diff options
author | Roman Tereshin <rtereshin@apple.com> | 2018-07-19 19:42:43 +0000 |
---|---|---|
committer | Roman Tereshin <rtereshin@apple.com> | 2018-07-19 19:42:43 +0000 |
commit | b2f9f92413fa4c9669f5f49716745c5f90d3fd61 (patch) | |
tree | 71586a90bab6c0fe6d9d26608f78b1e670e39e28 /test | |
parent | f9fb677cc2434e62c164c1322dbe7eacb47617f3 (diff) |
[LSV] Refactoring + supporting bitcasts to a type of different size
This is mostly a preparation work for adding a limited support for
select instructions. It proved to be difficult to do due to size and
irregularity of Vectorizer::isConsecutiveAccess, this is fixed here I
believe.
It also turned out that these changes make it simpler to finish one of
the TODOs and fix a number of other small issues, namely:
1. Looking through bitcasts to a type of a different size (requires
careful tracking of the original load/store size and some math
converting sizes in bytes to expected differences in indices of GEPs).
2. Reusing partial analysis of pointers done by first attempt in proving
them consecutive instead of starting from scratch. This added limited
support for nested GEPs co-existing with difficult sext/zext
instructions. This also required a careful handling of negative
differences between constant parts of offsets.
3. Handing a case where the first pointer index is not an add, but
something else (a function parameter for instance).
I observe an increased number of successful vectorizations on a large
set of shader programs. Only few shaders are affected, but those that
are affected sport >5% less loads and stores than before the patch.
Reviewed By: rampitec
Differential-Revision: https://reviews.llvm.org/D49342
git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@337489 91177308-0d34-0410-b5e6-96231b3b80d8
Diffstat (limited to 'test')
-rw-r--r-- | test/CodeGen/X86/loadStore_vectorizer.ll | 18 | ||||
-rw-r--r-- | test/Transforms/LoadStoreVectorizer/AMDGPU/gep-bitcast.ll | 21 |
2 files changed, 35 insertions, 4 deletions
diff --git a/test/CodeGen/X86/loadStore_vectorizer.ll b/test/CodeGen/X86/loadStore_vectorizer.ll index 03f6ccce6c4..48f31563c24 100644 --- a/test/CodeGen/X86/loadStore_vectorizer.ll +++ b/test/CodeGen/X86/loadStore_vectorizer.ll @@ -1,8 +1,9 @@ -; RUN: opt -load-store-vectorizer < %s -S | FileCheck %s +; RUN: opt -mtriple x86_64-- -load-store-vectorizer < %s -S | FileCheck %s %struct_render_pipeline_state = type opaque -define fastcc void @main(%struct_render_pipeline_state addrspace(1)* %pso) unnamed_addr { +define fastcc void @test1(%struct_render_pipeline_state addrspace(1)* %pso) unnamed_addr { +; CHECK-LABEL: @test1 ; CHECK: load i16 ; CHECK: load i16 entry: @@ -14,3 +15,16 @@ entry: %tmp4 = load i16, i16 addrspace(1)* %tmp3, align 2 ret void } + +define fastcc void @test2(%struct_render_pipeline_state addrspace(1)* %pso) unnamed_addr { +; CHECK-LABEL: @test2 +; CHECK: load <2 x i16> +entry: + %tmp = bitcast %struct_render_pipeline_state addrspace(1)* %pso to i16 addrspace(1)* + %tmp1 = load i16, i16 addrspace(1)* %tmp, align 2 + %tmp2 = bitcast %struct_render_pipeline_state addrspace(1)* %pso to i8 addrspace(1)* + %sunkaddr51 = getelementptr i8, i8 addrspace(1)* %tmp2, i64 2 + %tmp3 = bitcast i8 addrspace(1)* %sunkaddr51 to i16 addrspace(1)* + %tmp4 = load i16, i16 addrspace(1)* %tmp3, align 2 + ret void +} diff --git a/test/Transforms/LoadStoreVectorizer/AMDGPU/gep-bitcast.ll b/test/Transforms/LoadStoreVectorizer/AMDGPU/gep-bitcast.ll index b67dc058453..a9c3fbf7b64 100644 --- a/test/Transforms/LoadStoreVectorizer/AMDGPU/gep-bitcast.ll +++ b/test/Transforms/LoadStoreVectorizer/AMDGPU/gep-bitcast.ll @@ -56,8 +56,8 @@ define void @vect_zext_bitcast_i8_st1_to_i32_idx(i8 addrspace(1)* %arg1, i32 %ba ret void } -; TODO: This can be vectorized, but currently vectorizer unable to do it. ; CHECK-LABEL: @vect_zext_bitcast_i8_st4_to_i32_idx +; CHECK: load <4 x i32> define void @vect_zext_bitcast_i8_st4_to_i32_idx(i8 addrspace(1)* %arg1, i32 %base) { %add1 = add nuw i32 %base, 0 %zext1 = zext i32 %add1 to i64 @@ -74,10 +74,27 @@ define void @vect_zext_bitcast_i8_st4_to_i32_idx(i8 addrspace(1)* %arg1, i32 %ba %gep3 = getelementptr inbounds i8, i8 addrspace(1)* %arg1, i64 %zext3 %f2i3 = bitcast i8 addrspace(1)* %gep3 to i32 addrspace(1)* %load3 = load i32, i32 addrspace(1)* %f2i3, align 4 - %add4 = add nuw i32 %base, 16 + %add4 = add nuw i32 %base, 12 %zext4 = zext i32 %add4 to i64 %gep4 = getelementptr inbounds i8, i8 addrspace(1)* %arg1, i64 %zext4 %f2i4 = bitcast i8 addrspace(1)* %gep4 to i32 addrspace(1)* %load4 = load i32, i32 addrspace(1)* %f2i4, align 4 ret void } + +; CHECK-LABEL: @vect_zext_bitcast_negative_ptr_delta +; CHECK: load <2 x i32> +define void @vect_zext_bitcast_negative_ptr_delta(i32 addrspace(1)* %p, i32 %base) { + %p.bitcasted = bitcast i32 addrspace(1)* %p to i16 addrspace(1)* + %a.offset = add nuw i32 %base, 4 + %t.offset.zexted = zext i32 %base to i64 + %a.offset.zexted = zext i32 %a.offset to i64 + %t.ptr = getelementptr inbounds i16, i16 addrspace(1)* %p.bitcasted, i64 %t.offset.zexted + %a.ptr = getelementptr inbounds i16, i16 addrspace(1)* %p.bitcasted, i64 %a.offset.zexted + %b.ptr = getelementptr inbounds i16, i16 addrspace(1)* %t.ptr, i64 6 + %a.ptr.bitcasted = bitcast i16 addrspace(1)* %a.ptr to i32 addrspace(1)* + %b.ptr.bitcasted = bitcast i16 addrspace(1)* %b.ptr to i32 addrspace(1)* + %a.val = load i32, i32 addrspace(1)* %a.ptr.bitcasted + %b.val = load i32, i32 addrspace(1)* %b.ptr.bitcasted + ret void +} |