diff options
author | Matt Arsenault <Matthew.Arsenault@amd.com> | 2017-11-13 23:24:26 +0000 |
---|---|---|
committer | Matt Arsenault <Matthew.Arsenault@amd.com> | 2017-11-13 23:24:26 +0000 |
commit | dde12849b2e8fa78ec57164cbfa88e2041328924 (patch) | |
tree | ca9cf15106b9e536ac3bec305fdaac27083fba55 /test/CodeGen/AMDGPU | |
parent | 8658d31e24983e1b81b0de4c5ee019c57cbc3207 (diff) |
AMDGPU: Fix not converting d16 load/stores to offset
Fixes missed optimization with new MUBUF instructions.
git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@318106 91177308-0d34-0410-b5e6-96231b3b80d8
Diffstat (limited to 'test/CodeGen/AMDGPU')
-rw-r--r-- | test/CodeGen/AMDGPU/load-hi16.ll | 60 | ||||
-rw-r--r-- | test/CodeGen/AMDGPU/load-lo16.ll | 63 | ||||
-rw-r--r-- | test/CodeGen/AMDGPU/store-hi16.ll | 39 |
3 files changed, 156 insertions, 6 deletions
diff --git a/test/CodeGen/AMDGPU/load-hi16.ll b/test/CodeGen/AMDGPU/load-hi16.ll index b239f39ea30..4d0f5864810 100644 --- a/test/CodeGen/AMDGPU/load-hi16.ll +++ b/test/CodeGen/AMDGPU/load-hi16.ll @@ -1,5 +1,5 @@ -; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX9 %s -; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,VI %s +; RUN: llc -march=amdgcn -mcpu=gfx900 -amdgpu-sroa=0 -mattr=-promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX9 %s +; RUN: llc -march=amdgcn -mcpu=fiji -amdgpu-sroa=0 -mattr=-promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,VI %s ; GCN-LABEL: {{^}}load_local_hi_v2i16_undeflo: ; GCN: s_waitcnt @@ -503,6 +503,62 @@ entry: ret void } +; Local object gives known offset, so requires converting from offen +; to offset variant. + +; GCN-LABEL: {{^}}load_private_hi_v2i16_reglo_vreg_to_offset: +; GFX9: buffer_store_dword +; GFX9-NEXT: buffer_load_short_d16_hi v{{[0-9]+}}, off, s[0:3], s5 offset:4094 +define void @load_private_hi_v2i16_reglo_vreg_to_offset(i16 %reg) #0 { +entry: + %obj0 = alloca [10 x i32], align 4 + %obj1 = alloca [4096 x i16], align 2 + %bc = bitcast [10 x i32]* %obj0 to i32* + store volatile i32 123, i32* %bc + %gep = getelementptr inbounds [4096 x i16], [4096 x i16]* %obj1, i32 0, i32 2025 + %load = load i16, i16* %gep + %build0 = insertelement <2 x i16> undef, i16 %reg, i32 0 + %build1 = insertelement <2 x i16> %build0, i16 %load, i32 1 + store <2 x i16> %build1, <2 x i16> addrspace(1)* undef + ret void +} + +; GCN-LABEL: {{^}}load_private_hi_v2i16_reglo_vreg_sexti8_to_offset: +; GFX9: buffer_store_dword +; GFX9-NEXT: buffer_load_sbyte_d16_hi v{{[0-9]+}}, off, s[0:3], s5 offset:4095 +define void @load_private_hi_v2i16_reglo_vreg_sexti8_to_offset(i16 %reg) #0 { +entry: + %obj0 = alloca [10 x i32], align 4 + %obj1 = alloca [4096 x i8], align 2 + %bc = bitcast [10 x i32]* %obj0 to i32* + store volatile i32 123, i32* %bc + %gep = getelementptr inbounds [4096 x i8], [4096 x i8]* %obj1, i32 0, i32 4051 + %load = load i8, i8* %gep + %ext = sext i8 %load to i16 + %build0 = insertelement <2 x i16> undef, i16 %reg, i32 0 + %build1 = insertelement <2 x i16> %build0, i16 %ext, i32 1 + store <2 x i16> %build1, <2 x i16> addrspace(1)* undef + ret void +} + +; GCN-LABEL: {{^}}load_private_hi_v2i16_reglo_vreg_zexti8_to_offset: +; GFX9: buffer_store_dword +; GFX9-NEXT: buffer_load_ubyte_d16_hi v{{[0-9]+}}, off, s[0:3], s5 offset:4095 +define void @load_private_hi_v2i16_reglo_vreg_zexti8_to_offset(i16 %reg) #0 { +entry: + %obj0 = alloca [10 x i32], align 4 + %obj1 = alloca [4096 x i8], align 2 + %bc = bitcast [10 x i32]* %obj0 to i32* + store volatile i32 123, i32* %bc + %gep = getelementptr inbounds [4096 x i8], [4096 x i8]* %obj1, i32 0, i32 4051 + %load = load i8, i8* %gep + %ext = zext i8 %load to i16 + %build0 = insertelement <2 x i16> undef, i16 %reg, i32 0 + %build1 = insertelement <2 x i16> %build0, i16 %ext, i32 1 + store <2 x i16> %build1, <2 x i16> addrspace(1)* undef + ret void +} + ; FIXME: Remove m0 init and waitcnt between reads ; FIXME: Is there a cost to using the extload over not? ; GCN-LABEL: {{^}}load_local_v2i16_split: diff --git a/test/CodeGen/AMDGPU/load-lo16.ll b/test/CodeGen/AMDGPU/load-lo16.ll index 67d5fcc8a45..e34441653e3 100644 --- a/test/CodeGen/AMDGPU/load-lo16.ll +++ b/test/CodeGen/AMDGPU/load-lo16.ll @@ -1,5 +1,5 @@ -; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX9 %s -; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,VI %s +; RUN: llc -march=amdgcn -mcpu=gfx900 -amdgpu-sroa=0 -mattr=-promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX9 %s +; RUN: llc -march=amdgcn -mcpu=fiji -amdgpu-sroa=0 -mattr=-promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,VI %s ; GCN-LABEL: {{^}}load_local_lo_v2i16_undeflo: ; GCN: s_waitcnt @@ -588,4 +588,63 @@ entry: ret void } +; GCN-LABEL: {{^}}load_private_lo_v2i16_reglo_vreg_to_offset: +; GFX9: buffer_store_dword +; GFX9-NEXT: buffer_load_short_d16 v0, off, s[0:3], s5 offset:4094 + +; VI: buffer_load_ushort v +define void @load_private_lo_v2i16_reglo_vreg_to_offset(i32 %reg) #0 { +entry: + %obj0 = alloca [10 x i32], align 4 + %obj1 = alloca [4096 x i16], align 2 + %reg.bc = bitcast i32 %reg to <2 x i16> + %bc = bitcast [10 x i32]* %obj0 to i32* + store volatile i32 123, i32* %bc + %gep = getelementptr inbounds [4096 x i16], [4096 x i16]* %obj1, i32 0, i32 2025 + %load = load volatile i16, i16* %gep + %build1 = insertelement <2 x i16> %reg.bc, i16 %load, i32 0 + store <2 x i16> %build1, <2 x i16> addrspace(1)* undef + ret void +} + +; GCN-LABEL: {{^}}load_private_lo_v2i16_reglo_vreg_sexti8_to_offset: +; GFX9: buffer_store_dword +; GFX9-NEXT: buffer_load_sbyte_d16 v0, off, s[0:3], s5 offset:4095 + +; VI: buffer_load_sbyte v +define void @load_private_lo_v2i16_reglo_vreg_sexti8_to_offset(i32 %reg) #0 { +entry: + %obj0 = alloca [10 x i32], align 4 + %obj1 = alloca [4096 x i8], align 2 + %reg.bc = bitcast i32 %reg to <2 x i16> + %bc = bitcast [10 x i32]* %obj0 to i32* + store volatile i32 123, i32* %bc + %gep = getelementptr inbounds [4096 x i8], [4096 x i8]* %obj1, i32 0, i32 4051 + %load = load volatile i8, i8* %gep + %load.ext = sext i8 %load to i16 + %build1 = insertelement <2 x i16> %reg.bc, i16 %load.ext, i32 0 + store <2 x i16> %build1, <2 x i16> addrspace(1)* undef + ret void +} + +; GCN-LABEL: {{^}}load_private_lo_v2i16_reglo_vreg_zexti8_to_offset: +; GFX9: buffer_store_dword +; GFX9-NEXT: buffer_load_ubyte_d16 v0, off, s[0:3], s5 offset:4095 + +; VI: buffer_load_ubyte v +define void @load_private_lo_v2i16_reglo_vreg_zexti8_to_offset(i32 %reg) #0 { +entry: + %obj0 = alloca [10 x i32], align 4 + %obj1 = alloca [4096 x i8], align 2 + %reg.bc = bitcast i32 %reg to <2 x i16> + %bc = bitcast [10 x i32]* %obj0 to i32* + store volatile i32 123, i32* %bc + %gep = getelementptr inbounds [4096 x i8], [4096 x i8]* %obj1, i32 0, i32 4051 + %load = load volatile i8, i8* %gep + %load.ext = zext i8 %load to i16 + %build1 = insertelement <2 x i16> %reg.bc, i16 %load.ext, i32 0 + store <2 x i16> %build1, <2 x i16> addrspace(1)* undef + ret void +} + attributes #0 = { nounwind } diff --git a/test/CodeGen/AMDGPU/store-hi16.ll b/test/CodeGen/AMDGPU/store-hi16.ll index 99af332949a..1749e53930e 100644 --- a/test/CodeGen/AMDGPU/store-hi16.ll +++ b/test/CodeGen/AMDGPU/store-hi16.ll @@ -1,5 +1,5 @@ -; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX9 %s -; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,VI %s +; RUN: llc -march=amdgcn -mcpu=gfx900 -amdgpu-sroa=0 -mattr=-promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX9 %s +; RUN: llc -march=amdgcn -mcpu=fiji -amdgpu-sroa=0 -mattr=-promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,VI %s ; GCN-LABEL: {{^}}store_global_hi_v2i16: ; GCN: s_waitcnt @@ -591,4 +591,39 @@ entry: ret void } +; GCN-LABEL: {{^}}store_private_hi_v2i16_to_offset: +; GCN: s_waitcnt +; GFX9: buffer_store_dword +; GFX9-NEXT: buffer_store_short_d16_hi v0, off, s[0:3], s5 offset:4094 +define void @store_private_hi_v2i16_to_offset(i32 %arg) #0 { +entry: + %obj0 = alloca [10 x i32], align 4 + %obj1 = alloca [4096 x i16], align 2 + %bc = bitcast [10 x i32]* %obj0 to i32* + store volatile i32 123, i32* %bc + %value = bitcast i32 %arg to <2 x i16> + %hi = extractelement <2 x i16> %value, i32 1 + %gep = getelementptr inbounds [4096 x i16], [4096 x i16]* %obj1, i32 0, i32 2025 + store i16 %hi, i16* %gep + ret void +} + +; GCN-LABEL: {{^}}store_private_hi_v2i16_i8_to_offset: +; GCN: s_waitcnt +; GFX9: buffer_store_dword +; GFX9-NEXT: buffer_store_byte_d16_hi v0, off, s[0:3], s5 offset:4095 +define void @store_private_hi_v2i16_i8_to_offset(i32 %arg) #0 { +entry: + %obj0 = alloca [10 x i32], align 4 + %obj1 = alloca [4096 x i8], align 2 + %bc = bitcast [10 x i32]* %obj0 to i32* + store volatile i32 123, i32* %bc + %value = bitcast i32 %arg to <2 x i16> + %hi = extractelement <2 x i16> %value, i32 1 + %gep = getelementptr inbounds [4096 x i8], [4096 x i8]* %obj1, i32 0, i32 4051 + %trunc = trunc i16 %hi to i8 + store i8 %trunc, i8* %gep + ret void +} + attributes #0 = { nounwind } |