summaryrefslogtreecommitdiff
path: root/test/CodeGen/AMDGPU
diff options
context:
space:
mode:
authorMatt Arsenault <Matthew.Arsenault@amd.com>2017-11-13 23:24:26 +0000
committerMatt Arsenault <Matthew.Arsenault@amd.com>2017-11-13 23:24:26 +0000
commitdde12849b2e8fa78ec57164cbfa88e2041328924 (patch)
treeca9cf15106b9e536ac3bec305fdaac27083fba55 /test/CodeGen/AMDGPU
parent8658d31e24983e1b81b0de4c5ee019c57cbc3207 (diff)
AMDGPU: Fix not converting d16 load/stores to offset
Fixes missed optimization with new MUBUF instructions. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@318106 91177308-0d34-0410-b5e6-96231b3b80d8
Diffstat (limited to 'test/CodeGen/AMDGPU')
-rw-r--r--test/CodeGen/AMDGPU/load-hi16.ll60
-rw-r--r--test/CodeGen/AMDGPU/load-lo16.ll63
-rw-r--r--test/CodeGen/AMDGPU/store-hi16.ll39
3 files changed, 156 insertions, 6 deletions
diff --git a/test/CodeGen/AMDGPU/load-hi16.ll b/test/CodeGen/AMDGPU/load-hi16.ll
index b239f39ea30..4d0f5864810 100644
--- a/test/CodeGen/AMDGPU/load-hi16.ll
+++ b/test/CodeGen/AMDGPU/load-hi16.ll
@@ -1,5 +1,5 @@
-; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX9 %s
-; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,VI %s
+; RUN: llc -march=amdgcn -mcpu=gfx900 -amdgpu-sroa=0 -mattr=-promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX9 %s
+; RUN: llc -march=amdgcn -mcpu=fiji -amdgpu-sroa=0 -mattr=-promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,VI %s
; GCN-LABEL: {{^}}load_local_hi_v2i16_undeflo:
; GCN: s_waitcnt
@@ -503,6 +503,62 @@ entry:
ret void
}
+; Local object gives known offset, so requires converting from offen
+; to offset variant.
+
+; GCN-LABEL: {{^}}load_private_hi_v2i16_reglo_vreg_to_offset:
+; GFX9: buffer_store_dword
+; GFX9-NEXT: buffer_load_short_d16_hi v{{[0-9]+}}, off, s[0:3], s5 offset:4094
+define void @load_private_hi_v2i16_reglo_vreg_to_offset(i16 %reg) #0 {
+entry:
+ %obj0 = alloca [10 x i32], align 4
+ %obj1 = alloca [4096 x i16], align 2
+ %bc = bitcast [10 x i32]* %obj0 to i32*
+ store volatile i32 123, i32* %bc
+ %gep = getelementptr inbounds [4096 x i16], [4096 x i16]* %obj1, i32 0, i32 2025
+ %load = load i16, i16* %gep
+ %build0 = insertelement <2 x i16> undef, i16 %reg, i32 0
+ %build1 = insertelement <2 x i16> %build0, i16 %load, i32 1
+ store <2 x i16> %build1, <2 x i16> addrspace(1)* undef
+ ret void
+}
+
+; GCN-LABEL: {{^}}load_private_hi_v2i16_reglo_vreg_sexti8_to_offset:
+; GFX9: buffer_store_dword
+; GFX9-NEXT: buffer_load_sbyte_d16_hi v{{[0-9]+}}, off, s[0:3], s5 offset:4095
+define void @load_private_hi_v2i16_reglo_vreg_sexti8_to_offset(i16 %reg) #0 {
+entry:
+ %obj0 = alloca [10 x i32], align 4
+ %obj1 = alloca [4096 x i8], align 2
+ %bc = bitcast [10 x i32]* %obj0 to i32*
+ store volatile i32 123, i32* %bc
+ %gep = getelementptr inbounds [4096 x i8], [4096 x i8]* %obj1, i32 0, i32 4051
+ %load = load i8, i8* %gep
+ %ext = sext i8 %load to i16
+ %build0 = insertelement <2 x i16> undef, i16 %reg, i32 0
+ %build1 = insertelement <2 x i16> %build0, i16 %ext, i32 1
+ store <2 x i16> %build1, <2 x i16> addrspace(1)* undef
+ ret void
+}
+
+; GCN-LABEL: {{^}}load_private_hi_v2i16_reglo_vreg_zexti8_to_offset:
+; GFX9: buffer_store_dword
+; GFX9-NEXT: buffer_load_ubyte_d16_hi v{{[0-9]+}}, off, s[0:3], s5 offset:4095
+define void @load_private_hi_v2i16_reglo_vreg_zexti8_to_offset(i16 %reg) #0 {
+entry:
+ %obj0 = alloca [10 x i32], align 4
+ %obj1 = alloca [4096 x i8], align 2
+ %bc = bitcast [10 x i32]* %obj0 to i32*
+ store volatile i32 123, i32* %bc
+ %gep = getelementptr inbounds [4096 x i8], [4096 x i8]* %obj1, i32 0, i32 4051
+ %load = load i8, i8* %gep
+ %ext = zext i8 %load to i16
+ %build0 = insertelement <2 x i16> undef, i16 %reg, i32 0
+ %build1 = insertelement <2 x i16> %build0, i16 %ext, i32 1
+ store <2 x i16> %build1, <2 x i16> addrspace(1)* undef
+ ret void
+}
+
; FIXME: Remove m0 init and waitcnt between reads
; FIXME: Is there a cost to using the extload over not?
; GCN-LABEL: {{^}}load_local_v2i16_split:
diff --git a/test/CodeGen/AMDGPU/load-lo16.ll b/test/CodeGen/AMDGPU/load-lo16.ll
index 67d5fcc8a45..e34441653e3 100644
--- a/test/CodeGen/AMDGPU/load-lo16.ll
+++ b/test/CodeGen/AMDGPU/load-lo16.ll
@@ -1,5 +1,5 @@
-; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX9 %s
-; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,VI %s
+; RUN: llc -march=amdgcn -mcpu=gfx900 -amdgpu-sroa=0 -mattr=-promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX9 %s
+; RUN: llc -march=amdgcn -mcpu=fiji -amdgpu-sroa=0 -mattr=-promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,VI %s
; GCN-LABEL: {{^}}load_local_lo_v2i16_undeflo:
; GCN: s_waitcnt
@@ -588,4 +588,63 @@ entry:
ret void
}
+; GCN-LABEL: {{^}}load_private_lo_v2i16_reglo_vreg_to_offset:
+; GFX9: buffer_store_dword
+; GFX9-NEXT: buffer_load_short_d16 v0, off, s[0:3], s5 offset:4094
+
+; VI: buffer_load_ushort v
+define void @load_private_lo_v2i16_reglo_vreg_to_offset(i32 %reg) #0 {
+entry:
+ %obj0 = alloca [10 x i32], align 4
+ %obj1 = alloca [4096 x i16], align 2
+ %reg.bc = bitcast i32 %reg to <2 x i16>
+ %bc = bitcast [10 x i32]* %obj0 to i32*
+ store volatile i32 123, i32* %bc
+ %gep = getelementptr inbounds [4096 x i16], [4096 x i16]* %obj1, i32 0, i32 2025
+ %load = load volatile i16, i16* %gep
+ %build1 = insertelement <2 x i16> %reg.bc, i16 %load, i32 0
+ store <2 x i16> %build1, <2 x i16> addrspace(1)* undef
+ ret void
+}
+
+; GCN-LABEL: {{^}}load_private_lo_v2i16_reglo_vreg_sexti8_to_offset:
+; GFX9: buffer_store_dword
+; GFX9-NEXT: buffer_load_sbyte_d16 v0, off, s[0:3], s5 offset:4095
+
+; VI: buffer_load_sbyte v
+define void @load_private_lo_v2i16_reglo_vreg_sexti8_to_offset(i32 %reg) #0 {
+entry:
+ %obj0 = alloca [10 x i32], align 4
+ %obj1 = alloca [4096 x i8], align 2
+ %reg.bc = bitcast i32 %reg to <2 x i16>
+ %bc = bitcast [10 x i32]* %obj0 to i32*
+ store volatile i32 123, i32* %bc
+ %gep = getelementptr inbounds [4096 x i8], [4096 x i8]* %obj1, i32 0, i32 4051
+ %load = load volatile i8, i8* %gep
+ %load.ext = sext i8 %load to i16
+ %build1 = insertelement <2 x i16> %reg.bc, i16 %load.ext, i32 0
+ store <2 x i16> %build1, <2 x i16> addrspace(1)* undef
+ ret void
+}
+
+; GCN-LABEL: {{^}}load_private_lo_v2i16_reglo_vreg_zexti8_to_offset:
+; GFX9: buffer_store_dword
+; GFX9-NEXT: buffer_load_ubyte_d16 v0, off, s[0:3], s5 offset:4095
+
+; VI: buffer_load_ubyte v
+define void @load_private_lo_v2i16_reglo_vreg_zexti8_to_offset(i32 %reg) #0 {
+entry:
+ %obj0 = alloca [10 x i32], align 4
+ %obj1 = alloca [4096 x i8], align 2
+ %reg.bc = bitcast i32 %reg to <2 x i16>
+ %bc = bitcast [10 x i32]* %obj0 to i32*
+ store volatile i32 123, i32* %bc
+ %gep = getelementptr inbounds [4096 x i8], [4096 x i8]* %obj1, i32 0, i32 4051
+ %load = load volatile i8, i8* %gep
+ %load.ext = zext i8 %load to i16
+ %build1 = insertelement <2 x i16> %reg.bc, i16 %load.ext, i32 0
+ store <2 x i16> %build1, <2 x i16> addrspace(1)* undef
+ ret void
+}
+
attributes #0 = { nounwind }
diff --git a/test/CodeGen/AMDGPU/store-hi16.ll b/test/CodeGen/AMDGPU/store-hi16.ll
index 99af332949a..1749e53930e 100644
--- a/test/CodeGen/AMDGPU/store-hi16.ll
+++ b/test/CodeGen/AMDGPU/store-hi16.ll
@@ -1,5 +1,5 @@
-; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX9 %s
-; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,VI %s
+; RUN: llc -march=amdgcn -mcpu=gfx900 -amdgpu-sroa=0 -mattr=-promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX9 %s
+; RUN: llc -march=amdgcn -mcpu=fiji -amdgpu-sroa=0 -mattr=-promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,VI %s
; GCN-LABEL: {{^}}store_global_hi_v2i16:
; GCN: s_waitcnt
@@ -591,4 +591,39 @@ entry:
ret void
}
+; GCN-LABEL: {{^}}store_private_hi_v2i16_to_offset:
+; GCN: s_waitcnt
+; GFX9: buffer_store_dword
+; GFX9-NEXT: buffer_store_short_d16_hi v0, off, s[0:3], s5 offset:4094
+define void @store_private_hi_v2i16_to_offset(i32 %arg) #0 {
+entry:
+ %obj0 = alloca [10 x i32], align 4
+ %obj1 = alloca [4096 x i16], align 2
+ %bc = bitcast [10 x i32]* %obj0 to i32*
+ store volatile i32 123, i32* %bc
+ %value = bitcast i32 %arg to <2 x i16>
+ %hi = extractelement <2 x i16> %value, i32 1
+ %gep = getelementptr inbounds [4096 x i16], [4096 x i16]* %obj1, i32 0, i32 2025
+ store i16 %hi, i16* %gep
+ ret void
+}
+
+; GCN-LABEL: {{^}}store_private_hi_v2i16_i8_to_offset:
+; GCN: s_waitcnt
+; GFX9: buffer_store_dword
+; GFX9-NEXT: buffer_store_byte_d16_hi v0, off, s[0:3], s5 offset:4095
+define void @store_private_hi_v2i16_i8_to_offset(i32 %arg) #0 {
+entry:
+ %obj0 = alloca [10 x i32], align 4
+ %obj1 = alloca [4096 x i8], align 2
+ %bc = bitcast [10 x i32]* %obj0 to i32*
+ store volatile i32 123, i32* %bc
+ %value = bitcast i32 %arg to <2 x i16>
+ %hi = extractelement <2 x i16> %value, i32 1
+ %gep = getelementptr inbounds [4096 x i8], [4096 x i8]* %obj1, i32 0, i32 4051
+ %trunc = trunc i16 %hi to i8
+ store i8 %trunc, i8* %gep
+ ret void
+}
+
attributes #0 = { nounwind }