diff options
author | Marek Olsak <marek.olsak@amd.com> | 2017-11-09 01:52:48 +0000 |
---|---|---|
committer | Marek Olsak <marek.olsak@amd.com> | 2017-11-09 01:52:48 +0000 |
commit | aa75d4aeb0067c76f0df50b1b1a38e7ceb853a7c (patch) | |
tree | 9ed1185b5c8f9b51018b9d6aafa2485693e3e8ff /test/CodeGen/AMDGPU | |
parent | e79e4fb9f114fbe54a06e0462d588c698acbd461 (diff) |
AMDGPU: Lower buffer store and atomic intrinsics manually
Summary:
Without this, SIMemoryLegalizer inserts s_waitcnt vmcnt(0) before every
buffer store and atomic instruction.
Reviewers: arsenm, nhaehnle
Subscribers: kzhuravl, wdng, yaxunl, dstuttard, tpr, llvm-commits, t-tye
Differential Revision: https://reviews.llvm.org/D39060
git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@317754 91177308-0d34-0410-b5e6-96231b3b80d8
Diffstat (limited to 'test/CodeGen/AMDGPU')
-rw-r--r-- | test/CodeGen/AMDGPU/llvm.amdgcn.buffer.atomic.ll | 3 | ||||
-rw-r--r-- | test/CodeGen/AMDGPU/llvm.amdgcn.buffer.store.format.ll | 9 | ||||
-rw-r--r-- | test/CodeGen/AMDGPU/llvm.amdgcn.buffer.store.ll | 9 | ||||
-rw-r--r-- | test/CodeGen/AMDGPU/llvm.amdgcn.image.atomic.ll | 7 | ||||
-rw-r--r-- | test/CodeGen/AMDGPU/llvm.amdgcn.image.ll | 14 | ||||
-rw-r--r-- | test/CodeGen/AMDGPU/llvm.amdgcn.s.waitcnt.ll | 2 |
6 files changed, 44 insertions, 0 deletions
diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.atomic.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.atomic.ll index b6f72a114d9..0ce7d3efe45 100644 --- a/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.atomic.ll +++ b/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.atomic.ll @@ -2,6 +2,7 @@ ;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s -check-prefix=CHECK -check-prefix=VI ;CHECK-LABEL: {{^}}test1: +;CHECK-NOT: s_waitcnt ;CHECK: buffer_atomic_swap v0, off, s[0:3], 0 glc ;VI: s_movk_i32 [[SOFS:s[0-9]+]], 0x1ffc ;CHECK: s_waitcnt vmcnt(0) @@ -32,6 +33,7 @@ main_body: } ;CHECK-LABEL: {{^}}test2: +;CHECK-NOT: s_waitcnt ;CHECK: buffer_atomic_add v0, v1, s[0:3], 0 idxen glc ;CHECK: s_waitcnt vmcnt(0) ;CHECK: buffer_atomic_sub v0, v1, s[0:3], 0 idxen glc @@ -69,6 +71,7 @@ main_body: ; create copies which we don't bother to track here. ; ;CHECK-LABEL: {{^}}test3: +;CHECK-NOT: s_waitcnt ;CHECK: buffer_atomic_cmpswap {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], 0 glc ;CHECK: s_waitcnt vmcnt(0) ;VI: s_movk_i32 [[SOFS:s[0-9]+]], 0x1ffc diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.store.format.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.store.format.ll index e50455f6f9a..7fc7acdff92 100644 --- a/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.store.format.ll +++ b/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.store.format.ll @@ -2,6 +2,7 @@ ;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s ;CHECK-LABEL: {{^}}buffer_store: +;CHECK-NOT: s_waitcnt ;CHECK: buffer_store_format_xyzw v[0:3], off, s[0:3], 0 ;CHECK: buffer_store_format_xyzw v[4:7], off, s[0:3], 0 glc ;CHECK: buffer_store_format_xyzw v[8:11], off, s[0:3], 0 slc @@ -14,6 +15,7 @@ main_body: } ;CHECK-LABEL: {{^}}buffer_store_immoffs: +;CHECK-NOT: s_waitcnt ;CHECK: buffer_store_format_xyzw v[0:3], off, s[0:3], 0 offset:42 define amdgpu_ps void @buffer_store_immoffs(<4 x i32> inreg, <4 x float>) { main_body: @@ -22,6 +24,7 @@ main_body: } ;CHECK-LABEL: {{^}}buffer_store_idx: +;CHECK-NOT: s_waitcnt ;CHECK: buffer_store_format_xyzw v[0:3], v4, s[0:3], 0 idxen define amdgpu_ps void @buffer_store_idx(<4 x i32> inreg, <4 x float>, i32) { main_body: @@ -30,6 +33,7 @@ main_body: } ;CHECK-LABEL: {{^}}buffer_store_ofs: +;CHECK-NOT: s_waitcnt ;CHECK: buffer_store_format_xyzw v[0:3], v4, s[0:3], 0 offen define amdgpu_ps void @buffer_store_ofs(<4 x i32> inreg, <4 x float>, i32) { main_body: @@ -38,6 +42,7 @@ main_body: } ;CHECK-LABEL: {{^}}buffer_store_both: +;CHECK-NOT: s_waitcnt ;CHECK: buffer_store_format_xyzw v[0:3], v[4:5], s[0:3], 0 idxen offen define amdgpu_ps void @buffer_store_both(<4 x i32> inreg, <4 x float>, i32, i32) { main_body: @@ -47,6 +52,7 @@ main_body: ;CHECK-LABEL: {{^}}buffer_store_both_reversed: ;CHECK: v_mov_b32_e32 v6, v4 +;CHECK-NOT: s_waitcnt ;CHECK: buffer_store_format_xyzw v[0:3], v[5:6], s[0:3], 0 idxen offen define amdgpu_ps void @buffer_store_both_reversed(<4 x i32> inreg, <4 x float>, i32, i32) { main_body: @@ -57,6 +63,7 @@ main_body: ; Ideally, the register allocator would avoid the wait here ; ;CHECK-LABEL: {{^}}buffer_store_wait: +;CHECK-NOT: s_waitcnt ;CHECK: buffer_store_format_xyzw v[0:3], v4, s[0:3], 0 idxen ;CHECK: s_waitcnt expcnt(0) ;CHECK: buffer_load_format_xyzw v[0:3], v5, s[0:3], 0 idxen @@ -71,6 +78,7 @@ main_body: } ;CHECK-LABEL: {{^}}buffer_store_x1: +;CHECK-NOT: s_waitcnt ;CHECK: buffer_store_format_x v0, v1, s[0:3], 0 idxen define amdgpu_ps void @buffer_store_x1(<4 x i32> inreg %rsrc, float %data, i32 %index) { main_body: @@ -79,6 +87,7 @@ main_body: } ;CHECK-LABEL: {{^}}buffer_store_x2: +;CHECK-NOT: s_waitcnt ;CHECK: buffer_store_format_xy v[0:1], v2, s[0:3], 0 idxen define amdgpu_ps void @buffer_store_x2(<4 x i32> inreg %rsrc, <2 x float> %data, i32 %index) { main_body: diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.store.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.store.ll index 81597516d5f..c6200cacbe8 100644 --- a/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.store.ll +++ b/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.store.ll @@ -2,6 +2,7 @@ ;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s ;CHECK-LABEL: {{^}}buffer_store: +;CHECK-NOT: s_waitcnt ;CHECK: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ;CHECK: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 glc ;CHECK: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 slc @@ -14,6 +15,7 @@ main_body: } ;CHECK-LABEL: {{^}}buffer_store_immoffs: +;CHECK-NOT: s_waitcnt ;CHECK: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:42 define amdgpu_ps void @buffer_store_immoffs(<4 x i32> inreg, <4 x float>) { main_body: @@ -22,6 +24,7 @@ main_body: } ;CHECK-LABEL: {{^}}buffer_store_idx: +;CHECK-NOT: s_waitcnt ;CHECK: buffer_store_dwordx4 v[0:3], v4, s[0:3], 0 idxen define amdgpu_ps void @buffer_store_idx(<4 x i32> inreg, <4 x float>, i32) { main_body: @@ -30,6 +33,7 @@ main_body: } ;CHECK-LABEL: {{^}}buffer_store_ofs: +;CHECK-NOT: s_waitcnt ;CHECK: buffer_store_dwordx4 v[0:3], v4, s[0:3], 0 offen define amdgpu_ps void @buffer_store_ofs(<4 x i32> inreg, <4 x float>, i32) { main_body: @@ -38,6 +42,7 @@ main_body: } ;CHECK-LABEL: {{^}}buffer_store_both: +;CHECK-NOT: s_waitcnt ;CHECK: buffer_store_dwordx4 v[0:3], v[4:5], s[0:3], 0 idxen offen define amdgpu_ps void @buffer_store_both(<4 x i32> inreg, <4 x float>, i32, i32) { main_body: @@ -47,6 +52,7 @@ main_body: ;CHECK-LABEL: {{^}}buffer_store_both_reversed: ;CHECK: v_mov_b32_e32 v6, v4 +;CHECK-NOT: s_waitcnt ;CHECK: buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 idxen offen define amdgpu_ps void @buffer_store_both_reversed(<4 x i32> inreg, <4 x float>, i32, i32) { main_body: @@ -57,6 +63,7 @@ main_body: ; Ideally, the register allocator would avoid the wait here ; ;CHECK-LABEL: {{^}}buffer_store_wait: +;CHECK-NOT: s_waitcnt ;CHECK: buffer_store_dwordx4 v[0:3], v4, s[0:3], 0 idxen ;CHECK: s_waitcnt expcnt(0) ;CHECK: buffer_load_dwordx4 v[0:3], v5, s[0:3], 0 idxen @@ -71,6 +78,7 @@ main_body: } ;CHECK-LABEL: {{^}}buffer_store_x1: +;CHECK-NOT: s_waitcnt ;CHECK: buffer_store_dword v0, v1, s[0:3], 0 idxen define amdgpu_ps void @buffer_store_x1(<4 x i32> inreg %rsrc, float %data, i32 %index) { main_body: @@ -79,6 +87,7 @@ main_body: } ;CHECK-LABEL: {{^}}buffer_store_x2: +;CHECK-NOT: s_waitcnt ;CHECK: buffer_store_dwordx2 v[0:1], v2, s[0:3], 0 idxen define amdgpu_ps void @buffer_store_x2(<4 x i32> inreg %rsrc, <2 x float> %data, i32 %index) #0 { main_body: diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.image.atomic.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.image.atomic.ll index 87d83872788..5dec4ad9c1e 100644 --- a/test/CodeGen/AMDGPU/llvm.amdgcn.image.atomic.ll +++ b/test/CodeGen/AMDGPU/llvm.amdgcn.image.atomic.ll @@ -2,6 +2,7 @@ ;RUN: llc < %s -march=amdgcn -mcpu=tonga -show-mc-encoding -verify-machineinstrs | FileCheck %s --check-prefix=CHECK --check-prefix=VI ;CHECK-LABEL: {{^}}image_atomic_swap: +;CHECK-NOT: s_waitcnt ;SI: image_atomic_swap v4, v[0:3], s[0:7] dmask:0x1 unorm glc ; encoding: [0x00,0x31,0x3c,0xf0,0x00,0x04,0x00,0x00] ;VI: image_atomic_swap v4, v[0:3], s[0:7] dmask:0x1 unorm glc ; encoding: [0x00,0x31,0x40,0xf0,0x00,0x04,0x00,0x00] ;CHECK: s_waitcnt vmcnt(0) @@ -13,6 +14,7 @@ main_body: } ;CHECK-LABEL: {{^}}image_atomic_swap_v2i32: +;CHECK-NOT: s_waitcnt ;SI: image_atomic_swap v2, v[0:1], s[0:7] dmask:0x1 unorm glc ; encoding: [0x00,0x31,0x3c,0xf0,0x00,0x02,0x00,0x00] ;VI: image_atomic_swap v2, v[0:1], s[0:7] dmask:0x1 unorm glc ; encoding: [0x00,0x31,0x40,0xf0,0x00,0x02,0x00,0x00] ;CHECK: s_waitcnt vmcnt(0) @@ -24,6 +26,7 @@ main_body: } ;CHECK-LABEL: {{^}}image_atomic_swap_i32: +;CHECK-NOT: s_waitcnt ;SI: image_atomic_swap v1, v0, s[0:7] dmask:0x1 unorm glc ; encoding: [0x00,0x31,0x3c,0xf0,0x00,0x01,0x00,0x00] ;VI: image_atomic_swap v1, v0, s[0:7] dmask:0x1 unorm glc ; encoding: [0x00,0x31,0x40,0xf0,0x00,0x01,0x00,0x00] ;CHECK: s_waitcnt vmcnt(0) @@ -35,6 +38,7 @@ main_body: } ;CHECK-LABEL: {{^}}image_atomic_cmpswap: +;CHECK-NOT: s_waitcnt ;SI: image_atomic_cmpswap v[4:5], v[0:3], s[0:7] dmask:0x3 unorm glc ; encoding: [0x00,0x33,0x40,0xf0,0x00,0x04,0x00,0x00] ;VI: image_atomic_cmpswap v[4:5], v[0:3], s[0:7] dmask:0x3 unorm glc ; encoding: [0x00,0x33,0x44,0xf0,0x00,0x04,0x00,0x00] ;CHECK: s_waitcnt vmcnt(0) @@ -47,6 +51,7 @@ main_body: } ;CHECK-LABEL: {{^}}image_atomic_add: +;CHECK-NOT: s_waitcnt ;SI: image_atomic_add v4, v[0:3], s[0:7] dmask:0x1 unorm glc ; encoding: [0x00,0x31,0x44,0xf0,0x00,0x04,0x00,0x00] ;VI: image_atomic_add v4, v[0:3], s[0:7] dmask:0x1 unorm glc ; encoding: [0x00,0x31,0x48,0xf0,0x00,0x04,0x00,0x00] ;CHECK: s_waitcnt vmcnt(0) @@ -58,6 +63,7 @@ main_body: } ;CHECK-LABEL: {{^}}image_atomic_sub: +;CHECK-NOT: s_waitcnt ;SI: image_atomic_sub v4, v[0:3], s[0:7] dmask:0x1 unorm glc ; encoding: [0x00,0x31,0x48,0xf0,0x00,0x04,0x00,0x00] ;VI: image_atomic_sub v4, v[0:3], s[0:7] dmask:0x1 unorm glc ; encoding: [0x00,0x31,0x4c,0xf0,0x00,0x04,0x00,0x00] ;CHECK: s_waitcnt vmcnt(0) @@ -69,6 +75,7 @@ main_body: } ;CHECK-LABEL: {{^}}image_atomic_unchanged: +;CHECK-NOT: s_waitcnt ;CHECK: image_atomic_smin v4, v[0:3], s[0:7] dmask:0x1 unorm glc ; encoding: [0x00,0x31,0x50,0xf0,0x00,0x04,0x00,0x00] ;CHECK: s_waitcnt vmcnt(0) ;CHECK: image_atomic_umin v4, v[0:3], s[0:7] dmask:0x1 unorm glc ; encoding: [0x00,0x31,0x54,0xf0,0x00,0x04,0x00,0x00] diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.image.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.image.ll index a289f7b0cfb..42c87056746 100644 --- a/test/CodeGen/AMDGPU/llvm.amdgcn.image.ll +++ b/test/CodeGen/AMDGPU/llvm.amdgcn.image.ll @@ -2,6 +2,7 @@ ; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,VI %s ; GCN-LABEL: {{^}}image_load_v4i32: +; GCN-NOT: s_waitcnt ; GCN: image_load v[0:3], v[0:3], s[0:7] dmask:0xf unorm ; GCN: s_waitcnt vmcnt(0) define amdgpu_ps <4 x float> @image_load_v4i32(<8 x i32> inreg %rsrc, <4 x i32> %c) #0 { @@ -11,6 +12,7 @@ main_body: } ; GCN-LABEL: {{^}}image_load_v2i32: +; GCN-NOT: s_waitcnt ; GCN: image_load v[0:3], v[0:1], s[0:7] dmask:0xf unorm ; GCN: s_waitcnt vmcnt(0) define amdgpu_ps <4 x float> @image_load_v2i32(<8 x i32> inreg %rsrc, <2 x i32> %c) #0 { @@ -20,6 +22,7 @@ main_body: } ; GCN-LABEL: {{^}}image_load_i32: +; GCN-NOT: s_waitcnt ; GCN: image_load v[0:3], v0, s[0:7] dmask:0xf unorm ; GCN: s_waitcnt vmcnt(0) define amdgpu_ps <4 x float> @image_load_i32(<8 x i32> inreg %rsrc, i32 %c) #0 { @@ -29,6 +32,7 @@ main_body: } ; GCN-LABEL: {{^}}image_load_mip: +; GCN-NOT: s_waitcnt ; GCN: image_load_mip v[0:3], v[0:3], s[0:7] dmask:0xf unorm ; GCN: s_waitcnt vmcnt(0) define amdgpu_ps <4 x float> @image_load_mip(<8 x i32> inreg %rsrc, <4 x i32> %c) #0 { @@ -38,6 +42,7 @@ main_body: } ; GCN-LABEL: {{^}}image_load_1: +; GCN-NOT: s_waitcnt ; GCN: image_load v0, v[0:3], s[0:7] dmask:0x1 unorm ; GCN: s_waitcnt vmcnt(0) define amdgpu_ps float @image_load_1(<8 x i32> inreg %rsrc, <4 x i32> %c) #0 { @@ -48,6 +53,7 @@ main_body: } ; GCN-LABEL: {{^}}image_load_f32_v2i32: +; GCN-NOT: s_waitcnt ; GCN: image_load {{v[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1 unorm ; GCN: s_waitcnt vmcnt(0) define amdgpu_ps float @image_load_f32_v2i32(<8 x i32> inreg %rsrc, <2 x i32> %c) #0 { @@ -57,6 +63,7 @@ main_body: } ; GCN-LABEL: {{^}}image_load_v2f32_v4i32: +; GCN-NOT: s_waitcnt ; GCN: image_load {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x3 unorm ; GCN: s_waitcnt vmcnt(0) define amdgpu_ps <2 x float> @image_load_v2f32_v4i32(<8 x i32> inreg %rsrc, <4 x i32> %c) #0 { @@ -66,6 +73,7 @@ main_body: } ; GCN-LABEL: {{^}}image_store_v4i32: +; GCN-NOT: s_waitcnt ; GCN: image_store v[0:3], v[4:7], s[0:7] dmask:0xf unorm define amdgpu_ps void @image_store_v4i32(<8 x i32> inreg %rsrc, <4 x float> %data, <4 x i32> %coords) #0 { main_body: @@ -74,6 +82,7 @@ main_body: } ; GCN-LABEL: {{^}}image_store_v2i32: +; GCN-NOT: s_waitcnt ; GCN: image_store v[0:3], v[4:5], s[0:7] dmask:0xf unorm define amdgpu_ps void @image_store_v2i32(<8 x i32> inreg %rsrc, <4 x float> %data, <2 x i32> %coords) #0 { main_body: @@ -82,6 +91,7 @@ main_body: } ; GCN-LABEL: {{^}}image_store_i32: +; GCN-NOT: s_waitcnt ; GCN: image_store v[0:3], v4, s[0:7] dmask:0xf unorm define amdgpu_ps void @image_store_i32(<8 x i32> inreg %rsrc, <4 x float> %data, i32 %coords) #0 { main_body: @@ -90,6 +100,7 @@ main_body: } ; GCN-LABEL: {{^}}image_store_f32_i32: +; GCN-NOT: s_waitcnt ; GCN: image_store {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1 unorm define amdgpu_ps void @image_store_f32_i32(<8 x i32> inreg %rsrc, float %data, i32 %coords) #0 { main_body: @@ -98,6 +109,7 @@ main_body: } ; GCN-LABEL: {{^}}image_store_v2f32_v4i32: +; GCN-NOT: s_waitcnt ; GCN: image_store {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x3 unorm define amdgpu_ps void @image_store_v2f32_v4i32(<8 x i32> inreg %rsrc, <2 x float> %data, <4 x i32> %coords) #0 { main_body: @@ -106,6 +118,7 @@ main_body: } ; GCN-LABEL: {{^}}image_store_mip: +; GCN-NOT: s_waitcnt ; GCN: image_store_mip v[0:3], v[4:7], s[0:7] dmask:0xf unorm define amdgpu_ps void @image_store_mip(<8 x i32> inreg %rsrc, <4 x float> %data, <4 x i32> %coords) #0 { main_body: @@ -114,6 +127,7 @@ main_body: } ; GCN-LABEL: {{^}}getresinfo: +; GCN-NOT: s_waitcnt ; GCN: image_get_resinfo {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf define amdgpu_ps void @getresinfo() #0 { main_body: diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.s.waitcnt.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.s.waitcnt.ll index a466671d8c5..f6c2cb44c99 100644 --- a/test/CodeGen/AMDGPU/llvm.amdgcn.s.waitcnt.ll +++ b/test/CodeGen/AMDGPU/llvm.amdgcn.s.waitcnt.ll @@ -2,6 +2,7 @@ ; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck %s ; CHECK-LABEL: {{^}}test1: +; CHECK-NOT: s_waitcnt ; CHECK: image_store ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0){{$}} ; CHECK-NEXT: image_store @@ -17,6 +18,7 @@ define amdgpu_ps void @test1(<8 x i32> inreg %rsrc, <4 x float> %d0, <4 x float> ; emitted as late as possible. ; ; CHECK-LABEL: {{^}}test2: +; CHECK-NOT: s_waitcnt ; CHECK: image_load ; CHECK-NEXT: s_waitcnt ; CHECK: s_waitcnt vmcnt(0){{$}} |