summaryrefslogtreecommitdiff
path: root/test/CodeGen/AMDGPU
diff options
context:
space:
mode:
authorMarek Olsak <marek.olsak@amd.com>2017-11-09 01:52:48 +0000
committerMarek Olsak <marek.olsak@amd.com>2017-11-09 01:52:48 +0000
commitaa75d4aeb0067c76f0df50b1b1a38e7ceb853a7c (patch)
tree9ed1185b5c8f9b51018b9d6aafa2485693e3e8ff /test/CodeGen/AMDGPU
parente79e4fb9f114fbe54a06e0462d588c698acbd461 (diff)
AMDGPU: Lower buffer store and atomic intrinsics manually
Summary: Without this, SIMemoryLegalizer inserts s_waitcnt vmcnt(0) before every buffer store and atomic instruction. Reviewers: arsenm, nhaehnle Subscribers: kzhuravl, wdng, yaxunl, dstuttard, tpr, llvm-commits, t-tye Differential Revision: https://reviews.llvm.org/D39060 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@317754 91177308-0d34-0410-b5e6-96231b3b80d8
Diffstat (limited to 'test/CodeGen/AMDGPU')
-rw-r--r--test/CodeGen/AMDGPU/llvm.amdgcn.buffer.atomic.ll3
-rw-r--r--test/CodeGen/AMDGPU/llvm.amdgcn.buffer.store.format.ll9
-rw-r--r--test/CodeGen/AMDGPU/llvm.amdgcn.buffer.store.ll9
-rw-r--r--test/CodeGen/AMDGPU/llvm.amdgcn.image.atomic.ll7
-rw-r--r--test/CodeGen/AMDGPU/llvm.amdgcn.image.ll14
-rw-r--r--test/CodeGen/AMDGPU/llvm.amdgcn.s.waitcnt.ll2
6 files changed, 44 insertions, 0 deletions
diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.atomic.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.atomic.ll
index b6f72a114d9..0ce7d3efe45 100644
--- a/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.atomic.ll
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.atomic.ll
@@ -2,6 +2,7 @@
;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s -check-prefix=CHECK -check-prefix=VI
;CHECK-LABEL: {{^}}test1:
+;CHECK-NOT: s_waitcnt
;CHECK: buffer_atomic_swap v0, off, s[0:3], 0 glc
;VI: s_movk_i32 [[SOFS:s[0-9]+]], 0x1ffc
;CHECK: s_waitcnt vmcnt(0)
@@ -32,6 +33,7 @@ main_body:
}
;CHECK-LABEL: {{^}}test2:
+;CHECK-NOT: s_waitcnt
;CHECK: buffer_atomic_add v0, v1, s[0:3], 0 idxen glc
;CHECK: s_waitcnt vmcnt(0)
;CHECK: buffer_atomic_sub v0, v1, s[0:3], 0 idxen glc
@@ -69,6 +71,7 @@ main_body:
; create copies which we don't bother to track here.
;
;CHECK-LABEL: {{^}}test3:
+;CHECK-NOT: s_waitcnt
;CHECK: buffer_atomic_cmpswap {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], 0 glc
;CHECK: s_waitcnt vmcnt(0)
;VI: s_movk_i32 [[SOFS:s[0-9]+]], 0x1ffc
diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.store.format.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.store.format.ll
index e50455f6f9a..7fc7acdff92 100644
--- a/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.store.format.ll
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.store.format.ll
@@ -2,6 +2,7 @@
;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s
;CHECK-LABEL: {{^}}buffer_store:
+;CHECK-NOT: s_waitcnt
;CHECK: buffer_store_format_xyzw v[0:3], off, s[0:3], 0
;CHECK: buffer_store_format_xyzw v[4:7], off, s[0:3], 0 glc
;CHECK: buffer_store_format_xyzw v[8:11], off, s[0:3], 0 slc
@@ -14,6 +15,7 @@ main_body:
}
;CHECK-LABEL: {{^}}buffer_store_immoffs:
+;CHECK-NOT: s_waitcnt
;CHECK: buffer_store_format_xyzw v[0:3], off, s[0:3], 0 offset:42
define amdgpu_ps void @buffer_store_immoffs(<4 x i32> inreg, <4 x float>) {
main_body:
@@ -22,6 +24,7 @@ main_body:
}
;CHECK-LABEL: {{^}}buffer_store_idx:
+;CHECK-NOT: s_waitcnt
;CHECK: buffer_store_format_xyzw v[0:3], v4, s[0:3], 0 idxen
define amdgpu_ps void @buffer_store_idx(<4 x i32> inreg, <4 x float>, i32) {
main_body:
@@ -30,6 +33,7 @@ main_body:
}
;CHECK-LABEL: {{^}}buffer_store_ofs:
+;CHECK-NOT: s_waitcnt
;CHECK: buffer_store_format_xyzw v[0:3], v4, s[0:3], 0 offen
define amdgpu_ps void @buffer_store_ofs(<4 x i32> inreg, <4 x float>, i32) {
main_body:
@@ -38,6 +42,7 @@ main_body:
}
;CHECK-LABEL: {{^}}buffer_store_both:
+;CHECK-NOT: s_waitcnt
;CHECK: buffer_store_format_xyzw v[0:3], v[4:5], s[0:3], 0 idxen offen
define amdgpu_ps void @buffer_store_both(<4 x i32> inreg, <4 x float>, i32, i32) {
main_body:
@@ -47,6 +52,7 @@ main_body:
;CHECK-LABEL: {{^}}buffer_store_both_reversed:
;CHECK: v_mov_b32_e32 v6, v4
+;CHECK-NOT: s_waitcnt
;CHECK: buffer_store_format_xyzw v[0:3], v[5:6], s[0:3], 0 idxen offen
define amdgpu_ps void @buffer_store_both_reversed(<4 x i32> inreg, <4 x float>, i32, i32) {
main_body:
@@ -57,6 +63,7 @@ main_body:
; Ideally, the register allocator would avoid the wait here
;
;CHECK-LABEL: {{^}}buffer_store_wait:
+;CHECK-NOT: s_waitcnt
;CHECK: buffer_store_format_xyzw v[0:3], v4, s[0:3], 0 idxen
;CHECK: s_waitcnt expcnt(0)
;CHECK: buffer_load_format_xyzw v[0:3], v5, s[0:3], 0 idxen
@@ -71,6 +78,7 @@ main_body:
}
;CHECK-LABEL: {{^}}buffer_store_x1:
+;CHECK-NOT: s_waitcnt
;CHECK: buffer_store_format_x v0, v1, s[0:3], 0 idxen
define amdgpu_ps void @buffer_store_x1(<4 x i32> inreg %rsrc, float %data, i32 %index) {
main_body:
@@ -79,6 +87,7 @@ main_body:
}
;CHECK-LABEL: {{^}}buffer_store_x2:
+;CHECK-NOT: s_waitcnt
;CHECK: buffer_store_format_xy v[0:1], v2, s[0:3], 0 idxen
define amdgpu_ps void @buffer_store_x2(<4 x i32> inreg %rsrc, <2 x float> %data, i32 %index) {
main_body:
diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.store.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.store.ll
index 81597516d5f..c6200cacbe8 100644
--- a/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.store.ll
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.store.ll
@@ -2,6 +2,7 @@
;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s
;CHECK-LABEL: {{^}}buffer_store:
+;CHECK-NOT: s_waitcnt
;CHECK: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
;CHECK: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 glc
;CHECK: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 slc
@@ -14,6 +15,7 @@ main_body:
}
;CHECK-LABEL: {{^}}buffer_store_immoffs:
+;CHECK-NOT: s_waitcnt
;CHECK: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:42
define amdgpu_ps void @buffer_store_immoffs(<4 x i32> inreg, <4 x float>) {
main_body:
@@ -22,6 +24,7 @@ main_body:
}
;CHECK-LABEL: {{^}}buffer_store_idx:
+;CHECK-NOT: s_waitcnt
;CHECK: buffer_store_dwordx4 v[0:3], v4, s[0:3], 0 idxen
define amdgpu_ps void @buffer_store_idx(<4 x i32> inreg, <4 x float>, i32) {
main_body:
@@ -30,6 +33,7 @@ main_body:
}
;CHECK-LABEL: {{^}}buffer_store_ofs:
+;CHECK-NOT: s_waitcnt
;CHECK: buffer_store_dwordx4 v[0:3], v4, s[0:3], 0 offen
define amdgpu_ps void @buffer_store_ofs(<4 x i32> inreg, <4 x float>, i32) {
main_body:
@@ -38,6 +42,7 @@ main_body:
}
;CHECK-LABEL: {{^}}buffer_store_both:
+;CHECK-NOT: s_waitcnt
;CHECK: buffer_store_dwordx4 v[0:3], v[4:5], s[0:3], 0 idxen offen
define amdgpu_ps void @buffer_store_both(<4 x i32> inreg, <4 x float>, i32, i32) {
main_body:
@@ -47,6 +52,7 @@ main_body:
;CHECK-LABEL: {{^}}buffer_store_both_reversed:
;CHECK: v_mov_b32_e32 v6, v4
+;CHECK-NOT: s_waitcnt
;CHECK: buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 idxen offen
define amdgpu_ps void @buffer_store_both_reversed(<4 x i32> inreg, <4 x float>, i32, i32) {
main_body:
@@ -57,6 +63,7 @@ main_body:
; Ideally, the register allocator would avoid the wait here
;
;CHECK-LABEL: {{^}}buffer_store_wait:
+;CHECK-NOT: s_waitcnt
;CHECK: buffer_store_dwordx4 v[0:3], v4, s[0:3], 0 idxen
;CHECK: s_waitcnt expcnt(0)
;CHECK: buffer_load_dwordx4 v[0:3], v5, s[0:3], 0 idxen
@@ -71,6 +78,7 @@ main_body:
}
;CHECK-LABEL: {{^}}buffer_store_x1:
+;CHECK-NOT: s_waitcnt
;CHECK: buffer_store_dword v0, v1, s[0:3], 0 idxen
define amdgpu_ps void @buffer_store_x1(<4 x i32> inreg %rsrc, float %data, i32 %index) {
main_body:
@@ -79,6 +87,7 @@ main_body:
}
;CHECK-LABEL: {{^}}buffer_store_x2:
+;CHECK-NOT: s_waitcnt
;CHECK: buffer_store_dwordx2 v[0:1], v2, s[0:3], 0 idxen
define amdgpu_ps void @buffer_store_x2(<4 x i32> inreg %rsrc, <2 x float> %data, i32 %index) #0 {
main_body:
diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.image.atomic.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.image.atomic.ll
index 87d83872788..5dec4ad9c1e 100644
--- a/test/CodeGen/AMDGPU/llvm.amdgcn.image.atomic.ll
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.image.atomic.ll
@@ -2,6 +2,7 @@
;RUN: llc < %s -march=amdgcn -mcpu=tonga -show-mc-encoding -verify-machineinstrs | FileCheck %s --check-prefix=CHECK --check-prefix=VI
;CHECK-LABEL: {{^}}image_atomic_swap:
+;CHECK-NOT: s_waitcnt
;SI: image_atomic_swap v4, v[0:3], s[0:7] dmask:0x1 unorm glc ; encoding: [0x00,0x31,0x3c,0xf0,0x00,0x04,0x00,0x00]
;VI: image_atomic_swap v4, v[0:3], s[0:7] dmask:0x1 unorm glc ; encoding: [0x00,0x31,0x40,0xf0,0x00,0x04,0x00,0x00]
;CHECK: s_waitcnt vmcnt(0)
@@ -13,6 +14,7 @@ main_body:
}
;CHECK-LABEL: {{^}}image_atomic_swap_v2i32:
+;CHECK-NOT: s_waitcnt
;SI: image_atomic_swap v2, v[0:1], s[0:7] dmask:0x1 unorm glc ; encoding: [0x00,0x31,0x3c,0xf0,0x00,0x02,0x00,0x00]
;VI: image_atomic_swap v2, v[0:1], s[0:7] dmask:0x1 unorm glc ; encoding: [0x00,0x31,0x40,0xf0,0x00,0x02,0x00,0x00]
;CHECK: s_waitcnt vmcnt(0)
@@ -24,6 +26,7 @@ main_body:
}
;CHECK-LABEL: {{^}}image_atomic_swap_i32:
+;CHECK-NOT: s_waitcnt
;SI: image_atomic_swap v1, v0, s[0:7] dmask:0x1 unorm glc ; encoding: [0x00,0x31,0x3c,0xf0,0x00,0x01,0x00,0x00]
;VI: image_atomic_swap v1, v0, s[0:7] dmask:0x1 unorm glc ; encoding: [0x00,0x31,0x40,0xf0,0x00,0x01,0x00,0x00]
;CHECK: s_waitcnt vmcnt(0)
@@ -35,6 +38,7 @@ main_body:
}
;CHECK-LABEL: {{^}}image_atomic_cmpswap:
+;CHECK-NOT: s_waitcnt
;SI: image_atomic_cmpswap v[4:5], v[0:3], s[0:7] dmask:0x3 unorm glc ; encoding: [0x00,0x33,0x40,0xf0,0x00,0x04,0x00,0x00]
;VI: image_atomic_cmpswap v[4:5], v[0:3], s[0:7] dmask:0x3 unorm glc ; encoding: [0x00,0x33,0x44,0xf0,0x00,0x04,0x00,0x00]
;CHECK: s_waitcnt vmcnt(0)
@@ -47,6 +51,7 @@ main_body:
}
;CHECK-LABEL: {{^}}image_atomic_add:
+;CHECK-NOT: s_waitcnt
;SI: image_atomic_add v4, v[0:3], s[0:7] dmask:0x1 unorm glc ; encoding: [0x00,0x31,0x44,0xf0,0x00,0x04,0x00,0x00]
;VI: image_atomic_add v4, v[0:3], s[0:7] dmask:0x1 unorm glc ; encoding: [0x00,0x31,0x48,0xf0,0x00,0x04,0x00,0x00]
;CHECK: s_waitcnt vmcnt(0)
@@ -58,6 +63,7 @@ main_body:
}
;CHECK-LABEL: {{^}}image_atomic_sub:
+;CHECK-NOT: s_waitcnt
;SI: image_atomic_sub v4, v[0:3], s[0:7] dmask:0x1 unorm glc ; encoding: [0x00,0x31,0x48,0xf0,0x00,0x04,0x00,0x00]
;VI: image_atomic_sub v4, v[0:3], s[0:7] dmask:0x1 unorm glc ; encoding: [0x00,0x31,0x4c,0xf0,0x00,0x04,0x00,0x00]
;CHECK: s_waitcnt vmcnt(0)
@@ -69,6 +75,7 @@ main_body:
}
;CHECK-LABEL: {{^}}image_atomic_unchanged:
+;CHECK-NOT: s_waitcnt
;CHECK: image_atomic_smin v4, v[0:3], s[0:7] dmask:0x1 unorm glc ; encoding: [0x00,0x31,0x50,0xf0,0x00,0x04,0x00,0x00]
;CHECK: s_waitcnt vmcnt(0)
;CHECK: image_atomic_umin v4, v[0:3], s[0:7] dmask:0x1 unorm glc ; encoding: [0x00,0x31,0x54,0xf0,0x00,0x04,0x00,0x00]
diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.image.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.image.ll
index a289f7b0cfb..42c87056746 100644
--- a/test/CodeGen/AMDGPU/llvm.amdgcn.image.ll
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.image.ll
@@ -2,6 +2,7 @@
; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,VI %s
; GCN-LABEL: {{^}}image_load_v4i32:
+; GCN-NOT: s_waitcnt
; GCN: image_load v[0:3], v[0:3], s[0:7] dmask:0xf unorm
; GCN: s_waitcnt vmcnt(0)
define amdgpu_ps <4 x float> @image_load_v4i32(<8 x i32> inreg %rsrc, <4 x i32> %c) #0 {
@@ -11,6 +12,7 @@ main_body:
}
; GCN-LABEL: {{^}}image_load_v2i32:
+; GCN-NOT: s_waitcnt
; GCN: image_load v[0:3], v[0:1], s[0:7] dmask:0xf unorm
; GCN: s_waitcnt vmcnt(0)
define amdgpu_ps <4 x float> @image_load_v2i32(<8 x i32> inreg %rsrc, <2 x i32> %c) #0 {
@@ -20,6 +22,7 @@ main_body:
}
; GCN-LABEL: {{^}}image_load_i32:
+; GCN-NOT: s_waitcnt
; GCN: image_load v[0:3], v0, s[0:7] dmask:0xf unorm
; GCN: s_waitcnt vmcnt(0)
define amdgpu_ps <4 x float> @image_load_i32(<8 x i32> inreg %rsrc, i32 %c) #0 {
@@ -29,6 +32,7 @@ main_body:
}
; GCN-LABEL: {{^}}image_load_mip:
+; GCN-NOT: s_waitcnt
; GCN: image_load_mip v[0:3], v[0:3], s[0:7] dmask:0xf unorm
; GCN: s_waitcnt vmcnt(0)
define amdgpu_ps <4 x float> @image_load_mip(<8 x i32> inreg %rsrc, <4 x i32> %c) #0 {
@@ -38,6 +42,7 @@ main_body:
}
; GCN-LABEL: {{^}}image_load_1:
+; GCN-NOT: s_waitcnt
; GCN: image_load v0, v[0:3], s[0:7] dmask:0x1 unorm
; GCN: s_waitcnt vmcnt(0)
define amdgpu_ps float @image_load_1(<8 x i32> inreg %rsrc, <4 x i32> %c) #0 {
@@ -48,6 +53,7 @@ main_body:
}
; GCN-LABEL: {{^}}image_load_f32_v2i32:
+; GCN-NOT: s_waitcnt
; GCN: image_load {{v[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1 unorm
; GCN: s_waitcnt vmcnt(0)
define amdgpu_ps float @image_load_f32_v2i32(<8 x i32> inreg %rsrc, <2 x i32> %c) #0 {
@@ -57,6 +63,7 @@ main_body:
}
; GCN-LABEL: {{^}}image_load_v2f32_v4i32:
+; GCN-NOT: s_waitcnt
; GCN: image_load {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x3 unorm
; GCN: s_waitcnt vmcnt(0)
define amdgpu_ps <2 x float> @image_load_v2f32_v4i32(<8 x i32> inreg %rsrc, <4 x i32> %c) #0 {
@@ -66,6 +73,7 @@ main_body:
}
; GCN-LABEL: {{^}}image_store_v4i32:
+; GCN-NOT: s_waitcnt
; GCN: image_store v[0:3], v[4:7], s[0:7] dmask:0xf unorm
define amdgpu_ps void @image_store_v4i32(<8 x i32> inreg %rsrc, <4 x float> %data, <4 x i32> %coords) #0 {
main_body:
@@ -74,6 +82,7 @@ main_body:
}
; GCN-LABEL: {{^}}image_store_v2i32:
+; GCN-NOT: s_waitcnt
; GCN: image_store v[0:3], v[4:5], s[0:7] dmask:0xf unorm
define amdgpu_ps void @image_store_v2i32(<8 x i32> inreg %rsrc, <4 x float> %data, <2 x i32> %coords) #0 {
main_body:
@@ -82,6 +91,7 @@ main_body:
}
; GCN-LABEL: {{^}}image_store_i32:
+; GCN-NOT: s_waitcnt
; GCN: image_store v[0:3], v4, s[0:7] dmask:0xf unorm
define amdgpu_ps void @image_store_i32(<8 x i32> inreg %rsrc, <4 x float> %data, i32 %coords) #0 {
main_body:
@@ -90,6 +100,7 @@ main_body:
}
; GCN-LABEL: {{^}}image_store_f32_i32:
+; GCN-NOT: s_waitcnt
; GCN: image_store {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1 unorm
define amdgpu_ps void @image_store_f32_i32(<8 x i32> inreg %rsrc, float %data, i32 %coords) #0 {
main_body:
@@ -98,6 +109,7 @@ main_body:
}
; GCN-LABEL: {{^}}image_store_v2f32_v4i32:
+; GCN-NOT: s_waitcnt
; GCN: image_store {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x3 unorm
define amdgpu_ps void @image_store_v2f32_v4i32(<8 x i32> inreg %rsrc, <2 x float> %data, <4 x i32> %coords) #0 {
main_body:
@@ -106,6 +118,7 @@ main_body:
}
; GCN-LABEL: {{^}}image_store_mip:
+; GCN-NOT: s_waitcnt
; GCN: image_store_mip v[0:3], v[4:7], s[0:7] dmask:0xf unorm
define amdgpu_ps void @image_store_mip(<8 x i32> inreg %rsrc, <4 x float> %data, <4 x i32> %coords) #0 {
main_body:
@@ -114,6 +127,7 @@ main_body:
}
; GCN-LABEL: {{^}}getresinfo:
+; GCN-NOT: s_waitcnt
; GCN: image_get_resinfo {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf
define amdgpu_ps void @getresinfo() #0 {
main_body:
diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.s.waitcnt.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.s.waitcnt.ll
index a466671d8c5..f6c2cb44c99 100644
--- a/test/CodeGen/AMDGPU/llvm.amdgcn.s.waitcnt.ll
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.s.waitcnt.ll
@@ -2,6 +2,7 @@
; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck %s
; CHECK-LABEL: {{^}}test1:
+; CHECK-NOT: s_waitcnt
; CHECK: image_store
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0){{$}}
; CHECK-NEXT: image_store
@@ -17,6 +18,7 @@ define amdgpu_ps void @test1(<8 x i32> inreg %rsrc, <4 x float> %d0, <4 x float>
; emitted as late as possible.
;
; CHECK-LABEL: {{^}}test2:
+; CHECK-NOT: s_waitcnt
; CHECK: image_load
; CHECK-NEXT: s_waitcnt
; CHECK: s_waitcnt vmcnt(0){{$}}