summaryrefslogtreecommitdiff
path: root/test/CodeGen/AMDGPU
diff options
context:
space:
mode:
authorMatt Arsenault <Matthew.Arsenault@amd.com>2017-11-13 00:22:09 +0000
committerMatt Arsenault <Matthew.Arsenault@amd.com>2017-11-13 00:22:09 +0000
commitd8f8d0c326f41a944d67fe7ac0feb7c53c047e47 (patch)
tree58bfb18399bc881c875a802bce39076e4bacbc66 /test/CodeGen/AMDGPU
parent19f503cb0244cd2f66c41249b205ba0e082d9c04 (diff)
AMDGPU: Select d16 loads into low component of register
git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@318005 91177308-0d34-0410-b5e6-96231b3b80d8
Diffstat (limited to 'test/CodeGen/AMDGPU')
-rw-r--r--test/CodeGen/AMDGPU/extract_vector_elt-i16.ll11
-rw-r--r--test/CodeGen/AMDGPU/load-hi16.ll98
-rw-r--r--test/CodeGen/AMDGPU/load-lo16.ll591
3 files changed, 695 insertions, 5 deletions
diff --git a/test/CodeGen/AMDGPU/extract_vector_elt-i16.ll b/test/CodeGen/AMDGPU/extract_vector_elt-i16.ll
index b3d6d79e8db..4dee500c842 100644
--- a/test/CodeGen/AMDGPU/extract_vector_elt-i16.ll
+++ b/test/CodeGen/AMDGPU/extract_vector_elt-i16.ll
@@ -100,15 +100,16 @@ define amdgpu_kernel void @extract_vector_elt_v4i16(i16 addrspace(1)* %out, <4 x
; SICIVI: buffer_store_short
; SICIVI: buffer_store_short
-; GFX9: buffer_load_ushort
+; SICIVI: buffer_load_ushort
+; SICIVI: buffer_store_short
+
; GFX9: buffer_load_ushort
; GFX9: global_load_short_d16_hi
-
+; GFX9: global_load_short_d16 v
; GFX9: buffer_store_dword
; GFX9: buffer_store_dword
-
-; GCN: buffer_load_ushort
-; GCN: buffer_store_short
+; GFX9: buffer_load_ushort
+; GFX9: buffer_store_short
define amdgpu_kernel void @dynamic_extract_vector_elt_v3i16(i16 addrspace(1)* %out, <3 x i16> %foo, i32 %idx) #0 {
%p0 = extractelement <3 x i16> %foo, i32 %idx
%out1 = getelementptr i16, i16 addrspace(1)* %out, i32 1
diff --git a/test/CodeGen/AMDGPU/load-hi16.ll b/test/CodeGen/AMDGPU/load-hi16.ll
index 88a60935c74..b239f39ea30 100644
--- a/test/CodeGen/AMDGPU/load-hi16.ll
+++ b/test/CodeGen/AMDGPU/load-hi16.ll
@@ -503,4 +503,102 @@ entry:
ret void
}
+; FIXME: Remove m0 init and waitcnt between reads
+; FIXME: Is there a cost to using the extload over not?
+; GCN-LABEL: {{^}}load_local_v2i16_split:
+; GCN: s_waitcnt
+; GFX9-NEXT: s_mov_b32 m0, -1
+; GFX9-NEXT: ds_read_u16 v1, v0
+; GFX9-NEXT: s_waitcnt
+; GFX9-NEXT: ds_read_u16_d16_hi v1, v0 offset:2
+; GFX9-NEXT: s_waitcnt
+; GFX9-NEXT: v_mov_b32_e32 v0, v1
+; GFX9-NEXT: s_setpc_b64
+define <2 x i16> @load_local_v2i16_split(i16 addrspace(3)* %in) #0 {
+entry:
+ %gep = getelementptr inbounds i16, i16 addrspace(3)* %in, i32 1
+ %load0 = load volatile i16, i16 addrspace(3)* %in
+ %load1 = load volatile i16, i16 addrspace(3)* %gep
+ %build0 = insertelement <2 x i16> undef, i16 %load0, i32 0
+ %build1 = insertelement <2 x i16> %build0, i16 %load1, i32 1
+ ret <2 x i16> %build1
+}
+
+; FIXME: Remove waitcnt between reads
+; GCN-LABEL: {{^}}load_global_v2i16_split:
+; GCN: s_waitcnt
+; GFX9-NEXT: global_load_ushort v2
+; GFX9-NEXT: s_waitcnt
+; GFX9-NEXT: global_load_short_d16_hi v2
+; GFX9-NEXT: s_waitcnt
+; GFX9-NEXT: v_mov_b32_e32 v0, v2
+; GFX9-NEXT: s_setpc_b64
+define <2 x i16> @load_global_v2i16_split(i16 addrspace(1)* %in) #0 {
+entry:
+ %gep = getelementptr inbounds i16, i16 addrspace(1)* %in, i64 1
+ %load0 = load volatile i16, i16 addrspace(1)* %in
+ %load1 = load volatile i16, i16 addrspace(1)* %gep
+ %build0 = insertelement <2 x i16> undef, i16 %load0, i32 0
+ %build1 = insertelement <2 x i16> %build0, i16 %load1, i32 1
+ ret <2 x i16> %build1
+}
+
+; FIXME: Remove waitcnt between reads
+; GCN-LABEL: {{^}}load_flat_v2i16_split:
+; GCN: s_waitcnt
+; GFX9-NEXT: flat_load_ushort v2
+; GFX9-NEXT: s_waitcnt
+; GFX9-NEXT: flat_load_short_d16_hi v2
+; GFX9-NEXT: s_waitcnt
+; GFX9-NEXT: v_mov_b32_e32 v0, v2
+; GFX9-NEXT: s_setpc_b64
+define <2 x i16> @load_flat_v2i16_split(i16 addrspace(4)* %in) #0 {
+entry:
+ %gep = getelementptr inbounds i16, i16 addrspace(4)* %in, i64 1
+ %load0 = load volatile i16, i16 addrspace(4)* %in
+ %load1 = load volatile i16, i16 addrspace(4)* %gep
+ %build0 = insertelement <2 x i16> undef, i16 %load0, i32 0
+ %build1 = insertelement <2 x i16> %build0, i16 %load1, i32 1
+ ret <2 x i16> %build1
+}
+
+; FIXME: Remove waitcnt between reads
+; GCN-LABEL: {{^}}load_constant_v2i16_split:
+; GCN: s_waitcnt
+; GFX9-NEXT: global_load_ushort v2
+; GFX9-NEXT: s_waitcnt
+; GFX9-NEXT: global_load_short_d16_hi v2
+; GFX9-NEXT: s_waitcnt
+; GFX9-NEXT: v_mov_b32_e32 v0, v2
+; GFX9-NEXT: s_setpc_b64
+define <2 x i16> @load_constant_v2i16_split(i16 addrspace(2)* %in) #0 {
+entry:
+ %gep = getelementptr inbounds i16, i16 addrspace(2)* %in, i64 1
+ %load0 = load volatile i16, i16 addrspace(2)* %in
+ %load1 = load volatile i16, i16 addrspace(2)* %gep
+ %build0 = insertelement <2 x i16> undef, i16 %load0, i32 0
+ %build1 = insertelement <2 x i16> %build0, i16 %load1, i32 1
+ ret <2 x i16> %build1
+}
+
+; FIXME: Remove m0 init and waitcnt between reads
+; FIXME: Is there a cost to using the extload over not?
+; GCN-LABEL: {{^}}load_private_v2i16_split:
+; GCN: s_waitcnt
+; GFX9-NEXT: buffer_load_ushort v1, v0, s[0:3], s4 offen{{$}}
+; GFX9-NEXT: s_waitcnt
+; GFX9-NEXT: buffer_load_short_d16_hi v1, v0, s[0:3], s4 offen offset:2
+; GFX9-NEXT: s_waitcnt
+; GFX9-NEXT: v_mov_b32_e32 v0, v1
+; GFX9-NEXT: s_setpc_b64
+define <2 x i16> @load_private_v2i16_split(i16* %in) #0 {
+entry:
+ %gep = getelementptr inbounds i16, i16* %in, i32 1
+ %load0 = load volatile i16, i16* %in
+ %load1 = load volatile i16, i16* %gep
+ %build0 = insertelement <2 x i16> undef, i16 %load0, i32 0
+ %build1 = insertelement <2 x i16> %build0, i16 %load1, i32 1
+ ret <2 x i16> %build1
+}
+
attributes #0 = { nounwind }
diff --git a/test/CodeGen/AMDGPU/load-lo16.ll b/test/CodeGen/AMDGPU/load-lo16.ll
new file mode 100644
index 00000000000..67d5fcc8a45
--- /dev/null
+++ b/test/CodeGen/AMDGPU/load-lo16.ll
@@ -0,0 +1,591 @@
+; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX9 %s
+; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,VI %s
+
+; GCN-LABEL: {{^}}load_local_lo_v2i16_undeflo:
+; GCN: s_waitcnt
+; GFX9-NEXT: ds_read_u16_d16 v0, v0
+; GFX9-NEXT: s_waitcnt
+; GFX9-NEXT: s_setpc_b64
+
+; VI: ds_read_u16
+define <2 x i16> @load_local_lo_v2i16_undeflo(i16 addrspace(3)* %in) #0 {
+entry:
+ %load = load i16, i16 addrspace(3)* %in
+ %build = insertelement <2 x i16> undef, i16 %load, i32 0
+ ret <2 x i16> %build
+}
+
+; GCN-LABEL: {{^}}load_local_lo_v2i16_reglo:
+; GCN: s_waitcnt
+; GFX9-NEXT: ds_read_u16_d16 v0, v0
+; GFX9-NEXT: s_waitcnt
+; GFX9-NEXT: s_setpc_b64
+
+; VI: ds_read_u16
+define <2 x i16> @load_local_lo_v2i16_reglo(i16 addrspace(3)* %in, i16 %reg) #0 {
+entry:
+ %load = load i16, i16 addrspace(3)* %in
+ %build0 = insertelement <2 x i16> undef, i16 %reg, i32 0
+ %build1 = insertelement <2 x i16> %build0, i16 %load, i32 0
+ ret <2 x i16> %build1
+}
+
+; Show that we get reasonable regalloc without physreg constraints.
+; GCN-LABEL: {{^}}load_local_lo_v2i16_reglo_vreg:
+; GCN: s_waitcnt
+; GFX9-NEXT: ds_read_u16_d16 v0, v0
+; GFX9-NEXT: s_waitcnt
+; GFX9-NEXT: global_store_dword v[0:1], v0, off{{$}}
+; GFX9-NEXT: s_waitcnt
+; GFX9-NEXT: s_setpc_b64
+
+; VI: ds_read_u16
+define void @load_local_lo_v2i16_reglo_vreg(i16 addrspace(3)* %in, i16 %reg) #0 {
+entry:
+ %load = load i16, i16 addrspace(3)* %in
+ %build0 = insertelement <2 x i16> undef, i16 %reg, i32 0
+ %build1 = insertelement <2 x i16> %build0, i16 %load, i32 0
+ store <2 x i16> %build1, <2 x i16> addrspace(1)* undef
+ ret void
+}
+
+; GCN-LABEL: {{^}}load_local_lo_v2i16_zerolo:
+; GCN: s_waitcnt
+; GFX9-NEXT: v_mov_b32_e32 v1, 0
+; GFX9-NEXT: ds_read_u16_d16 v1, v0
+; GFX9-NEXT: s_waitcnt
+; GFX9-NEXT: v_mov_b32_e32 v0, v1
+; GFX9-NEXT: s_setpc_b64
+
+; VI: ds_read_u16 v
+define <2 x i16> @load_local_lo_v2i16_zerolo(i16 addrspace(3)* %in) #0 {
+entry:
+ %load = load i16, i16 addrspace(3)* %in
+ %build = insertelement <2 x i16> zeroinitializer, i16 %load, i32 0
+ ret <2 x i16> %build
+}
+
+; GCN-LABEL: {{^}}load_local_lo_v2f16_fpimm:
+; GCN: s_waitcnt
+; GFX9-NEXT: v_mov_b32_e32 v1, 2.0
+; GFX9-NEXT: ds_read_u16_d16 v1, v0
+; GFX9-NEXT: s_waitcnt
+; GFX9-NEXT: v_mov_b32_e32 v0, v1
+; GFX9-NEXT: s_setpc_b64
+
+; VI: ds_read_u16 v
+define <2 x half> @load_local_lo_v2f16_fpimm(half addrspace(3)* %in) #0 {
+entry:
+ %load = load half, half addrspace(3)* %in
+ %build = insertelement <2 x half> <half 0.0, half 2.0>, half %load, i32 0
+ ret <2 x half> %build
+}
+
+; GCN-LABEL: {{^}}load_local_lo_v2f16_reghi_vreg:
+; GCN: s_waitcnt
+; GFX9-NEXT: ds_read_u16_d16 v1, v0
+; GFX9-NEXT: s_waitcnt
+; GFX9-NEXT: global_store_dword v[0:1], v1, off{{$}}
+; GFX9-NEXT: s_waitcnt
+; GFX9-NEXT: s_setpc_b64
+
+; VI: ds_read_u16 v
+define void @load_local_lo_v2f16_reghi_vreg(half addrspace(3)* %in, i32 %reg) #0 {
+entry:
+ %reg.bc = bitcast i32 %reg to <2 x half>
+ %load = load half, half addrspace(3)* %in
+ %build1 = insertelement <2 x half> %reg.bc, half %load, i32 0
+ store <2 x half> %build1, <2 x half> addrspace(1)* undef
+ ret void
+}
+
+; GCN-LABEL: {{^}}load_local_lo_v2f16_reglo_vreg:
+
+; GFX9: ds_read_u16 v
+; GFX9: v_and_b32_e32 v{{[0-9]+}}, 0xffff, v{{[0-9]+}}
+; GFX9: v_lshl_or_b32 v{{[0-9]+}}, v{{[0-9]+}}, 16, v{{[0-9]+}}
+; GFX9: global_store_dword
+
+; VI: ds_read_u16 v
+define void @load_local_lo_v2f16_reglo_vreg(half addrspace(3)* %in, half %reg) #0 {
+entry:
+ %load = load half, half addrspace(3)* %in
+ %build0 = insertelement <2 x half> undef, half %reg, i32 1
+ %build1 = insertelement <2 x half> %build0, half %load, i32 0
+ store <2 x half> %build1, <2 x half> addrspace(1)* undef
+ ret void
+}
+
+; GCN-LABEL: {{^}}load_local_lo_v2i16_reghi_vreg_zexti8:
+; GCN: s_waitcnt
+; GFX9-NEXT: ds_read_u8_d16 v1, v0
+; GFX9-NEXT: s_waitcnt
+; GFX9-NEXT: global_store_dword v[0:1], v1, off{{$}}
+; GFX9-NEXT: s_waitcnt
+; GFX9-NEXT: s_setpc_b64
+
+; VI: ds_read_u8 v
+define void @load_local_lo_v2i16_reghi_vreg_zexti8(i8 addrspace(3)* %in, i32 %reg) #0 {
+entry:
+ %reg.bc = bitcast i32 %reg to <2 x i16>
+ %load = load i8, i8 addrspace(3)* %in
+ %ext = zext i8 %load to i16
+ %build1 = insertelement <2 x i16> %reg.bc, i16 %ext, i32 0
+ store <2 x i16> %build1, <2 x i16> addrspace(1)* undef
+ ret void
+}
+
+; GCN-LABEL: {{^}}load_local_lo_v2i16_reglo_vreg_zexti8:
+; GCN: s_waitcnt
+; GFX9: ds_read_u8 v
+; GFX9: global_store_dword
+; GFX9-NEXT: s_waitcnt
+; GFX9-NEXT: s_setpc_b64
+
+; VI: ds_read_u8 v
+define void @load_local_lo_v2i16_reglo_vreg_zexti8(i8 addrspace(3)* %in, i16 %reg) #0 {
+entry:
+ %load = load i8, i8 addrspace(3)* %in
+ %ext = zext i8 %load to i16
+ %build0 = insertelement <2 x i16> undef, i16 %reg, i32 1
+ %build1 = insertelement <2 x i16> %build0, i16 %ext, i32 0
+ store <2 x i16> %build1, <2 x i16> addrspace(1)* undef
+ ret void
+}
+
+; GCN-LABEL: {{^}}load_local_lo_v2i16_reghi_vreg_sexti8:
+; GCN: s_waitcnt
+; GFX9-NEXT: ds_read_i8_d16 v1, v0
+; GFX9-NEXT: s_waitcnt
+; GFX9-NEXT: global_store_dword v[0:1], v1, off{{$}}
+; GFX9-NEXT: s_waitcnt
+; GFX9-NEXT: s_setpc_b64
+
+; VI: ds_read_i8 v
+define void @load_local_lo_v2i16_reghi_vreg_sexti8(i8 addrspace(3)* %in, i32 %reg) #0 {
+entry:
+ %reg.bc = bitcast i32 %reg to <2 x i16>
+ %load = load i8, i8 addrspace(3)* %in
+ %ext = sext i8 %load to i16
+ %build1 = insertelement <2 x i16> %reg.bc, i16 %ext, i32 0
+ store <2 x i16> %build1, <2 x i16> addrspace(1)* undef
+ ret void
+}
+
+; GCN-LABEL: {{^}}load_local_lo_v2i16_reglo_vreg_sexti8:
+; GCN: s_waitcnt
+; GFX9: ds_read_i8 v
+; GFX9: v_and_b32_e32 v{{[0-9]+}}, 0xffff, v{{[0-9]+}}
+; GFX9: v_lshl_or_b32 v{{[0-9]+}}, v{{[0-9]+}}, 16, v{{[0-9]+}}
+
+; VI: ds_read_i8 v
+define void @load_local_lo_v2i16_reglo_vreg_sexti8(i8 addrspace(3)* %in, i16 %reg) #0 {
+entry:
+ %load = load i8, i8 addrspace(3)* %in
+ %ext = sext i8 %load to i16
+ %build0 = insertelement <2 x i16> undef, i16 %reg, i32 1
+ %build1 = insertelement <2 x i16> %build0, i16 %ext, i32 0
+ store <2 x i16> %build1, <2 x i16> addrspace(1)* undef
+ ret void
+}
+
+; GCN-LABEL: {{^}}load_global_lo_v2i16_reglo_vreg:
+; GCN: s_waitcnt
+; GFX9-NEXT: global_load_short_d16 v2, v[0:1], off offset:-4094
+; GFX9-NEXT: s_waitcnt
+; GFX9-NEXT: global_store_dword
+; GFX9-NEXT: s_waitcnt
+; GFX9-NEXT: s_setpc_b64
+define void @load_global_lo_v2i16_reglo_vreg(i16 addrspace(1)* %in, i32 %reg) #0 {
+entry:
+ %reg.bc = bitcast i32 %reg to <2 x i16>
+ %gep = getelementptr inbounds i16, i16 addrspace(1)* %in, i64 -2047
+ %load = load i16, i16 addrspace(1)* %gep
+ %build1 = insertelement <2 x i16> %reg.bc, i16 %load, i32 0
+ store <2 x i16> %build1, <2 x i16> addrspace(1)* undef
+ ret void
+}
+
+; GCN-LABEL: {{^}}load_global_lo_v2f16_reglo_vreg:
+; GCN: s_waitcnt
+; GFX9-NEXT: global_load_short_d16 v2, v[0:1], off offset:-4094
+; GFX9-NEXT: s_waitcnt
+; GFX9-NEXT: global_store_dword
+; GFX9-NEXT: s_waitcnt
+; GFX9-NEXT: s_setpc_b64
+define void @load_global_lo_v2f16_reglo_vreg(half addrspace(1)* %in, i32 %reg) #0 {
+entry:
+ %reg.bc = bitcast i32 %reg to <2 x half>
+ %gep = getelementptr inbounds half, half addrspace(1)* %in, i64 -2047
+ %load = load half, half addrspace(1)* %gep
+ %build1 = insertelement <2 x half> %reg.bc, half %load, i32 0
+ store <2 x half> %build1, <2 x half> addrspace(1)* undef
+ ret void
+}
+
+; GCN-LABEL: {{^}}load_global_lo_v2i16_reglo_vreg_zexti8:
+; GCN: s_waitcnt
+; GFX9-NEXT: global_load_ubyte_d16 v2, v[0:1], off offset:-4095
+; GFX9-NEXT: s_waitcnt
+; GFX9-NEXT: global_store_dword
+; GFX9-NEXT: s_waitcnt
+; GFX9-NEXT: s_setpc_b64
+define void @load_global_lo_v2i16_reglo_vreg_zexti8(i8 addrspace(1)* %in, i32 %reg) #0 {
+entry:
+ %reg.bc = bitcast i32 %reg to <2 x i16>
+ %gep = getelementptr inbounds i8, i8 addrspace(1)* %in, i64 -4095
+ %load = load i8, i8 addrspace(1)* %gep
+ %ext = zext i8 %load to i16
+ %build1 = insertelement <2 x i16> %reg.bc, i16 %ext, i32 0
+ store <2 x i16> %build1, <2 x i16> addrspace(1)* undef
+ ret void
+}
+
+; GCN-LABEL: {{^}}load_global_lo_v2i16_reglo_vreg_sexti8:
+; GCN: s_waitcnt
+; GFX9-NEXT: global_load_sbyte_d16 v2, v[0:1], off offset:-4095
+; GFX9-NEXT: s_waitcnt
+; GFX9-NEXT: global_store_dword
+; GFX9-NEXT: s_waitcnt
+; GFX9-NEXT: s_setpc_b64
+define void @load_global_lo_v2i16_reglo_vreg_sexti8(i8 addrspace(1)* %in, i32 %reg) #0 {
+entry:
+ %reg.bc = bitcast i32 %reg to <2 x i16>
+ %gep = getelementptr inbounds i8, i8 addrspace(1)* %in, i64 -4095
+ %load = load i8, i8 addrspace(1)* %gep
+ %ext = sext i8 %load to i16
+ %build1 = insertelement <2 x i16> %reg.bc, i16 %ext, i32 0
+ store <2 x i16> %build1, <2 x i16> addrspace(1)* undef
+ ret void
+}
+
+; GCN-LABEL: {{^}}load_flat_lo_v2i16_reghi_vreg:
+; GCN: s_waitcnt
+; GFX9-NEXT: flat_load_short_d16 v2, v[0:1]
+; GFX9-NEXT: s_waitcnt
+; GFX9-NEXT: global_store_dword v[0:1], v2
+; GFX9-NEXT: s_waitcnt
+; GFX9-NEXT: s_setpc_b64
+
+; VI: flat_load_ushort v{{[0-9]+}}
+; VI: v_or_b32_e32
+define void @load_flat_lo_v2i16_reghi_vreg(i16 addrspace(4)* %in, i32 %reg) #0 {
+entry:
+ %reg.bc = bitcast i32 %reg to <2 x i16>
+ %load = load i16, i16 addrspace(4)* %in
+ %build1 = insertelement <2 x i16> %reg.bc, i16 %load, i32 0
+ store <2 x i16> %build1, <2 x i16> addrspace(1)* undef
+ ret void
+}
+
+; GCN-LABEL: {{^}}load_flat_lo_v2f16_reghi_vreg:
+; GCN: s_waitcnt
+; GFX9-NEXT: flat_load_short_d16 v2, v[0:1]
+; GFX9-NEXT: s_waitcnt
+; GFX9-NEXT: global_store_dword v[0:1], v2
+; GFX9-NEXT: s_waitcnt
+; GFX9-NEXT: s_setpc_b64
+
+; VI: flat_load_ushort v{{[0-9]+}}
+; VI: v_or_b32_e32
+define void @load_flat_lo_v2f16_reghi_vreg(half addrspace(4)* %in, i32 %reg) #0 {
+entry:
+ %reg.bc = bitcast i32 %reg to <2 x half>
+ %load = load half, half addrspace(4)* %in
+ %build1 = insertelement <2 x half> %reg.bc, half %load, i32 0
+ store <2 x half> %build1, <2 x half> addrspace(1)* undef
+ ret void
+}
+
+; GCN-LABEL: {{^}}load_flat_lo_v2i16_reglo_vreg_zexti8:
+; GCN: s_waitcnt
+; GFX9-NEXT: flat_load_ubyte_d16 v2, v[0:1]
+; GFX9-NEXT: s_waitcnt
+; GFX9-NEXT: global_store_dword v[0:1], v2
+; GFX9-NEXT: s_waitcnt
+; GFX9-NEXT: s_setpc_b64
+
+; VI: flat_load_ubyte v{{[0-9]+}}
+; VI: v_or_b32_e32
+define void @load_flat_lo_v2i16_reglo_vreg_zexti8(i8 addrspace(4)* %in, i32 %reg) #0 {
+entry:
+ %reg.bc = bitcast i32 %reg to <2 x i16>
+ %load = load i8, i8 addrspace(4)* %in
+ %ext = zext i8 %load to i16
+ %build1 = insertelement <2 x i16> %reg.bc, i16 %ext, i32 0
+ store <2 x i16> %build1, <2 x i16> addrspace(1)* undef
+ ret void
+}
+
+; GCN-LABEL: {{^}}load_flat_lo_v2i16_reglo_vreg_sexti8:
+; GCN: s_waitcnt
+; GFX9-NEXT: flat_load_sbyte_d16 v2, v[0:1]
+; GFX9-NEXT: s_waitcnt
+; GFX9-NEXT: global_store_dword v[0:1], v2
+; GFX9-NEXT: s_waitcnt
+; GFX9-NEXT: s_setpc_b64
+
+; VI: flat_load_sbyte v{{[0-9]+}}
+; VI: v_or_b32_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+
+define void @load_flat_lo_v2i16_reglo_vreg_sexti8(i8 addrspace(4)* %in, i32 %reg) #0 {
+entry:
+ %reg.bc = bitcast i32 %reg to <2 x i16>
+ %load = load i8, i8 addrspace(4)* %in
+ %ext = sext i8 %load to i16
+ %build1 = insertelement <2 x i16> %reg.bc, i16 %ext, i32 0
+ store <2 x i16> %build1, <2 x i16> addrspace(1)* undef
+ ret void
+}
+
+; GCN-LABEL: {{^}}load_private_lo_v2i16_reglo_vreg:
+; GCN: s_waitcnt
+; GFX9-NEXT: buffer_load_short_d16 v1, v0, s[0:3], s4 offen offset:4094{{$}}
+; GFX9-NEXT: s_waitcnt
+; GFX9-NEXT: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v1
+; GFX9-NEXT: s_waitcnt
+; GFX9-NEXT: s_setpc_b64
+
+; VI: buffer_load_ushort v{{[0-9]+}}, v0, s[0:3], s4 offen offset:4094{{$}}
+define void @load_private_lo_v2i16_reglo_vreg(i16* %in, i32 %reg) #0 {
+entry:
+ %reg.bc = bitcast i32 %reg to <2 x i16>
+ %gep = getelementptr inbounds i16, i16* %in, i64 2047
+ %load = load i16, i16* %gep
+ %build1 = insertelement <2 x i16> %reg.bc, i16 %load, i32 0
+ store <2 x i16> %build1, <2 x i16> addrspace(1)* undef
+ ret void
+}
+
+; GCN-LABEL: {{^}}load_private_lo_v2i16_reghi_vreg:
+; GCN: s_waitcnt
+; GFX9-NEXT: buffer_load_ushort v{{[0-9]+}}, v{{[0-9]+}}, s[0:3], s4 offen offset:4094{{$}}
+; GFX9-NEXT: s_waitcnt
+; GFX9: v_and_b32
+; GFX9: v_lshl_or_b32
+
+; GFX9-NEXT: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}
+; GFX9-NEXT: s_waitcnt
+; GFX9-NEXT: s_setpc_b64
+
+; VI: buffer_load_ushort v{{[0-9]+}}, v0, s[0:3], s4 offen offset:4094{{$}}
+define void @load_private_lo_v2i16_reghi_vreg(i16* %in, i16 %reg) #0 {
+entry:
+ %gep = getelementptr inbounds i16, i16* %in, i64 2047
+ %load = load i16, i16* %gep
+ %build0 = insertelement <2 x i16> undef, i16 %reg, i32 1
+ %build1 = insertelement <2 x i16> %build0, i16 %load, i32 0
+ store <2 x i16> %build1, <2 x i16> addrspace(1)* undef
+ ret void
+}
+
+; GCN-LABEL: {{^}}load_private_lo_v2f16_reglo_vreg:
+; GCN: s_waitcnt
+; GFX9-NEXT: buffer_load_short_d16 v1, v0, s[0:3], s4 offen offset:4094{{$}}
+; GFX9-NEXT: s_waitcnt
+; GFX9-NEXT: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v1
+; GFX9-NEXT: s_waitcnt
+; GFX9-NEXT: s_setpc_b64
+
+; VI: buffer_load_ushort v{{[0-9]+}}, v0, s[0:3], s4 offen offset:4094{{$}}
+define void @load_private_lo_v2f16_reglo_vreg(half* %in, i32 %reg) #0 {
+entry:
+ %reg.bc = bitcast i32 %reg to <2 x half>
+ %gep = getelementptr inbounds half, half* %in, i64 2047
+ %load = load half, half* %gep
+ %build1 = insertelement <2 x half> %reg.bc, half %load, i32 0
+ store <2 x half> %build1, <2 x half> addrspace(1)* undef
+ ret void
+}
+
+; GCN-LABEL: {{^}}load_private_lo_v2i16_reglo_vreg_nooff:
+; GCN: s_waitcnt
+; GFX9-NEXT: buffer_load_short_d16 v1, off, s[0:3], s4 offset:4094{{$}}
+; GFX9-NEXT: s_waitcnt
+; GFX9-NEXT: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v1
+; GFX9-NEXT: s_waitcnt
+; GFX9-NEXT: s_setpc_b64
+
+; VI: buffer_load_ushort v{{[0-9]+}}, off, s[0:3], s4 offset:4094{{$}}
+define void @load_private_lo_v2i16_reglo_vreg_nooff(i16* %in, i32 %reg) #0 {
+entry:
+ %reg.bc = bitcast i32 %reg to <2 x i16>
+ %load = load volatile i16, i16* inttoptr (i32 4094 to i16*)
+ %build1 = insertelement <2 x i16> %reg.bc, i16 %load, i32 0
+ store <2 x i16> %build1, <2 x i16> addrspace(1)* undef
+ ret void
+}
+
+; GCN-LABEL: {{^}}load_private_lo_v2i16_reghi_vreg_nooff:
+; GCN: s_waitcnt
+; GFX9-NEXT: buffer_load_short_d16 v1, off, s[0:3], s4 offset:4094{{$}}
+; GFX9-NEXT: s_waitcnt
+; GFX9-NEXT: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v1
+; GFX9-NEXT: s_waitcnt
+; GFX9-NEXT: s_setpc_b64
+
+; VI: buffer_load_ushort v{{[0-9]+}}, off, s[0:3], s4 offset:4094{{$}}
+define void @load_private_lo_v2i16_reghi_vreg_nooff(i16* %in, i32 %reg) #0 {
+entry:
+ %reg.bc = bitcast i32 %reg to <2 x i16>
+ %load = load volatile i16, i16* inttoptr (i32 4094 to i16*)
+ %build1 = insertelement <2 x i16> %reg.bc, i16 %load, i32 0
+ store <2 x i16> %build1, <2 x i16> addrspace(1)* undef
+ ret void
+}
+
+; GCN-LABEL: {{^}}load_private_lo_v2f16_reglo_vreg_nooff:
+; GCN: s_waitcnt
+; GFX9-NEXT: buffer_load_short_d16 v1, off, s[0:3], s4 offset:4094{{$}}
+; GFX9-NEXT: s_waitcnt
+; GFX9-NEXT: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v1
+; GFX9-NEXT: s_waitcnt
+; GFX9-NEXT: s_setpc_b64
+
+; VI: buffer_load_ushort v{{[0-9]+}}, off, s[0:3], s4 offset:4094{{$}}
+define void @load_private_lo_v2f16_reglo_vreg_nooff(half* %in, i32 %reg) #0 {
+entry:
+ %reg.bc = bitcast i32 %reg to <2 x half>
+ %load = load volatile half, half* inttoptr (i32 4094 to half*)
+ %build1 = insertelement <2 x half> %reg.bc, half %load, i32 0
+ store <2 x half> %build1, <2 x half> addrspace(1)* undef
+ ret void
+}
+
+; GCN-LABEL: {{^}}load_private_lo_v2i16_reglo_vreg_zexti8:
+; GCN: s_waitcnt
+; GFX9-NEXT: buffer_load_ubyte_d16 v1, v0, s[0:3], s4 offen offset:2047{{$}}
+; GFX9-NEXT: s_waitcnt
+; GFX9-NEXT: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v1
+; GFX9-NEXT: s_waitcnt
+; GFX9-NEXT: s_setpc_b64
+
+; VI: buffer_load_ubyte v{{[0-9]+}}, v0, s[0:3], s4 offen offset:2047{{$}}
+define void @load_private_lo_v2i16_reglo_vreg_zexti8(i8* %in, i32 %reg) #0 {
+entry:
+ %reg.bc = bitcast i32 %reg to <2 x i16>
+ %gep = getelementptr inbounds i8, i8* %in, i64 2047
+ %load = load i8, i8* %gep
+ %ext = zext i8 %load to i16
+ %build1 = insertelement <2 x i16> %reg.bc, i16 %ext, i32 0
+ store <2 x i16> %build1, <2 x i16> addrspace(1)* undef
+ ret void
+}
+
+; GCN-LABEL: {{^}}load_private_lo_v2i16_reglo_vreg_sexti8:
+; GCN: s_waitcnt
+; GFX9-NEXT: buffer_load_sbyte_d16 v1, v0, s[0:3], s4 offen offset:2047{{$}}
+; GFX9-NEXT: s_waitcnt
+; GFX9-NEXT: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v1
+; GFX9-NEXT: s_waitcnt
+; GFX9-NEXT: s_setpc_b64
+
+; VI: buffer_load_sbyte v{{[0-9]+}}, v0, s[0:3], s4 offen offset:2047{{$}}
+define void @load_private_lo_v2i16_reglo_vreg_sexti8(i8* %in, i32 %reg) #0 {
+entry:
+ %reg.bc = bitcast i32 %reg to <2 x i16>
+ %gep = getelementptr inbounds i8, i8* %in, i64 2047
+ %load = load i8, i8* %gep
+ %ext = sext i8 %load to i16
+ %build1 = insertelement <2 x i16> %reg.bc, i16 %ext, i32 0
+ store <2 x i16> %build1, <2 x i16> addrspace(1)* undef
+ ret void
+}
+
+; GCN-LABEL: {{^}}load_private_lo_v2i16_reglo_vreg_nooff_zexti8:
+; GCN: s_waitcnt
+; GFX9-NEXT: buffer_load_ubyte_d16 v1, off, s[0:3], s4 offset:4094{{$}}
+; GFX9-NEXT: s_waitcnt
+; GFX9-NEXT: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v1
+; GFX9-NEXT: s_waitcnt
+; GFX9-NEXT: s_setpc_b64
+
+; VI: buffer_load_ubyte v0, off, s[0:3], s4 offset:4094{{$}}
+define void @load_private_lo_v2i16_reglo_vreg_nooff_zexti8(i8* %in, i32 %reg) #0 {
+entry:
+ %reg.bc = bitcast i32 %reg to <2 x i16>
+ %load = load volatile i8, i8* inttoptr (i32 4094 to i8*)
+ %ext = zext i8 %load to i16
+ %build1 = insertelement <2 x i16> %reg.bc, i16 %ext, i32 0
+ store <2 x i16> %build1, <2 x i16> addrspace(1)* undef
+ ret void
+}
+
+; GCN-LABEL: {{^}}load_private_lo_v2i16_reglo_vreg_nooff_sexti8:
+; GCN: s_waitcnt
+; GFX9-NEXT: buffer_load_sbyte_d16 v1, off, s[0:3], s4 offset:4094{{$}}
+; GFX9-NEXT: s_waitcnt
+; GFX9-NEXT: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v1
+; GFX9-NEXT: s_waitcnt
+; GFX9-NEXT: s_setpc_b64
+
+; VI: buffer_load_sbyte v0, off, s[0:3], s4 offset:4094{{$}}
+define void @load_private_lo_v2i16_reglo_vreg_nooff_sexti8(i8* %in, i32 %reg) #0 {
+entry:
+ %reg.bc = bitcast i32 %reg to <2 x i16>
+ %load = load volatile i8, i8* inttoptr (i32 4094 to i8*)
+ %ext = sext i8 %load to i16
+ %build1 = insertelement <2 x i16> %reg.bc, i16 %ext, i32 0
+ store <2 x i16> %build1, <2 x i16> addrspace(1)* undef
+ ret void
+}
+
+; GCN-LABEL: {{^}}load_private_lo_v2f16_reglo_vreg_nooff_zexti8:
+; GCN: s_waitcnt
+; GFX9-NEXT: buffer_load_ubyte_d16 v1, off, s[0:3], s4 offset:4094{{$}}
+; GFX9-NEXT: s_waitcnt
+; GFX9-NEXT: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v1
+; GFX9-NEXT: s_waitcnt
+; GFX9-NEXT: s_setpc_b64
+
+; VI: buffer_load_ubyte v0, off, s[0:3], s4 offset:4094{{$}}
+define void @load_private_lo_v2f16_reglo_vreg_nooff_zexti8(i8* %in, i32 %reg) #0 {
+entry:
+ %reg.bc = bitcast i32 %reg to <2 x half>
+ %load = load volatile i8, i8* inttoptr (i32 4094 to i8*)
+ %ext = zext i8 %load to i16
+ %bc.ext = bitcast i16 %ext to half
+ %build1 = insertelement <2 x half> %reg.bc, half %bc.ext, i32 0
+ store <2 x half> %build1, <2 x half> addrspace(1)* undef
+ ret void
+}
+
+; GCN-LABEL: {{^}}load_constant_lo_v2i16_reglo_vreg:
+; GCN: s_waitcnt
+; GFX9-NEXT: global_load_short_d16 v2, v[0:1], off offset:-4094
+; GFX9-NEXT: s_waitcnt
+; GFX9-NEXT: global_store_dword
+; GFX9-NEXT: s_waitcnt
+; GFX9-NEXT: s_setpc_b64
+
+; VI: flat_load_ushort
+define void @load_constant_lo_v2i16_reglo_vreg(i16 addrspace(2)* %in, i32 %reg) #0 {
+entry:
+ %reg.bc = bitcast i32 %reg to <2 x i16>
+ %gep = getelementptr inbounds i16, i16 addrspace(2)* %in, i64 -2047
+ %load = load i16, i16 addrspace(2)* %gep
+ %build1 = insertelement <2 x i16> %reg.bc, i16 %load, i32 0
+ store <2 x i16> %build1, <2 x i16> addrspace(1)* undef
+ ret void
+}
+
+; GCN-LABEL: load_constant_lo_v2f16_reglo_vreg
+; GCN: s_waitcnt
+; GFX9-NEXT: global_load_short_d16 v2, v[0:1], off offset:-4094
+; GFX9-NEXT: s_waitcnt
+; GFX9-NEXT: global_store_dword
+; GFX9-NEXT: s_waitcnt
+; GFX9-NEXT: s_setpc_b64
+
+; VI: flat_load_ushort
+define void @load_constant_lo_v2f16_reglo_vreg(half addrspace(2)* %in, i32 %reg) #0 {
+entry:
+ %reg.bc = bitcast i32 %reg to <2 x half>
+ %gep = getelementptr inbounds half, half addrspace(2)* %in, i64 -2047
+ %load = load half, half addrspace(2)* %gep
+ %build1 = insertelement <2 x half> %reg.bc, half %load, i32 0
+ store <2 x half> %build1, <2 x half> addrspace(1)* undef
+ ret void
+}
+
+attributes #0 = { nounwind }