summaryrefslogtreecommitdiff
path: root/test/CodeGen/AMDGPU
diff options
context:
space:
mode:
authorMatt Arsenault <Matthew.Arsenault@amd.com>2017-11-15 21:51:43 +0000
committerMatt Arsenault <Matthew.Arsenault@amd.com>2017-11-15 21:51:43 +0000
commit710e9b3dae457f330793546a29ed113ed2dde23d (patch)
treefa36ef5aa82fb3664e1786a6836d50672aae4430 /test/CodeGen/AMDGPU
parent9b78b26b27b8a48a161a215105318f45ae7a91f2 (diff)
AMDGPU: Replace i64 add/sub lowering
Use VOP3 add/addc like usual. This has some tradeoffs. Inline immediates fold a little better, but other constants are worse off. SIShrinkInstructions could be made smarter to handle these cases. This allows us to avoid selecting scalar adds where we need to track the carry in scc and replace its users. This makes it easier to use the carryless VALU adds. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@318340 91177308-0d34-0410-b5e6-96231b3b80d8
Diffstat (limited to 'test/CodeGen/AMDGPU')
-rw-r--r--test/CodeGen/AMDGPU/add.v2i16.ll4
-rw-r--r--test/CodeGen/AMDGPU/clamp.ll3
-rw-r--r--test/CodeGen/AMDGPU/ctpop.ll6
-rw-r--r--test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll22
-rw-r--r--test/CodeGen/AMDGPU/lshr.v2i16.ll2
-rw-r--r--test/CodeGen/AMDGPU/mad_64_32.ll3
-rw-r--r--test/CodeGen/AMDGPU/mul.ll33
-rw-r--r--test/CodeGen/AMDGPU/schedule-regpressure-limit2.ll2
-rw-r--r--test/CodeGen/AMDGPU/split-scalar-i64-add.ll5
-rw-r--r--test/CodeGen/AMDGPU/sub.i16.ll2
-rw-r--r--test/CodeGen/AMDGPU/sub.ll14
-rw-r--r--test/CodeGen/AMDGPU/sub.v2i16.ll5
12 files changed, 53 insertions, 48 deletions
diff --git a/test/CodeGen/AMDGPU/add.v2i16.ll b/test/CodeGen/AMDGPU/add.v2i16.ll
index f999db3f4e6..a2f647cefd1 100644
--- a/test/CodeGen/AMDGPU/add.v2i16.ll
+++ b/test/CodeGen/AMDGPU/add.v2i16.ll
@@ -163,10 +163,10 @@ define amdgpu_kernel void @v_test_add_v2i16_inline_fp_split(<2 x i16> addrspace(
; GFX9-DAG: v_lshrrev_b32_e32 v[[ELT1:[0-9]+]], 16, [[ADD]]
; GFX9: buffer_store_dwordx2 v{{\[}}[[ELT0]]:[[ELT1]]{{\]}}
-; VI: flat_load_ushort v[[A_HI:[0-9]+]]
; VI: flat_load_ushort v[[A_LO:[0-9]+]]
-; VI: flat_load_ushort v[[B_HI:[0-9]+]]
+; VI: flat_load_ushort v[[A_HI:[0-9]+]]
; VI: flat_load_ushort v[[B_LO:[0-9]+]]
+; VI: flat_load_ushort v[[B_HI:[0-9]+]]
; VI: v_add_u16_e32 v[[ADD_HI:[0-9]+]], v[[A_HI]], v[[B_HI]]
; VI-NOT: and
diff --git a/test/CodeGen/AMDGPU/clamp.ll b/test/CodeGen/AMDGPU/clamp.ll
index 216ecf76345..683235764ca 100644
--- a/test/CodeGen/AMDGPU/clamp.ll
+++ b/test/CodeGen/AMDGPU/clamp.ll
@@ -398,7 +398,8 @@ define amdgpu_kernel void @v_clamp_f32_snan_no_dx10clamp(float addrspace(1)* %ou
; GCN-LABEL: {{^}}v_clamp_f32_snan_no_dx10clamp_nnan_src:
; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]]
-; GCN: v_med3_f32 v{{[0-9]+}}, [[A]], 0, 1.0
+; GCN: v_add_f32_e32 [[ADD:v[0-9]+]], 1.0, [[A]]
+; GCN: v_med3_f32 v{{[0-9]+}}, [[ADD]], 0, 1.0
define amdgpu_kernel void @v_clamp_f32_snan_no_dx10clamp_nnan_src(float addrspace(1)* %out, float addrspace(1)* %aptr) #4 {
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
diff --git a/test/CodeGen/AMDGPU/ctpop.ll b/test/CodeGen/AMDGPU/ctpop.ll
index 5ea39032fcf..2d16d4034d3 100644
--- a/test/CodeGen/AMDGPU/ctpop.ll
+++ b/test/CodeGen/AMDGPU/ctpop.ll
@@ -44,8 +44,8 @@ define amdgpu_kernel void @v_ctpop_i32(i32 addrspace(1)* noalias %out, i32 addrs
; FUNC-LABEL: {{^}}v_ctpop_add_chain_i32:
; SI: buffer_load_dword [[VAL0:v[0-9]+]],
; SI: buffer_load_dword [[VAL1:v[0-9]+]],
-; VI: flat_load_dword [[VAL1:v[0-9]+]],
; VI: flat_load_dword [[VAL0:v[0-9]+]],
+; VI: flat_load_dword [[VAL1:v[0-9]+]],
; GCN: v_bcnt_u32_b32{{(_e64)*}} [[MIDRESULT:v[0-9]+]], [[VAL1]], 0
; SI: v_bcnt_u32_b32_e32 [[RESULT:v[0-9]+]], [[VAL0]], [[MIDRESULT]]
; VI: v_bcnt_u32_b32 [[RESULT:v[0-9]+]], [[VAL0]], [[MIDRESULT]]
@@ -58,8 +58,8 @@ define amdgpu_kernel void @v_ctpop_add_chain_i32(i32 addrspace(1)* noalias %out,
%tid = call i32 @llvm.r600.read.tidig.x()
%in0.gep = getelementptr i32, i32 addrspace(1)* %in0, i32 %tid
%in1.gep = getelementptr i32, i32 addrspace(1)* %in1, i32 %tid
- %val0 = load i32, i32 addrspace(1)* %in0.gep, align 4
- %val1 = load i32, i32 addrspace(1)* %in1.gep, align 4
+ %val0 = load volatile i32, i32 addrspace(1)* %in0.gep, align 4
+ %val1 = load volatile i32, i32 addrspace(1)* %in1.gep, align 4
%ctpop0 = call i32 @llvm.ctpop.i32(i32 %val0) nounwind readnone
%ctpop1 = call i32 @llvm.ctpop.i32(i32 %val1) nounwind readnone
%add = add i32 %ctpop0, %ctpop1
diff --git a/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll b/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll
index adfa43fe018..404ad606835 100644
--- a/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll
+++ b/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll
@@ -261,13 +261,15 @@ define amdgpu_kernel void @v_insertelement_v2i16_0_inlineimm(<2 x i16> addrspace
; GCN-LABEL: {{^}}v_insertelement_v2i16_1:
; VI: v_mov_b32_e32 [[K:v[0-9]+]], 0x3e70000
; GCN-DAG: {{flat|global}}_load_dword [[VEC:v[0-9]+]]
-; CI: v_or_b32_e32 [[RES:v[0-9]+]], 0x3e70000, [[VEC]]
-; VI: v_or_b32_sdwa [[RES:v[0-9]+]], [[VEC]], [[K]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX9-DAG: s_movk_i32 [[K:s[0-9]+]], 0x3e7
; GFX9-DAG: v_and_b32_e32 [[ELT0:v[0-9]+]], 0xffff, [[VEC]]
; GFX9: v_lshl_or_b32 [[RES:v[0-9]+]], [[K]], 16, [[ELT0]]
+; CI: v_and_b32_e32 [[AND:v[0-9]+]], 0xffff, [[VEC]]
+; CI: v_or_b32_e32 [[RES:v[0-9]+]], 0x3e70000, [[AND]]
+; VI: v_or_b32_sdwa [[RES:v[0-9]+]], [[VEC]], [[K]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+
; GCN: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RES]]
define amdgpu_kernel void @v_insertelement_v2i16_1(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 {
%tid = call i32 @llvm.amdgcn.workitem.id.x() #1
@@ -345,13 +347,16 @@ define amdgpu_kernel void @v_insertelement_v2f16_0_inlineimm(<2 x half> addrspac
; GCN-LABEL: {{^}}v_insertelement_v2f16_1:
; VI: v_mov_b32_e32 [[K:v[0-9]+]], 0x45000000
; GCN-DAG: {{flat|global}}_load_dword [[VEC:v[0-9]+]]
-; CI: v_or_b32_e32 [[RES:v[0-9]+]], 0x45000000, [[VEC]]
-; VI: v_or_b32_sdwa [[RES:v[0-9]+]], [[VEC]], [[K]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX9-DAG: s_movk_i32 [[K:s[0-9]+]], 0x4500
; GFX9-DAG: v_and_b32_e32 [[ELT0:v[0-9]+]], 0xffff, [[VEC]]
; GFX9: v_lshl_or_b32 [[RES:v[0-9]+]], [[K]], 16, [[ELT0]]
+; CI: v_and_b32_e32 [[AND:v[0-9]+]], 0xffff, [[VEC]]
+; CI: v_or_b32_e32 [[RES:v[0-9]+]], 0x45000000, [[AND]]
+
+; VI: v_or_b32_sdwa [[RES:v[0-9]+]], [[VEC]], [[K]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+
; GCN: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RES]]
define amdgpu_kernel void @v_insertelement_v2f16_1(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %in) #0 {
%tid = call i32 @llvm.amdgcn.workitem.id.x() #1
@@ -423,11 +428,9 @@ define amdgpu_kernel void @v_insertelement_v2i16_dynamic_sgpr(<2 x i16> addrspac
; GCN-LABEL: {{^}}v_insertelement_v2i16_dynamic_vgpr:
; GFX89-DAG: s_mov_b32 [[MASKK:s[0-9]+]], 0xffff{{$}}
+; GCN-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0x3e7
; GCN: {{flat|global}}_load_dword [[IDX:v[0-9]+]]
; GCN: {{flat|global}}_load_dword [[VEC:v[0-9]+]]
-; GCN-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0x3e7
-
-; GCN-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0x3e7
; GFX89-DAG: v_lshlrev_b32_e32 [[SCALED_IDX:v[0-9]+]], 16, [[IDX]]
; GFX89-DAG: v_lshlrev_b32_e64 [[MASK:v[0-9]+]], [[SCALED_IDX]], [[MASKK]]
@@ -452,11 +455,10 @@ define amdgpu_kernel void @v_insertelement_v2i16_dynamic_vgpr(<2 x i16> addrspac
; GCN-LABEL: {{^}}v_insertelement_v2f16_dynamic_vgpr:
; GFX89-DAG: s_mov_b32 [[MASKK:s[0-9]+]], 0xffff{{$}}
+; GCN-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0x1234
+
; GCN: {{flat|global}}_load_dword [[IDX:v[0-9]+]]
; GCN: {{flat|global}}_load_dword [[VEC:v[0-9]+]]
-; GCN-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0x1234
-
-; GCN-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0x1234
; GFX89-DAG: v_lshlrev_b32_e32 [[SCALED_IDX:v[0-9]+]], 16, [[IDX]]
; GFX89-DAG: v_lshlrev_b32_e64 [[MASK:v[0-9]+]], [[SCALED_IDX]], [[MASKK]]
diff --git a/test/CodeGen/AMDGPU/lshr.v2i16.ll b/test/CodeGen/AMDGPU/lshr.v2i16.ll
index 54969768f88..72aac2322a4 100644
--- a/test/CodeGen/AMDGPU/lshr.v2i16.ll
+++ b/test/CodeGen/AMDGPU/lshr.v2i16.ll
@@ -8,7 +8,7 @@
; GFX9: v_mov_b32_e32 [[VLHS:v[0-9]+]], [[LHS]]
; GFX9: v_pk_lshrrev_b16 [[RESULT:v[0-9]+]], [[RHS]], [[VLHS]]
-; VI: v_lshrrev_b32_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; VI-DAG: v_lshrrev_b32_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; CI: v_lshrrev_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
; CI-DAG: v_lshlrev_b32_e32 v{{[0-9]+}}, 16, v{{[0-9]+}}
; CIVI-DAG: v_bfe_u32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, 16
diff --git a/test/CodeGen/AMDGPU/mad_64_32.ll b/test/CodeGen/AMDGPU/mad_64_32.ll
index b4d9d928101..91a53eed7d7 100644
--- a/test/CodeGen/AMDGPU/mad_64_32.ll
+++ b/test/CodeGen/AMDGPU/mad_64_32.ll
@@ -69,8 +69,9 @@ define i64 @mad_u64_u32_zextops_commute(i32 %arg0, i32 %arg1, i64 %arg2) #0 {
; GCN-LABEL: {{^}}mad_i64_i32_sextops_i32_i128:
; CI: v_mad_u64_u32
; CI: v_mad_u64_u32
-; CI: v_mad_u64_u32
; CI: v_mad_i64_i32
+; CI: v_mad_u64_u32
+
; SI-NOT: v_mad_
define i128 @mad_i64_i32_sextops_i32_i128(i32 %arg0, i32 %arg1, i128 %arg2) #0 {
diff --git a/test/CodeGen/AMDGPU/mul.ll b/test/CodeGen/AMDGPU/mul.ll
index 555c65a6ffe..b35eefaaa07 100644
--- a/test/CodeGen/AMDGPU/mul.ll
+++ b/test/CodeGen/AMDGPU/mul.ll
@@ -211,6 +211,7 @@ endif:
; SI: s_mul_i32
; SI: v_mul_hi_u32
; SI: s_mul_i32
+; SI: s_mul_i32
; SI-DAG: s_mul_i32
; SI-DAG: v_mul_hi_u32
@@ -219,22 +220,16 @@ endif:
; SI-DAG: s_mul_i32
; SI-DAG: v_mul_hi_u32
-; SI: s_mul_i32
-; SI: s_mul_i32
-; SI: s_mul_i32
-; SI: s_mul_i32
-; SI: s_mul_i32
-
-
; VI: s_mul_i32
-; VI: v_mul_hi_u32
; VI: v_mad_u64_u32
; VI: s_mul_i32
; VI: v_mul_hi_u32
; VI: v_mad_u64_u32
+; VI: v_mul_hi_u32
; VI: v_mad_u64_u32
+
; GCN: buffer_store_dwordx4
define amdgpu_kernel void @s_mul_i128(i128 addrspace(1)* %out, i128 %a, i128 %b) nounwind #0 {
%mul = mul i128 %a, %b
@@ -246,15 +241,15 @@ define amdgpu_kernel void @s_mul_i128(i128 addrspace(1)* %out, i128 %a, i128 %b)
; GCN: {{buffer|flat}}_load_dwordx4
; GCN: {{buffer|flat}}_load_dwordx4
-; GCN-DAG: v_mul_lo_i32
-; GCN-DAG: v_mul_hi_u32
-; GCN-DAG: v_mul_hi_u32
-; GCN-DAG: v_mul_lo_i32
-; GCN-DAG: v_mul_hi_u32
-; GCN-DAG: v_mul_hi_u32
-; GCN-DAG: v_mul_lo_i32
-; GCN-DAG: v_mul_lo_i32
-; GCN-DAG: v_add_i32_e32
+; SI-DAG: v_mul_lo_i32
+; SI-DAG: v_mul_hi_u32
+; SI-DAG: v_mul_hi_u32
+; SI-DAG: v_mul_lo_i32
+; SI-DAG: v_mul_hi_u32
+; SI-DAG: v_mul_hi_u32
+; SI-DAG: v_mul_lo_i32
+; SI-DAG: v_mul_lo_i32
+; SI-DAG: v_add_i32_e32
; SI-DAG: v_mul_hi_u32
; SI-DAG: v_mul_lo_i32
@@ -265,7 +260,9 @@ define amdgpu_kernel void @s_mul_i128(i128 addrspace(1)* %out, i128 %a, i128 %b)
; SI-DAG: v_mul_lo_i32
; SI-DAG: v_mul_lo_i32
-; VI-DAG: v_mad_u64_u32
+; VI-DAG: v_mul_lo_i32
+; VI-DAG: v_mul_hi_u32
+; VI: v_mad_u64_u32
; VI: v_mad_u64_u32
; VI: v_mad_u64_u32
diff --git a/test/CodeGen/AMDGPU/schedule-regpressure-limit2.ll b/test/CodeGen/AMDGPU/schedule-regpressure-limit2.ll
index cae713be2ea..2072cf5e887 100644
--- a/test/CodeGen/AMDGPU/schedule-regpressure-limit2.ll
+++ b/test/CodeGen/AMDGPU/schedule-regpressure-limit2.ll
@@ -7,7 +7,7 @@
; SI: NumVgprs: {{[1-9]$}}
; stores may alias loads
-; VI: NumSgprs: {{[1-5][0-9]$}}
+; VI: NumSgprs: {{[0-9]$}}
; VI: NumVgprs: {{[1-3][0-9]$}}
define amdgpu_kernel void @load_fma_store(float addrspace(3)* nocapture readonly %in_arg, float addrspace(1)* nocapture %out_arg) {
diff --git a/test/CodeGen/AMDGPU/split-scalar-i64-add.ll b/test/CodeGen/AMDGPU/split-scalar-i64-add.ll
index 5d7d29db3a2..59ddd6177d7 100644
--- a/test/CodeGen/AMDGPU/split-scalar-i64-add.ll
+++ b/test/CodeGen/AMDGPU/split-scalar-i64-add.ll
@@ -7,8 +7,11 @@ declare i32 @llvm.amdgcn.workitem.id.x() readnone
; set in vcc, which is undefined since the low scalar half add sets
; scc instead.
+; FIXME: SIShrinkInstructions should force immediate fold.
+
; FUNC-LABEL: {{^}}imp_def_vcc_split_i64_add_0:
-; SI: v_add_i32_e32 v{{[0-9]+}}, vcc, 0x18f, v{{[0-9]+}}
+; SI: s_movk_i32 [[K:s[0-9]+]], 0x18f
+; SI: v_add_i32_e32 v{{[0-9]+}}, vcc, [[K]], v{{[0-9]+}}
; SI: v_addc_u32_e32 v{{[0-9]+}}, vcc, 0, v{{[0-9]+}}, vcc
define amdgpu_kernel void @imp_def_vcc_split_i64_add_0(i64 addrspace(1)* %out, i32 addrspace(1)* %in, i32 %s.val) {
%v.val = load volatile i32, i32 addrspace(1)* %in
diff --git a/test/CodeGen/AMDGPU/sub.i16.ll b/test/CodeGen/AMDGPU/sub.i16.ll
index fd70f2b6108..3ad091a91d6 100644
--- a/test/CodeGen/AMDGPU/sub.i16.ll
+++ b/test/CodeGen/AMDGPU/sub.i16.ll
@@ -85,9 +85,9 @@ define amdgpu_kernel void @v_test_sub_i16_zext_to_i32(i32 addrspace(1)* %out, i1
; FIXME: Need to handle non-uniform case for function below (load without gep).
; GCN-LABEL: {{^}}v_test_sub_i16_zext_to_i64:
-; VI: v_mov_b32_e32 v[[VZERO:[0-9]+]], 0
; VI: flat_load_ushort [[A:v[0-9]+]]
; VI: flat_load_ushort [[B:v[0-9]+]]
+; VI: v_mov_b32_e32 v[[VZERO:[0-9]+]], 0
; VI-DAG: v_sub_u16_e32 v[[ADD:[0-9]+]], [[A]], [[B]]
; VI: buffer_store_dwordx2 v{{\[}}[[ADD]]:[[VZERO]]{{\]}}, off, {{s\[[0-9]+:[0-9]+\]}}, 0{{$}}
define amdgpu_kernel void @v_test_sub_i16_zext_to_i64(i64 addrspace(1)* %out, i16 addrspace(1)* %in0, i16 addrspace(1)* %in1) #1 {
diff --git a/test/CodeGen/AMDGPU/sub.ll b/test/CodeGen/AMDGPU/sub.ll
index 46f1b120f21..4c573acdbab 100644
--- a/test/CodeGen/AMDGPU/sub.ll
+++ b/test/CodeGen/AMDGPU/sub.ll
@@ -57,7 +57,7 @@ define amdgpu_kernel void @test_sub_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32
; VI: v_sub_i16_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
define amdgpu_kernel void @test_sub_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %in) {
%b_ptr = getelementptr i16, i16 addrspace(1)* %in, i16 1
- %a = load i16, i16 addrspace(1)* %in
+ %a = load i16, i16 addrspace(1)* %in
%b = load i16, i16 addrspace(1)* %b_ptr
%result = sub i16 %a, %b
store i16 %result, i16 addrspace(1)* %out
@@ -71,7 +71,7 @@ define amdgpu_kernel void @test_sub_i16(i16 addrspace(1)* %out, i16 addrspace(1)
define amdgpu_kernel void @test_sub_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) {
%b_ptr = getelementptr <2 x i16>, <2 x i16> addrspace(1)* %in, i16 1
- %a = load <2 x i16>, <2 x i16> addrspace(1) * %in
+ %a = load <2 x i16>, <2 x i16> addrspace(1) * %in
%b = load <2 x i16>, <2 x i16> addrspace(1) * %b_ptr
%result = sub <2 x i16> %a, %b
store <2 x i16> %result, <2 x i16> addrspace(1)* %out
@@ -87,7 +87,7 @@ define amdgpu_kernel void @test_sub_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16
define amdgpu_kernel void @test_sub_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> addrspace(1)* %in) {
%b_ptr = getelementptr <4 x i16>, <4 x i16> addrspace(1)* %in, i16 1
- %a = load <4 x i16>, <4 x i16> addrspace(1) * %in
+ %a = load <4 x i16>, <4 x i16> addrspace(1) * %in
%b = load <4 x i16>, <4 x i16> addrspace(1) * %b_ptr
%result = sub <4 x i16> %a, %b
store <4 x i16> %result, <4 x i16> addrspace(1)* %out
@@ -146,13 +146,13 @@ define amdgpu_kernel void @v_test_sub_v2i64(<2 x i64> addrspace(1)* %out, <2 x i
}
; FUNC-LABEL: {{^}}v_test_sub_v4i64:
-; SI: v_subrev_i32_e32
+; SI: v_sub_i32_e32
; SI: v_subb_u32_e32
-; SI: v_subrev_i32_e32
+; SI: v_sub_i32_e32
; SI: v_subb_u32_e32
-; SI: v_subrev_i32_e32
+; SI: v_sub_i32_e32
; SI: v_subb_u32_e32
-; SI: v_subrev_i32_e32
+; SI: v_sub_i32_e32
; SI: v_subb_u32_e32
define amdgpu_kernel void @v_test_sub_v4i64(<4 x i64> addrspace(1)* %out, <4 x i64> addrspace(1)* noalias %inA, <4 x i64> addrspace(1)* noalias %inB) {
%tid = call i32 @llvm.r600.read.tidig.x() readnone
diff --git a/test/CodeGen/AMDGPU/sub.v2i16.ll b/test/CodeGen/AMDGPU/sub.v2i16.ll
index 6e76575e3be..620f38e5fba 100644
--- a/test/CodeGen/AMDGPU/sub.v2i16.ll
+++ b/test/CodeGen/AMDGPU/sub.v2i16.ll
@@ -160,10 +160,11 @@ define amdgpu_kernel void @v_test_sub_v2i16_inline_fp_split(<2 x i16> addrspace(
; GFX9-DAG: v_lshrrev_b32_e32 v[[ELT1:[0-9]+]], 16, [[ADD]]
; GFX9: buffer_store_dwordx2 v{{\[}}[[ELT0]]:[[ELT1]]{{\]}}
-; VI: flat_load_ushort v[[A_HI:[0-9]+]]
; VI: flat_load_ushort v[[A_LO:[0-9]+]]
-; VI: flat_load_ushort v[[B_HI:[0-9]+]]
+; VI: flat_load_ushort v[[A_HI:[0-9]+]]
+
; VI: flat_load_ushort v[[B_LO:[0-9]+]]
+; VI: flat_load_ushort v[[B_HI:[0-9]+]]
; VI: v_sub_u16_e32 v[[ADD_HI:[0-9]+]], v[[A_HI]], v[[B_HI]]
; VI-NOT: and