diff options
author | Matt Arsenault <Matthew.Arsenault@amd.com> | 2017-11-15 21:51:43 +0000 |
---|---|---|
committer | Matt Arsenault <Matthew.Arsenault@amd.com> | 2017-11-15 21:51:43 +0000 |
commit | 710e9b3dae457f330793546a29ed113ed2dde23d (patch) | |
tree | fa36ef5aa82fb3664e1786a6836d50672aae4430 /test/CodeGen/AMDGPU | |
parent | 9b78b26b27b8a48a161a215105318f45ae7a91f2 (diff) |
AMDGPU: Replace i64 add/sub lowering
Use VOP3 add/addc like usual.
This has some tradeoffs. Inline immediates fold
a little better, but other constants are worse off.
SIShrinkInstructions could be made smarter to handle
these cases.
This allows us to avoid selecting scalar adds where we
need to track the carry in scc and replace its users.
This makes it easier to use the carryless VALU adds.
git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@318340 91177308-0d34-0410-b5e6-96231b3b80d8
Diffstat (limited to 'test/CodeGen/AMDGPU')
-rw-r--r-- | test/CodeGen/AMDGPU/add.v2i16.ll | 4 | ||||
-rw-r--r-- | test/CodeGen/AMDGPU/clamp.ll | 3 | ||||
-rw-r--r-- | test/CodeGen/AMDGPU/ctpop.ll | 6 | ||||
-rw-r--r-- | test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll | 22 | ||||
-rw-r--r-- | test/CodeGen/AMDGPU/lshr.v2i16.ll | 2 | ||||
-rw-r--r-- | test/CodeGen/AMDGPU/mad_64_32.ll | 3 | ||||
-rw-r--r-- | test/CodeGen/AMDGPU/mul.ll | 33 | ||||
-rw-r--r-- | test/CodeGen/AMDGPU/schedule-regpressure-limit2.ll | 2 | ||||
-rw-r--r-- | test/CodeGen/AMDGPU/split-scalar-i64-add.ll | 5 | ||||
-rw-r--r-- | test/CodeGen/AMDGPU/sub.i16.ll | 2 | ||||
-rw-r--r-- | test/CodeGen/AMDGPU/sub.ll | 14 | ||||
-rw-r--r-- | test/CodeGen/AMDGPU/sub.v2i16.ll | 5 |
12 files changed, 53 insertions, 48 deletions
diff --git a/test/CodeGen/AMDGPU/add.v2i16.ll b/test/CodeGen/AMDGPU/add.v2i16.ll index f999db3f4e6..a2f647cefd1 100644 --- a/test/CodeGen/AMDGPU/add.v2i16.ll +++ b/test/CodeGen/AMDGPU/add.v2i16.ll @@ -163,10 +163,10 @@ define amdgpu_kernel void @v_test_add_v2i16_inline_fp_split(<2 x i16> addrspace( ; GFX9-DAG: v_lshrrev_b32_e32 v[[ELT1:[0-9]+]], 16, [[ADD]] ; GFX9: buffer_store_dwordx2 v{{\[}}[[ELT0]]:[[ELT1]]{{\]}} -; VI: flat_load_ushort v[[A_HI:[0-9]+]] ; VI: flat_load_ushort v[[A_LO:[0-9]+]] -; VI: flat_load_ushort v[[B_HI:[0-9]+]] +; VI: flat_load_ushort v[[A_HI:[0-9]+]] ; VI: flat_load_ushort v[[B_LO:[0-9]+]] +; VI: flat_load_ushort v[[B_HI:[0-9]+]] ; VI: v_add_u16_e32 v[[ADD_HI:[0-9]+]], v[[A_HI]], v[[B_HI]] ; VI-NOT: and diff --git a/test/CodeGen/AMDGPU/clamp.ll b/test/CodeGen/AMDGPU/clamp.ll index 216ecf76345..683235764ca 100644 --- a/test/CodeGen/AMDGPU/clamp.ll +++ b/test/CodeGen/AMDGPU/clamp.ll @@ -398,7 +398,8 @@ define amdgpu_kernel void @v_clamp_f32_snan_no_dx10clamp(float addrspace(1)* %ou ; GCN-LABEL: {{^}}v_clamp_f32_snan_no_dx10clamp_nnan_src: ; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]] -; GCN: v_med3_f32 v{{[0-9]+}}, [[A]], 0, 1.0 +; GCN: v_add_f32_e32 [[ADD:v[0-9]+]], 1.0, [[A]] +; GCN: v_med3_f32 v{{[0-9]+}}, [[ADD]], 0, 1.0 define amdgpu_kernel void @v_clamp_f32_snan_no_dx10clamp_nnan_src(float addrspace(1)* %out, float addrspace(1)* %aptr) #4 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid diff --git a/test/CodeGen/AMDGPU/ctpop.ll b/test/CodeGen/AMDGPU/ctpop.ll index 5ea39032fcf..2d16d4034d3 100644 --- a/test/CodeGen/AMDGPU/ctpop.ll +++ b/test/CodeGen/AMDGPU/ctpop.ll @@ -44,8 +44,8 @@ define amdgpu_kernel void @v_ctpop_i32(i32 addrspace(1)* noalias %out, i32 addrs ; FUNC-LABEL: {{^}}v_ctpop_add_chain_i32: ; SI: buffer_load_dword [[VAL0:v[0-9]+]], ; SI: buffer_load_dword [[VAL1:v[0-9]+]], -; VI: flat_load_dword [[VAL1:v[0-9]+]], ; VI: flat_load_dword [[VAL0:v[0-9]+]], +; VI: flat_load_dword [[VAL1:v[0-9]+]], ; GCN: v_bcnt_u32_b32{{(_e64)*}} [[MIDRESULT:v[0-9]+]], [[VAL1]], 0 ; SI: v_bcnt_u32_b32_e32 [[RESULT:v[0-9]+]], [[VAL0]], [[MIDRESULT]] ; VI: v_bcnt_u32_b32 [[RESULT:v[0-9]+]], [[VAL0]], [[MIDRESULT]] @@ -58,8 +58,8 @@ define amdgpu_kernel void @v_ctpop_add_chain_i32(i32 addrspace(1)* noalias %out, %tid = call i32 @llvm.r600.read.tidig.x() %in0.gep = getelementptr i32, i32 addrspace(1)* %in0, i32 %tid %in1.gep = getelementptr i32, i32 addrspace(1)* %in1, i32 %tid - %val0 = load i32, i32 addrspace(1)* %in0.gep, align 4 - %val1 = load i32, i32 addrspace(1)* %in1.gep, align 4 + %val0 = load volatile i32, i32 addrspace(1)* %in0.gep, align 4 + %val1 = load volatile i32, i32 addrspace(1)* %in1.gep, align 4 %ctpop0 = call i32 @llvm.ctpop.i32(i32 %val0) nounwind readnone %ctpop1 = call i32 @llvm.ctpop.i32(i32 %val1) nounwind readnone %add = add i32 %ctpop0, %ctpop1 diff --git a/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll b/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll index adfa43fe018..404ad606835 100644 --- a/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll +++ b/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll @@ -261,13 +261,15 @@ define amdgpu_kernel void @v_insertelement_v2i16_0_inlineimm(<2 x i16> addrspace ; GCN-LABEL: {{^}}v_insertelement_v2i16_1: ; VI: v_mov_b32_e32 [[K:v[0-9]+]], 0x3e70000 ; GCN-DAG: {{flat|global}}_load_dword [[VEC:v[0-9]+]] -; CI: v_or_b32_e32 [[RES:v[0-9]+]], 0x3e70000, [[VEC]] -; VI: v_or_b32_sdwa [[RES:v[0-9]+]], [[VEC]], [[K]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-DAG: s_movk_i32 [[K:s[0-9]+]], 0x3e7 ; GFX9-DAG: v_and_b32_e32 [[ELT0:v[0-9]+]], 0xffff, [[VEC]] ; GFX9: v_lshl_or_b32 [[RES:v[0-9]+]], [[K]], 16, [[ELT0]] +; CI: v_and_b32_e32 [[AND:v[0-9]+]], 0xffff, [[VEC]] +; CI: v_or_b32_e32 [[RES:v[0-9]+]], 0x3e70000, [[AND]] +; VI: v_or_b32_sdwa [[RES:v[0-9]+]], [[VEC]], [[K]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD + ; GCN: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RES]] define amdgpu_kernel void @v_insertelement_v2i16_1(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 @@ -345,13 +347,16 @@ define amdgpu_kernel void @v_insertelement_v2f16_0_inlineimm(<2 x half> addrspac ; GCN-LABEL: {{^}}v_insertelement_v2f16_1: ; VI: v_mov_b32_e32 [[K:v[0-9]+]], 0x45000000 ; GCN-DAG: {{flat|global}}_load_dword [[VEC:v[0-9]+]] -; CI: v_or_b32_e32 [[RES:v[0-9]+]], 0x45000000, [[VEC]] -; VI: v_or_b32_sdwa [[RES:v[0-9]+]], [[VEC]], [[K]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-DAG: s_movk_i32 [[K:s[0-9]+]], 0x4500 ; GFX9-DAG: v_and_b32_e32 [[ELT0:v[0-9]+]], 0xffff, [[VEC]] ; GFX9: v_lshl_or_b32 [[RES:v[0-9]+]], [[K]], 16, [[ELT0]] +; CI: v_and_b32_e32 [[AND:v[0-9]+]], 0xffff, [[VEC]] +; CI: v_or_b32_e32 [[RES:v[0-9]+]], 0x45000000, [[AND]] + +; VI: v_or_b32_sdwa [[RES:v[0-9]+]], [[VEC]], [[K]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD + ; GCN: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RES]] define amdgpu_kernel void @v_insertelement_v2f16_1(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %in) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 @@ -423,11 +428,9 @@ define amdgpu_kernel void @v_insertelement_v2i16_dynamic_sgpr(<2 x i16> addrspac ; GCN-LABEL: {{^}}v_insertelement_v2i16_dynamic_vgpr: ; GFX89-DAG: s_mov_b32 [[MASKK:s[0-9]+]], 0xffff{{$}} +; GCN-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0x3e7 ; GCN: {{flat|global}}_load_dword [[IDX:v[0-9]+]] ; GCN: {{flat|global}}_load_dword [[VEC:v[0-9]+]] -; GCN-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0x3e7 - -; GCN-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0x3e7 ; GFX89-DAG: v_lshlrev_b32_e32 [[SCALED_IDX:v[0-9]+]], 16, [[IDX]] ; GFX89-DAG: v_lshlrev_b32_e64 [[MASK:v[0-9]+]], [[SCALED_IDX]], [[MASKK]] @@ -452,11 +455,10 @@ define amdgpu_kernel void @v_insertelement_v2i16_dynamic_vgpr(<2 x i16> addrspac ; GCN-LABEL: {{^}}v_insertelement_v2f16_dynamic_vgpr: ; GFX89-DAG: s_mov_b32 [[MASKK:s[0-9]+]], 0xffff{{$}} +; GCN-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0x1234 + ; GCN: {{flat|global}}_load_dword [[IDX:v[0-9]+]] ; GCN: {{flat|global}}_load_dword [[VEC:v[0-9]+]] -; GCN-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0x1234 - -; GCN-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0x1234 ; GFX89-DAG: v_lshlrev_b32_e32 [[SCALED_IDX:v[0-9]+]], 16, [[IDX]] ; GFX89-DAG: v_lshlrev_b32_e64 [[MASK:v[0-9]+]], [[SCALED_IDX]], [[MASKK]] diff --git a/test/CodeGen/AMDGPU/lshr.v2i16.ll b/test/CodeGen/AMDGPU/lshr.v2i16.ll index 54969768f88..72aac2322a4 100644 --- a/test/CodeGen/AMDGPU/lshr.v2i16.ll +++ b/test/CodeGen/AMDGPU/lshr.v2i16.ll @@ -8,7 +8,7 @@ ; GFX9: v_mov_b32_e32 [[VLHS:v[0-9]+]], [[LHS]] ; GFX9: v_pk_lshrrev_b16 [[RESULT:v[0-9]+]], [[RHS]], [[VLHS]] -; VI: v_lshrrev_b32_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-DAG: v_lshrrev_b32_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; CI: v_lshrrev_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} ; CI-DAG: v_lshlrev_b32_e32 v{{[0-9]+}}, 16, v{{[0-9]+}} ; CIVI-DAG: v_bfe_u32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, 16 diff --git a/test/CodeGen/AMDGPU/mad_64_32.ll b/test/CodeGen/AMDGPU/mad_64_32.ll index b4d9d928101..91a53eed7d7 100644 --- a/test/CodeGen/AMDGPU/mad_64_32.ll +++ b/test/CodeGen/AMDGPU/mad_64_32.ll @@ -69,8 +69,9 @@ define i64 @mad_u64_u32_zextops_commute(i32 %arg0, i32 %arg1, i64 %arg2) #0 { ; GCN-LABEL: {{^}}mad_i64_i32_sextops_i32_i128: ; CI: v_mad_u64_u32 ; CI: v_mad_u64_u32 -; CI: v_mad_u64_u32 ; CI: v_mad_i64_i32 +; CI: v_mad_u64_u32 + ; SI-NOT: v_mad_ define i128 @mad_i64_i32_sextops_i32_i128(i32 %arg0, i32 %arg1, i128 %arg2) #0 { diff --git a/test/CodeGen/AMDGPU/mul.ll b/test/CodeGen/AMDGPU/mul.ll index 555c65a6ffe..b35eefaaa07 100644 --- a/test/CodeGen/AMDGPU/mul.ll +++ b/test/CodeGen/AMDGPU/mul.ll @@ -211,6 +211,7 @@ endif: ; SI: s_mul_i32 ; SI: v_mul_hi_u32 ; SI: s_mul_i32 +; SI: s_mul_i32 ; SI-DAG: s_mul_i32 ; SI-DAG: v_mul_hi_u32 @@ -219,22 +220,16 @@ endif: ; SI-DAG: s_mul_i32 ; SI-DAG: v_mul_hi_u32 -; SI: s_mul_i32 -; SI: s_mul_i32 -; SI: s_mul_i32 -; SI: s_mul_i32 -; SI: s_mul_i32 - - ; VI: s_mul_i32 -; VI: v_mul_hi_u32 ; VI: v_mad_u64_u32 ; VI: s_mul_i32 ; VI: v_mul_hi_u32 ; VI: v_mad_u64_u32 +; VI: v_mul_hi_u32 ; VI: v_mad_u64_u32 + ; GCN: buffer_store_dwordx4 define amdgpu_kernel void @s_mul_i128(i128 addrspace(1)* %out, i128 %a, i128 %b) nounwind #0 { %mul = mul i128 %a, %b @@ -246,15 +241,15 @@ define amdgpu_kernel void @s_mul_i128(i128 addrspace(1)* %out, i128 %a, i128 %b) ; GCN: {{buffer|flat}}_load_dwordx4 ; GCN: {{buffer|flat}}_load_dwordx4 -; GCN-DAG: v_mul_lo_i32 -; GCN-DAG: v_mul_hi_u32 -; GCN-DAG: v_mul_hi_u32 -; GCN-DAG: v_mul_lo_i32 -; GCN-DAG: v_mul_hi_u32 -; GCN-DAG: v_mul_hi_u32 -; GCN-DAG: v_mul_lo_i32 -; GCN-DAG: v_mul_lo_i32 -; GCN-DAG: v_add_i32_e32 +; SI-DAG: v_mul_lo_i32 +; SI-DAG: v_mul_hi_u32 +; SI-DAG: v_mul_hi_u32 +; SI-DAG: v_mul_lo_i32 +; SI-DAG: v_mul_hi_u32 +; SI-DAG: v_mul_hi_u32 +; SI-DAG: v_mul_lo_i32 +; SI-DAG: v_mul_lo_i32 +; SI-DAG: v_add_i32_e32 ; SI-DAG: v_mul_hi_u32 ; SI-DAG: v_mul_lo_i32 @@ -265,7 +260,9 @@ define amdgpu_kernel void @s_mul_i128(i128 addrspace(1)* %out, i128 %a, i128 %b) ; SI-DAG: v_mul_lo_i32 ; SI-DAG: v_mul_lo_i32 -; VI-DAG: v_mad_u64_u32 +; VI-DAG: v_mul_lo_i32 +; VI-DAG: v_mul_hi_u32 +; VI: v_mad_u64_u32 ; VI: v_mad_u64_u32 ; VI: v_mad_u64_u32 diff --git a/test/CodeGen/AMDGPU/schedule-regpressure-limit2.ll b/test/CodeGen/AMDGPU/schedule-regpressure-limit2.ll index cae713be2ea..2072cf5e887 100644 --- a/test/CodeGen/AMDGPU/schedule-regpressure-limit2.ll +++ b/test/CodeGen/AMDGPU/schedule-regpressure-limit2.ll @@ -7,7 +7,7 @@ ; SI: NumVgprs: {{[1-9]$}} ; stores may alias loads -; VI: NumSgprs: {{[1-5][0-9]$}} +; VI: NumSgprs: {{[0-9]$}} ; VI: NumVgprs: {{[1-3][0-9]$}} define amdgpu_kernel void @load_fma_store(float addrspace(3)* nocapture readonly %in_arg, float addrspace(1)* nocapture %out_arg) { diff --git a/test/CodeGen/AMDGPU/split-scalar-i64-add.ll b/test/CodeGen/AMDGPU/split-scalar-i64-add.ll index 5d7d29db3a2..59ddd6177d7 100644 --- a/test/CodeGen/AMDGPU/split-scalar-i64-add.ll +++ b/test/CodeGen/AMDGPU/split-scalar-i64-add.ll @@ -7,8 +7,11 @@ declare i32 @llvm.amdgcn.workitem.id.x() readnone ; set in vcc, which is undefined since the low scalar half add sets ; scc instead. +; FIXME: SIShrinkInstructions should force immediate fold. + ; FUNC-LABEL: {{^}}imp_def_vcc_split_i64_add_0: -; SI: v_add_i32_e32 v{{[0-9]+}}, vcc, 0x18f, v{{[0-9]+}} +; SI: s_movk_i32 [[K:s[0-9]+]], 0x18f +; SI: v_add_i32_e32 v{{[0-9]+}}, vcc, [[K]], v{{[0-9]+}} ; SI: v_addc_u32_e32 v{{[0-9]+}}, vcc, 0, v{{[0-9]+}}, vcc define amdgpu_kernel void @imp_def_vcc_split_i64_add_0(i64 addrspace(1)* %out, i32 addrspace(1)* %in, i32 %s.val) { %v.val = load volatile i32, i32 addrspace(1)* %in diff --git a/test/CodeGen/AMDGPU/sub.i16.ll b/test/CodeGen/AMDGPU/sub.i16.ll index fd70f2b6108..3ad091a91d6 100644 --- a/test/CodeGen/AMDGPU/sub.i16.ll +++ b/test/CodeGen/AMDGPU/sub.i16.ll @@ -85,9 +85,9 @@ define amdgpu_kernel void @v_test_sub_i16_zext_to_i32(i32 addrspace(1)* %out, i1 ; FIXME: Need to handle non-uniform case for function below (load without gep). ; GCN-LABEL: {{^}}v_test_sub_i16_zext_to_i64: -; VI: v_mov_b32_e32 v[[VZERO:[0-9]+]], 0 ; VI: flat_load_ushort [[A:v[0-9]+]] ; VI: flat_load_ushort [[B:v[0-9]+]] +; VI: v_mov_b32_e32 v[[VZERO:[0-9]+]], 0 ; VI-DAG: v_sub_u16_e32 v[[ADD:[0-9]+]], [[A]], [[B]] ; VI: buffer_store_dwordx2 v{{\[}}[[ADD]]:[[VZERO]]{{\]}}, off, {{s\[[0-9]+:[0-9]+\]}}, 0{{$}} define amdgpu_kernel void @v_test_sub_i16_zext_to_i64(i64 addrspace(1)* %out, i16 addrspace(1)* %in0, i16 addrspace(1)* %in1) #1 { diff --git a/test/CodeGen/AMDGPU/sub.ll b/test/CodeGen/AMDGPU/sub.ll index 46f1b120f21..4c573acdbab 100644 --- a/test/CodeGen/AMDGPU/sub.ll +++ b/test/CodeGen/AMDGPU/sub.ll @@ -57,7 +57,7 @@ define amdgpu_kernel void @test_sub_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32 ; VI: v_sub_i16_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}} define amdgpu_kernel void @test_sub_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %in) { %b_ptr = getelementptr i16, i16 addrspace(1)* %in, i16 1 - %a = load i16, i16 addrspace(1)* %in + %a = load i16, i16 addrspace(1)* %in %b = load i16, i16 addrspace(1)* %b_ptr %result = sub i16 %a, %b store i16 %result, i16 addrspace(1)* %out @@ -71,7 +71,7 @@ define amdgpu_kernel void @test_sub_i16(i16 addrspace(1)* %out, i16 addrspace(1) define amdgpu_kernel void @test_sub_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) { %b_ptr = getelementptr <2 x i16>, <2 x i16> addrspace(1)* %in, i16 1 - %a = load <2 x i16>, <2 x i16> addrspace(1) * %in + %a = load <2 x i16>, <2 x i16> addrspace(1) * %in %b = load <2 x i16>, <2 x i16> addrspace(1) * %b_ptr %result = sub <2 x i16> %a, %b store <2 x i16> %result, <2 x i16> addrspace(1)* %out @@ -87,7 +87,7 @@ define amdgpu_kernel void @test_sub_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16 define amdgpu_kernel void @test_sub_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> addrspace(1)* %in) { %b_ptr = getelementptr <4 x i16>, <4 x i16> addrspace(1)* %in, i16 1 - %a = load <4 x i16>, <4 x i16> addrspace(1) * %in + %a = load <4 x i16>, <4 x i16> addrspace(1) * %in %b = load <4 x i16>, <4 x i16> addrspace(1) * %b_ptr %result = sub <4 x i16> %a, %b store <4 x i16> %result, <4 x i16> addrspace(1)* %out @@ -146,13 +146,13 @@ define amdgpu_kernel void @v_test_sub_v2i64(<2 x i64> addrspace(1)* %out, <2 x i } ; FUNC-LABEL: {{^}}v_test_sub_v4i64: -; SI: v_subrev_i32_e32 +; SI: v_sub_i32_e32 ; SI: v_subb_u32_e32 -; SI: v_subrev_i32_e32 +; SI: v_sub_i32_e32 ; SI: v_subb_u32_e32 -; SI: v_subrev_i32_e32 +; SI: v_sub_i32_e32 ; SI: v_subb_u32_e32 -; SI: v_subrev_i32_e32 +; SI: v_sub_i32_e32 ; SI: v_subb_u32_e32 define amdgpu_kernel void @v_test_sub_v4i64(<4 x i64> addrspace(1)* %out, <4 x i64> addrspace(1)* noalias %inA, <4 x i64> addrspace(1)* noalias %inB) { %tid = call i32 @llvm.r600.read.tidig.x() readnone diff --git a/test/CodeGen/AMDGPU/sub.v2i16.ll b/test/CodeGen/AMDGPU/sub.v2i16.ll index 6e76575e3be..620f38e5fba 100644 --- a/test/CodeGen/AMDGPU/sub.v2i16.ll +++ b/test/CodeGen/AMDGPU/sub.v2i16.ll @@ -160,10 +160,11 @@ define amdgpu_kernel void @v_test_sub_v2i16_inline_fp_split(<2 x i16> addrspace( ; GFX9-DAG: v_lshrrev_b32_e32 v[[ELT1:[0-9]+]], 16, [[ADD]] ; GFX9: buffer_store_dwordx2 v{{\[}}[[ELT0]]:[[ELT1]]{{\]}} -; VI: flat_load_ushort v[[A_HI:[0-9]+]] ; VI: flat_load_ushort v[[A_LO:[0-9]+]] -; VI: flat_load_ushort v[[B_HI:[0-9]+]] +; VI: flat_load_ushort v[[A_HI:[0-9]+]] + ; VI: flat_load_ushort v[[B_LO:[0-9]+]] +; VI: flat_load_ushort v[[B_HI:[0-9]+]] ; VI: v_sub_u16_e32 v[[ADD_HI:[0-9]+]], v[[A_HI]], v[[B_HI]] ; VI-NOT: and |