diff options
author | Craig Topper <craig.topper@gmail.com> | 2017-02-11 22:57:12 +0000 |
---|---|---|
committer | Craig Topper <craig.topper@gmail.com> | 2017-02-11 22:57:12 +0000 |
commit | fe892ae26106c3d8d2b90b1464338b79652b555f (patch) | |
tree | 5ad7bedcebde69bb4e0b56c09eb9ed1d71faf2ee | |
parent | a1a9b947dcb2da5e1b13cf38d1de76621fffa45a (diff) |
[X86] Move code for using blendi for insert_subvector out to an isel pattern. This gives the DAG combiner more opportunity to optimize without needing to dig through the blend.
git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@294876 91177308-0d34-0410-b5e6-96231b3b80d8
-rw-r--r-- | lib/Target/X86/X86ISelLowering.cpp | 27 | ||||
-rw-r--r-- | lib/Target/X86/X86InstrSSE.td | 53 | ||||
-rw-r--r-- | test/CodeGen/X86/clear_upper_vector_element_bits.ll | 42 | ||||
-rw-r--r-- | test/CodeGen/X86/insertelement-zero.ll | 38 |
4 files changed, 91 insertions, 69 deletions
diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index e7417b75ae7..0de7d1c173c 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -34152,33 +34152,6 @@ static SDValue combineInsertSubvector(SDNode *N, SelectionDAG &DAG, MVT OpVT = N->getSimpleValueType(0); MVT SubVecVT = SubVec.getSimpleValueType(); - // For insertion into the zero index (low half) of a 256-bit vector, it is - // more efficient to generate a blend with immediate instead of an insert*128. - // We are still creating an INSERT_SUBVECTOR below with an undef node to - // extend the subvector to the size of the result vector. Make sure that - // we are not recursing on that node by checking for undef here. - if (IdxVal == 0 && OpVT.is256BitVector() && !Vec.isUndef()) { - SDValue Vec256 = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT, - DAG.getUNDEF(OpVT), SubVec, N->getOperand(2)); - - // Integers must be cast to 32-bit because there is only vpblendd; - // vpblendw can't be used for this because it has a handicapped mask. - // If we don't have AVX2, then cast to float. Using a wrong domain blend - // is still more efficient than using the wrong domain vinsertf128 that - // will be created by InsertSubVector(). - MVT CastVT = OpVT; - if (OpVT.isInteger()) - CastVT = Subtarget.hasAVX2() ? MVT::v8i32 : MVT::v8f32; - - // The blend instruction, and therefore its mask, depend on the data type. - unsigned MaskVal = CastVT.getScalarSizeInBits() == 64 ? 0x03 : 0x0f; - SDValue Mask = DAG.getConstant(MaskVal, dl, MVT::i8); - Vec = DAG.getBitcast(CastVT, Vec); - Vec256 = DAG.getBitcast(CastVT, Vec256); - Vec256 = DAG.getNode(X86ISD::BLENDI, dl, CastVT, Vec, Vec256, Mask); - return DAG.getBitcast(OpVT, Vec256); - } - // If we're inserting into the upper half of a 256-bit vector with a vector // that was extracted from the upper half of a 256-bit vector, we should // use a blend instead. diff --git a/lib/Target/X86/X86InstrSSE.td b/lib/Target/X86/X86InstrSSE.td index ddd7b808450..e1191309219 100644 --- a/lib/Target/X86/X86InstrSSE.td +++ b/lib/Target/X86/X86InstrSSE.td @@ -6984,6 +6984,19 @@ let Constraints = "$src1 = $dst" in { SSE_DPPD_ITINS>; } +// For insertion into the zero index (low half) of a 256-bit vector, it is +// more efficient to generate a blend with immediate instead of an insert*128. +let Predicates = [HasAVX] in { +def : Pat<(insert_subvector (v4f64 VR256:$src1), (v2f64 VR128:$src2), (iPTR 0)), + (VBLENDPDYrri VR256:$src1, + (INSERT_SUBREG (v4f64 (IMPLICIT_DEF)), + VR128:$src2, sub_xmm), 0x3)>; +def : Pat<(insert_subvector (v8f32 VR256:$src1), (v4f32 VR128:$src2), (iPTR 0)), + (VBLENDPSYrri VR256:$src1, + (INSERT_SUBREG (v8f32 (IMPLICIT_DEF)), + VR128:$src2, sub_xmm), 0xf)>; +} + /// SS41I_quaternary_int_avx - AVX SSE 4.1 with 4 operators multiclass SS41I_quaternary_int_avx<bits<8> opc, string OpcodeStr, RegisterClass RC, X86MemOperand x86memop, @@ -8178,6 +8191,46 @@ defm VPBLENDD : AVX2_binop_rmi<0x02, "vpblendd", X86Blendi, v4i32, defm VPBLENDDY : AVX2_binop_rmi<0x02, "vpblendd", X86Blendi, v8i32, VR256, loadv4i64, i256mem>, VEX_L; +// For insertion into the zero index (low half) of a 256-bit vector, it is +// more efficient to generate a blend with immediate instead of an insert*128. +let Predicates = [HasAVX2] in { +def : Pat<(insert_subvector (v8i32 VR256:$src1), (v4i32 VR128:$src2), (iPTR 0)), + (VPBLENDDYrri VR256:$src1, + (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)), + VR128:$src2, sub_xmm), 0xf)>; +def : Pat<(insert_subvector (v4i64 VR256:$src1), (v2i64 VR128:$src2), (iPTR 0)), + (VPBLENDDYrri VR256:$src1, + (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)), + VR128:$src2, sub_xmm), 0xf)>; +def : Pat<(insert_subvector (v16i16 VR256:$src1), (v8i16 VR128:$src2), (iPTR 0)), + (VPBLENDDYrri VR256:$src1, + (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)), + VR128:$src2, sub_xmm), 0xf)>; +def : Pat<(insert_subvector (v32i8 VR256:$src1), (v16i8 VR128:$src2), (iPTR 0)), + (VPBLENDDYrri VR256:$src1, + (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)), + VR128:$src2, sub_xmm), 0xf)>; +} + +let Predicates = [HasAVX1Only] in { +def : Pat<(insert_subvector (v8i32 VR256:$src1), (v4i32 VR128:$src2), (iPTR 0)), + (VBLENDPSYrri VR256:$src1, + (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)), + VR128:$src2, sub_xmm), 0xf)>; +def : Pat<(insert_subvector (v4i64 VR256:$src1), (v2i64 VR128:$src2), (iPTR 0)), + (VBLENDPSYrri VR256:$src1, + (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)), + VR128:$src2, sub_xmm), 0xf)>; +def : Pat<(insert_subvector (v16i16 VR256:$src1), (v8i16 VR128:$src2), (iPTR 0)), + (VBLENDPSYrri VR256:$src1, + (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)), + VR128:$src2, sub_xmm), 0xf)>; +def : Pat<(insert_subvector (v32i8 VR256:$src1), (v16i8 VR128:$src2), (iPTR 0)), + (VBLENDPSYrri VR256:$src1, + (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)), + VR128:$src2, sub_xmm), 0xf)>; +} + //===----------------------------------------------------------------------===// // VPBROADCAST - Load from memory and broadcast to all elements of the // destination operand diff --git a/test/CodeGen/X86/clear_upper_vector_element_bits.ll b/test/CodeGen/X86/clear_upper_vector_element_bits.ll index 6e8c6755f4e..4ad865e19aa 100644 --- a/test/CodeGen/X86/clear_upper_vector_element_bits.ll +++ b/test/CodeGen/X86/clear_upper_vector_element_bits.ll @@ -837,23 +837,21 @@ define <8 x i32> @_clearupper8xi32b(<8 x i32>) nounwind { ; AVX1-LABEL: _clearupper8xi32b: ; AVX1: # BB#0: ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6,7] -; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,6],xmm1[7] -; AVX1-NEXT: vblendpd {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3] -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 -; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3],xmm2[4],xmm1[5],xmm2[6],xmm1[7] -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7] +; AVX1-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm0[4,5,6,7] +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: _clearupper8xi32b: ; AVX2: # BB#0: ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX2-NEXT: vpblendw {{.*#+}} xmm2 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6,7] -; AVX2-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,6],xmm1[7] -; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm2 -; AVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3],xmm2[4],xmm1[5],xmm2[6],xmm1[7] -; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-NEXT: vpblendw {{.*#+}} xmm2 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7] +; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm0[4,5,6,7] +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0 +; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7] +; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm2, %ymm0 ; AVX2-NEXT: retq %x16 = bitcast <8 x i32> %0 to <16 x i16> %r0 = insertelement <16 x i16> %x16, i16 zeroinitializer, i32 1 @@ -901,22 +899,22 @@ define <16 x i16> @_clearupper16xi16b(<16 x i16>) nounwind { ; ; AVX1-LABEL: _clearupper16xi16b: ; AVX1: # BB#0: -; AVX1-NEXT: vmovapd {{.*#+}} xmm1 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0] -; AVX1-NEXT: vandpd %xmm1, %xmm0, %xmm2 -; AVX1-NEXT: vblendpd {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3] -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 -; AVX1-NEXT: vandpd %xmm1, %xmm2, %xmm1 -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: vmovaps {{.*#+}} xmm1 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0] +; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm2 +; AVX1-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm0[4,5,6,7] +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: _clearupper16xi16b: ; AVX2: # BB#0: ; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 -; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm2 -; AVX2-NEXT: vpand %xmm1, %xmm2, %xmm1 -; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm0[4,5,6,7] +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0 +; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm2, %ymm0 ; AVX2-NEXT: retq %x8 = bitcast <16 x i16> %0 to <32 x i8> %r0 = insertelement <32 x i8> %x8, i8 zeroinitializer, i32 1 diff --git a/test/CodeGen/X86/insertelement-zero.ll b/test/CodeGen/X86/insertelement-zero.ll index 3571fa72738..ea780a2fa68 100644 --- a/test/CodeGen/X86/insertelement-zero.ll +++ b/test/CodeGen/X86/insertelement-zero.ll @@ -408,23 +408,21 @@ define <16 x i16> @insert_v16i16_z12345z789ABZDEz(<16 x i16> %a) { ; AVX1-LABEL: insert_v16i16_z12345z789ABZDEz: ; AVX1: # BB#0: ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm1[0],xmm0[1,2,3,4,5,6,7] -; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5],xmm1[6],xmm2[7] -; AVX1-NEXT: vblendpd {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3] -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 -; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3,4,5,6],xmm1[7] -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm1[0],xmm0[1,2,3,4,5],xmm1[6],xmm0[7] +; AVX1-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm0[4,5,6,7] +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm1[7] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: insert_v16i16_z12345z789ABZDEz: ; AVX2: # BB#0: ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX2-NEXT: vpblendw {{.*#+}} xmm2 = xmm1[0],xmm0[1,2,3,4,5,6,7] -; AVX2-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5],xmm1[6],xmm2[7] -; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm2 -; AVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3,4,5,6],xmm1[7] -; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-NEXT: vpblendw {{.*#+}} xmm2 = xmm1[0],xmm0[1,2,3,4,5],xmm1[6],xmm0[7] +; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm0[4,5,6,7] +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0 +; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm1[7] +; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm2, %ymm0 ; AVX2-NEXT: retq %1 = insertelement <16 x i16> %a, i16 0, i32 0 %2 = insertelement <16 x i16> %1, i16 0, i32 6 @@ -499,11 +497,11 @@ define <32 x i8> @insert_v32i8_z123456789ABCDEzGHIJKLMNOPQRSTzz(<32 x i8> %a) { ; AVX1-NEXT: xorl %eax, %eax ; AVX1-NEXT: vpinsrb $0, %eax, %xmm0, %xmm1 ; AVX1-NEXT: vpinsrb $15, %eax, %xmm1, %xmm1 -; AVX1-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3] -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,6],xmm2[7] -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm2[7] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: insert_v32i8_z123456789ABCDEzGHIJKLMNOPQRSTzz: @@ -511,11 +509,11 @@ define <32 x i8> @insert_v32i8_z123456789ABCDEzGHIJKLMNOPQRSTzz(<32 x i8> %a) { ; AVX2-NEXT: xorl %eax, %eax ; AVX2-NEXT: vpinsrb $0, %eax, %xmm0, %xmm1 ; AVX2-NEXT: vpinsrb $15, %eax, %xmm1, %xmm1 -; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0 ; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,6],xmm2[7] -; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm2[7] +; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 ; AVX2-NEXT: retq %1 = insertelement <32 x i8> %a, i8 0, i32 0 %2 = insertelement <32 x i8> %1, i8 0, i32 15 |