summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorTony Jiang <jtony@ca.ibm.com>2017-05-31 13:09:57 +0000
committerTony Jiang <jtony@ca.ibm.com>2017-05-31 13:09:57 +0000
commita688c8eaae7cfc5c70608fdbe7ec64ef86c449e6 (patch)
tree2f1c884d66836996e0c643f25208a8ac97829f1f
parent7913836381e3f3984b711f921a1cf9da5d37e107 (diff)
[PowerPC] Fix a performance bug for PPC::XXPERMDI.
There are some VectorShuffle Nodes in SDAG which can be selected to XXPERMDI Instruction, this patch recognizes them and does the selection to improve the PPC performance. Differential Revision: https://reviews.llvm.org/D33404 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@304298 91177308-0d34-0410-b5e6-96231b3b80d8
-rw-r--r--lib/Target/PowerPC/PPCISelLowering.cpp106
-rw-r--r--lib/Target/PowerPC/PPCISelLowering.h8
-rw-r--r--lib/Target/PowerPC/PPCInstrInfo.td5
-rw-r--r--lib/Target/PowerPC/PPCInstrVSX.td4
-rw-r--r--test/CodeGen/PowerPC/vec_xxpermdi.ll307
5 files changed, 417 insertions, 13 deletions
diff --git a/lib/Target/PowerPC/PPCISelLowering.cpp b/lib/Target/PowerPC/PPCISelLowering.cpp
index 216efcc4a1e..91356fcec45 100644
--- a/lib/Target/PowerPC/PPCISelLowering.cpp
+++ b/lib/Target/PowerPC/PPCISelLowering.cpp
@@ -1112,6 +1112,7 @@ const char *PPCTargetLowering::getTargetNodeName(unsigned Opcode) const {
case PPCISD::VPERM: return "PPCISD::VPERM";
case PPCISD::XXSPLT: return "PPCISD::XXSPLT";
case PPCISD::XXINSERT: return "PPCISD::XXINSERT";
+ case PPCISD::XXPERMDI: return "PPCISD::XXPERMDI";
case PPCISD::VECSHL: return "PPCISD::VECSHL";
case PPCISD::CMPB: return "PPCISD::CMPB";
case PPCISD::Hi: return "PPCISD::Hi";
@@ -1593,17 +1594,25 @@ bool PPC::isSplatShuffleMask(ShuffleVectorSDNode *N, unsigned EltSize) {
return true;
}
- // Check that the mask is shuffling words
-static bool isWordShuffleMask(ShuffleVectorSDNode *N) {
- for (unsigned i = 0; i < 4; ++i) {
- unsigned B0 = N->getMaskElt(i*4);
- unsigned B1 = N->getMaskElt(i*4+1);
- unsigned B2 = N->getMaskElt(i*4+2);
- unsigned B3 = N->getMaskElt(i*4+3);
- if (B0 % 4)
- return false;
- if (B1 != B0+1 || B2 != B1+1 || B3 != B2+1)
+// Check that the mask is shuffling N byte elements.
+static bool isNByteElemShuffleMask(ShuffleVectorSDNode *N, unsigned Width) {
+ assert((Width == 2 || Width == 4 || Width == 8 || Width == 16) &&
+ "Unexpected element width.");
+
+ unsigned NumOfElem = 16 / Width;
+ unsigned MaskVal[16]; // Width is never greater than 16
+ for (unsigned i = 0; i < NumOfElem; ++i) {
+ MaskVal[0] = N->getMaskElt(i * Width);
+ if (MaskVal[0] % Width) {
return false;
+ }
+
+ for (unsigned int j = 1; j < Width; ++j) {
+ MaskVal[j] = N->getMaskElt(i * Width + j);
+ if (MaskVal[j] != MaskVal[j-1] + 1) {
+ return false;
+ }
+ }
}
return true;
@@ -1611,7 +1620,7 @@ static bool isWordShuffleMask(ShuffleVectorSDNode *N) {
bool PPC::isXXINSERTWMask(ShuffleVectorSDNode *N, unsigned &ShiftElts,
unsigned &InsertAtByte, bool &Swap, bool IsLE) {
- if (!isWordShuffleMask(N))
+ if (!isNByteElemShuffleMask(N, 4))
return false;
// Now we look at mask elements 0,4,8,12
@@ -1688,7 +1697,7 @@ bool PPC::isXXSLDWIShuffleMask(ShuffleVectorSDNode *N, unsigned &ShiftElts,
bool &Swap, bool IsLE) {
assert(N->getValueType(0) == MVT::v16i8 && "Shuffle vector expects v16i8");
// Ensure each byte index of the word is consecutive.
- if (!isWordShuffleMask(N))
+ if (!isNByteElemShuffleMask(N, 4))
return false;
// Now we look at mask elements 0,4,8,12, which are the beginning of words.
@@ -1746,6 +1755,66 @@ bool PPC::isXXSLDWIShuffleMask(ShuffleVectorSDNode *N, unsigned &ShiftElts,
}
}
+/// Can node \p N be lowered to an XXPERMDI instruction? If so, set \p Swap
+/// if the inputs to the instruction should be swapped and set \p DM to the
+/// value for the immediate.
+/// Specifically, set \p Swap to true only if \p N can be lowered to XXPERMDI
+/// AND element 0 of the result comes from the first input (LE) or second input
+/// (BE). Set \p DM to the calculated result (0-3) only if \p N can be lowered.
+/// \return true iff the given mask of shuffle node \p N is a XXPERMDI shuffle
+/// mask.
+bool PPC::isXXPERMDIShuffleMask(ShuffleVectorSDNode *N, unsigned &DM,
+ bool &Swap, bool IsLE) {
+ assert(N->getValueType(0) == MVT::v16i8 && "Shuffle vector expects v16i8");
+
+ // Ensure each byte index of the double word is consecutive.
+ if (!isNByteElemShuffleMask(N, 8))
+ return false;
+
+ unsigned M0 = N->getMaskElt(0) / 8;
+ unsigned M1 = N->getMaskElt(8) / 8;
+ assert(((M0 | M1) < 4) && "A mask element out of bounds?");
+
+ // If both vector operands for the shuffle are the same vector, the mask will
+ // contain only elements from the first one and the second one will be undef.
+ if (N->getOperand(1).isUndef()) {
+ if ((M0 | M1) < 2) {
+ DM = IsLE ? (((~M1) & 1) << 1) + ((~M0) & 1) : (M0 << 1) + (M1 & 1);
+ Swap = false;
+ return true;
+ } else
+ return false;
+ }
+
+ if (IsLE) {
+ if (M0 > 1 && M1 < 2) {
+ Swap = false;
+ } else if (M0 < 2 && M1 > 1) {
+ M0 = (M0 + 2) % 4;
+ M1 = (M1 + 2) % 4;
+ Swap = true;
+ } else
+ return false;
+
+ // Note: if control flow comes here that means Swap is already set above
+ DM = (((~M1) & 1) << 1) + ((~M0) & 1);
+ return true;
+ } else { // BE
+ if (M0 < 2 && M1 > 1) {
+ Swap = false;
+ } else if (M0 > 1 && M1 < 2) {
+ M0 = (M0 + 2) % 4;
+ M1 = (M1 + 2) % 4;
+ Swap = true;
+ } else
+ return false;
+
+ // Note: if control flow comes here that means Swap is already set above
+ DM = (M0 << 1) + (M1 & 1);
+ return true;
+ }
+}
+
/// getVSPLTImmediate - Return the appropriate VSPLT* immediate to splat the
/// specified isSplatShuffleMask VECTOR_SHUFFLE mask.
@@ -7760,6 +7829,19 @@ SDValue PPCTargetLowering::LowerVECTOR_SHUFFLE(SDValue Op,
return DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, Shl);
}
+ if (Subtarget.hasVSX() &&
+ PPC::isXXPERMDIShuffleMask(SVOp, ShiftElts, Swap, isLittleEndian)) {
+ if (Swap)
+ std::swap(V1, V2);
+ SDValue Conv1 = DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, V1);
+ SDValue Conv2 =
+ DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, V2.isUndef() ? V1 : V2);
+
+ SDValue PermDI = DAG.getNode(PPCISD::XXPERMDI, dl, MVT::v2i64, Conv1, Conv2,
+ DAG.getConstant(ShiftElts, dl, MVT::i32));
+ return DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, PermDI);
+ }
+
if (Subtarget.hasVSX()) {
if (V2.isUndef() && PPC::isSplatShuffleMask(SVOp, 4)) {
int SplatIdx = PPC::getVSPLTImmediate(SVOp, 4, DAG);
diff --git a/lib/Target/PowerPC/PPCISelLowering.h b/lib/Target/PowerPC/PPCISelLowering.h
index 2f9eb95f6de..7982a4a9e9f 100644
--- a/lib/Target/PowerPC/PPCISelLowering.h
+++ b/lib/Target/PowerPC/PPCISelLowering.h
@@ -90,6 +90,10 @@ namespace llvm {
///
VECSHL,
+ /// XXPERMDI - The PPC XXPERMDI instruction
+ ///
+ XXPERMDI,
+
/// The CMPB instruction (takes two operands of i32 or i64).
CMPB,
@@ -454,6 +458,10 @@ namespace llvm {
/// for a XXSLDWI instruction.
bool isXXSLDWIShuffleMask(ShuffleVectorSDNode *N, unsigned &ShiftElts,
bool &Swap, bool IsLE);
+ /// isXXPERMDIShuffleMask - Return true if this is a shuffle mask suitable
+ /// for a XXPERMDI instruction.
+ bool isXXPERMDIShuffleMask(ShuffleVectorSDNode *N, unsigned &ShiftElts,
+ bool &Swap, bool IsLE);
/// isVSLDOIShuffleMask - If this is a vsldoi shuffle mask, return the
/// shift amount, otherwise return -1.
diff --git a/lib/Target/PowerPC/PPCInstrInfo.td b/lib/Target/PowerPC/PPCInstrInfo.td
index 26b99eced23..8223aa655e3 100644
--- a/lib/Target/PowerPC/PPCInstrInfo.td
+++ b/lib/Target/PowerPC/PPCInstrInfo.td
@@ -53,6 +53,10 @@ def SDT_PPCVecInsert : SDTypeProfile<1, 3, [ SDTCisVec<0>,
SDTCisVec<1>, SDTCisVec<2>, SDTCisInt<3>
]>;
+def SDT_PPCxxpermdi: SDTypeProfile<1, 3, [ SDTCisVec<0>,
+ SDTCisVec<1>, SDTCisVec<2>, SDTCisInt<3>
+]>;
+
def SDT_PPCvcmp : SDTypeProfile<1, 3, [
SDTCisSameAs<0, 1>, SDTCisSameAs<1, 2>, SDTCisVT<3, i32>
]>;
@@ -170,6 +174,7 @@ def PPCaddiDtprelL : SDNode<"PPCISD::ADDI_DTPREL_L", SDTIntBinOp>;
def PPCvperm : SDNode<"PPCISD::VPERM", SDT_PPCvperm, []>;
def PPCxxsplt : SDNode<"PPCISD::XXSPLT", SDT_PPCVecSplat, []>;
def PPCxxinsert : SDNode<"PPCISD::XXINSERT", SDT_PPCVecInsert, []>;
+def PPCxxpermdi : SDNode<"PPCISD::XXPERMDI", SDT_PPCxxpermdi, []>;
def PPCvecshl : SDNode<"PPCISD::VECSHL", SDT_PPCVecShift, []>;
def PPCqvfperm : SDNode<"PPCISD::QVFPERM", SDT_PPCqvfperm, []>;
diff --git a/lib/Target/PowerPC/PPCInstrVSX.td b/lib/Target/PowerPC/PPCInstrVSX.td
index 1589ab03e50..c4139ca8b7b 100644
--- a/lib/Target/PowerPC/PPCInstrVSX.td
+++ b/lib/Target/PowerPC/PPCInstrVSX.td
@@ -843,7 +843,9 @@ let Uses = [RM] in {
def XXPERMDI : XX3Form_2<60, 10,
(outs vsrc:$XT), (ins vsrc:$XA, vsrc:$XB, u2imm:$DM),
- "xxpermdi $XT, $XA, $XB, $DM", IIC_VecPerm, []>;
+ "xxpermdi $XT, $XA, $XB, $DM", IIC_VecPerm,
+ [(set v2i64:$XT, (PPCxxpermdi v2i64:$XA, v2i64:$XB,
+ imm32SExt16:$DM))]>;
let isCodeGenOnly = 1 in
def XXPERMDIs : XX3Form_2s<60, 10, (outs vsrc:$XT), (ins vsfrc:$XA, u2imm:$DM),
"xxpermdi $XT, $XA, $XA, $DM", IIC_VecPerm, []>;
diff --git a/test/CodeGen/PowerPC/vec_xxpermdi.ll b/test/CodeGen/PowerPC/vec_xxpermdi.ll
new file mode 100644
index 00000000000..9be2a1864a0
--- /dev/null
+++ b/test/CodeGen/PowerPC/vec_xxpermdi.ll
@@ -0,0 +1,307 @@
+; RUN: llc -verify-machineinstrs -mtriple=powerpc64le-unknown-linux-gnu -mcpu=pwr8 < %s | \
+; RUN: FileCheck %s -check-prefix=CHECK-LE
+; RUN: llc -verify-machineinstrs -mtriple=powerpc64-unknown-linux-gnu -mcpu=pwr8 < %s | \
+; RUN: FileCheck %s -check-prefix=CHECK-BE
+
+; Possible LE ShuffleVector masks (Case 1):
+; ShuffleVector((vector double)a, (vector double)b, 3, 1)
+; ShuffleVector((vector double)a, (vector double)b, 2, 1)
+; ShuffleVector((vector double)a, (vector double)b, 3, 0)
+; ShuffleVector((vector double)a, (vector double)b, 2, 0)
+; which targets at:
+; xxpermdi a, b, 0
+; xxpermdi a, b, 1
+; xxpermdi a, b, 2
+; xxpermdi a, b, 3
+; Possible LE Swap ShuffleVector masks (Case 2):
+; ShuffleVector((vector double)a, (vector double)b, 1, 3)
+; ShuffleVector((vector double)a, (vector double)b, 0, 3)
+; ShuffleVector((vector double)a, (vector double)b, 1, 2)
+; ShuffleVector((vector double)a, (vector double)b, 0, 2)
+; which targets at:
+; xxpermdi b, a, 0
+; xxpermdi b, a, 1
+; xxpermdi b, a, 2
+; xxpermdi b, a, 3
+; Possible LE ShuffleVector masks when a == b, b is undef (Case 3):
+; ShuffleVector((vector double)a, (vector double)a, 1, 1)
+; ShuffleVector((vector double)a, (vector double)a, 0, 1)
+; ShuffleVector((vector double)a, (vector double)a, 1, 0)
+; ShuffleVector((vector double)a, (vector double)a, 0, 0)
+; which targets at:
+; xxpermdi a, a, 0
+; xxpermdi a, a, 1
+; xxpermdi a, a, 2
+; xxpermdi a, a, 3
+
+; Possible BE ShuffleVector masks (Case 4):
+; ShuffleVector((vector double)a, (vector double)b, 0, 2)
+; ShuffleVector((vector double)a, (vector double)b, 0, 3)
+; ShuffleVector((vector double)a, (vector double)b, 1, 2)
+; ShuffleVector((vector double)a, (vector double)b, 1, 3)
+; which targets at:
+; xxpermdi a, b, 0
+; xxpermdi a, b, 1
+; xxpermdi a, b, 2
+; xxpermdi a, b, 3
+; Possible BE Swap ShuffleVector masks (Case 5):
+; ShuffleVector((vector double)a, (vector double)b, 2, 0)
+; ShuffleVector((vector double)a, (vector double)b, 3, 0)
+; ShuffleVector((vector double)a, (vector double)b, 2, 1)
+; ShuffleVector((vector double)a, (vector double)b, 3, 1)
+; which targets at:
+; xxpermdi b, a, 0
+; xxpermdi b, a, 1
+; xxpermdi b, a, 2
+; xxpermdi b, a, 3
+; Possible BE ShuffleVector masks when a == b, b is undef (Case 6):
+; ShuffleVector((vector double)a, (vector double)a, 0, 0)
+; ShuffleVector((vector double)a, (vector double)a, 0, 1)
+; ShuffleVector((vector double)a, (vector double)a, 1, 0)
+; ShuffleVector((vector double)a, (vector double)a, 1, 1)
+; which targets at:
+; xxpermdi a, a, 0
+; xxpermdi a, a, 1
+; xxpermdi a, a, 2
+; xxpermdi a, a, 3
+
+define <2 x double> @test_le_vec_xxpermdi_v2f64_v2f64_0(<2 x double> %VA, <2 x double> %VB) {
+ entry:
+ %0 = shufflevector <2 x double> %VA, <2 x double> %VB,<2 x i32> <i32 3, i32 1>
+ ret <2 x double> %0
+; CHECK-LE-LABEL: @test_le_vec_xxpermdi_v2f64_v2f64_0
+; CHECK-LE: xxmrghd 34, 34, 35
+; CHECK-LE: blr
+}
+
+define <2 x double> @test_le_vec_xxpermdi_v2f64_v2f64_1(<2 x double> %VA, <2 x double> %VB) {
+ entry:
+ %0 = shufflevector <2 x double> %VA, <2 x double> %VB,<2 x i32> <i32 2, i32 1>
+ ret <2 x double> %0
+; CHECK-LE-LABEL: @test_le_vec_xxpermdi_v2f64_v2f64_1
+; CHECK-LE: xxpermdi 34, 34, 35, 1
+; CHECK-LE: blr
+}
+
+define <2 x double> @test_le_vec_xxpermdi_v2f64_v2f64_2(<2 x double> %VA, <2 x double> %VB) {
+ entry:
+ %0 = shufflevector <2 x double> %VA, <2 x double> %VB,<2 x i32> <i32 3, i32 0>
+ ret <2 x double> %0
+; CHECK-LE-LABEL: @test_le_vec_xxpermdi_v2f64_v2f64_2
+; CHECK-LE: xxpermdi 34, 34, 35, 2
+; CHECK-LE: blr
+}
+
+define <2 x double> @test_le_vec_xxpermdi_v2f64_v2f64_3(<2 x double> %VA, <2 x double> %VB) {
+ entry:
+ %0 = shufflevector <2 x double> %VA, <2 x double> %VB,<2 x i32> <i32 2, i32 0>
+ ret <2 x double> %0
+; CHECK-LE-LABEL: @test_le_vec_xxpermdi_v2f64_v2f64_3
+; CHECK-LE: xxmrgld 34, 34, 35
+; CHECK-LE: blr
+}
+
+define <2 x double> @test_le_swap_vec_xxpermdi_v2f64_v2f64_0(<2 x double> %VA, <2 x double> %VB) {
+ entry:
+ %0 = shufflevector <2 x double> %VA, <2 x double> %VB,<2 x i32> <i32 1, i32 3>
+ ret <2 x double> %0
+; CHECK-LE-LABEL: @test_le_swap_vec_xxpermdi_v2f64_v2f64_0
+; CHECK-LE: xxmrghd 34, 35, 34
+; CHECK-LE: blr
+}
+
+define <2 x double> @test_le_swap_vec_xxpermdi_v2f64_v2f64_1(<2 x double> %VA, <2 x double> %VB) {
+ entry:
+ %0 = shufflevector <2 x double> %VA, <2 x double> %VB,<2 x i32> <i32 0, i32 3>
+ ret <2 x double> %0
+; CHECK-LE-LABEL: @test_le_swap_vec_xxpermdi_v2f64_v2f64_1
+; CHECK-LE: xxpermdi 34, 35, 34, 1
+; CHECK-LE: blr
+}
+
+define <2 x double> @test_le_swap_vec_xxpermdi_v2f64_v2f64_2(<2 x double> %VA, <2 x double> %VB) {
+ entry:
+ %0 = shufflevector <2 x double> %VA, <2 x double> %VB,<2 x i32> <i32 1, i32 2>
+ ret <2 x double> %0
+; CHECK-LE-LABEL: @test_le_swap_vec_xxpermdi_v2f64_v2f64_2
+; CHECK-LE: xxpermdi 34, 35, 34, 2
+; CHECK-LE: blr
+}
+
+define <2 x double> @test_le_swap_vec_xxpermdi_v2f64_v2f64_3(<2 x double> %VA, <2 x double> %VB) {
+ entry:
+ %0 = shufflevector <2 x double> %VA, <2 x double> %VB,<2 x i32> <i32 0, i32 2>
+ ret <2 x double> %0
+; CHECK-LE-LABEL: @test_le_swap_vec_xxpermdi_v2f64_v2f64_3
+; CHECK-LE: xxmrgld 34, 35, 34
+; CHECK-LE: blr
+}
+
+define <2 x double> @test_le_vec_xxpermdi_v2f64_undef_0(<2 x double> %VA) {
+ entry:
+ %0 = shufflevector <2 x double> %VA, <2 x double> undef, <2 x i32> <i32 1, i32 1>
+ ret <2 x double> %0
+; CHECK-LE-LABEL: @test_le_vec_xxpermdi_v2f64_undef_0
+; CHECK-LE: xxspltd 34, 34, 0
+; CHECK-LE: blr
+}
+
+define <2 x double> @test_le_vec_xxpermdi_v2f64_undef_1(<2 x double> %VA) {
+ entry:
+ %0 = shufflevector <2 x double> %VA, <2 x double> undef, <2 x i32> <i32 0, i32 1>
+ ret <2 x double> %0
+; CHECK-LE-LABEL: @test_le_vec_xxpermdi_v2f64_undef_1
+; CHECK-LE: blr
+}
+
+define <2 x double> @test_le_vec_xxpermdi_v2f64_undef_2(<2 x double> %VA) {
+ entry:
+ %0 = shufflevector <2 x double> %VA, <2 x double> undef, <2 x i32> <i32 1, i32 0>
+ ret <2 x double> %0
+; CHECK-LE-LABEL: @test_le_vec_xxpermdi_v2f64_undef_2
+; CHCECK-LE: xxswapd 34, 34
+}
+
+define <2 x double> @test_le_vec_xxpermdi_v2f64_undef_3(<2 x double> %VA) {
+ entry:
+ %0 = shufflevector <2 x double> %VA, <2 x double> undef, <2 x i32> <i32 0, i32 0>
+ ret <2 x double> %0
+; CHECK-LE-LABEL: @test_le_vec_xxpermdi_v2f64_undef_3
+; CHECK-LE: xxspltd 34, 34, 1
+; CHECK-LE: blr
+}
+
+; Start testing BE
+define <2 x double> @test_be_vec_xxpermdi_v2f64_v2f64_0(<2 x double> %VA, <2 x double> %VB) {
+ entry:
+ %0 = shufflevector <2 x double> %VA, <2 x double> %VB,<2 x i32> <i32 0, i32 2>
+ ret <2 x double> %0
+; CHECK-BE-LABEL: @test_be_vec_xxpermdi_v2f64_v2f64_0
+; CHECK-BE: xxmrghd 34, 34, 35
+; CHECK-BE: blr
+}
+
+define <2 x double> @test_be_vec_xxpermdi_v2f64_v2f64_1(<2 x double> %VA, <2 x double> %VB) {
+ entry:
+ %0 = shufflevector <2 x double> %VA, <2 x double> %VB,<2 x i32> <i32 0, i32 3>
+ ret <2 x double> %0
+; CHECK-BE-LABEL: @test_be_vec_xxpermdi_v2f64_v2f64_1
+; CHECK-BE: xxpermdi 34, 34, 35, 1
+; CHECK-BE: blr
+}
+
+define <2 x double> @test_be_vec_xxpermdi_v2f64_v2f64_2(<2 x double> %VA, <2 x double> %VB) {
+ entry:
+ %0 = shufflevector <2 x double> %VA, <2 x double> %VB,<2 x i32> <i32 1, i32 2>
+ ret <2 x double> %0
+; CHECK-BE-LABEL: @test_be_vec_xxpermdi_v2f64_v2f64_2
+; CHECK-BE: xxpermdi 34, 34, 35, 2
+; CHECK-BE: blr
+}
+
+define <2 x double> @test_be_vec_xxpermdi_v2f64_v2f64_3(<2 x double> %VA, <2 x double> %VB) {
+ entry:
+ %0 = shufflevector <2 x double> %VA, <2 x double> %VB,<2 x i32> <i32 1, i32 3>
+ ret <2 x double> %0
+; CHECK-BE-LABEL: @test_be_vec_xxpermdi_v2f64_v2f64_3
+; CHECK-BE: xxmrgld 34, 34, 35
+; CHECK-BE: blr
+}
+
+define <2 x double> @test_be_swap_vec_xxpermdi_v2f64_v2f64_0(<2 x double> %VA, <2 x double> %VB) {
+ entry:
+ %0 = shufflevector <2 x double> %VA, <2 x double> %VB,<2 x i32> <i32 2, i32 0>
+ ret <2 x double> %0
+; CHECK-BE-LABEL: @test_be_swap_vec_xxpermdi_v2f64_v2f64_0
+; CHECK-BE: xxmrghd 34, 35, 34
+; CHECK-BE: blr
+}
+
+define <2 x double> @test_be_swap_vec_xxpermdi_v2f64_v2f64_1(<2 x double> %VA, <2 x double> %VB) {
+ entry:
+ %0 = shufflevector <2 x double> %VA, <2 x double> %VB,<2 x i32> <i32 2, i32 1>
+ ret <2 x double> %0
+; CHECK-BE-LABEL: @test_be_swap_vec_xxpermdi_v2f64_v2f64_1
+; CHECK-BE: xxpermdi 34, 35, 34, 1
+; CHECK-BE: blr
+}
+
+define <2 x double> @test_be_swap_vec_xxpermdi_v2f64_v2f64_2(<2 x double> %VA, <2 x double> %VB) {
+ entry:
+ %0 = shufflevector <2 x double> %VA, <2 x double> %VB,<2 x i32> <i32 3, i32 0>
+ ret <2 x double> %0
+; CHECK-BE-LABEL: @test_be_swap_vec_xxpermdi_v2f64_v2f64_2
+; CHECK-BE: xxpermdi 34, 35, 34, 2
+; CHECK-BE: blr
+}
+
+define <2 x double> @test_be_swap_vec_xxpermdi_v2f64_v2f64_3(<2 x double> %VA, <2 x double> %VB) {
+ entry:
+ %0 = shufflevector <2 x double> %VA, <2 x double> %VB,<2 x i32> <i32 3, i32 1>
+ ret <2 x double> %0
+; CHECK-BE-LABEL: @test_be_swap_vec_xxpermdi_v2f64_v2f64_3
+; CHECK-BE: xxmrgld 34, 35, 34
+; CHECK-BE: blr
+}
+
+define <2 x double> @test_be_vec_xxpermdi_v2f64_undef_0(<2 x double> %VA) {
+ entry:
+ %0 = shufflevector <2 x double> %VA, <2 x double> undef, <2 x i32> <i32 0, i32 0>
+ ret <2 x double> %0
+; CHECK-BE-LABEL: @test_be_vec_xxpermdi_v2f64_undef_0
+; CHECK-BE: xxspltd 34, 34, 0
+; CHECK-BE: blr
+}
+
+define <2 x double> @test_be_vec_xxpermdi_v2f64_undef_1(<2 x double> %VA) {
+ entry:
+ %0 = shufflevector <2 x double> %VA, <2 x double> undef, <2 x i32> <i32 0, i32 1>
+ ret <2 x double> %0
+; CHECK-BE-LABEL: @test_be_vec_xxpermdi_v2f64_undef_1
+; CHECK-BE: blr
+}
+
+define <2 x double> @test_be_vec_xxpermdi_v2f64_undef_2(<2 x double> %VA) {
+ entry:
+ %0 = shufflevector <2 x double> %VA, <2 x double> undef, <2 x i32> <i32 1, i32 0>
+ ret <2 x double> %0
+; CHECK-BE-LABEL: @test_be_vec_xxpermdi_v2f64_undef_2
+; CHCECK-LE: xxswapd 34, 34
+}
+
+define <2 x double> @test_be_vec_xxpermdi_v2f64_undef_3(<2 x double> %VA) {
+ entry:
+ %0 = shufflevector <2 x double> %VA, <2 x double> undef, <2 x i32> <i32 1, i32 1>
+ ret <2 x double> %0
+; CHECK-BE-LABEL: @test_be_vec_xxpermdi_v2f64_undef_3
+; CHECK-BE: xxspltd 34, 34, 1
+; CHECK-BE: blr
+}
+
+; More test cases to test different types of vector inputs
+define <16 x i8> @test_be_vec_xxpermdi_v16i8_v16i8(<16 x i8> %VA, <16 x i8> %VB) {
+ entry:
+ %0 = shufflevector <16 x i8> %VA, <16 x i8> %VB,<16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19>
+ ret <16 x i8> %0
+; CHECK-BE-LABEL: @test_be_vec_xxpermdi_v16i8_v16i8
+; CHECK-BE: xxpermdi 34, 34, 35, 1
+; CHECK-BE: blr
+}
+
+define <8 x i16> @test_le_swap_vec_xxpermdi_v8i16_v8i16(<8 x i16> %VA, <8 x i16> %VB) {
+ entry:
+ %0 = shufflevector <8 x i16> %VA, <8 x i16> %VB,<8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
+ ret <8 x i16> %0
+; CHECK-LE-LABEL: @test_le_swap_vec_xxpermdi_v8i16_v8i16
+; CHECK-LE: xxpermdi 34, 35, 34, 1
+; CHECK-LE: blr
+}
+
+define <4 x i32> @test_le_swap_vec_xxpermdi_v4i32_v4i32(<4 x i32> %VA, <4 x i32> %VB) {
+ entry:
+ %0 = shufflevector <4 x i32> %VA, <4 x i32> %VB,<4 x i32> <i32 0, i32 1, i32 6, i32 7>
+ ret <4 x i32> %0
+; CHECK-LE-LABEL: @test_le_swap_vec_xxpermdi_v4i32_v4i32
+; CHECK-LE: xxpermdi 34, 35, 34, 1
+; CHECK-LE: blr
+}