summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorHans Wennborg <hans@hanshq.net>2018-02-14 10:51:00 +0000
committerHans Wennborg <hans@hanshq.net>2018-02-14 10:51:00 +0000
commita1b3df34e9efd6b17978356eb26b2c8f7286db4d (patch)
treedd0a621757b601b5fd3e03b0ad61625519ed1468
parent3af8845fb1af47a669ab753face6fbaaee9a44a1 (diff)
Revert r319778 (and r319911) due to PR36357
git-svn-id: https://llvm.org/svn/llvm-project/llvm/branches/release_60@325112 91177308-0d34-0410-b5e6-96231b3b80d8
-rw-r--r--include/llvm/IR/IntrinsicsX86.td9
-rw-r--r--lib/IR/AutoUpgrade.cpp7
-rw-r--r--lib/Target/X86/X86ISelLowering.cpp47
-rw-r--r--lib/Target/X86/X86IntrinsicsInfo.h3
-rw-r--r--test/CodeGen/X86/avx512-intrinsics-fast-isel.ll53
-rw-r--r--test/CodeGen/X86/avx512-intrinsics-upgrade.ll14
-rw-r--r--test/CodeGen/X86/avx512-intrinsics.ll15
-rw-r--r--test/CodeGen/X86/avx512bw-intrinsics-fast-isel.ll371
-rw-r--r--test/CodeGen/X86/avx512bw-intrinsics-upgrade.ll40
-rw-r--r--test/CodeGen/X86/avx512bw-intrinsics.ll49
10 files changed, 206 insertions, 402 deletions
diff --git a/include/llvm/IR/IntrinsicsX86.td b/include/llvm/IR/IntrinsicsX86.td
index bd6177c5b3d..7c000e2b1dc 100644
--- a/include/llvm/IR/IntrinsicsX86.td
+++ b/include/llvm/IR/IntrinsicsX86.td
@@ -3738,6 +3738,15 @@ let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.".
def int_x86_avx512_kxnor_w : // TODO: remove this intrinsic
Intrinsic<[llvm_i16_ty], [llvm_i16_ty, llvm_i16_ty],
[IntrNoMem]>;
+ def int_x86_avx512_kunpck_bw : GCCBuiltin<"__builtin_ia32_kunpckhi">,
+ Intrinsic<[llvm_i16_ty], [llvm_i16_ty, llvm_i16_ty],
+ [IntrNoMem]>;
+ def int_x86_avx512_kunpck_wd : GCCBuiltin<"__builtin_ia32_kunpcksi">,
+ Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty],
+ [IntrNoMem]>;
+ def int_x86_avx512_kunpck_dq : GCCBuiltin<"__builtin_ia32_kunpckdi">,
+ Intrinsic<[llvm_i64_ty], [llvm_i64_ty, llvm_i64_ty],
+ [IntrNoMem]>;
def int_x86_avx512_kortestz_w : GCCBuiltin<"__builtin_ia32_kortestzhi">,
Intrinsic<[llvm_i32_ty], [llvm_i16_ty, llvm_i16_ty],
[IntrNoMem]>;
diff --git a/lib/IR/AutoUpgrade.cpp b/lib/IR/AutoUpgrade.cpp
index c258d1a4e3a..c56a022c670 100644
--- a/lib/IR/AutoUpgrade.cpp
+++ b/lib/IR/AutoUpgrade.cpp
@@ -75,7 +75,6 @@ static bool ShouldUpgradeX86Intrinsic(Function *F, StringRef Name) {
Name=="ssse3.pabs.d.128" || // Added in 6.0
Name.startswith("avx512.mask.shuf.i") || // Added in 6.0
Name.startswith("avx512.mask.shuf.f") || // Added in 6.0
- Name.startswith("avx512.kunpck") || //added in 6.0
Name.startswith("avx2.pabs.") || // Added in 6.0
Name.startswith("avx512.mask.pabs.") || // Added in 6.0
Name.startswith("avx512.broadcastm") || // Added in 6.0
@@ -1063,12 +1062,6 @@ void llvm::UpgradeIntrinsicCall(CallInst *CI, Function *NewFn) {
Rep = Builder.CreateVectorSplat(NumElts, CI->getArgOperand(0));
Rep = EmitX86Select(Builder, CI->getArgOperand(2), Rep,
CI->getArgOperand(1));
- } else if (IsX86 && (Name.startswith("avx512.kunpck"))) {
- uint64_t Shift = CI->getType()->getScalarSizeInBits() / 2;
- uint64_t And = (1ULL << Shift) - 1;
- Value* LowBits = Builder.CreateAnd(CI->getArgOperand(0), And);
- Value* HighBits = Builder.CreateShl(CI->getArgOperand(1), Shift);
- Rep = Builder.CreateOr(LowBits, HighBits);
} else if (IsX86 && (Name == "sse.add.ss" || Name == "sse2.add.sd")) {
Type *I32Ty = Type::getInt32Ty(C);
Value *Elt0 = Builder.CreateExtractElement(CI->getArgOperand(0),
diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp
index 6a417b7f682..9237833a2cd 100644
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@@ -30466,53 +30466,6 @@ static SDValue combineBitcastvxi1(SelectionDAG &DAG, SDValue BitCast,
SDValue N0 = BitCast.getOperand(0);
EVT VecVT = N0->getValueType(0);
- if (VT.isVector() && VecVT.isScalarInteger() && Subtarget.hasAVX512() &&
- N0->getOpcode() == ISD::OR) {
- SDValue Op0 = N0->getOperand(0);
- SDValue Op1 = N0->getOperand(1);
- MVT TrunckVT;
- MVT BitcastVT;
- switch (VT.getSimpleVT().SimpleTy) {
- default:
- return SDValue();
- case MVT::v16i1:
- TrunckVT = MVT::i8;
- BitcastVT = MVT::v8i1;
- break;
- case MVT::v32i1:
- TrunckVT = MVT::i16;
- BitcastVT = MVT::v16i1;
- break;
- case MVT::v64i1:
- TrunckVT = MVT::i32;
- BitcastVT = MVT::v32i1;
- break;
- }
- bool isArg0UndefRight = Op0->getOpcode() == ISD::SHL;
- bool isArg0UndefLeft =
- Op0->getOpcode() == ISD::ZERO_EXTEND || Op0->getOpcode() == ISD::AND;
- bool isArg1UndefRight = Op1->getOpcode() == ISD::SHL;
- bool isArg1UndefLeft =
- Op1->getOpcode() == ISD::ZERO_EXTEND || Op1->getOpcode() == ISD::AND;
- SDValue OpLeft;
- SDValue OpRight;
- if (isArg0UndefRight && isArg1UndefLeft) {
- OpLeft = Op0;
- OpRight = Op1;
- } else if (isArg1UndefRight && isArg0UndefLeft) {
- OpLeft = Op1;
- OpRight = Op0;
- } else
- return SDValue();
- SDLoc DL(BitCast);
- SDValue Shr = OpLeft->getOperand(0);
- SDValue Trunc1 = DAG.getNode(ISD::TRUNCATE, DL, TrunckVT, Shr);
- SDValue Bitcast1 = DAG.getBitcast(BitcastVT, Trunc1);
- SDValue Trunc2 = DAG.getNode(ISD::TRUNCATE, DL, TrunckVT, OpRight);
- SDValue Bitcast2 = DAG.getBitcast(BitcastVT, Trunc2);
- return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Bitcast1, Bitcast2);
- }
-
if (!VT.isScalarInteger() || !VecVT.isSimple())
return SDValue();
diff --git a/lib/Target/X86/X86IntrinsicsInfo.h b/lib/Target/X86/X86IntrinsicsInfo.h
index 7821971b4a2..fae0889950b 100644
--- a/lib/Target/X86/X86IntrinsicsInfo.h
+++ b/lib/Target/X86/X86IntrinsicsInfo.h
@@ -479,6 +479,9 @@ static const IntrinsicData IntrinsicsWithoutChain[] = {
X86_INTRINSIC_DATA(avx512_exp2_ps, INTR_TYPE_1OP_MASK_RM, X86ISD::EXP2, 0),
X86_INTRINSIC_DATA(avx512_kand_w, MASK_BINOP, ISD::AND, 0),
X86_INTRINSIC_DATA(avx512_kor_w, MASK_BINOP, ISD::OR, 0),
+ X86_INTRINSIC_DATA(avx512_kunpck_bw, KUNPCK, ISD::CONCAT_VECTORS, 0),
+ X86_INTRINSIC_DATA(avx512_kunpck_dq, KUNPCK, ISD::CONCAT_VECTORS, 0),
+ X86_INTRINSIC_DATA(avx512_kunpck_wd, KUNPCK, ISD::CONCAT_VECTORS, 0),
X86_INTRINSIC_DATA(avx512_kxor_w, MASK_BINOP, ISD::XOR, 0),
X86_INTRINSIC_DATA(avx512_mask_add_pd_512, INTR_TYPE_2OP_MASK, ISD::FADD,
X86ISD::FADD_RND),
diff --git a/test/CodeGen/X86/avx512-intrinsics-fast-isel.ll b/test/CodeGen/X86/avx512-intrinsics-fast-isel.ll
index 50de773af00..80127f66bdf 100644
--- a/test/CodeGen/X86/avx512-intrinsics-fast-isel.ll
+++ b/test/CodeGen/X86/avx512-intrinsics-fast-isel.ll
@@ -5,59 +5,6 @@
; NOTE: This should use IR equivalent to what is generated by clang/test/CodeGen/avx512f-builtins.c
-define zeroext i16 @test_mm512_kunpackb(<8 x i64> %__A, <8 x i64> %__B, <8 x i64> %__C, <8 x i64> %__D, <8 x i64> %__E, <8 x i64> %__F) local_unnamed_addr #0 {
-; X32-LABEL: test_mm512_kunpackb:
-; X32: # %bb.0: # %entry
-; X32-NEXT: pushl %ebp
-; X32-NEXT: .cfi_def_cfa_offset 8
-; X32-NEXT: .cfi_offset %ebp, -8
-; X32-NEXT: movl %esp, %ebp
-; X32-NEXT: .cfi_def_cfa_register %ebp
-; X32-NEXT: andl $-64, %esp
-; X32-NEXT: subl $64, %esp
-; X32-NEXT: vmovdqa64 136(%ebp), %zmm3
-; X32-NEXT: vpcmpneqd %zmm1, %zmm0, %k0
-; X32-NEXT: vpcmpneqd 8(%ebp), %zmm2, %k1
-; X32-NEXT: kunpckbw %k0, %k1, %k1
-; X32-NEXT: vpcmpneqd 72(%ebp), %zmm3, %k0 {%k1}
-; X32-NEXT: kmovw %k0, %eax
-; X32-NEXT: movzwl %ax, %eax
-; X32-NEXT: movl %ebp, %esp
-; X32-NEXT: popl %ebp
-; X32-NEXT: vzeroupper
-; X32-NEXT: retl
-;
-; X64-LABEL: test_mm512_kunpackb:
-; X64: # %bb.0: # %entry
-; X64-NEXT: vpcmpneqd %zmm1, %zmm0, %k0
-; X64-NEXT: vpcmpneqd %zmm3, %zmm2, %k1
-; X64-NEXT: kunpckbw %k0, %k1, %k1
-; X64-NEXT: vpcmpneqd %zmm5, %zmm4, %k0 {%k1}
-; X64-NEXT: kmovw %k0, %eax
-; X64-NEXT: movzwl %ax, %eax
-; X64-NEXT: vzeroupper
-; X64-NEXT: retq
-entry:
- %0 = bitcast <8 x i64> %__A to <16 x i32>
- %1 = bitcast <8 x i64> %__B to <16 x i32>
- %2 = icmp ne <16 x i32> %0, %1
- %3 = bitcast <16 x i1> %2 to i16
- %4 = bitcast <8 x i64> %__C to <16 x i32>
- %5 = bitcast <8 x i64> %__D to <16 x i32>
- %6 = icmp ne <16 x i32> %4, %5
- %7 = bitcast <16 x i1> %6 to i16
- %8 = and i16 %7, 255
- %shl.i = shl i16 %3, 8
- %or.i = or i16 %8, %shl.i
- %9 = bitcast <8 x i64> %__E to <16 x i32>
- %10 = bitcast <8 x i64> %__F to <16 x i32>
- %11 = icmp ne <16 x i32> %9, %10
- %12 = bitcast i16 %or.i to <16 x i1>
- %13 = and <16 x i1> %11, %12
- %14 = bitcast <16 x i1> %13 to i16
- ret i16 %14
-}
-
define <16 x float> @test_mm512_shuffle_f32x4(<16 x float> %__A, <16 x float> %__B) {
; X32-LABEL: test_mm512_shuffle_f32x4:
; X32: # %bb.0: # %entry
diff --git a/test/CodeGen/X86/avx512-intrinsics-upgrade.ll b/test/CodeGen/X86/avx512-intrinsics-upgrade.ll
index f3ca0644e46..378dbda2dc0 100644
--- a/test/CodeGen/X86/avx512-intrinsics-upgrade.ll
+++ b/test/CodeGen/X86/avx512-intrinsics-upgrade.ll
@@ -1,20 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl | FileCheck %s
-declare i16 @llvm.x86.avx512.kunpck.bw(i16, i16) nounwind readnone
-
-define i16 @unpckbw_test(i16 %a0, i16 %a1) {
-; CHECK-LABEL: unpckbw_test:
-; CHECK: ## %bb.0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: shll $8, %esi
-; CHECK-NEXT: orl %esi, %eax
-; CHECK-NEXT: ## kill: def %ax killed %ax killed %eax
-; CHECK-NEXT: retq
- %res = call i16 @llvm.x86.avx512.kunpck.bw(i16 %a0, i16 %a1)
- ret i16 %res
-}
-
define <16 x i32>@test_int_x86_avx512_mask_pbroadcastd_gpr_512(i32 %x0, <16 x i32> %x1, i16 %mask) {
; CHECK-LABEL: test_int_x86_avx512_mask_pbroadcastd_gpr_512:
; CHECK: ## %bb.0:
diff --git a/test/CodeGen/X86/avx512-intrinsics.ll b/test/CodeGen/X86/avx512-intrinsics.ll
index 5faa202c30f..628199d4ac9 100644
--- a/test/CodeGen/X86/avx512-intrinsics.ll
+++ b/test/CodeGen/X86/avx512-intrinsics.ll
@@ -96,6 +96,21 @@ define i16 @test_kor(i16 %a0, i16 %a1) {
ret i16 %t2
}
+declare i16 @llvm.x86.avx512.kunpck.bw(i16, i16) nounwind readnone
+
+define i16 @unpckbw_test(i16 %a0, i16 %a1) {
+; CHECK-LABEL: unpckbw_test:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: kmovw %edi, %k0
+; CHECK-NEXT: kmovw %esi, %k1
+; CHECK-NEXT: kunpckbw %k1, %k0, %k0
+; CHECK-NEXT: kmovw %k0, %eax
+; CHECK-NEXT: ## kill: def %ax killed %ax killed %eax
+; CHECK-NEXT: retq
+ %res = call i16 @llvm.x86.avx512.kunpck.bw(i16 %a0, i16 %a1)
+ ret i16 %res
+}
+
declare i16 @llvm.x86.avx512.kxnor.w(i16, i16) nounwind readnone
; TODO: the two kxnor instructions here a no op and should be elimintaed,
; probably by FoldConstantArithmetic in SelectionDAG.
diff --git a/test/CodeGen/X86/avx512bw-intrinsics-fast-isel.ll b/test/CodeGen/X86/avx512bw-intrinsics-fast-isel.ll
index 1e754be6fe4..a56111f3453 100644
--- a/test/CodeGen/X86/avx512bw-intrinsics-fast-isel.ll
+++ b/test/CodeGen/X86/avx512bw-intrinsics-fast-isel.ll
@@ -4,117 +4,6 @@
; NOTE: This should use IR equivalent to what is generated by clang/test/CodeGen/avx512bw-builtins.c
-define i64 @test_mm512_kunpackd(<8 x i64> %__A, <8 x i64> %__B, <8 x i64> %__C, <8 x i64> %__D, <8 x i64> %__E, <8 x i64> %__F) {
-; X32-LABEL: test_mm512_kunpackd:
-; X32: # %bb.0: # %entry
-; X32-NEXT: pushl %ebp
-; X32-NEXT: .cfi_def_cfa_offset 8
-; X32-NEXT: .cfi_offset %ebp, -8
-; X32-NEXT: movl %esp, %ebp
-; X32-NEXT: .cfi_def_cfa_register %ebp
-; X32-NEXT: andl $-64, %esp
-; X32-NEXT: subl $64, %esp
-; X32-NEXT: vmovdqa64 136(%ebp), %zmm3
-; X32-NEXT: vmovdqa64 72(%ebp), %zmm4
-; X32-NEXT: vmovdqa64 8(%ebp), %zmm5
-; X32-NEXT: vpcmpneqb %zmm0, %zmm1, %k0
-; X32-NEXT: kmovq %k0, {{[0-9]+}}(%esp)
-; X32-NEXT: vpcmpneqb %zmm5, %zmm2, %k0
-; X32-NEXT: kmovq %k0, {{[0-9]+}}(%esp)
-; X32-NEXT: kmovd {{[0-9]+}}(%esp), %k0
-; X32-NEXT: kmovd {{[0-9]+}}(%esp), %k1
-; X32-NEXT: kunpckdq %k0, %k1, %k1
-; X32-NEXT: vpcmpneqb %zmm3, %zmm4, %k0 {%k1}
-; X32-NEXT: kmovq %k0, {{[0-9]+}}(%esp)
-; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X32-NEXT: movl %ebp, %esp
-; X32-NEXT: popl %ebp
-; X32-NEXT: vzeroupper
-; X32-NEXT: retl
-;
-; X64-LABEL: test_mm512_kunpackd:
-; X64: # %bb.0: # %entry
-; X64-NEXT: vpcmpneqb %zmm0, %zmm1, %k0
-; X64-NEXT: vpcmpneqb %zmm3, %zmm2, %k1
-; X64-NEXT: kunpckdq %k0, %k1, %k1
-; X64-NEXT: vpcmpneqb %zmm5, %zmm4, %k0 {%k1}
-; X64-NEXT: kmovq %k0, %rax
-; X64-NEXT: vzeroupper
-; X64-NEXT: retq
-entry:
- %0 = bitcast <8 x i64> %__B to <64 x i8>
- %1 = bitcast <8 x i64> %__A to <64 x i8>
- %2 = icmp ne <64 x i8> %0, %1
- %3 = bitcast <64 x i1> %2 to i64
- %4 = bitcast <8 x i64> %__C to <64 x i8>
- %5 = bitcast <8 x i64> %__D to <64 x i8>
- %6 = icmp ne <64 x i8> %4, %5
- %7 = bitcast <64 x i1> %6 to i64
- %and.i = and i64 %7, 4294967295
- %shl.i = shl i64 %3, 32
- %or.i = or i64 %and.i, %shl.i
- %8 = bitcast <8 x i64> %__E to <64 x i8>
- %9 = bitcast <8 x i64> %__F to <64 x i8>
- %10 = icmp ne <64 x i8> %8, %9
- %11 = bitcast i64 %or.i to <64 x i1>
- %12 = and <64 x i1> %10, %11
- %13 = bitcast <64 x i1> %12 to i64
- ret i64 %13
-}
-
-define i32 @test_mm512_kunpackw(<8 x i64> %__A, <8 x i64> %__B, <8 x i64> %__C, <8 x i64> %__D, <8 x i64> %__E, <8 x i64> %__F) {
-; X32-LABEL: test_mm512_kunpackw:
-; X32: # %bb.0: # %entry
-; X32-NEXT: pushl %ebp
-; X32-NEXT: .cfi_def_cfa_offset 8
-; X32-NEXT: .cfi_offset %ebp, -8
-; X32-NEXT: movl %esp, %ebp
-; X32-NEXT: .cfi_def_cfa_register %ebp
-; X32-NEXT: andl $-64, %esp
-; X32-NEXT: subl $64, %esp
-; X32-NEXT: vmovdqa64 136(%ebp), %zmm3
-; X32-NEXT: vpcmpneqw %zmm0, %zmm1, %k0
-; X32-NEXT: vpcmpneqw 8(%ebp), %zmm2, %k1
-; X32-NEXT: kunpckwd %k0, %k1, %k1
-; X32-NEXT: vpcmpneqw 72(%ebp), %zmm3, %k0 {%k1}
-; X32-NEXT: kmovd %k0, %eax
-; X32-NEXT: movl %ebp, %esp
-; X32-NEXT: popl %ebp
-; X32-NEXT: vzeroupper
-; X32-NEXT: retl
-;
-; X64-LABEL: test_mm512_kunpackw:
-; X64: # %bb.0: # %entry
-; X64-NEXT: vpcmpneqw %zmm0, %zmm1, %k0
-; X64-NEXT: vpcmpneqw %zmm3, %zmm2, %k1
-; X64-NEXT: kunpckwd %k0, %k1, %k1
-; X64-NEXT: vpcmpneqw %zmm5, %zmm4, %k0 {%k1}
-; X64-NEXT: kmovd %k0, %eax
-; X64-NEXT: vzeroupper
-; X64-NEXT: retq
-entry:
- %0 = bitcast <8 x i64> %__B to <32 x i16>
- %1 = bitcast <8 x i64> %__A to <32 x i16>
- %2 = icmp ne <32 x i16> %0, %1
- %3 = bitcast <32 x i1> %2 to i32
- %4 = bitcast <8 x i64> %__C to <32 x i16>
- %5 = bitcast <8 x i64> %__D to <32 x i16>
- %6 = icmp ne <32 x i16> %4, %5
- %7 = bitcast <32 x i1> %6 to i32
- %and.i = and i32 %7, 65535
- %shl.i = shl i32 %3, 16
- %or.i = or i32 %and.i, %shl.i
- %8 = bitcast <8 x i64> %__E to <32 x i16>
- %9 = bitcast <8 x i64> %__F to <32 x i16>
- %10 = icmp ne <32 x i16> %8, %9
- %11 = bitcast i32 %or.i to <32 x i1>
- %12 = and <32 x i1> %10, %11
- %13 = bitcast <32 x i1> %12 to i32
- ret i32 %13
-}
-
-
define <8 x i64> @test_mm512_mask_set1_epi8(<8 x i64> %__O, i64 %__M, i8 signext %__A) {
; X32-LABEL: test_mm512_mask_set1_epi8:
; X32: # %bb.0: # %entry
@@ -189,46 +78,19 @@ define <8 x i64> @test_mm512_mask_set1_epi8(<8 x i64> %__O, i64 %__M, i8 signext
; X32-NEXT: movb %ch, %al
; X32-NEXT: kmovd %eax, %k2
; X32-NEXT: kxorq %k2, %k1, %k1
-; X32-NEXT: kshiftlq $63, %k1, %k1
-; X32-NEXT: kshiftrq $55, %k1, %k1
-; X32-NEXT: kxorq %k0, %k1, %k0
-; X32-NEXT: kshiftrq $9, %k0, %k1
; X32-NEXT: andb $2, %al
; X32-NEXT: shrb %al
; X32-NEXT: kmovd %eax, %k2
-; X32-NEXT: kxorq %k2, %k1, %k1
-; X32-NEXT: kshiftlq $63, %k1, %k1
-; X32-NEXT: kshiftrq $54, %k1, %k1
-; X32-NEXT: kxorq %k0, %k1, %k0
-; X32-NEXT: kshiftrq $10, %k0, %k1
; X32-NEXT: movb %ch, %al
; X32-NEXT: andb $15, %al
; X32-NEXT: movl %eax, %edx
; X32-NEXT: shrb $2, %dl
-; X32-NEXT: kmovd %edx, %k2
-; X32-NEXT: kxorq %k2, %k1, %k1
-; X32-NEXT: kshiftlq $63, %k1, %k1
-; X32-NEXT: kshiftrq $53, %k1, %k1
-; X32-NEXT: kxorq %k0, %k1, %k0
-; X32-NEXT: kshiftrq $11, %k0, %k1
+; X32-NEXT: kmovd %edx, %k3
; X32-NEXT: shrb $3, %al
-; X32-NEXT: kmovd %eax, %k2
-; X32-NEXT: kxorq %k2, %k1, %k1
-; X32-NEXT: movl %ecx, %eax
-; X32-NEXT: shrl $12, %eax
-; X32-NEXT: andl $15, %eax
-; X32-NEXT: kmovd %eax, %k2
+; X32-NEXT: kmovd %eax, %k4
; X32-NEXT: movl %ecx, %eax
; X32-NEXT: shrl $13, %eax
; X32-NEXT: andb $1, %al
-; X32-NEXT: kmovd %eax, %k3
-; X32-NEXT: movl %ecx, %eax
-; X32-NEXT: shrl $14, %eax
-; X32-NEXT: andl $3, %eax
-; X32-NEXT: kmovd %eax, %k4
-; X32-NEXT: movl %ecx, %eax
-; X32-NEXT: shrl $15, %eax
-; X32-NEXT: andl $1, %eax
; X32-NEXT: kmovd %eax, %k5
; X32-NEXT: movl %ecx, %edx
; X32-NEXT: shrl $16, %edx
@@ -243,25 +105,52 @@ define <8 x i64> @test_mm512_mask_set1_epi8(<8 x i64> %__O, i64 %__M, i8 signext
; X32-NEXT: kmovd %eax, %k7
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: kshiftlq $63, %k1, %k1
+; X32-NEXT: kshiftrq $55, %k1, %k1
+; X32-NEXT: kxorq %k0, %k1, %k0
+; X32-NEXT: kshiftrq $9, %k0, %k1
+; X32-NEXT: kxorq %k2, %k1, %k1
+; X32-NEXT: kshiftlq $63, %k1, %k1
+; X32-NEXT: kshiftrq $54, %k1, %k1
+; X32-NEXT: kxorq %k0, %k1, %k0
+; X32-NEXT: kshiftrq $10, %k0, %k1
+; X32-NEXT: kxorq %k3, %k1, %k1
+; X32-NEXT: kshiftlq $63, %k1, %k1
+; X32-NEXT: kshiftrq $53, %k1, %k1
+; X32-NEXT: kxorq %k0, %k1, %k0
+; X32-NEXT: kshiftrq $11, %k0, %k1
+; X32-NEXT: kxorq %k4, %k1, %k1
+; X32-NEXT: kshiftlq $63, %k1, %k1
; X32-NEXT: kshiftrq $52, %k1, %k1
; X32-NEXT: kxorq %k0, %k1, %k0
; X32-NEXT: kshiftrq $12, %k0, %k1
+; X32-NEXT: movl %ecx, %esi
+; X32-NEXT: shrl $12, %esi
+; X32-NEXT: andl $15, %esi
+; X32-NEXT: kmovd %esi, %k2
; X32-NEXT: kxorq %k2, %k1, %k1
; X32-NEXT: kshiftlq $63, %k1, %k1
; X32-NEXT: kshiftrq $51, %k1, %k1
; X32-NEXT: kxorq %k0, %k1, %k0
; X32-NEXT: kshiftrq $13, %k0, %k1
-; X32-NEXT: kxorq %k3, %k1, %k1
+; X32-NEXT: kxorq %k5, %k1, %k1
; X32-NEXT: kshiftlq $63, %k1, %k1
; X32-NEXT: kshiftrq $50, %k1, %k1
; X32-NEXT: kxorq %k0, %k1, %k0
; X32-NEXT: kshiftrq $14, %k0, %k1
-; X32-NEXT: kxorq %k4, %k1, %k1
+; X32-NEXT: movl %ecx, %esi
+; X32-NEXT: shrl $14, %esi
+; X32-NEXT: andl $3, %esi
+; X32-NEXT: kmovd %esi, %k2
+; X32-NEXT: kxorq %k2, %k1, %k1
; X32-NEXT: kshiftlq $63, %k1, %k1
; X32-NEXT: kshiftrq $49, %k1, %k1
; X32-NEXT: kxorq %k0, %k1, %k0
; X32-NEXT: kshiftrq $15, %k0, %k1
-; X32-NEXT: kxorq %k5, %k1, %k1
+; X32-NEXT: movl %ecx, %esi
+; X32-NEXT: shrl $15, %esi
+; X32-NEXT: andl $1, %esi
+; X32-NEXT: kmovd %esi, %k2
+; X32-NEXT: kxorq %k2, %k1, %k1
; X32-NEXT: kshiftlq $63, %k1, %k1
; X32-NEXT: kshiftrq $48, %k1, %k1
; X32-NEXT: kxorq %k0, %k1, %k0
@@ -494,22 +383,14 @@ define <8 x i64> @test_mm512_mask_set1_epi8(<8 x i64> %__O, i64 %__M, i8 signext
; X32-NEXT: kxorq %k0, %k1, %k0
; X32-NEXT: kshiftrq $43, %k0, %k1
; X32-NEXT: kxorq %k4, %k1, %k1
-; X32-NEXT: movl %eax, %esi
-; X32-NEXT: shrl $12, %esi
-; X32-NEXT: andl $15, %esi
-; X32-NEXT: kmovd %esi, %k2
-; X32-NEXT: movl %eax, %esi
-; X32-NEXT: shrl $14, %esi
-; X32-NEXT: andl $3, %esi
-; X32-NEXT: kmovd %esi, %k3
-; X32-NEXT: movl %eax, %esi
-; X32-NEXT: shrl $15, %esi
-; X32-NEXT: andl $1, %esi
-; X32-NEXT: kmovd %esi, %k4
; X32-NEXT: kshiftlq $63, %k1, %k1
; X32-NEXT: kshiftrq $20, %k1, %k1
; X32-NEXT: kxorq %k0, %k1, %k0
; X32-NEXT: kshiftrq $44, %k0, %k1
+; X32-NEXT: movl %eax, %esi
+; X32-NEXT: shrl $12, %esi
+; X32-NEXT: andl $15, %esi
+; X32-NEXT: kmovd %esi, %k2
; X32-NEXT: kxorq %k2, %k1, %k1
; X32-NEXT: kshiftlq $63, %k1, %k1
; X32-NEXT: kshiftrq $19, %k1, %k1
@@ -520,12 +401,20 @@ define <8 x i64> @test_mm512_mask_set1_epi8(<8 x i64> %__O, i64 %__M, i8 signext
; X32-NEXT: kshiftrq $18, %k1, %k1
; X32-NEXT: kxorq %k0, %k1, %k0
; X32-NEXT: kshiftrq $46, %k0, %k1
-; X32-NEXT: kxorq %k3, %k1, %k1
+; X32-NEXT: movl %eax, %esi
+; X32-NEXT: shrl $14, %esi
+; X32-NEXT: andl $3, %esi
+; X32-NEXT: kmovd %esi, %k2
+; X32-NEXT: kxorq %k2, %k1, %k1
; X32-NEXT: kshiftlq $63, %k1, %k1
; X32-NEXT: kshiftrq $17, %k1, %k1
; X32-NEXT: kxorq %k0, %k1, %k0
; X32-NEXT: kshiftrq $47, %k0, %k1
-; X32-NEXT: kxorq %k4, %k1, %k1
+; X32-NEXT: movl %eax, %esi
+; X32-NEXT: shrl $15, %esi
+; X32-NEXT: andl $1, %esi
+; X32-NEXT: kmovd %esi, %k2
+; X32-NEXT: kxorq %k2, %k1, %k1
; X32-NEXT: kshiftlq $63, %k1, %k1
; X32-NEXT: kshiftrq $16, %k1, %k1
; X32-NEXT: kxorq %k0, %k1, %k0
@@ -551,8 +440,8 @@ define <8 x i64> @test_mm512_mask_set1_epi8(<8 x i64> %__O, i64 %__M, i8 signext
; X32-NEXT: kxorq %k2, %k1, %k1
; X32-NEXT: kshiftlq $63, %k1, %k1
; X32-NEXT: kshiftrq $12, %k1, %k1
-; X32-NEXT: kxorq %k0, %k1, %k4
-; X32-NEXT: kshiftrq $52, %k4, %k0
+; X32-NEXT: kxorq %k0, %k1, %k3
+; X32-NEXT: kshiftrq $52, %k3, %k0
; X32-NEXT: movl %ecx, %edx
; X32-NEXT: shrb $4, %dl
; X32-NEXT: kmovd %edx, %k1
@@ -576,19 +465,19 @@ define <8 x i64> @test_mm512_mask_set1_epi8(<8 x i64> %__O, i64 %__M, i8 signext
; X32-NEXT: andb $15, %cl
; X32-NEXT: movl %ecx, %edx
; X32-NEXT: shrb $2, %dl
-; X32-NEXT: kmovd %edx, %k3
+; X32-NEXT: kmovd %edx, %k4
; X32-NEXT: kshiftlq $63, %k5, %k5
; X32-NEXT: kshiftrq $11, %k5, %k5
-; X32-NEXT: kxorq %k4, %k5, %k4
-; X32-NEXT: kshiftrq $53, %k4, %k5
+; X32-NEXT: kxorq %k3, %k5, %k3
+; X32-NEXT: kshiftrq $53, %k3, %k5
; X32-NEXT: kxorq %k6, %k5, %k5
; X32-NEXT: kshiftlq $63, %k5, %k5
; X32-NEXT: kshiftrq $10, %k5, %k5
-; X32-NEXT: kxorq %k4, %k5, %k5
-; X32-NEXT: kshiftrq $54, %k5, %k4
-; X32-NEXT: kxorq %k7, %k4, %k6
+; X32-NEXT: kxorq %k3, %k5, %k5
+; X32-NEXT: kshiftrq $54, %k5, %k3
+; X32-NEXT: kxorq %k7, %k3, %k6
; X32-NEXT: shrb $3, %cl
-; X32-NEXT: kmovd %ecx, %k4
+; X32-NEXT: kmovd %ecx, %k3
; X32-NEXT: movl %eax, %ecx
; X32-NEXT: shrl $29, %ecx
; X32-NEXT: andb $1, %cl
@@ -603,12 +492,6 @@ define <8 x i64> @test_mm512_mask_set1_epi8(<8 x i64> %__O, i64 %__M, i8 signext
; X32-NEXT: kxorq %k5, %k0, %k0
; X32-NEXT: kshiftrq $56, %k0, %k5
; X32-NEXT: kxorq %k1, %k5, %k1
-; X32-NEXT: movl %eax, %ecx
-; X32-NEXT: shrl $28, %ecx
-; X32-NEXT: kmovd %ecx, %k5
-; X32-NEXT: movl %eax, %ecx
-; X32-NEXT: shrl $30, %ecx
-; X32-NEXT: kmovd %ecx, %k6
; X32-NEXT: kshiftlq $63, %k1, %k1
; X32-NEXT: kshiftrq $7, %k1, %k1
; X32-NEXT: kxorq %k0, %k1, %k0
@@ -618,17 +501,20 @@ define <8 x i64> @test_mm512_mask_set1_epi8(<8 x i64> %__O, i64 %__M, i8 signext
; X32-NEXT: kshiftrq $6, %k1, %k1
; X32-NEXT: kxorq %k0, %k1, %k0
; X32-NEXT: kshiftrq $58, %k0, %k1
-; X32-NEXT: kxorq %k3, %k1, %k1
+; X32-NEXT: kxorq %k4, %k1, %k1
; X32-NEXT: kshiftlq $63, %k1, %k1
; X32-NEXT: kshiftrq $5, %k1, %k1
; X32-NEXT: kxorq %k0, %k1, %k0
; X32-NEXT: kshiftrq $59, %k0, %k1
-; X32-NEXT: kxorq %k4, %k1, %k1
+; X32-NEXT: kxorq %k3, %k1, %k1
; X32-NEXT: kshiftlq $63, %k1, %k1
; X32-NEXT: kshiftrq $4, %k1, %k1
; X32-NEXT: kxorq %k0, %k1, %k0
; X32-NEXT: kshiftrq $60, %k0, %k1
-; X32-NEXT: kxorq %k5, %k1, %k1
+; X32-NEXT: movl %eax, %ecx
+; X32-NEXT: shrl $28, %ecx
+; X32-NEXT: kmovd %ecx, %k2
+; X32-NEXT: kxorq %k2, %k1, %k1
; X32-NEXT: kshiftlq $63, %k1, %k1
; X32-NEXT: kshiftrq $3, %k1, %k1
; X32-NEXT: kxorq %k0, %k1, %k0
@@ -638,7 +524,10 @@ define <8 x i64> @test_mm512_mask_set1_epi8(<8 x i64> %__O, i64 %__M, i8 signext
; X32-NEXT: kshiftrq $2, %k1, %k1
; X32-NEXT: kxorq %k0, %k1, %k0
; X32-NEXT: kshiftrq $62, %k0, %k1
-; X32-NEXT: kxorq %k6, %k1, %k1
+; X32-NEXT: movl %eax, %ecx
+; X32-NEXT: shrl $30, %ecx
+; X32-NEXT: kmovd %ecx, %k2
+; X32-NEXT: kxorq %k2, %k1, %k1
; X32-NEXT: shrl $31, %eax
; X32-NEXT: kmovd %eax, %k2
; X32-NEXT: movb {{[0-9]+}}(%esp), %al
@@ -743,46 +632,19 @@ define <8 x i64> @test_mm512_maskz_set1_epi8(i64 %__M, i8 signext %__A) {
; X32-NEXT: movb %ch, %al
; X32-NEXT: kmovd %eax, %k2
; X32-NEXT: kxorq %k2, %k1, %k1
-; X32-NEXT: kshiftlq $63, %k1, %k1
-; X32-NEXT: kshiftrq $55, %k1, %k1
-; X32-NEXT: kxorq %k0, %k1, %k0
-; X32-NEXT: kshiftrq $9, %k0, %k1
; X32-NEXT: andb $2, %al
; X32-NEXT: shrb %al
; X32-NEXT: kmovd %eax, %k2
-; X32-NEXT: kxorq %k2, %k1, %k1
-; X32-NEXT: kshiftlq $63, %k1, %k1
-; X32-NEXT: kshiftrq $54, %k1, %k1
-; X32-NEXT: kxorq %k0, %k1, %k0
-; X32-NEXT: kshiftrq $10, %k0, %k1
; X32-NEXT: movb %ch, %al
; X32-NEXT: andb $15, %al
; X32-NEXT: movl %eax, %edx
; X32-NEXT: shrb $2, %dl
-; X32-NEXT: kmovd %edx, %k2
-; X32-NEXT: kxorq %k2, %k1, %k1
-; X32-NEXT: kshiftlq $63, %k1, %k1
-; X32-NEXT: kshiftrq $53, %k1, %k1
-; X32-NEXT: kxorq %k0, %k1, %k0
-; X32-NEXT: kshiftrq $11, %k0, %k1
+; X32-NEXT: kmovd %edx, %k3
; X32-NEXT: shrb $3, %al
-; X32-NEXT: kmovd %eax, %k2
-; X32-NEXT: kxorq %k2, %k1, %k1
-; X32-NEXT: movl %ecx, %eax
-; X32-NEXT: shrl $12, %eax
-; X32-NEXT: andl $15, %eax
-; X32-NEXT: kmovd %eax, %k2
+; X32-NEXT: kmovd %eax, %k4
; X32-NEXT: movl %ecx, %eax
; X32-NEXT: shrl $13, %eax
; X32-NEXT: andb $1, %al
-; X32-NEXT: kmovd %eax, %k3
-; X32-NEXT: movl %ecx, %eax
-; X32-NEXT: shrl $14, %eax
-; X32-NEXT: andl $3, %eax
-; X32-NEXT: kmovd %eax, %k4
-; X32-NEXT: movl %ecx, %eax
-; X32-NEXT: shrl $15, %eax
-; X32-NEXT: andl $1, %eax
; X32-NEXT: kmovd %eax, %k5
; X32-NEXT: movl %ecx, %edx
; X32-NEXT: shrl $16, %edx
@@ -797,25 +659,52 @@ define <8 x i64> @test_mm512_maskz_set1_epi8(i64 %__M, i8 signext %__A) {
; X32-NEXT: kmovd %eax, %k7
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: kshiftlq $63, %k1, %k1
+; X32-NEXT: kshiftrq $55, %k1, %k1
+; X32-NEXT: kxorq %k0, %k1, %k0
+; X32-NEXT: kshiftrq $9, %k0, %k1
+; X32-NEXT: kxorq %k2, %k1, %k1
+; X32-NEXT: kshiftlq $63, %k1, %k1
+; X32-NEXT: kshiftrq $54, %k1, %k1
+; X32-NEXT: kxorq %k0, %k1, %k0
+; X32-NEXT: kshiftrq $10, %k0, %k1
+; X32-NEXT: kxorq %k3, %k1, %k1
+; X32-NEXT: kshiftlq $63, %k1, %k1
+; X32-NEXT: kshiftrq $53, %k1, %k1
+; X32-NEXT: kxorq %k0, %k1, %k0
+; X32-NEXT: kshiftrq $11, %k0, %k1
+; X32-NEXT: kxorq %k4, %k1, %k1
+; X32-NEXT: kshiftlq $63, %k1, %k1
; X32-NEXT: kshiftrq $52, %k1, %k1
; X32-NEXT: kxorq %k0, %k1, %k0
; X32-NEXT: kshiftrq $12, %k0, %k1
+; X32-NEXT: movl %ecx, %esi
+; X32-NEXT: shrl $12, %esi
+; X32-NEXT: andl $15, %esi
+; X32-NEXT: kmovd %esi, %k2
; X32-NEXT: kxorq %k2, %k1, %k1
; X32-NEXT: kshiftlq $63, %k1, %k1
; X32-NEXT: kshiftrq $51, %k1, %k1
; X32-NEXT: kxorq %k0, %k1, %k0
; X32-NEXT: kshiftrq $13, %k0, %k1
-; X32-NEXT: kxorq %k3, %k1, %k1
+; X32-NEXT: kxorq %k5, %k1, %k1
; X32-NEXT: kshiftlq $63, %k1, %k1
; X32-NEXT: kshiftrq $50, %k1, %k1
; X32-NEXT: kxorq %k0, %k1, %k0
; X32-NEXT: kshiftrq $14, %k0, %k1
-; X32-NEXT: kxorq %k4, %k1, %k1
+; X32-NEXT: movl %ecx, %esi
+; X32-NEXT: shrl $14, %esi
+; X32-NEXT: andl $3, %esi
+; X32-NEXT: kmovd %esi, %k2
+; X32-NEXT: kxorq %k2, %k1, %k1
; X32-NEXT: kshiftlq $63, %k1, %k1
; X32-NEXT: kshiftrq $49, %k1, %k1
; X32-NEXT: kxorq %k0, %k1, %k0
; X32-NEXT: kshiftrq $15, %k0, %k1
-; X32-NEXT: kxorq %k5, %k1, %k1
+; X32-NEXT: movl %ecx, %esi
+; X32-NEXT: shrl $15, %esi
+; X32-NEXT: andl $1, %esi
+; X32-NEXT: kmovd %esi, %k2
+; X32-NEXT: kxorq %k2, %k1, %k1
; X32-NEXT: kshiftlq $63, %k1, %k1
; X32-NEXT: kshiftrq $48, %k1, %k1
; X32-NEXT: kxorq %k0, %k1, %k0
@@ -1048,22 +937,14 @@ define <8 x i64> @test_mm512_maskz_set1_epi8(i64 %__M, i8 signext %__A) {
; X32-NEXT: kxorq %k0, %k1, %k0
; X32-NEXT: kshiftrq $43, %k0, %k1
; X32-NEXT: kxorq %k4, %k1, %k1
-; X32-NEXT: movl %eax, %esi
-; X32-NEXT: shrl $12, %esi
-; X32-NEXT: andl $15, %esi
-; X32-NEXT: kmovd %esi, %k2
-; X32-NEXT: movl %eax, %esi
-; X32-NEXT: shrl $14, %esi
-; X32-NEXT: andl $3, %esi
-; X32-NEXT: kmovd %esi, %k3
-; X32-NEXT: movl %eax, %esi
-; X32-NEXT: shrl $15, %esi
-; X32-NEXT: andl $1, %esi
-; X32-NEXT: kmovd %esi, %k4
; X32-NEXT: kshiftlq $63, %k1, %k1
; X32-NEXT: kshiftrq $20, %k1, %k1
; X32-NEXT: kxorq %k0, %k1, %k0
; X32-NEXT: kshiftrq $44, %k0, %k1
+; X32-NEXT: movl %eax, %esi
+; X32-NEXT: shrl $12, %esi
+; X32-NEXT: andl $15, %esi
+; X32-NEXT: kmovd %esi, %k2
; X32-NEXT: kxorq %k2, %k1, %k1
; X32-NEXT: kshiftlq $63, %k1, %k1
; X32-NEXT: kshiftrq $19, %k1, %k1
@@ -1074,12 +955,20 @@ define <8 x i64> @test_mm512_maskz_set1_epi8(i64 %__M, i8 signext %__A) {
; X32-NEXT: kshiftrq $18, %k1, %k1
; X32-NEXT: kxorq %k0, %k1, %k0
; X32-NEXT: kshiftrq $46, %k0, %k1
-; X32-NEXT: kxorq %k3, %k1, %k1
+; X32-NEXT: movl %eax, %esi
+; X32-NEXT: shrl $14, %esi
+; X32-NEXT: andl $3, %esi
+; X32-NEXT: kmovd %esi, %k2
+; X32-NEXT: kxorq %k2, %k1, %k1
; X32-NEXT: kshiftlq $63, %k1, %k1
; X32-NEXT: kshiftrq $17, %k1, %k1
; X32-NEXT: kxorq %k0, %k1, %k0
; X32-NEXT: kshiftrq $47, %k0, %k1
-; X32-NEXT: kxorq %k4, %k1, %k1
+; X32-NEXT: movl %eax, %esi
+; X32-NEXT: shrl $15, %esi
+; X32-NEXT: andl $1, %esi
+; X32-NEXT: kmovd %esi, %k2
+; X32-NEXT: kxorq %k2, %k1, %k1
; X32-NEXT: kshiftlq $63, %k1, %k1
; X32-NEXT: kshiftrq $16, %k1, %k1
; X32-NEXT: kxorq %k0, %k1, %k0
@@ -1105,8 +994,8 @@ define <8 x i64> @test_mm512_maskz_set1_epi8(i64 %__M, i8 signext %__A) {
; X32-NEXT: kxorq %k2, %k1, %k1
; X32-NEXT: kshiftlq $63, %k1, %k1
; X32-NEXT: kshiftrq $12, %k1, %k1
-; X32-NEXT: kxorq %k0, %k1, %k4
-; X32-NEXT: kshiftrq $52, %k4, %k0
+; X32-NEXT: kxorq %k0, %k1, %k3
+; X32-NEXT: kshiftrq $52, %k3, %k0
; X32-NEXT: movl %ecx, %edx
; X32-NEXT: shrb $4, %dl
; X32-NEXT: kmovd %edx, %k1
@@ -1130,19 +1019,19 @@ define <8 x i64> @test_mm512_maskz_set1_epi8(i64 %__M, i8 signext %__A) {
; X32-NEXT: andb $15, %cl
; X32-NEXT: movl %ecx, %edx
; X32-NEXT: shrb $2, %dl
-; X32-NEXT: kmovd %edx, %k3
+; X32-NEXT: kmovd %edx, %k4
; X32-NEXT: kshiftlq $63, %k5, %k5
; X32-NEXT: kshiftrq $11, %k5, %k5
-; X32-NEXT: kxorq %k4, %k5, %k4
-; X32-NEXT: kshiftrq $53, %k4, %k5
+; X32-NEXT: kxorq %k3, %k5, %k3
+; X32-NEXT: kshiftrq $53, %k3, %k5
; X32-NEXT: kxorq %k6, %k5, %k5
; X32-NEXT: kshiftlq $63, %k5, %k5
; X32-NEXT: kshiftrq $10, %k5, %k5
-; X32-NEXT: kxorq %k4, %k5, %k5
-; X32-NEXT: kshiftrq $54, %k5, %k4
-; X32-NEXT: kxorq %k7, %k4, %k6
+; X32-NEXT: kxorq %k3, %k5, %k5
+; X32-NEXT: kshiftrq $54, %k5, %k3
+; X32-NEXT: kxorq %k7, %k3, %k6
; X32-NEXT: shrb $3, %cl
-; X32-NEXT: kmovd %ecx, %k4
+; X32-NEXT: kmovd %ecx, %k3
; X32-NEXT: movl %eax, %ecx
; X32-NEXT: shrl $29, %ecx
; X32-NEXT: andb $1, %cl
@@ -1157,12 +1046,6 @@ define <8 x i64> @test_mm512_maskz_set1_epi8(i64 %__M, i8 signext %__A) {
; X32-NEXT: kxorq %k5, %k0, %k0
; X32-NEXT: kshiftrq $56, %k0, %k5
; X32-NEXT: kxorq %k1, %k5, %k1
-; X32-NEXT: movl %eax, %ecx
-; X32-NEXT: shrl $28, %ecx
-; X32-NEXT: kmovd %ecx, %k5
-; X32-NEXT: movl %eax, %ecx
-; X32-NEXT: shrl $30, %ecx
-; X32-NEXT: kmovd %ecx, %k6
; X32-NEXT: kshiftlq $63, %k1, %k1
; X32-NEXT: kshiftrq $7, %k1, %k1
; X32-NEXT: kxorq %k0, %k1, %k0
@@ -1172,17 +1055,20 @@ define <8 x i64> @test_mm512_maskz_set1_epi8(i64 %__M, i8 signext %__A) {
; X32-NEXT: kshiftrq $6, %k1, %k1
; X32-NEXT: kxorq %k0, %k1, %k0
; X32-NEXT: kshiftrq $58, %k0, %k1
-; X32-NEXT: kxorq %k3, %k1, %k1
+; X32-NEXT: kxorq %k4, %k1, %k1
; X32-NEXT: kshiftlq $63, %k1, %k1
; X32-NEXT: kshiftrq $5, %k1, %k1
; X32-NEXT: kxorq %k0, %k1, %k0
; X32-NEXT: kshiftrq $59, %k0, %k1
-; X32-NEXT: kxorq %k4, %k1, %k1
+; X32-NEXT: kxorq %k3, %k1, %k1
; X32-NEXT: kshiftlq $63, %k1, %k1
; X32-NEXT: kshiftrq $4, %k1, %k1
; X32-NEXT: kxorq %k0, %k1, %k0
; X32-NEXT: kshiftrq $60, %k0, %k1
-; X32-NEXT: kxorq %k5, %k1, %k1
+; X32-NEXT: movl %eax, %ecx
+; X32-NEXT: shrl $28, %ecx
+; X32-NEXT: kmovd %ecx, %k2
+; X32-NEXT: kxorq %k2, %k1, %k1
; X32-NEXT: kshiftlq $63, %k1, %k1
; X32-NEXT: kshiftrq $3, %k1, %k1
; X32-NEXT: kxorq %k0, %k1, %k0
@@ -1192,7 +1078,10 @@ define <8 x i64> @test_mm512_maskz_set1_epi8(i64 %__M, i8 signext %__A) {
; X32-NEXT: kshiftrq $2, %k1, %k1
; X32-NEXT: kxorq %k0, %k1, %k0
; X32-NEXT: kshiftrq $62, %k0, %k1
-; X32-NEXT: kxorq %k6, %k1, %k1
+; X32-NEXT: movl %eax, %ecx
+; X32-NEXT: shrl $30, %ecx
+; X32-NEXT: kmovd %ecx, %k2
+; X32-NEXT: kxorq %k2, %k1, %k1
; X32-NEXT: shrl $31, %eax
; X32-NEXT: kmovd %eax, %k2
; X32-NEXT: movb {{[0-9]+}}(%esp), %al
diff --git a/test/CodeGen/X86/avx512bw-intrinsics-upgrade.ll b/test/CodeGen/X86/avx512bw-intrinsics-upgrade.ll
index f19e09758f1..13aca464b9e 100644
--- a/test/CodeGen/X86/avx512bw-intrinsics-upgrade.ll
+++ b/test/CodeGen/X86/avx512bw-intrinsics-upgrade.ll
@@ -2,46 +2,6 @@
; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX512BW
; RUN: llc < %s -mtriple=i386-unknown-linux-gnu -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX512F-32
-declare i32 @llvm.x86.avx512.kunpck.wd(i32, i32)
-
-define i32@test_int_x86_avx512_kunpck_wd(i32 %x0, i32 %x1) {
-; AVX512BW-LABEL: test_int_x86_avx512_kunpck_wd:
-; AVX512BW: ## %bb.0:
-; AVX512BW-NEXT: movzwl %di, %eax
-; AVX512BW-NEXT: shll $16, %esi
-; AVX512BW-NEXT: orl %esi, %eax
-; AVX512BW-NEXT: retq
-;
-; AVX512F-32-LABEL: test_int_x86_avx512_kunpck_wd:
-; AVX512F-32: # %bb.0:
-; AVX512F-32-NEXT: movzwl {{[0-9]+}}(%esp), %ecx
-; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %eax
-; AVX512F-32-NEXT: shll $16, %eax
-; AVX512F-32-NEXT: orl %ecx, %eax
-; AVX512F-32-NEXT: retl
- %res = call i32 @llvm.x86.avx512.kunpck.wd(i32 %x0, i32 %x1)
- ret i32 %res
-}
-
-declare i64 @llvm.x86.avx512.kunpck.dq(i64, i64)
-
-define i64@test_int_x86_avx512_kunpck_qd(i64 %x0, i64 %x1) {
-; AVX512BW-LABEL: test_int_x86_avx512_kunpck_qd:
-; AVX512BW: ## %bb.0:
-; AVX512BW-NEXT: movl %edi, %eax
-; AVX512BW-NEXT: shlq $32, %rsi
-; AVX512BW-NEXT: orq %rsi, %rax
-; AVX512BW-NEXT: retq
-;
-; AVX512F-32-LABEL: test_int_x86_avx512_kunpck_qd:
-; AVX512F-32: # %bb.0:
-; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %eax
-; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %edx
-; AVX512F-32-NEXT: retl
- %res = call i64 @llvm.x86.avx512.kunpck.dq(i64 %x0, i64 %x1)
- ret i64 %res
-}
-
declare <64 x i8> @llvm.x86.avx512.mask.pbroadcast.b.gpr.512(i8, <64 x i8>, i64)
define <64 x i8>@test_int_x86_avx512_mask_pbroadcast_b_gpr_512(i8 %x0, <64 x i8> %x1, i64 %mask) {
diff --git a/test/CodeGen/X86/avx512bw-intrinsics.ll b/test/CodeGen/X86/avx512bw-intrinsics.ll
index 2fa7c2c5b8a..7b5cc5feff0 100644
--- a/test/CodeGen/X86/avx512bw-intrinsics.ll
+++ b/test/CodeGen/X86/avx512bw-intrinsics.ll
@@ -1455,6 +1455,55 @@ define <8 x i64>@test_int_x86_avx512_mask_psadb_w_512(<64 x i8> %x0, <64 x i8>
ret <8 x i64> %res2
}
+declare i32 @llvm.x86.avx512.kunpck.wd(i32, i32)
+
+define i32@test_int_x86_avx512_kunpck_wd(i32 %x0, i32 %x1) {
+; AVX512BW-LABEL: test_int_x86_avx512_kunpck_wd:
+; AVX512BW: ## %bb.0:
+; AVX512BW-NEXT: kmovd %edi, %k0
+; AVX512BW-NEXT: kmovd %esi, %k1
+; AVX512BW-NEXT: kunpckwd %k1, %k0, %k0
+; AVX512BW-NEXT: kmovd %k0, %eax
+; AVX512BW-NEXT: retq
+;
+; AVX512F-32-LABEL: test_int_x86_avx512_kunpck_wd:
+; AVX512F-32: # %bb.0:
+; AVX512F-32-NEXT: kmovw {{[0-9]+}}(%esp), %k0
+; AVX512F-32-NEXT: kmovw {{[0-9]+}}(%esp), %k1
+; AVX512F-32-NEXT: kunpckwd %k0, %k1, %k0
+; AVX512F-32-NEXT: kmovd %k0, %eax
+; AVX512F-32-NEXT: retl
+ %res = call i32 @llvm.x86.avx512.kunpck.wd(i32 %x0, i32 %x1)
+ ret i32 %res
+}
+
+declare i64 @llvm.x86.avx512.kunpck.dq(i64, i64)
+
+define i64@test_int_x86_avx512_kunpck_qd(i64 %x0, i64 %x1) {
+; AVX512BW-LABEL: test_int_x86_avx512_kunpck_qd:
+; AVX512BW: ## %bb.0:
+; AVX512BW-NEXT: kmovq %rdi, %k0
+; AVX512BW-NEXT: kmovq %rsi, %k1
+; AVX512BW-NEXT: kunpckdq %k1, %k0, %k0
+; AVX512BW-NEXT: kmovq %k0, %rax
+; AVX512BW-NEXT: retq
+;
+; AVX512F-32-LABEL: test_int_x86_avx512_kunpck_qd:
+; AVX512F-32: # %bb.0:
+; AVX512F-32-NEXT: subl $12, %esp
+; AVX512F-32-NEXT: .cfi_def_cfa_offset 16
+; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k0
+; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1
+; AVX512F-32-NEXT: kunpckdq %k0, %k1, %k0
+; AVX512F-32-NEXT: kmovq %k0, (%esp)
+; AVX512F-32-NEXT: movl (%esp), %eax
+; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %edx
+; AVX512F-32-NEXT: addl $12, %esp
+; AVX512F-32-NEXT: retl
+ %res = call i64 @llvm.x86.avx512.kunpck.dq(i64 %x0, i64 %x1)
+ ret i64 %res
+}
+
declare i64 @llvm.x86.avx512.cvtb2mask.512(<64 x i8>)
define i64@test_int_x86_avx512_cvtb2mask_512(<64 x i8> %x0) {