summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorHans Wennborg <hans@hanshq.net>2018-02-02 16:24:08 +0000
committerHans Wennborg <hans@hanshq.net>2018-02-02 16:24:08 +0000
commit4b07ed628fe33090d114553618d63f9144c51715 (patch)
tree54c46b32e9553ae4a8cb45e70d857184dd624f2e
parent816adbd1b4a9e874953958d839951279dfbce5b7 (diff)
Merging r323908:
------------------------------------------------------------------------ r323908 | mareko | 2018-01-31 21:18:04 +0100 (Wed, 31 Jan 2018) | 7 lines AMDGPU: Add intrinsics llvm.amdgcn.cvt.{pknorm.i16, pknorm.u16, pk.i16, pk.u16} Reviewers: arsenm, nhaehnle Subscribers: kzhuravl, wdng, yaxunl, dstuttard, tpr, t-tye Differential Revision: https://reviews.llvm.org/D41663 ------------------------------------------------------------------------ git-svn-id: https://llvm.org/svn/llvm-project/llvm/branches/release_60@324103 91177308-0d34-0410-b5e6-96231b3b80d8
-rw-r--r--include/llvm/IR/IntrinsicsAMDGPU.td20
-rw-r--r--lib/Target/AMDGPU/AMDGPUISelLowering.cpp4
-rw-r--r--lib/Target/AMDGPU/AMDGPUISelLowering.h4
-rw-r--r--lib/Target/AMDGPU/AMDGPUInstrInfo.td8
-rw-r--r--lib/Target/AMDGPU/SIISelLowering.cpp50
-rw-r--r--lib/Target/AMDGPU/VOP2Instructions.td8
-rw-r--r--lib/Transforms/InstCombine/InstCombineCalls.cpp12
-rw-r--r--test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pk.i16.ll84
-rw-r--r--test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pk.u16.ll84
-rw-r--r--test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pknorm.i16.ll164
-rw-r--r--test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pknorm.u16.ll164
-rw-r--r--test/Transforms/InstCombine/AMDGPU/amdgcn-intrinsics.ll108
12 files changed, 702 insertions, 8 deletions
diff --git a/include/llvm/IR/IntrinsicsAMDGPU.td b/include/llvm/IR/IntrinsicsAMDGPU.td
index 22a3a0fe618..3397fa41db1 100644
--- a/include/llvm/IR/IntrinsicsAMDGPU.td
+++ b/include/llvm/IR/IntrinsicsAMDGPU.td
@@ -238,6 +238,26 @@ def int_amdgcn_cvt_pkrtz : Intrinsic<
[IntrNoMem, IntrSpeculatable]
>;
+def int_amdgcn_cvt_pknorm_i16 : Intrinsic<
+ [llvm_v2i16_ty], [llvm_float_ty, llvm_float_ty],
+ [IntrNoMem, IntrSpeculatable]
+>;
+
+def int_amdgcn_cvt_pknorm_u16 : Intrinsic<
+ [llvm_v2i16_ty], [llvm_float_ty, llvm_float_ty],
+ [IntrNoMem, IntrSpeculatable]
+>;
+
+def int_amdgcn_cvt_pk_i16 : Intrinsic<
+ [llvm_v2i16_ty], [llvm_i32_ty, llvm_i32_ty],
+ [IntrNoMem, IntrSpeculatable]
+>;
+
+def int_amdgcn_cvt_pk_u16 : Intrinsic<
+ [llvm_v2i16_ty], [llvm_i32_ty, llvm_i32_ty],
+ [IntrNoMem, IntrSpeculatable]
+>;
+
def int_amdgcn_class : Intrinsic<
[llvm_i1_ty], [llvm_anyfloat_ty, llvm_i32_ty],
[IntrNoMem, IntrSpeculatable]
diff --git a/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
index 49929441ef2..21192a2c1cc 100644
--- a/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ b/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -3957,6 +3957,10 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const {
NODE_NAME_CASE(CVT_F32_UBYTE2)
NODE_NAME_CASE(CVT_F32_UBYTE3)
NODE_NAME_CASE(CVT_PKRTZ_F16_F32)
+ NODE_NAME_CASE(CVT_PKNORM_I16_F32)
+ NODE_NAME_CASE(CVT_PKNORM_U16_F32)
+ NODE_NAME_CASE(CVT_PK_I16_I32)
+ NODE_NAME_CASE(CVT_PK_U16_U32)
NODE_NAME_CASE(FP_TO_FP16)
NODE_NAME_CASE(FP16_ZEXT)
NODE_NAME_CASE(BUILD_VERTICAL_VECTOR)
diff --git a/lib/Target/AMDGPU/AMDGPUISelLowering.h b/lib/Target/AMDGPU/AMDGPUISelLowering.h
index 5c31bddd9b1..039ee174e5b 100644
--- a/lib/Target/AMDGPU/AMDGPUISelLowering.h
+++ b/lib/Target/AMDGPU/AMDGPUISelLowering.h
@@ -417,6 +417,10 @@ enum NodeType : unsigned {
// Convert two float 32 numbers into a single register holding two packed f16
// with round to zero.
CVT_PKRTZ_F16_F32,
+ CVT_PKNORM_I16_F32,
+ CVT_PKNORM_U16_F32,
+ CVT_PK_I16_I32,
+ CVT_PK_U16_U32,
// Same as the standard node, except the high bits of the resulting integer
// are known 0.
diff --git a/lib/Target/AMDGPU/AMDGPUInstrInfo.td b/lib/Target/AMDGPU/AMDGPUInstrInfo.td
index c024010f3e9..65c483d85c5 100644
--- a/lib/Target/AMDGPU/AMDGPUInstrInfo.td
+++ b/lib/Target/AMDGPU/AMDGPUInstrInfo.td
@@ -35,6 +35,10 @@ def AMDGPUFPPackOp : SDTypeProfile<1, 2,
[SDTCisFP<1>, SDTCisSameAs<1, 2>]
>;
+def AMDGPUIntPackOp : SDTypeProfile<1, 2,
+ [SDTCisInt<1>, SDTCisSameAs<1, 2>]
+>;
+
def AMDGPUDivScaleOp : SDTypeProfile<2, 3,
[SDTCisFP<0>, SDTCisInt<1>, SDTCisSameAs<0, 2>, SDTCisSameAs<0, 3>, SDTCisSameAs<0, 4>]
>;
@@ -142,6 +146,10 @@ def AMDGPUrsq_clamp : SDNode<"AMDGPUISD::RSQ_CLAMP", SDTFPUnaryOp>;
def AMDGPUldexp : SDNode<"AMDGPUISD::LDEXP", AMDGPULdExpOp>;
def AMDGPUpkrtz_f16_f32 : SDNode<"AMDGPUISD::CVT_PKRTZ_F16_F32", AMDGPUFPPackOp>;
+def AMDGPUpknorm_i16_f32 : SDNode<"AMDGPUISD::CVT_PKNORM_I16_F32", AMDGPUFPPackOp>;
+def AMDGPUpknorm_u16_f32 : SDNode<"AMDGPUISD::CVT_PKNORM_U16_F32", AMDGPUFPPackOp>;
+def AMDGPUpk_i16_i32 : SDNode<"AMDGPUISD::CVT_PK_I16_I32", AMDGPUIntPackOp>;
+def AMDGPUpk_u16_u32 : SDNode<"AMDGPUISD::CVT_PK_U16_U32", AMDGPUIntPackOp>;
def AMDGPUfp_to_f16 : SDNode<"AMDGPUISD::FP_TO_FP16" , SDTFPToIntOp>;
def AMDGPUfp16_zext : SDNode<"AMDGPUISD::FP16_ZEXT" , SDTFPToIntOp>;
diff --git a/lib/Target/AMDGPU/SIISelLowering.cpp b/lib/Target/AMDGPU/SIISelLowering.cpp
index 415d8a512aa..c2d79b9ef5f 100644
--- a/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -205,6 +205,7 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::f32, Custom);
setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::v4f32, Custom);
+ setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::v2i16, Custom);
setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::v2f16, Custom);
setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom);
@@ -3517,7 +3518,8 @@ void SITargetLowering::ReplaceNodeResults(SDNode *N,
}
case ISD::INTRINSIC_WO_CHAIN: {
unsigned IID = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue();
- if (IID == Intrinsic::amdgcn_cvt_pkrtz) {
+ switch (IID) {
+ case Intrinsic::amdgcn_cvt_pkrtz: {
SDValue Src0 = N->getOperand(1);
SDValue Src1 = N->getOperand(2);
SDLoc SL(N);
@@ -3526,6 +3528,29 @@ void SITargetLowering::ReplaceNodeResults(SDNode *N,
Results.push_back(DAG.getNode(ISD::BITCAST, SL, MVT::v2f16, Cvt));
return;
}
+ case Intrinsic::amdgcn_cvt_pknorm_i16:
+ case Intrinsic::amdgcn_cvt_pknorm_u16:
+ case Intrinsic::amdgcn_cvt_pk_i16:
+ case Intrinsic::amdgcn_cvt_pk_u16: {
+ SDValue Src0 = N->getOperand(1);
+ SDValue Src1 = N->getOperand(2);
+ SDLoc SL(N);
+ unsigned Opcode;
+
+ if (IID == Intrinsic::amdgcn_cvt_pknorm_i16)
+ Opcode = AMDGPUISD::CVT_PKNORM_I16_F32;
+ else if (IID == Intrinsic::amdgcn_cvt_pknorm_u16)
+ Opcode = AMDGPUISD::CVT_PKNORM_U16_F32;
+ else if (IID == Intrinsic::amdgcn_cvt_pk_i16)
+ Opcode = AMDGPUISD::CVT_PK_I16_I32;
+ else
+ Opcode = AMDGPUISD::CVT_PK_U16_U32;
+
+ SDValue Cvt = DAG.getNode(Opcode, SL, MVT::i32, Src0, Src1);
+ Results.push_back(DAG.getNode(ISD::BITCAST, SL, MVT::v2i16, Cvt));
+ return;
+ }
+ }
break;
}
case ISD::SELECT: {
@@ -4424,10 +4449,27 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
case Intrinsic::amdgcn_ubfe:
return DAG.getNode(AMDGPUISD::BFE_U32, DL, VT,
Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
- case Intrinsic::amdgcn_cvt_pkrtz: {
- // FIXME: Stop adding cast if v2f16 legal.
+ case Intrinsic::amdgcn_cvt_pkrtz:
+ case Intrinsic::amdgcn_cvt_pknorm_i16:
+ case Intrinsic::amdgcn_cvt_pknorm_u16:
+ case Intrinsic::amdgcn_cvt_pk_i16:
+ case Intrinsic::amdgcn_cvt_pk_u16: {
+ // FIXME: Stop adding cast if v2f16/v2i16 are legal.
EVT VT = Op.getValueType();
- SDValue Node = DAG.getNode(AMDGPUISD::CVT_PKRTZ_F16_F32, DL, MVT::i32,
+ unsigned Opcode;
+
+ if (IntrinsicID == Intrinsic::amdgcn_cvt_pkrtz)
+ Opcode = AMDGPUISD::CVT_PKRTZ_F16_F32;
+ else if (IntrinsicID == Intrinsic::amdgcn_cvt_pknorm_i16)
+ Opcode = AMDGPUISD::CVT_PKNORM_I16_F32;
+ else if (IntrinsicID == Intrinsic::amdgcn_cvt_pknorm_u16)
+ Opcode = AMDGPUISD::CVT_PKNORM_U16_F32;
+ else if (IntrinsicID == Intrinsic::amdgcn_cvt_pk_i16)
+ Opcode = AMDGPUISD::CVT_PK_I16_I32;
+ else
+ Opcode = AMDGPUISD::CVT_PK_U16_U32;
+
+ SDValue Node = DAG.getNode(Opcode, DL, MVT::i32,
Op.getOperand(1), Op.getOperand(2));
return DAG.getNode(ISD::BITCAST, DL, VT, Node);
}
diff --git a/lib/Target/AMDGPU/VOP2Instructions.td b/lib/Target/AMDGPU/VOP2Instructions.td
index ef90b68db1a..56b934f92f6 100644
--- a/lib/Target/AMDGPU/VOP2Instructions.td
+++ b/lib/Target/AMDGPU/VOP2Instructions.td
@@ -407,11 +407,11 @@ defm V_MBCNT_LO_U32_B32 : VOP2Inst <"v_mbcnt_lo_u32_b32", VOP_NO_EXT<VOP_I32_I32
defm V_MBCNT_HI_U32_B32 : VOP2Inst <"v_mbcnt_hi_u32_b32", VOP_NO_EXT<VOP_I32_I32_I32>, int_amdgcn_mbcnt_hi>;
defm V_LDEXP_F32 : VOP2Inst <"v_ldexp_f32", VOP_NO_EXT<VOP_F32_F32_I32>, AMDGPUldexp>;
defm V_CVT_PKACCUM_U8_F32 : VOP2Inst <"v_cvt_pkaccum_u8_f32", VOP_NO_EXT<VOP_I32_F32_I32>>; // TODO: set "Uses = dst"
-defm V_CVT_PKNORM_I16_F32 : VOP2Inst <"v_cvt_pknorm_i16_f32", VOP_NO_EXT<VOP_I32_F32_F32>>;
-defm V_CVT_PKNORM_U16_F32 : VOP2Inst <"v_cvt_pknorm_u16_f32", VOP_NO_EXT<VOP_I32_F32_F32>>;
+defm V_CVT_PKNORM_I16_F32 : VOP2Inst <"v_cvt_pknorm_i16_f32", VOP_NO_EXT<VOP_I32_F32_F32>, AMDGPUpknorm_i16_f32>;
+defm V_CVT_PKNORM_U16_F32 : VOP2Inst <"v_cvt_pknorm_u16_f32", VOP_NO_EXT<VOP_I32_F32_F32>, AMDGPUpknorm_u16_f32>;
defm V_CVT_PKRTZ_F16_F32 : VOP2Inst <"v_cvt_pkrtz_f16_f32", VOP_NO_EXT<VOP_I32_F32_F32>, AMDGPUpkrtz_f16_f32>;
-defm V_CVT_PK_U16_U32 : VOP2Inst <"v_cvt_pk_u16_u32", VOP_NO_EXT<VOP_I32_I32_I32>>;
-defm V_CVT_PK_I16_I32 : VOP2Inst <"v_cvt_pk_i16_i32", VOP_NO_EXT<VOP_I32_I32_I32>>;
+defm V_CVT_PK_U16_U32 : VOP2Inst <"v_cvt_pk_u16_u32", VOP_NO_EXT<VOP_I32_I32_I32>, AMDGPUpk_u16_u32>;
+defm V_CVT_PK_I16_I32 : VOP2Inst <"v_cvt_pk_i16_i32", VOP_NO_EXT<VOP_I32_I32_I32>, AMDGPUpk_i16_i32>;
} // End SubtargetPredicate = isGCN
diff --git a/lib/Transforms/InstCombine/InstCombineCalls.cpp b/lib/Transforms/InstCombine/InstCombineCalls.cpp
index 40e52ee755e..2f2f0696366 100644
--- a/lib/Transforms/InstCombine/InstCombineCalls.cpp
+++ b/lib/Transforms/InstCombine/InstCombineCalls.cpp
@@ -3264,6 +3264,18 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
break;
}
+ case Intrinsic::amdgcn_cvt_pknorm_i16:
+ case Intrinsic::amdgcn_cvt_pknorm_u16:
+ case Intrinsic::amdgcn_cvt_pk_i16:
+ case Intrinsic::amdgcn_cvt_pk_u16: {
+ Value *Src0 = II->getArgOperand(0);
+ Value *Src1 = II->getArgOperand(1);
+
+ if (isa<UndefValue>(Src0) && isa<UndefValue>(Src1))
+ return replaceInstUsesWith(*II, UndefValue::get(II->getType()));
+
+ break;
+ }
case Intrinsic::amdgcn_ubfe:
case Intrinsic::amdgcn_sbfe: {
// Decompose simple cases into standard shifts.
diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pk.i16.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pk.i16.ll
new file mode 100644
index 00000000000..241b708e7ba
--- /dev/null
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pk.i16.ll
@@ -0,0 +1,84 @@
+; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=SI %s
+; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=SI %s
+; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=VI %s
+; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=VI %s
+
+; GCN-LABEL: {{^}}s_cvt_pk_i16_i32:
+; GCN-DAG: s_load_dword [[X:s[0-9]+]], s[0:1], 0x{{b|2c}}
+; GCN-DAG: s_load_dword [[SY:s[0-9]+]], s[0:1], 0x{{c|30}}
+; GCN: v_mov_b32_e32 [[VY:v[0-9]+]], [[SY]]
+; SI: v_cvt_pk_i16_i32_e32 v{{[0-9]+}}, [[X]], [[VY]]
+; VI: v_cvt_pk_i16_i32 v{{[0-9]+}}, [[X]], [[VY]]
+define amdgpu_kernel void @s_cvt_pk_i16_i32(i32 addrspace(1)* %out, i32 %x, i32 %y) #0 {
+ %result = call <2 x i16> @llvm.amdgcn.cvt.pk.i16(i32 %x, i32 %y)
+ %r = bitcast <2 x i16> %result to i32
+ store i32 %r, i32 addrspace(1)* %out
+ ret void
+}
+
+; GCN-LABEL: {{^}}s_cvt_pk_i16_samereg_i32:
+; GCN: s_load_dword [[X:s[0-9]+]]
+; GCN: v_cvt_pk_i16_i32{{(_e64)*}} v{{[0-9]+}}, [[X]], [[X]]
+define amdgpu_kernel void @s_cvt_pk_i16_samereg_i32(i32 addrspace(1)* %out, i32 %x) #0 {
+ %result = call <2 x i16> @llvm.amdgcn.cvt.pk.i16(i32 %x, i32 %x)
+ %r = bitcast <2 x i16> %result to i32
+ store i32 %r, i32 addrspace(1)* %out
+ ret void
+}
+
+; GCN-LABEL: {{^}}v_cvt_pk_i16_i32:
+; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]]
+; GCN: {{buffer|flat|global}}_load_dword [[B:v[0-9]+]]
+; SI: v_cvt_pk_i16_i32_e32 v{{[0-9]+}}, [[A]], [[B]]
+; VI: v_cvt_pk_i16_i32 v{{[0-9]+}}, [[A]], [[B]]
+define amdgpu_kernel void @v_cvt_pk_i16_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %a.ptr, i32 addrspace(1)* %b.ptr) #0 {
+ %tid = call i32 @llvm.amdgcn.workitem.id.x()
+ %tid.ext = sext i32 %tid to i64
+ %a.gep = getelementptr inbounds i32, i32 addrspace(1)* %a.ptr, i64 %tid.ext
+ %b.gep = getelementptr inbounds i32, i32 addrspace(1)* %b.ptr, i64 %tid.ext
+ %out.gep = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %tid.ext
+ %a = load volatile i32, i32 addrspace(1)* %a.gep
+ %b = load volatile i32, i32 addrspace(1)* %b.gep
+ %cvt = call <2 x i16> @llvm.amdgcn.cvt.pk.i16(i32 %a, i32 %b)
+ %r = bitcast <2 x i16> %cvt to i32
+ store i32 %r, i32 addrspace(1)* %out.gep
+ ret void
+}
+
+; GCN-LABEL: {{^}}v_cvt_pk_i16_i32_reg_imm:
+; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]]
+; GCN: v_cvt_pk_i16_i32{{(_e64)*}} v{{[0-9]+}}, [[A]], 1
+define amdgpu_kernel void @v_cvt_pk_i16_i32_reg_imm(i32 addrspace(1)* %out, i32 addrspace(1)* %a.ptr) #0 {
+ %tid = call i32 @llvm.amdgcn.workitem.id.x()
+ %tid.ext = sext i32 %tid to i64
+ %a.gep = getelementptr inbounds i32, i32 addrspace(1)* %a.ptr, i64 %tid.ext
+ %out.gep = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %tid.ext
+ %a = load volatile i32, i32 addrspace(1)* %a.gep
+ %cvt = call <2 x i16> @llvm.amdgcn.cvt.pk.i16(i32 %a, i32 1)
+ %r = bitcast <2 x i16> %cvt to i32
+ store i32 %r, i32 addrspace(1)* %out.gep
+ ret void
+}
+
+; GCN-LABEL: {{^}}v_cvt_pk_i16_i32_imm_reg:
+; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]]
+; SI: v_cvt_pk_i16_i32_e32 v{{[0-9]+}}, 1, [[A]]
+; VI: v_cvt_pk_i16_i32 v{{[0-9]+}}, 1, [[A]]
+define amdgpu_kernel void @v_cvt_pk_i16_i32_imm_reg(i32 addrspace(1)* %out, i32 addrspace(1)* %a.ptr) #0 {
+ %tid = call i32 @llvm.amdgcn.workitem.id.x()
+ %tid.ext = sext i32 %tid to i64
+ %a.gep = getelementptr inbounds i32, i32 addrspace(1)* %a.ptr, i64 %tid.ext
+ %out.gep = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %tid.ext
+ %a = load volatile i32, i32 addrspace(1)* %a.gep
+ %cvt = call <2 x i16> @llvm.amdgcn.cvt.pk.i16(i32 1, i32 %a)
+ %r = bitcast <2 x i16> %cvt to i32
+ store i32 %r, i32 addrspace(1)* %out.gep
+ ret void
+}
+
+declare <2 x i16> @llvm.amdgcn.cvt.pk.i16(i32, i32) #1
+declare i32 @llvm.amdgcn.workitem.id.x() #1
+
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readnone }
diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pk.u16.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pk.u16.ll
new file mode 100644
index 00000000000..8d5c9aa9521
--- /dev/null
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pk.u16.ll
@@ -0,0 +1,84 @@
+; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=SI %s
+; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=SI %s
+; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=VI %s
+; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=VI %s
+
+; GCN-LABEL: {{^}}s_cvt_pk_u16_u32:
+; GCN-DAG: s_load_dword [[X:s[0-9]+]], s[0:1], 0x{{b|2c}}
+; GCN-DAG: s_load_dword [[SY:s[0-9]+]], s[0:1], 0x{{c|30}}
+; GCN: v_mov_b32_e32 [[VY:v[0-9]+]], [[SY]]
+; SI: v_cvt_pk_u16_u32_e32 v{{[0-9]+}}, [[X]], [[VY]]
+; VI: v_cvt_pk_u16_u32 v{{[0-9]+}}, [[X]], [[VY]]
+define amdgpu_kernel void @s_cvt_pk_u16_u32(i32 addrspace(1)* %out, i32 %x, i32 %y) #0 {
+ %result = call <2 x i16> @llvm.amdgcn.cvt.pk.u16(i32 %x, i32 %y)
+ %r = bitcast <2 x i16> %result to i32
+ store i32 %r, i32 addrspace(1)* %out
+ ret void
+}
+
+; GCN-LABEL: {{^}}s_cvt_pk_u16_samereg_i32:
+; GCN: s_load_dword [[X:s[0-9]+]]
+; GCN: v_cvt_pk_u16_u32{{(_e64)*}} v{{[0-9]+}}, [[X]], [[X]]
+define amdgpu_kernel void @s_cvt_pk_u16_samereg_i32(i32 addrspace(1)* %out, i32 %x) #0 {
+ %result = call <2 x i16> @llvm.amdgcn.cvt.pk.u16(i32 %x, i32 %x)
+ %r = bitcast <2 x i16> %result to i32
+ store i32 %r, i32 addrspace(1)* %out
+ ret void
+}
+
+; GCN-LABEL: {{^}}v_cvt_pk_u16_u32:
+; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]]
+; GCN: {{buffer|flat|global}}_load_dword [[B:v[0-9]+]]
+; SI: v_cvt_pk_u16_u32_e32 v{{[0-9]+}}, [[A]], [[B]]
+; VI: v_cvt_pk_u16_u32 v{{[0-9]+}}, [[A]], [[B]]
+define amdgpu_kernel void @v_cvt_pk_u16_u32(i32 addrspace(1)* %out, i32 addrspace(1)* %a.ptr, i32 addrspace(1)* %b.ptr) #0 {
+ %tid = call i32 @llvm.amdgcn.workitem.id.x()
+ %tid.ext = sext i32 %tid to i64
+ %a.gep = getelementptr inbounds i32, i32 addrspace(1)* %a.ptr, i64 %tid.ext
+ %b.gep = getelementptr inbounds i32, i32 addrspace(1)* %b.ptr, i64 %tid.ext
+ %out.gep = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %tid.ext
+ %a = load volatile i32, i32 addrspace(1)* %a.gep
+ %b = load volatile i32, i32 addrspace(1)* %b.gep
+ %cvt = call <2 x i16> @llvm.amdgcn.cvt.pk.u16(i32 %a, i32 %b)
+ %r = bitcast <2 x i16> %cvt to i32
+ store i32 %r, i32 addrspace(1)* %out.gep
+ ret void
+}
+
+; GCN-LABEL: {{^}}v_cvt_pk_u16_u32_reg_imm:
+; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]]
+; GCN: v_cvt_pk_u16_u32{{(_e64)*}} v{{[0-9]+}}, [[A]], 1
+define amdgpu_kernel void @v_cvt_pk_u16_u32_reg_imm(i32 addrspace(1)* %out, i32 addrspace(1)* %a.ptr) #0 {
+ %tid = call i32 @llvm.amdgcn.workitem.id.x()
+ %tid.ext = sext i32 %tid to i64
+ %a.gep = getelementptr inbounds i32, i32 addrspace(1)* %a.ptr, i64 %tid.ext
+ %out.gep = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %tid.ext
+ %a = load volatile i32, i32 addrspace(1)* %a.gep
+ %cvt = call <2 x i16> @llvm.amdgcn.cvt.pk.u16(i32 %a, i32 1)
+ %r = bitcast <2 x i16> %cvt to i32
+ store i32 %r, i32 addrspace(1)* %out.gep
+ ret void
+}
+
+; GCN-LABEL: {{^}}v_cvt_pk_u16_u32_imm_reg:
+; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]]
+; SI: v_cvt_pk_u16_u32_e32 v{{[0-9]+}}, 1, [[A]]
+; VI: v_cvt_pk_u16_u32 v{{[0-9]+}}, 1, [[A]]
+define amdgpu_kernel void @v_cvt_pk_u16_u32_imm_reg(i32 addrspace(1)* %out, i32 addrspace(1)* %a.ptr) #0 {
+ %tid = call i32 @llvm.amdgcn.workitem.id.x()
+ %tid.ext = sext i32 %tid to i64
+ %a.gep = getelementptr inbounds i32, i32 addrspace(1)* %a.ptr, i64 %tid.ext
+ %out.gep = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %tid.ext
+ %a = load volatile i32, i32 addrspace(1)* %a.gep
+ %cvt = call <2 x i16> @llvm.amdgcn.cvt.pk.u16(i32 1, i32 %a)
+ %r = bitcast <2 x i16> %cvt to i32
+ store i32 %r, i32 addrspace(1)* %out.gep
+ ret void
+}
+
+declare <2 x i16> @llvm.amdgcn.cvt.pk.u16(i32, i32) #1
+declare i32 @llvm.amdgcn.workitem.id.x() #1
+
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readnone }
diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pknorm.i16.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pknorm.i16.ll
new file mode 100644
index 00000000000..822e8c2886b
--- /dev/null
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pknorm.i16.ll
@@ -0,0 +1,164 @@
+; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=SI %s
+; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=SI %s
+; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=VI %s
+; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=VI %s
+
+; GCN-LABEL: {{^}}s_cvt_pknorm_i16_f32:
+; GCN-DAG: s_load_dword [[X:s[0-9]+]], s[0:1], 0x{{b|2c}}
+; GCN-DAG: s_load_dword [[SY:s[0-9]+]], s[0:1], 0x{{c|30}}
+; GCN: v_mov_b32_e32 [[VY:v[0-9]+]], [[SY]]
+; SI: v_cvt_pknorm_i16_f32_e32 v{{[0-9]+}}, [[X]], [[VY]]
+; VI: v_cvt_pknorm_i16_f32 v{{[0-9]+}}, [[X]], [[VY]]
+define amdgpu_kernel void @s_cvt_pknorm_i16_f32(i32 addrspace(1)* %out, float %x, float %y) #0 {
+ %result = call <2 x i16> @llvm.amdgcn.cvt.pknorm.i16(float %x, float %y)
+ %r = bitcast <2 x i16> %result to i32
+ store i32 %r, i32 addrspace(1)* %out
+ ret void
+}
+
+; GCN-LABEL: {{^}}s_cvt_pknorm_i16_samereg_f32:
+; GCN: s_load_dword [[X:s[0-9]+]]
+; GCN: v_cvt_pknorm_i16_f32{{(_e64)*}} v{{[0-9]+}}, [[X]], [[X]]
+define amdgpu_kernel void @s_cvt_pknorm_i16_samereg_f32(i32 addrspace(1)* %out, float %x) #0 {
+ %result = call <2 x i16> @llvm.amdgcn.cvt.pknorm.i16(float %x, float %x)
+ %r = bitcast <2 x i16> %result to i32
+ store i32 %r, i32 addrspace(1)* %out
+ ret void
+}
+
+; GCN-LABEL: {{^}}v_cvt_pknorm_i16_f32:
+; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]]
+; GCN: {{buffer|flat|global}}_load_dword [[B:v[0-9]+]]
+; SI: v_cvt_pknorm_i16_f32_e32 v{{[0-9]+}}, [[A]], [[B]]
+; VI: v_cvt_pknorm_i16_f32 v{{[0-9]+}}, [[A]], [[B]]
+define amdgpu_kernel void @v_cvt_pknorm_i16_f32(i32 addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
+ %tid = call i32 @llvm.amdgcn.workitem.id.x()
+ %tid.ext = sext i32 %tid to i64
+ %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
+ %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
+ %out.gep = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %tid.ext
+ %a = load volatile float, float addrspace(1)* %a.gep
+ %b = load volatile float, float addrspace(1)* %b.gep
+ %cvt = call <2 x i16> @llvm.amdgcn.cvt.pknorm.i16(float %a, float %b)
+ %r = bitcast <2 x i16> %cvt to i32
+ store i32 %r, i32 addrspace(1)* %out.gep
+ ret void
+}
+
+; GCN-LABEL: {{^}}v_cvt_pknorm_i16_f32_reg_imm:
+; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]]
+; GCN: v_cvt_pknorm_i16_f32{{(_e64)*}} v{{[0-9]+}}, [[A]], 1.0
+define amdgpu_kernel void @v_cvt_pknorm_i16_f32_reg_imm(i32 addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
+ %tid = call i32 @llvm.amdgcn.workitem.id.x()
+ %tid.ext = sext i32 %tid to i64
+ %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
+ %out.gep = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %tid.ext
+ %a = load volatile float, float addrspace(1)* %a.gep
+ %cvt = call <2 x i16> @llvm.amdgcn.cvt.pknorm.i16(float %a, float 1.0)
+ %r = bitcast <2 x i16> %cvt to i32
+ store i32 %r, i32 addrspace(1)* %out.gep
+ ret void
+}
+
+; GCN-LABEL: {{^}}v_cvt_pknorm_i16_f32_imm_reg:
+; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]]
+; SI: v_cvt_pknorm_i16_f32_e32 v{{[0-9]+}}, 1.0, [[A]]
+; VI: v_cvt_pknorm_i16_f32 v{{[0-9]+}}, 1.0, [[A]]
+define amdgpu_kernel void @v_cvt_pknorm_i16_f32_imm_reg(i32 addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
+ %tid = call i32 @llvm.amdgcn.workitem.id.x()
+ %tid.ext = sext i32 %tid to i64
+ %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
+ %out.gep = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %tid.ext
+ %a = load volatile float, float addrspace(1)* %a.gep
+ %cvt = call <2 x i16> @llvm.amdgcn.cvt.pknorm.i16(float 1.0, float %a)
+ %r = bitcast <2 x i16> %cvt to i32
+ store i32 %r, i32 addrspace(1)* %out.gep
+ ret void
+}
+
+; GCN-LABEL: {{^}}v_cvt_pknorm_i16_f32_fneg_lo:
+; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]]
+; GCN: {{buffer|flat|global}}_load_dword [[B:v[0-9]+]]
+; GCN: v_cvt_pknorm_i16_f32{{(_e64)*}} v{{[0-9]+}}, -[[A]], [[B]]
+define amdgpu_kernel void @v_cvt_pknorm_i16_f32_fneg_lo(i32 addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
+ %tid = call i32 @llvm.amdgcn.workitem.id.x()
+ %tid.ext = sext i32 %tid to i64
+ %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
+ %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
+ %out.gep = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %tid.ext
+ %a = load volatile float, float addrspace(1)* %a.gep
+ %b = load volatile float, float addrspace(1)* %b.gep
+ %neg.a = fsub float -0.0, %a
+ %cvt = call <2 x i16> @llvm.amdgcn.cvt.pknorm.i16(float %neg.a, float %b)
+ %r = bitcast <2 x i16> %cvt to i32
+ store i32 %r, i32 addrspace(1)* %out.gep
+ ret void
+}
+
+; GCN-LABEL: {{^}}v_cvt_pknorm_i16_f32_fneg_hi:
+; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]]
+; GCN: {{buffer|flat|global}}_load_dword [[B:v[0-9]+]]
+; GCN: v_cvt_pknorm_i16_f32{{(_e64)*}} v{{[0-9]+}}, [[A]], -[[B]]
+define amdgpu_kernel void @v_cvt_pknorm_i16_f32_fneg_hi(i32 addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
+ %tid = call i32 @llvm.amdgcn.workitem.id.x()
+ %tid.ext = sext i32 %tid to i64
+ %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
+ %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
+ %out.gep = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %tid.ext
+ %a = load volatile float, float addrspace(1)* %a.gep
+ %b = load volatile float, float addrspace(1)* %b.gep
+ %neg.b = fsub float -0.0, %b
+ %cvt = call <2 x i16> @llvm.amdgcn.cvt.pknorm.i16(float %a, float %neg.b)
+ %r = bitcast <2 x i16> %cvt to i32
+ store i32 %r, i32 addrspace(1)* %out.gep
+ ret void
+}
+
+; GCN-LABEL: {{^}}v_cvt_pknorm_i16_f32_fneg_lo_hi:
+; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]]
+; GCN: {{buffer|flat|global}}_load_dword [[B:v[0-9]+]]
+; GCN: v_cvt_pknorm_i16_f32{{(_e64)*}} v{{[0-9]+}}, -[[A]], -[[B]]
+define amdgpu_kernel void @v_cvt_pknorm_i16_f32_fneg_lo_hi(i32 addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
+ %tid = call i32 @llvm.amdgcn.workitem.id.x()
+ %tid.ext = sext i32 %tid to i64
+ %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
+ %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
+ %out.gep = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %tid.ext
+ %a = load volatile float, float addrspace(1)* %a.gep
+ %b = load volatile float, float addrspace(1)* %b.gep
+ %neg.a = fsub float -0.0, %a
+ %neg.b = fsub float -0.0, %b
+ %cvt = call <2 x i16> @llvm.amdgcn.cvt.pknorm.i16(float %neg.a, float %neg.b)
+ %r = bitcast <2 x i16> %cvt to i32
+ store i32 %r, i32 addrspace(1)* %out.gep
+ ret void
+}
+
+; GCN-LABEL: {{^}}v_cvt_pknorm_i16_f32_fneg_fabs_lo_fneg_hi:
+; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]]
+; GCN: {{buffer|flat|global}}_load_dword [[B:v[0-9]+]]
+; GCN: v_cvt_pknorm_i16_f32{{(_e64)*}} v{{[0-9]+}}, -|[[A]]|, -[[B]]
+define amdgpu_kernel void @v_cvt_pknorm_i16_f32_fneg_fabs_lo_fneg_hi(i32 addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
+ %tid = call i32 @llvm.amdgcn.workitem.id.x()
+ %tid.ext = sext i32 %tid to i64
+ %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
+ %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
+ %out.gep = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %tid.ext
+ %a = load volatile float, float addrspace(1)* %a.gep
+ %b = load volatile float, float addrspace(1)* %b.gep
+ %fabs.a = call float @llvm.fabs.f32(float %a)
+ %neg.fabs.a = fsub float -0.0, %fabs.a
+ %neg.b = fsub float -0.0, %b
+ %cvt = call <2 x i16> @llvm.amdgcn.cvt.pknorm.i16(float %neg.fabs.a, float %neg.b)
+ %r = bitcast <2 x i16> %cvt to i32
+ store i32 %r, i32 addrspace(1)* %out.gep
+ ret void
+}
+
+declare <2 x i16> @llvm.amdgcn.cvt.pknorm.i16(float, float) #1
+declare float @llvm.fabs.f32(float) #1
+declare i32 @llvm.amdgcn.workitem.id.x() #1
+
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readnone }
diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pknorm.u16.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pknorm.u16.ll
new file mode 100644
index 00000000000..c2b8f3cb28c
--- /dev/null
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pknorm.u16.ll
@@ -0,0 +1,164 @@
+; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=SI %s
+; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=SI %s
+; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=VI %s
+; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=VI %s
+
+; GCN-LABEL: {{^}}s_cvt_pknorm_u16_f32:
+; GCN-DAG: s_load_dword [[X:s[0-9]+]], s[0:1], 0x{{b|2c}}
+; GCN-DAG: s_load_dword [[SY:s[0-9]+]], s[0:1], 0x{{c|30}}
+; GCN: v_mov_b32_e32 [[VY:v[0-9]+]], [[SY]]
+; SI: v_cvt_pknorm_u16_f32_e32 v{{[0-9]+}}, [[X]], [[VY]]
+; VI: v_cvt_pknorm_u16_f32 v{{[0-9]+}}, [[X]], [[VY]]
+define amdgpu_kernel void @s_cvt_pknorm_u16_f32(i32 addrspace(1)* %out, float %x, float %y) #0 {
+ %result = call <2 x i16> @llvm.amdgcn.cvt.pknorm.u16(float %x, float %y)
+ %r = bitcast <2 x i16> %result to i32
+ store i32 %r, i32 addrspace(1)* %out
+ ret void
+}
+
+; GCN-LABEL: {{^}}s_cvt_pknorm_u16_samereg_f32:
+; GCN: s_load_dword [[X:s[0-9]+]]
+; GCN: v_cvt_pknorm_u16_f32{{(_e64)*}} v{{[0-9]+}}, [[X]], [[X]]
+define amdgpu_kernel void @s_cvt_pknorm_u16_samereg_f32(i32 addrspace(1)* %out, float %x) #0 {
+ %result = call <2 x i16> @llvm.amdgcn.cvt.pknorm.u16(float %x, float %x)
+ %r = bitcast <2 x i16> %result to i32
+ store i32 %r, i32 addrspace(1)* %out
+ ret void
+}
+
+; GCN-LABEL: {{^}}v_cvt_pknorm_u16_f32:
+; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]]
+; GCN: {{buffer|flat|global}}_load_dword [[B:v[0-9]+]]
+; SI: v_cvt_pknorm_u16_f32_e32 v{{[0-9]+}}, [[A]], [[B]]
+; VI: v_cvt_pknorm_u16_f32 v{{[0-9]+}}, [[A]], [[B]]
+define amdgpu_kernel void @v_cvt_pknorm_u16_f32(i32 addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
+ %tid = call i32 @llvm.amdgcn.workitem.id.x()
+ %tid.ext = sext i32 %tid to i64
+ %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
+ %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
+ %out.gep = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %tid.ext
+ %a = load volatile float, float addrspace(1)* %a.gep
+ %b = load volatile float, float addrspace(1)* %b.gep
+ %cvt = call <2 x i16> @llvm.amdgcn.cvt.pknorm.u16(float %a, float %b)
+ %r = bitcast <2 x i16> %cvt to i32
+ store i32 %r, i32 addrspace(1)* %out.gep
+ ret void
+}
+
+; GCN-LABEL: {{^}}v_cvt_pknorm_u16_f32_reg_imm:
+; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]]
+; GCN: v_cvt_pknorm_u16_f32{{(_e64)*}} v{{[0-9]+}}, [[A]], 1.0
+define amdgpu_kernel void @v_cvt_pknorm_u16_f32_reg_imm(i32 addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
+ %tid = call i32 @llvm.amdgcn.workitem.id.x()
+ %tid.ext = sext i32 %tid to i64
+ %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
+ %out.gep = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %tid.ext
+ %a = load volatile float, float addrspace(1)* %a.gep
+ %cvt = call <2 x i16> @llvm.amdgcn.cvt.pknorm.u16(float %a, float 1.0)
+ %r = bitcast <2 x i16> %cvt to i32
+ store i32 %r, i32 addrspace(1)* %out.gep
+ ret void
+}
+
+; GCN-LABEL: {{^}}v_cvt_pknorm_u16_f32_imm_reg:
+; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]]
+; SI: v_cvt_pknorm_u16_f32_e32 v{{[0-9]+}}, 1.0, [[A]]
+; VI: v_cvt_pknorm_u16_f32 v{{[0-9]+}}, 1.0, [[A]]
+define amdgpu_kernel void @v_cvt_pknorm_u16_f32_imm_reg(i32 addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
+ %tid = call i32 @llvm.amdgcn.workitem.id.x()
+ %tid.ext = sext i32 %tid to i64
+ %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
+ %out.gep = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %tid.ext
+ %a = load volatile float, float addrspace(1)* %a.gep
+ %cvt = call <2 x i16> @llvm.amdgcn.cvt.pknorm.u16(float 1.0, float %a)
+ %r = bitcast <2 x i16> %cvt to i32
+ store i32 %r, i32 addrspace(1)* %out.gep
+ ret void
+}
+
+; GCN-LABEL: {{^}}v_cvt_pknorm_u16_f32_fneg_lo:
+; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]]
+; GCN: {{buffer|flat|global}}_load_dword [[B:v[0-9]+]]
+; GCN: v_cvt_pknorm_u16_f32{{(_e64)*}} v{{[0-9]+}}, -[[A]], [[B]]
+define amdgpu_kernel void @v_cvt_pknorm_u16_f32_fneg_lo(i32 addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
+ %tid = call i32 @llvm.amdgcn.workitem.id.x()
+ %tid.ext = sext i32 %tid to i64
+ %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
+ %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
+ %out.gep = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %tid.ext
+ %a = load volatile float, float addrspace(1)* %a.gep
+ %b = load volatile float, float addrspace(1)* %b.gep
+ %neg.a = fsub float -0.0, %a
+ %cvt = call <2 x i16> @llvm.amdgcn.cvt.pknorm.u16(float %neg.a, float %b)
+ %r = bitcast <2 x i16> %cvt to i32
+ store i32 %r, i32 addrspace(1)* %out.gep
+ ret void
+}
+
+; GCN-LABEL: {{^}}v_cvt_pknorm_u16_f32_fneg_hi:
+; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]]
+; GCN: {{buffer|flat|global}}_load_dword [[B:v[0-9]+]]
+; GCN: v_cvt_pknorm_u16_f32{{(_e64)*}} v{{[0-9]+}}, [[A]], -[[B]]
+define amdgpu_kernel void @v_cvt_pknorm_u16_f32_fneg_hi(i32 addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
+ %tid = call i32 @llvm.amdgcn.workitem.id.x()
+ %tid.ext = sext i32 %tid to i64
+ %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
+ %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
+ %out.gep = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %tid.ext
+ %a = load volatile float, float addrspace(1)* %a.gep
+ %b = load volatile float, float addrspace(1)* %b.gep
+ %neg.b = fsub float -0.0, %b
+ %cvt = call <2 x i16> @llvm.amdgcn.cvt.pknorm.u16(float %a, float %neg.b)
+ %r = bitcast <2 x i16> %cvt to i32
+ store i32 %r, i32 addrspace(1)* %out.gep
+ ret void
+}
+
+; GCN-LABEL: {{^}}v_cvt_pknorm_u16_f32_fneg_lo_hi:
+; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]]
+; GCN: {{buffer|flat|global}}_load_dword [[B:v[0-9]+]]
+; GCN: v_cvt_pknorm_u16_f32{{(_e64)*}} v{{[0-9]+}}, -[[A]], -[[B]]
+define amdgpu_kernel void @v_cvt_pknorm_u16_f32_fneg_lo_hi(i32 addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
+ %tid = call i32 @llvm.amdgcn.workitem.id.x()
+ %tid.ext = sext i32 %tid to i64
+ %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
+ %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
+ %out.gep = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %tid.ext
+ %a = load volatile float, float addrspace(1)* %a.gep
+ %b = load volatile float, float addrspace(1)* %b.gep
+ %neg.a = fsub float -0.0, %a
+ %neg.b = fsub float -0.0, %b
+ %cvt = call <2 x i16> @llvm.amdgcn.cvt.pknorm.u16(float %neg.a, float %neg.b)
+ %r = bitcast <2 x i16> %cvt to i32
+ store i32 %r, i32 addrspace(1)* %out.gep
+ ret void
+}
+
+; GCN-LABEL: {{^}}v_cvt_pknorm_u16_f32_fneg_fabs_lo_fneg_hi:
+; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]]
+; GCN: {{buffer|flat|global}}_load_dword [[B:v[0-9]+]]
+; GCN: v_cvt_pknorm_u16_f32{{(_e64)*}} v{{[0-9]+}}, -|[[A]]|, -[[B]]
+define amdgpu_kernel void @v_cvt_pknorm_u16_f32_fneg_fabs_lo_fneg_hi(i32 addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
+ %tid = call i32 @llvm.amdgcn.workitem.id.x()
+ %tid.ext = sext i32 %tid to i64
+ %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
+ %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
+ %out.gep = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %tid.ext
+ %a = load volatile float, float addrspace(1)* %a.gep
+ %b = load volatile float, float addrspace(1)* %b.gep
+ %fabs.a = call float @llvm.fabs.f32(float %a)
+ %neg.fabs.a = fsub float -0.0, %fabs.a
+ %neg.b = fsub float -0.0, %b
+ %cvt = call <2 x i16> @llvm.amdgcn.cvt.pknorm.u16(float %neg.fabs.a, float %neg.b)
+ %r = bitcast <2 x i16> %cvt to i32
+ store i32 %r, i32 addrspace(1)* %out.gep
+ ret void
+}
+
+declare <2 x i16> @llvm.amdgcn.cvt.pknorm.u16(float, float) #1
+declare float @llvm.fabs.f32(float) #1
+declare i32 @llvm.amdgcn.workitem.id.x() #1
+
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readnone }
diff --git a/test/Transforms/InstCombine/AMDGPU/amdgcn-intrinsics.ll b/test/Transforms/InstCombine/AMDGPU/amdgcn-intrinsics.ll
index f82bf81fbbf..c8a05204bf5 100644
--- a/test/Transforms/InstCombine/AMDGPU/amdgcn-intrinsics.ll
+++ b/test/Transforms/InstCombine/AMDGPU/amdgcn-intrinsics.ll
@@ -723,6 +723,114 @@ define <2 x half> @constant_rtz_pkrtz() {
}
; --------------------------------------------------------------------
+; llvm.amdgcn.cvt.pknorm.i16
+; --------------------------------------------------------------------
+
+declare <2 x i16> @llvm.amdgcn.cvt.pknorm.i16(float, float) nounwind readnone
+
+; CHECK-LABEL: @undef_lhs_cvt_pknorm_i16(
+; CHECK: %cvt = call <2 x i16> @llvm.amdgcn.cvt.pknorm.i16(float undef, float %y)
+define <2 x i16> @undef_lhs_cvt_pknorm_i16(float %y) {
+ %cvt = call <2 x i16> @llvm.amdgcn.cvt.pknorm.i16(float undef, float %y)
+ ret <2 x i16> %cvt
+}
+
+; CHECK-LABEL: @undef_rhs_cvt_pknorm_i16(
+; CHECK: %cvt = call <2 x i16> @llvm.amdgcn.cvt.pknorm.i16(float %x, float undef)
+define <2 x i16> @undef_rhs_cvt_pknorm_i16(float %x) {
+ %cvt = call <2 x i16> @llvm.amdgcn.cvt.pknorm.i16(float %x, float undef)
+ ret <2 x i16> %cvt
+}
+
+; CHECK-LABEL: @undef_cvt_pknorm_i16(
+; CHECK: ret <2 x i16> undef
+define <2 x i16> @undef_cvt_pknorm_i16() {
+ %cvt = call <2 x i16> @llvm.amdgcn.cvt.pknorm.i16(float undef, float undef)
+ ret <2 x i16> %cvt
+}
+
+; --------------------------------------------------------------------
+; llvm.amdgcn.cvt.pknorm.u16
+; --------------------------------------------------------------------
+
+declare <2 x i16> @llvm.amdgcn.cvt.pknorm.u16(float, float) nounwind readnone
+
+; CHECK-LABEL: @undef_lhs_cvt_pknorm_u16(
+; CHECK: %cvt = call <2 x i16> @llvm.amdgcn.cvt.pknorm.u16(float undef, float %y)
+define <2 x i16> @undef_lhs_cvt_pknorm_u16(float %y) {
+ %cvt = call <2 x i16> @llvm.amdgcn.cvt.pknorm.u16(float undef, float %y)
+ ret <2 x i16> %cvt
+}
+
+; CHECK-LABEL: @undef_rhs_cvt_pknorm_u16(
+; CHECK: %cvt = call <2 x i16> @llvm.amdgcn.cvt.pknorm.u16(float %x, float undef)
+define <2 x i16> @undef_rhs_cvt_pknorm_u16(float %x) {
+ %cvt = call <2 x i16> @llvm.amdgcn.cvt.pknorm.u16(float %x, float undef)
+ ret <2 x i16> %cvt
+}
+
+; CHECK-LABEL: @undef_cvt_pknorm_u16(
+; CHECK: ret <2 x i16> undef
+define <2 x i16> @undef_cvt_pknorm_u16() {
+ %cvt = call <2 x i16> @llvm.amdgcn.cvt.pknorm.u16(float undef, float undef)
+ ret <2 x i16> %cvt
+}
+
+; --------------------------------------------------------------------
+; llvm.amdgcn.cvt.pk.i16
+; --------------------------------------------------------------------
+
+declare <2 x i16> @llvm.amdgcn.cvt.pk.i16(i32, i32) nounwind readnone
+
+; CHECK-LABEL: @undef_lhs_cvt_pk_i16(
+; CHECK: %cvt = call <2 x i16> @llvm.amdgcn.cvt.pk.i16(i32 undef, i32 %y)
+define <2 x i16> @undef_lhs_cvt_pk_i16(i32 %y) {
+ %cvt = call <2 x i16> @llvm.amdgcn.cvt.pk.i16(i32 undef, i32 %y)
+ ret <2 x i16> %cvt
+}
+
+; CHECK-LABEL: @undef_rhs_cvt_pk_i16(
+; CHECK: %cvt = call <2 x i16> @llvm.amdgcn.cvt.pk.i16(i32 %x, i32 undef)
+define <2 x i16> @undef_rhs_cvt_pk_i16(i32 %x) {
+ %cvt = call <2 x i16> @llvm.amdgcn.cvt.pk.i16(i32 %x, i32 undef)
+ ret <2 x i16> %cvt
+}
+
+; CHECK-LABEL: @undef_cvt_pk_i16(
+; CHECK: ret <2 x i16> undef
+define <2 x i16> @undef_cvt_pk_i16() {
+ %cvt = call <2 x i16> @llvm.amdgcn.cvt.pk.i16(i32 undef, i32 undef)
+ ret <2 x i16> %cvt
+}
+
+; --------------------------------------------------------------------
+; llvm.amdgcn.cvt.pk.u16
+; --------------------------------------------------------------------
+
+declare <2 x i16> @llvm.amdgcn.cvt.pk.u16(i32, i32) nounwind readnone
+
+; CHECK-LABEL: @undef_lhs_cvt_pk_u16(
+; CHECK: %cvt = call <2 x i16> @llvm.amdgcn.cvt.pk.u16(i32 undef, i32 %y)
+define <2 x i16> @undef_lhs_cvt_pk_u16(i32 %y) {
+ %cvt = call <2 x i16> @llvm.amdgcn.cvt.pk.u16(i32 undef, i32 %y)
+ ret <2 x i16> %cvt
+}
+
+; CHECK-LABEL: @undef_rhs_cvt_pk_u16(
+; CHECK: %cvt = call <2 x i16> @llvm.amdgcn.cvt.pk.u16(i32 %x, i32 undef)
+define <2 x i16> @undef_rhs_cvt_pk_u16(i32 %x) {
+ %cvt = call <2 x i16> @llvm.amdgcn.cvt.pk.u16(i32 %x, i32 undef)
+ ret <2 x i16> %cvt
+}
+
+; CHECK-LABEL: @undef_cvt_pk_u16(
+; CHECK: ret <2 x i16> undef
+define <2 x i16> @undef_cvt_pk_u16() {
+ %cvt = call <2 x i16> @llvm.amdgcn.cvt.pk.u16(i32 undef, i32 undef)
+ ret <2 x i16> %cvt
+}
+
+; --------------------------------------------------------------------
; llvm.amdgcn.ubfe
; --------------------------------------------------------------------