summaryrefslogtreecommitdiff
path: root/lib/Target/AMDGPU/SIISelLowering.cpp
diff options
context:
space:
mode:
authorMatt Arsenault <Matthew.Arsenault@amd.com>2017-08-01 19:54:18 +0000
committerMatt Arsenault <Matthew.Arsenault@amd.com>2017-08-01 19:54:18 +0000
commit43950949ad8a5f46c857c8dae7796fc614759750 (patch)
tree35e91df1743809a44854c0b1baec16f7cc72283c /lib/Target/AMDGPU/SIISelLowering.cpp
parenta68b470abaa80d99a4750311f5255ea3326e29eb (diff)
AMDGPU: Initial implementation of calls
Includes a hack to fix the type selected for the GlobalAddress of the function, which will be fixed by changing the default datalayout to use generic pointers for 0. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@309732 91177308-0d34-0410-b5e6-96231b3b80d8
Diffstat (limited to 'lib/Target/AMDGPU/SIISelLowering.cpp')
-rw-r--r--lib/Target/AMDGPU/SIISelLowering.cpp406
1 files changed, 399 insertions, 7 deletions
diff --git a/lib/Target/AMDGPU/SIISelLowering.cpp b/lib/Target/AMDGPU/SIISelLowering.cpp
index 247a011df4a..6be94ba5cbf 100644
--- a/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -1201,9 +1201,13 @@ static void reservePrivateMemoryRegs(const TargetMachine &TM,
if (TM.getOptLevel() == CodeGenOpt::None)
HasStackObjects = true;
+ // For now assume stack access is needed in any callee functions, so we need
+ // the scratch registers to pass in.
+ bool RequiresStackAccess = HasStackObjects || MFI.hasCalls();
+
const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
if (ST.isAmdCodeObjectV2(MF)) {
- if (HasStackObjects) {
+ if (RequiresStackAccess) {
// If we have stack objects, we unquestionably need the private buffer
// resource. For the Code Object V2 ABI, this will be the first 4 user
// SGPR inputs. We can reserve those and use them directly.
@@ -1212,9 +1216,23 @@ static void reservePrivateMemoryRegs(const TargetMachine &TM,
MF, SIRegisterInfo::PRIVATE_SEGMENT_BUFFER);
Info.setScratchRSrcReg(PrivateSegmentBufferReg);
- unsigned PrivateSegmentWaveByteOffsetReg = TRI.getPreloadedValue(
- MF, SIRegisterInfo::PRIVATE_SEGMENT_WAVE_BYTE_OFFSET);
- Info.setScratchWaveOffsetReg(PrivateSegmentWaveByteOffsetReg);
+ if (MFI.hasCalls()) {
+ // If we have calls, we need to keep the frame register in a register
+ // that won't be clobbered by a call, so ensure it is copied somewhere.
+
+ // This is not a problem for the scratch wave offset, because the same
+ // registers are reserved in all functions.
+
+ // FIXME: Nothing is really ensuring this is a call preserved register,
+ // it's just selected from the end so it happens to be.
+ unsigned ReservedOffsetReg
+ = TRI.reservedPrivateSegmentWaveByteOffsetReg(MF);
+ Info.setScratchWaveOffsetReg(ReservedOffsetReg);
+ } else {
+ unsigned PrivateSegmentWaveByteOffsetReg = TRI.getPreloadedValue(
+ MF, SIRegisterInfo::PRIVATE_SEGMENT_WAVE_BYTE_OFFSET);
+ Info.setScratchWaveOffsetReg(PrivateSegmentWaveByteOffsetReg);
+ }
} else {
unsigned ReservedBufferReg
= TRI.reservedPrivateSegmentBufferReg(MF);
@@ -1237,7 +1255,7 @@ static void reservePrivateMemoryRegs(const TargetMachine &TM,
// offset is still in an input SGPR.
Info.setScratchRSrcReg(ReservedBufferReg);
- if (HasStackObjects) {
+ if (HasStackObjects && !MFI.hasCalls()) {
unsigned ScratchWaveOffsetReg = TRI.getPreloadedValue(
MF, SIRegisterInfo::PRIVATE_SEGMENT_WAVE_BYTE_OFFSET);
Info.setScratchWaveOffsetReg(ScratchWaveOffsetReg);
@@ -1249,6 +1267,50 @@ static void reservePrivateMemoryRegs(const TargetMachine &TM,
}
}
+bool SITargetLowering::supportSplitCSR(MachineFunction *MF) const {
+ const SIMachineFunctionInfo *Info = MF->getInfo<SIMachineFunctionInfo>();
+ return !Info->isEntryFunction();
+}
+
+void SITargetLowering::initializeSplitCSR(MachineBasicBlock *Entry) const {
+
+}
+
+void SITargetLowering::insertCopiesSplitCSR(
+ MachineBasicBlock *Entry,
+ const SmallVectorImpl<MachineBasicBlock *> &Exits) const {
+ const SIRegisterInfo *TRI = getSubtarget()->getRegisterInfo();
+
+ const MCPhysReg *IStart = TRI->getCalleeSavedRegsViaCopy(Entry->getParent());
+ if (!IStart)
+ return;
+
+ const TargetInstrInfo *TII = Subtarget->getInstrInfo();
+ MachineRegisterInfo *MRI = &Entry->getParent()->getRegInfo();
+ MachineBasicBlock::iterator MBBI = Entry->begin();
+ for (const MCPhysReg *I = IStart; *I; ++I) {
+ const TargetRegisterClass *RC = nullptr;
+ if (AMDGPU::SReg_64RegClass.contains(*I))
+ RC = &AMDGPU::SGPR_64RegClass;
+ else if (AMDGPU::SReg_32RegClass.contains(*I))
+ RC = &AMDGPU::SGPR_32RegClass;
+ else
+ llvm_unreachable("Unexpected register class in CSRsViaCopy!");
+
+ unsigned NewVR = MRI->createVirtualRegister(RC);
+ // Create copy from CSR to a virtual register.
+ Entry->addLiveIn(*I);
+ BuildMI(*Entry, MBBI, DebugLoc(), TII->get(TargetOpcode::COPY), NewVR)
+ .addReg(*I);
+
+ // Insert the copy-back instructions right before the terminator.
+ for (auto *Exit : Exits)
+ BuildMI(*Exit, Exit->getFirstTerminator(), DebugLoc(),
+ TII->get(TargetOpcode::COPY), *I)
+ .addReg(NewVR);
+ }
+}
+
SDValue SITargetLowering::LowerFormalArguments(
SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,
@@ -1589,6 +1651,22 @@ SITargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
}
// FIXME: Does sret work properly?
+ if (!Info->isEntryFunction()) {
+ const SIRegisterInfo *TRI
+ = static_cast<const SISubtarget *>(Subtarget)->getRegisterInfo();
+ const MCPhysReg *I =
+ TRI->getCalleeSavedRegsViaCopy(&DAG.getMachineFunction());
+ if (I) {
+ for (; *I; ++I) {
+ if (AMDGPU::SReg_64RegClass.contains(*I))
+ RetOps.push_back(DAG.getRegister(*I, MVT::i64));
+ else if (AMDGPU::SReg_32RegClass.contains(*I))
+ RetOps.push_back(DAG.getRegister(*I, MVT::i32));
+ else
+ llvm_unreachable("Unexpected register class in CSRsViaCopy!");
+ }
+ }
+ }
// Update chain and glue.
RetOps[0] = Chain;
@@ -1601,6 +1679,296 @@ SITargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
return DAG.getNode(Opc, DL, MVT::Other, RetOps);
}
+SDValue SITargetLowering::LowerCallResult(
+ SDValue Chain, SDValue InFlag, CallingConv::ID CallConv, bool IsVarArg,
+ const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,
+ SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals, bool IsThisReturn,
+ SDValue ThisVal) const {
+ CCAssignFn *RetCC = CCAssignFnForReturn(CallConv, IsVarArg);
+
+ // Assign locations to each value returned by this call.
+ SmallVector<CCValAssign, 16> RVLocs;
+ CCState CCInfo(CallConv, IsVarArg, DAG.getMachineFunction(), RVLocs,
+ *DAG.getContext());
+ CCInfo.AnalyzeCallResult(Ins, RetCC);
+
+ // Copy all of the result registers out of their specified physreg.
+ for (unsigned i = 0; i != RVLocs.size(); ++i) {
+ CCValAssign VA = RVLocs[i];
+ SDValue Val;
+
+ if (VA.isRegLoc()) {
+ Val = DAG.getCopyFromReg(Chain, DL, VA.getLocReg(), VA.getLocVT(), InFlag);
+ Chain = Val.getValue(1);
+ InFlag = Val.getValue(2);
+ } else if (VA.isMemLoc()) {
+ report_fatal_error("TODO: return values in memory");
+ } else
+ llvm_unreachable("unknown argument location type");
+
+ switch (VA.getLocInfo()) {
+ case CCValAssign::Full:
+ break;
+ case CCValAssign::BCvt:
+ Val = DAG.getNode(ISD::BITCAST, DL, VA.getValVT(), Val);
+ break;
+ case CCValAssign::ZExt:
+ Val = DAG.getNode(ISD::AssertZext, DL, VA.getLocVT(), Val,
+ DAG.getValueType(VA.getValVT()));
+ Val = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), Val);
+ break;
+ case CCValAssign::SExt:
+ Val = DAG.getNode(ISD::AssertSext, DL, VA.getLocVT(), Val,
+ DAG.getValueType(VA.getValVT()));
+ Val = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), Val);
+ break;
+ case CCValAssign::AExt:
+ Val = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), Val);
+ break;
+ default:
+ llvm_unreachable("Unknown loc info!");
+ }
+
+ InVals.push_back(Val);
+ }
+
+ return Chain;
+}
+
+// The wave scratch offset register is used as the global base pointer.
+SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI,
+ SmallVectorImpl<SDValue> &InVals) const {
+ const AMDGPUTargetMachine &TM =
+ static_cast<const AMDGPUTargetMachine &>(getTargetMachine());
+ if (!TM.enableFunctionCalls())
+ return AMDGPUTargetLowering::LowerCall(CLI, InVals);
+
+ SelectionDAG &DAG = CLI.DAG;
+ const SDLoc &DL = CLI.DL;
+ SmallVector<ISD::OutputArg, 32> &Outs = CLI.Outs;
+ SmallVector<SDValue, 32> &OutVals = CLI.OutVals;
+ SmallVector<ISD::InputArg, 32> &Ins = CLI.Ins;
+ SDValue Chain = CLI.Chain;
+ SDValue Callee = CLI.Callee;
+ bool &IsTailCall = CLI.IsTailCall;
+ CallingConv::ID CallConv = CLI.CallConv;
+ bool IsVarArg = CLI.IsVarArg;
+ bool IsSibCall = false;
+ bool IsThisReturn = false;
+ MachineFunction &MF = DAG.getMachineFunction();
+
+ // TODO: Implement tail calls.
+ IsTailCall = false;
+
+ if (IsVarArg || MF.getTarget().Options.GuaranteedTailCallOpt) {
+ report_fatal_error("varargs and tail calls not implemented");
+ }
+
+ if (GlobalAddressSDNode *GA = dyn_cast<GlobalAddressSDNode>(Callee)) {
+ // FIXME: Remove this hack for function pointer types.
+ const GlobalValue *GV = GA->getGlobal();
+ assert(Callee.getValueType() == MVT::i32);
+ Callee = DAG.getGlobalAddress(GV, DL, MVT::i64, GA->getOffset(),
+ false, GA->getTargetFlags());
+ }
+
+ const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
+
+ // Analyze operands of the call, assigning locations to each operand.
+ SmallVector<CCValAssign, 16> ArgLocs;
+ CCState CCInfo(CallConv, IsVarArg, MF, ArgLocs, *DAG.getContext());
+ CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, IsVarArg);
+ CCInfo.AnalyzeCallOperands(Outs, AssignFn);
+
+ // Get a count of how many bytes are to be pushed on the stack.
+ unsigned NumBytes = CCInfo.getNextStackOffset();
+
+ if (IsSibCall) {
+ // Since we're not changing the ABI to make this a tail call, the memory
+ // operands are already available in the caller's incoming argument space.
+ NumBytes = 0;
+ }
+
+ // FPDiff is the byte offset of the call's argument area from the callee's.
+ // Stores to callee stack arguments will be placed in FixedStackSlots offset
+ // by this amount for a tail call. In a sibling call it must be 0 because the
+ // caller will deallocate the entire stack and the callee still expects its
+ // arguments to begin at SP+0. Completely unused for non-tail calls.
+ int FPDiff = 0;
+
+ SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass;
+
+ // Adjust the stack pointer for the new arguments...
+ // These operations are automatically eliminated by the prolog/epilog pass
+ if (!IsSibCall) {
+ Chain = DAG.getCALLSEQ_START(Chain, NumBytes, 0, DL);
+
+ unsigned OffsetReg = Info->getScratchWaveOffsetReg();
+
+ // In the HSA case, this should be an identity copy.
+ SDValue ScratchRSrcReg
+ = DAG.getCopyFromReg(Chain, DL, Info->getScratchRSrcReg(), MVT::v4i32);
+ RegsToPass.emplace_back(AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3, ScratchRSrcReg);
+
+ // TODO: Don't hardcode these registers and get from the callee function.
+ SDValue ScratchWaveOffsetReg
+ = DAG.getCopyFromReg(Chain, DL, OffsetReg, MVT::i32);
+ RegsToPass.emplace_back(AMDGPU::SGPR4, ScratchWaveOffsetReg);
+ }
+
+ // Stack pointer relative accesses are done by changing the offset SGPR. This
+ // is just the VGPR offset component.
+ SDValue StackPtr = DAG.getConstant(0, DL, MVT::i32);
+
+ SmallVector<SDValue, 8> MemOpChains;
+ MVT PtrVT = MVT::i32;
+
+ // Walk the register/memloc assignments, inserting copies/loads.
+ for (unsigned i = 0, realArgIdx = 0, e = ArgLocs.size(); i != e;
+ ++i, ++realArgIdx) {
+ CCValAssign &VA = ArgLocs[i];
+ SDValue Arg = OutVals[realArgIdx];
+
+ // Promote the value if needed.
+ switch (VA.getLocInfo()) {
+ case CCValAssign::Full:
+ break;
+ case CCValAssign::BCvt:
+ Arg = DAG.getNode(ISD::BITCAST, DL, VA.getLocVT(), Arg);
+ break;
+ case CCValAssign::ZExt:
+ Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), Arg);
+ break;
+ case CCValAssign::SExt:
+ Arg = DAG.getNode(ISD::SIGN_EXTEND, DL, VA.getLocVT(), Arg);
+ break;
+ case CCValAssign::AExt:
+ Arg = DAG.getNode(ISD::ANY_EXTEND, DL, VA.getLocVT(), Arg);
+ break;
+ case CCValAssign::FPExt:
+ Arg = DAG.getNode(ISD::FP_EXTEND, DL, VA.getLocVT(), Arg);
+ break;
+ default:
+ llvm_unreachable("Unknown loc info!");
+ }
+
+ if (VA.isRegLoc()) {
+ RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
+ } else {
+ assert(VA.isMemLoc());
+
+ SDValue DstAddr;
+ MachinePointerInfo DstInfo;
+
+ unsigned LocMemOffset = VA.getLocMemOffset();
+ int32_t Offset = LocMemOffset;
+ SDValue PtrOff = DAG.getConstant(Offset, DL, MVT::i32);
+ PtrOff = DAG.getNode(ISD::ADD, DL, PtrVT, StackPtr, PtrOff);
+
+ if (!IsTailCall) {
+ SDValue PtrOff = DAG.getTargetConstant(Offset, DL, MVT::i32);
+
+ DstAddr = DAG.getNode(ISD::ADD, DL, PtrVT, StackPtr, PtrOff);
+ DstInfo = MachinePointerInfo::getStack(MF, LocMemOffset);
+ }
+
+ if (Outs[i].Flags.isByVal()) {
+ SDValue SizeNode =
+ DAG.getConstant(Outs[i].Flags.getByValSize(), DL, MVT::i32);
+ SDValue Cpy = DAG.getMemcpy(
+ Chain, DL, DstAddr, Arg, SizeNode, Outs[i].Flags.getByValAlign(),
+ /*isVol = */ false, /*AlwaysInline = */ true,
+ /*isTailCall = */ false,
+ DstInfo, MachinePointerInfo());
+
+ MemOpChains.push_back(Cpy);
+ } else {
+ SDValue Store = DAG.getStore(Chain, DL, Arg, DstAddr, DstInfo);
+ MemOpChains.push_back(Store);
+ }
+ }
+ }
+
+ if (!MemOpChains.empty())
+ Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOpChains);
+
+ // Build a sequence of copy-to-reg nodes chained together with token chain
+ // and flag operands which copy the outgoing args into the appropriate regs.
+ SDValue InFlag;
+ for (auto &RegToPass : RegsToPass) {
+ Chain = DAG.getCopyToReg(Chain, DL, RegToPass.first,
+ RegToPass.second, InFlag);
+ InFlag = Chain.getValue(1);
+ }
+
+ // We don't usually want to end the call-sequence here because we would tidy
+ // the frame up *after* the call, however in the ABI-changing tail-call case
+ // we've carefully laid out the parameters so that when sp is reset they'll be
+ // in the correct location.
+ if (IsTailCall && !IsSibCall) {
+ Chain = DAG.getCALLSEQ_END(Chain,
+ DAG.getTargetConstant(NumBytes, DL, MVT::i32),
+ DAG.getTargetConstant(0, DL, MVT::i32),
+ InFlag, DL);
+ InFlag = Chain.getValue(1);
+ }
+
+ std::vector<SDValue> Ops;
+ Ops.push_back(Chain);
+ Ops.push_back(Callee);
+
+ if (IsTailCall) {
+ // Each tail call may have to adjust the stack by a different amount, so
+ // this information must travel along with the operation for eventual
+ // consumption by emitEpilogue.
+ Ops.push_back(DAG.getTargetConstant(FPDiff, DL, MVT::i32));
+ }
+
+ // Add argument registers to the end of the list so that they are known live
+ // into the call.
+ for (auto &RegToPass : RegsToPass) {
+ Ops.push_back(DAG.getRegister(RegToPass.first,
+ RegToPass.second.getValueType()));
+ }
+
+ // Add a register mask operand representing the call-preserved registers.
+
+ const AMDGPURegisterInfo *TRI = Subtarget->getRegisterInfo();
+ const uint32_t *Mask = TRI->getCallPreservedMask(MF, CallConv);
+ assert(Mask && "Missing call preserved mask for calling convention");
+ Ops.push_back(DAG.getRegisterMask(Mask));
+
+ if (InFlag.getNode())
+ Ops.push_back(InFlag);
+
+ SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
+
+ // If we're doing a tall call, use a TC_RETURN here rather than an
+ // actual call instruction.
+ if (IsTailCall) {
+ MF.getFrameInfo().setHasTailCall();
+ llvm_unreachable("not implemented");
+ }
+
+ // Returns a chain and a flag for retval copy to use.
+ SDValue Call = DAG.getNode(AMDGPUISD::CALL, DL, NodeTys, Ops);
+ Chain = Call.getValue(0);
+ InFlag = Call.getValue(1);
+
+ uint64_t CalleePopBytes = 0;
+ Chain = DAG.getCALLSEQ_END(Chain, DAG.getTargetConstant(NumBytes, DL, MVT::i32),
+ DAG.getTargetConstant(CalleePopBytes, DL, MVT::i32),
+ InFlag, DL);
+ if (!Ins.empty())
+ InFlag = Chain.getValue(1);
+
+ // Handle result values, copying them out of physregs into vregs that we
+ // return.
+ return LowerCallResult(Chain, InFlag, CallConv, IsVarArg, Ins, DL, DAG,
+ InVals, IsThisReturn,
+ IsThisReturn ? OutVals[0] : SDValue());
+}
+
unsigned SITargetLowering::getRegisterByName(const char* RegName, EVT VT,
SelectionDAG &DAG) const {
unsigned Reg = StringSwitch<unsigned>(RegName)
@@ -2266,6 +2634,27 @@ MachineBasicBlock *SITargetLowering::EmitInstrWithCustomInserter(
MI.eraseFromParent();
return BB;
}
+ case AMDGPU::ADJCALLSTACKUP:
+ case AMDGPU::ADJCALLSTACKDOWN: {
+ const SIMachineFunctionInfo *Info = MF->getInfo<SIMachineFunctionInfo>();
+ MachineInstrBuilder MIB(*MF, &MI);
+ MIB.addReg(Info->getStackPtrOffsetReg(), RegState::ImplicitDefine)
+ .addReg(Info->getStackPtrOffsetReg(), RegState::Implicit);
+ return BB;
+ }
+ case AMDGPU::SI_CALL: {
+ const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
+ const DebugLoc &DL = MI.getDebugLoc();
+ unsigned ReturnAddrReg = TII->getRegisterInfo().getReturnAddressReg(*MF);
+ MachineInstrBuilder MIB =
+ BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_SWAPPC_B64), ReturnAddrReg);
+ for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I)
+ MIB.add(MI.getOperand(I));
+ MIB.setMemRefs(MI.memoperands_begin(), MI.memoperands_end());
+
+ MI.eraseFromParent();
+ return BB;
+ }
default:
return AMDGPUTargetLowering::EmitInstrWithCustomInserter(MI, BB);
}
@@ -2931,13 +3320,16 @@ SDValue SITargetLowering::LowerGlobalAddress(AMDGPUMachineFunction *MFI,
SDValue Op,
SelectionDAG &DAG) const {
GlobalAddressSDNode *GSD = cast<GlobalAddressSDNode>(Op);
+ const GlobalValue *GV = GSD->getGlobal();
if (GSD->getAddressSpace() != AMDGPUASI.CONSTANT_ADDRESS &&
- GSD->getAddressSpace() != AMDGPUASI.GLOBAL_ADDRESS)
+ GSD->getAddressSpace() != AMDGPUASI.GLOBAL_ADDRESS &&
+ // FIXME: It isn't correct to rely on the type of the pointer. This should
+ // be removed when address space 0 is 64-bit.
+ !GV->getType()->getElementType()->isFunctionTy())
return AMDGPUTargetLowering::LowerGlobalAddress(MFI, Op, DAG);
SDLoc DL(GSD);
- const GlobalValue *GV = GSD->getGlobal();
EVT PtrVT = Op.getValueType();
if (shouldEmitFixup(GV))