GCCBuiltin<"__builtin_amdgcn_dispatch_id">,
Intrinsic<[llvm_i64_ty], [], [IntrNoMem]>;
+def int_amdgcn_implicit_buffer_ptr :
+ GCCBuiltin<"__builtin_amdgcn_implicit_buffer_ptr">,
+ Intrinsic<[LLVMQualPointerType<llvm_i8_ty, 2>], [], [IntrNoMem]>;
+
//===----------------------------------------------------------------------===//
// Instruction Intrinsics
//===----------------------------------------------------------------------===//
void AMDGPUAsmPrinter::EmitFunctionBodyStart() {
const AMDGPUSubtarget &STM = MF->getSubtarget<AMDGPUSubtarget>();
SIProgramInfo KernelInfo;
- if (STM.isAmdCodeObjectV2()) {
+ if (STM.isAmdCodeObjectV2(*MF)) {
getSIProgramInfo(KernelInfo, *MF);
EmitAmdKernelCodeT(*MF, KernelInfo);
}
void AMDGPUAsmPrinter::EmitFunctionEntryLabel() {
const SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
const AMDGPUSubtarget &STM = MF->getSubtarget<AMDGPUSubtarget>();
- if (MFI->isKernel() && STM.isAmdCodeObjectV2()) {
+ if (MFI->isKernel() && STM.isAmdCodeObjectV2(*MF)) {
AMDGPUTargetStreamer *TS =
static_cast<AMDGPUTargetStreamer *>(OutStreamer->getTargetStreamer());
SmallString<128> SymbolName;
// FIXME: Should use getKernArgSize
header.kernarg_segment_byte_size =
- STM.getKernArgSegmentSize(MFI->getABIArgOffset());
+ STM.getKernArgSegmentSize(MF, MFI->getABIArgOffset());
header.wavefront_sgpr_count = KernelInfo.NumSGPR;
header.workitem_vgpr_count = KernelInfo.NumVGPR;
header.workitem_private_segment_byte_size = KernelInfo.ScratchSize;
return EnableVGPRSpilling || !AMDGPU::isShader(F.getCallingConv());
}
-unsigned SISubtarget::getKernArgSegmentSize(unsigned ExplicitArgBytes) const {
- unsigned ImplicitBytes = getImplicitArgNumBytes();
+unsigned SISubtarget::getKernArgSegmentSize(const MachineFunction &MF,
+ unsigned ExplicitArgBytes) const {
+ unsigned ImplicitBytes = getImplicitArgNumBytes(MF);
if (ImplicitBytes == 0)
return ExplicitArgBytes;
return EnableXNACK;
}
- bool isAmdCodeObjectV2() const {
- return isAmdHsaOS() || isMesa3DOS();
+ bool isMesaKernel(const MachineFunction &MF) const {
+ return isMesa3DOS() && !AMDGPU::isShader(MF.getFunction()->getCallingConv());
+ }
+
+ // Covers VS/PS/CS graphics shaders
+ bool isMesaGfxShader(const MachineFunction &MF) const {
+ return isMesa3DOS() && AMDGPU::isShader(MF.getFunction()->getCallingConv());
+ }
+
+ bool isAmdCodeObjectV2(const MachineFunction &MF) const {
+ return isAmdHsaOS() || isMesaKernel(MF);
}
/// \brief Returns the offset in bytes from the start of the input buffer
/// of the first explicit kernel argument.
- unsigned getExplicitKernelArgOffset() const {
- return isAmdCodeObjectV2() ? 0 : 36;
+ unsigned getExplicitKernelArgOffset(const MachineFunction &MF) const {
+ return isAmdCodeObjectV2(MF) ? 0 : 36;
}
unsigned getAlignmentForImplicitArgPtr() const {
return isAmdHsaOS() ? 8 : 4;
}
- unsigned getImplicitArgNumBytes() const {
- if (isMesa3DOS())
+ unsigned getImplicitArgNumBytes(const MachineFunction &MF) const {
+ if (isMesaKernel(MF))
return 16;
if (isAmdHsaOS() && isOpenCLEnv())
return 32;
return getGeneration() != AMDGPUSubtarget::SOUTHERN_ISLANDS;
}
- unsigned getKernArgSegmentSize(unsigned ExplictArgBytes) const;
+ unsigned getKernArgSegmentSize(const MachineFunction &MF, unsigned ExplictArgBytes) const;
/// Return the maximum number of waves per SIMD for kernels using \p SGPRs SGPRs
unsigned getOccupancyWithNumSGPRs(unsigned SGPRs) const;
unsigned ValBase = ArgLocs[In.getOrigArgIndex()].getLocMemOffset();
unsigned PartOffset = VA.getLocMemOffset();
- unsigned Offset = Subtarget->getExplicitKernelArgOffset() + VA.getLocMemOffset();
+ unsigned Offset = Subtarget->getExplicitKernelArgOffset(MF) + VA.getLocMemOffset();
MachinePointerInfo PtrInfo(UndefValue::get(PtrTy), PartOffset - ValBase);
SDValue Arg = DAG.getLoad(
unsigned PreloadedPrivateBufferReg = AMDGPU::NoRegister;
- if (ST.isAmdCodeObjectV2()) {
+ if (ST.isAmdCodeObjectV2(MF) || ST.isMesaGfxShader(MF)) {
PreloadedPrivateBufferReg = TRI->getPreloadedValue(
MF, SIRegisterInfo::PRIVATE_SEGMENT_BUFFER);
}
}
if (ResourceRegUsed && PreloadedPrivateBufferReg != AMDGPU::NoRegister) {
- assert(ST.isAmdCodeObjectV2());
+ assert(ST.isAmdCodeObjectV2(MF) || ST.isMesaGfxShader(MF));
MRI.addLiveIn(PreloadedPrivateBufferReg);
MBB.addLiveIn(PreloadedPrivateBufferReg);
}
bool CopyBuffer = ResourceRegUsed &&
PreloadedPrivateBufferReg != AMDGPU::NoRegister &&
+ ST.isAmdCodeObjectV2(MF) &&
ScratchRsrcReg != PreloadedPrivateBufferReg;
// This needs to be careful of the copying order to avoid overwriting one of
.addReg(PreloadedPrivateBufferReg, RegState::Kill);
}
- if (ResourceRegUsed && PreloadedPrivateBufferReg == AMDGPU::NoRegister) {
- assert(!ST.isAmdCodeObjectV2());
+ if (ResourceRegUsed && (ST.isMesaGfxShader(MF) || (PreloadedPrivateBufferReg == AMDGPU::NoRegister))) {
+ assert(!ST.isAmdCodeObjectV2(MF));
const MCInstrDesc &SMovB32 = TII->get(AMDGPU::S_MOV_B32);
- unsigned Rsrc0 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub0);
- unsigned Rsrc1 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub1);
unsigned Rsrc2 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub2);
unsigned Rsrc3 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub3);
// Use relocations to get the pointer, and setup the other bits manually.
uint64_t Rsrc23 = TII->getScratchRsrcWords23();
- BuildMI(MBB, I, DL, SMovB32, Rsrc0)
- .addExternalSymbol("SCRATCH_RSRC_DWORD0")
- .addReg(ScratchRsrcReg, RegState::ImplicitDefine);
- BuildMI(MBB, I, DL, SMovB32, Rsrc1)
- .addExternalSymbol("SCRATCH_RSRC_DWORD1")
- .addReg(ScratchRsrcReg, RegState::ImplicitDefine);
+ if (MFI->hasPrivateMemoryInputPtr()) {
+ unsigned Rsrc01 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub0_sub1);
+
+ if (AMDGPU::isCompute(MF.getFunction()->getCallingConv())) {
+ const MCInstrDesc &Mov64 = TII->get(AMDGPU::S_MOV_B64);
+
+ BuildMI(MBB, I, DL, Mov64, Rsrc01)
+ .addReg(PreloadedPrivateBufferReg)
+ .addReg(ScratchRsrcReg, RegState::ImplicitDefine);
+ } else {
+ const MCInstrDesc &LoadDwordX2 = TII->get(AMDGPU::S_LOAD_DWORDX2_IMM);
+
+ PointerType *PtrTy =
+ PointerType::get(Type::getInt64Ty(MF.getFunction()->getContext()),
+ AMDGPUAS::CONSTANT_ADDRESS);
+ MachinePointerInfo PtrInfo(UndefValue::get(PtrTy));
+ auto MMO = MF.getMachineMemOperand(PtrInfo,
+ MachineMemOperand::MOLoad |
+ MachineMemOperand::MOInvariant |
+ MachineMemOperand::MODereferenceable,
+ 0, 0);
+ BuildMI(MBB, I, DL, LoadDwordX2, Rsrc01)
+ .addReg(PreloadedPrivateBufferReg)
+ .addImm(0) // offset
+ .addImm(0) // glc
+ .addMemOperand(MMO)
+ .addReg(ScratchRsrcReg, RegState::ImplicitDefine);
+ }
+ } else {
+ unsigned Rsrc0 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub0);
+ unsigned Rsrc1 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub1);
+
+ BuildMI(MBB, I, DL, SMovB32, Rsrc0)
+ .addExternalSymbol("SCRATCH_RSRC_DWORD0")
+ .addReg(ScratchRsrcReg, RegState::ImplicitDefine);
+
+ BuildMI(MBB, I, DL, SMovB32, Rsrc1)
+ .addExternalSymbol("SCRATCH_RSRC_DWORD1")
+ .addReg(ScratchRsrcReg, RegState::ImplicitDefine);
+
+ }
BuildMI(MBB, I, DL, SMovB32, Rsrc2)
.addImm(Rsrc23 & 0xffffffff)
if (!AMDGPU::isShader(CallConv)) {
assert(Info->hasWorkGroupIDX() && Info->hasWorkItemIDX());
} else {
- assert(!Info->hasPrivateSegmentBuffer() && !Info->hasDispatchPtr() &&
+ assert(!Info->hasDispatchPtr() &&
!Info->hasKernargSegmentPtr() && !Info->hasFlatScratchInit() &&
!Info->hasWorkGroupIDX() && !Info->hasWorkGroupIDY() &&
!Info->hasWorkGroupIDZ() && !Info->hasWorkGroupInfo() &&
!Info->hasWorkItemIDZ());
}
+ if (Info->hasPrivateMemoryInputPtr()) {
+ unsigned PrivateMemoryPtrReg = Info->addPrivateMemoryPtr(*TRI);
+ MF.addLiveIn(PrivateMemoryPtrReg, &AMDGPU::SReg_64RegClass);
+ CCInfo.AllocateReg(PrivateMemoryPtrReg);
+ }
+
// FIXME: How should these inputs interact with inreg / custom SGPR inputs?
if (Info->hasPrivateSegmentBuffer()) {
unsigned PrivateSegmentBufferReg = Info->addPrivateSegmentBuffer(*TRI);
if (VA.isMemLoc()) {
VT = Ins[i].VT;
EVT MemVT = VA.getLocVT();
- const unsigned Offset = Subtarget->getExplicitKernelArgOffset() +
+ const unsigned Offset = Subtarget->getExplicitKernelArgOffset(MF) +
VA.getLocMemOffset();
// The first 36 bytes of the input buffer contains information about
// thread group and global sizes.
if (getTargetMachine().getOptLevel() == CodeGenOpt::None)
HasStackObjects = true;
- if (ST.isAmdCodeObjectV2()) {
+ if (ST.isAmdCodeObjectV2(MF)) {
if (HasStackObjects) {
// If we have stack objects, we unquestionably need the private buffer
// resource. For the Code Object V2 ABI, this will be the first 4 user
// TODO: Should this propagate fast-math-flags?
switch (IntrinsicID) {
+ case Intrinsic::amdgcn_implicit_buffer_ptr: {
+ unsigned Reg = TRI->getPreloadedValue(MF, SIRegisterInfo::PRIVATE_SEGMENT_BUFFER);
+ return CreateLiveInRegister(DAG, &AMDGPU::SReg_64RegClass, Reg, VT);
+ }
case Intrinsic::amdgcn_dispatch_ptr:
case Intrinsic::amdgcn_queue_ptr: {
- if (!Subtarget->isAmdCodeObjectV2()) {
+ if (!Subtarget->isAmdCodeObjectV2(MF)) {
DiagnosticInfoUnsupported BadIntrin(
*MF.getFunction(), "unsupported hsa intrinsic without hsa target",
DL.getDebugLoc());
PrivateSegmentWaveByteOffset(false),
WorkItemIDX(false),
WorkItemIDY(false),
- WorkItemIDZ(false) {
+ WorkItemIDZ(false),
+ PrivateMemoryInputPtr(false) {
const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
const Function *F = MF.getFunction();
if (HasStackObjects || MaySpill)
PrivateSegmentWaveByteOffset = true;
- if (ST.isAmdCodeObjectV2()) {
+ if (ST.isAmdCodeObjectV2(MF)) {
if (HasStackObjects || MaySpill)
PrivateSegmentBuffer = true;
if (F->hasFnAttribute("amdgpu-dispatch-id"))
DispatchID = true;
+ } else if (ST.isMesaGfxShader(MF)) {
+ if (HasStackObjects || MaySpill)
+ PrivateMemoryInputPtr = true;
}
// We don't need to worry about accessing spills with flat instructions.
return FlatScratchInitUserSGPR;
}
+unsigned SIMachineFunctionInfo::addPrivateMemoryPtr(const SIRegisterInfo &TRI) {
+ PrivateMemoryPtrUserSGPR = TRI.getMatchingSuperReg(
+ getNextUserSGPR(), AMDGPU::sub0, &AMDGPU::SReg_64RegClass);
+ NumUserSGPRs += 2;
+ return PrivateMemoryPtrUserSGPR;
+}
+
SIMachineFunctionInfo::SpilledReg SIMachineFunctionInfo::getSpilledReg (
MachineFunction *MF,
unsigned FrameIndex,
unsigned ScratchRSrcReg;
unsigned ScratchWaveOffsetReg;
+ // Input registers for non-HSA ABI
+ unsigned PrivateMemoryPtrUserSGPR;
+
// Input registers setup for the HSA ABI.
// User SGPRs in allocation order.
unsigned PrivateSegmentBufferUserSGPR;
bool WorkItemIDY : 1;
bool WorkItemIDZ : 1;
+ // Private memory buffer
+ // Compute directly in sgpr[0:1]
+ // Other shaders indirect 64-bits at sgpr[0:1]
+ bool PrivateMemoryInputPtr : 1;
+
MCPhysReg getNextUserSGPR() const {
assert(NumSystemSGPRs == 0 && "System SGPRs must be added after user SGPRs");
return AMDGPU::SGPR0 + NumUserSGPRs;
unsigned addKernargSegmentPtr(const SIRegisterInfo &TRI);
unsigned addDispatchID(const SIRegisterInfo &TRI);
unsigned addFlatScratchInit(const SIRegisterInfo &TRI);
+ unsigned addPrivateMemoryPtr(const SIRegisterInfo &TRI);
// Add system SGPRs.
unsigned addWorkGroupIDX() {
return WorkItemIDZ;
}
+ bool hasPrivateMemoryInputPtr() const {
+ return PrivateMemoryInputPtr;
+ }
+
unsigned getNumUserSGPRs() const {
return NumUserSGPRs;
}
return QueuePtrUserSGPR;
}
+ unsigned getPrivateMemoryPtrUserSGPR() const {
+ return PrivateMemoryPtrUserSGPR;
+ }
+
bool hasSpilledSGPRs() const {
return HasSpilledSGPRs;
}
case SIRegisterInfo::PRIVATE_SEGMENT_WAVE_BYTE_OFFSET:
return MFI->PrivateSegmentWaveByteOffsetSystemSGPR;
case SIRegisterInfo::PRIVATE_SEGMENT_BUFFER:
- assert(ST.isAmdCodeObjectV2() &&
- "Non-CodeObjectV2 ABI currently uses relocations");
- assert(MFI->hasPrivateSegmentBuffer());
- return MFI->PrivateSegmentBufferUserSGPR;
+ if (ST.isAmdCodeObjectV2(MF)) {
+ assert(MFI->hasPrivateSegmentBuffer());
+ return MFI->PrivateSegmentBufferUserSGPR;
+ }
+ assert(MFI->hasPrivateMemoryInputPtr());
+ return MFI->PrivateMemoryPtrUserSGPR;
case SIRegisterInfo::KERNARG_SEGMENT_PTR:
assert(MFI->hasKernargSegmentPtr());
return MFI->KernargSegmentPtrUserSGPR;