// Print comments that apply to both callable functions and entry points.
void AMDGPUAsmPrinter::emitCommonFunctionComments(
uint32_t NumVGPR,
+ Optional<uint32_t> NumAGPR,
+ uint32_t TotalNumVGPR,
uint32_t NumSGPR,
uint64_t ScratchSize,
uint64_t CodeSize,
OutStreamer->emitRawComment(" codeLenInByte = " + Twine(CodeSize), false);
OutStreamer->emitRawComment(" NumSgprs: " + Twine(NumSGPR), false);
OutStreamer->emitRawComment(" NumVgprs: " + Twine(NumVGPR), false);
+ if (NumAGPR) {
+ OutStreamer->emitRawComment(" NumAgprs: " + Twine(*NumAGPR), false);
+ OutStreamer->emitRawComment(" TotalNumVgprs: " + Twine(TotalNumVGPR),
+ false);
+ }
OutStreamer->emitRawComment(" ScratchSize: " + Twine(ScratchSize), false);
OutStreamer->emitRawComment(" MemoryBound: " + Twine(MFI->isMemoryBound()),
false);
SIFunctionResourceInfo &Info = CallGraphResourceInfo[&MF.getFunction()];
emitCommonFunctionComments(
Info.NumVGPR,
+ STM.hasMAIInsts() ? Info.NumAGPR : Optional<uint32_t>(),
+ Info.getTotalNumVGPRs(STM),
Info.getTotalNumSGPRs(MF.getSubtarget<GCNSubtarget>()),
Info.PrivateSegmentSize,
getFunctionCodeSize(MF), MFI);
}
OutStreamer->emitRawComment(" Kernel info:", false);
- emitCommonFunctionComments(CurrentProgramInfo.NumVGPR,
+ emitCommonFunctionComments(CurrentProgramInfo.NumArchVGPR,
+ STM.hasMAIInsts()
+ ? CurrentProgramInfo.NumAccVGPR
+ : Optional<uint32_t>(),
+ CurrentProgramInfo.NumVGPR,
CurrentProgramInfo.NumSGPR,
CurrentProgramInfo.ScratchSize,
getFunctionCodeSize(MF), MFI);
UsesVCC, UsesFlatScratch);
}
+int32_t AMDGPUAsmPrinter::SIFunctionResourceInfo::getTotalNumVGPRs(
+ const GCNSubtarget &ST) const {
+ return std::max(NumVGPR, NumAGPR);
+}
+
AMDGPUAsmPrinter::SIFunctionResourceInfo AMDGPUAsmPrinter::analyzeResourceUsage(
const MachineFunction &MF) const {
SIFunctionResourceInfo Info;
HighestVGPRReg = Reg;
break;
}
- MCPhysReg AReg = AMDGPU::AGPR0 + TRI.getHWRegIndex(Reg);
- if (MRI.isPhysRegUsed(AReg)) {
- HighestVGPRReg = AReg;
- break;
+ }
+
+ if (ST.hasMAIInsts()) {
+ MCPhysReg HighestAGPRReg = AMDGPU::NoRegister;
+ for (MCPhysReg Reg : reverse(AMDGPU::AGPR_32RegClass.getRegisters())) {
+ if (MRI.isPhysRegUsed(Reg)) {
+ HighestAGPRReg = Reg;
+ break;
+ }
}
+ Info.NumAGPR = HighestAGPRReg == AMDGPU::NoRegister ? 0 :
+ TRI.getHWRegIndex(HighestAGPRReg) + 1;
}
MCPhysReg HighestSGPRReg = AMDGPU::NoRegister;
}
int32_t MaxVGPR = -1;
+ int32_t MaxAGPR = -1;
int32_t MaxSGPR = -1;
uint64_t CalleeFrameSize = 0;
for (const MachineOperand &MO : MI.operands()) {
unsigned Width = 0;
bool IsSGPR = false;
+ bool IsAGPR = false;
if (!MO.isReg())
continue;
Width = 1;
} else if (AMDGPU::AGPR_32RegClass.contains(Reg)) {
IsSGPR = false;
+ IsAGPR = true;
Width = 1;
} else if (AMDGPU::SReg_64RegClass.contains(Reg)) {
assert(!AMDGPU::TTMP_64RegClass.contains(Reg) &&
Width = 2;
} else if (AMDGPU::AReg_64RegClass.contains(Reg)) {
IsSGPR = false;
+ IsAGPR = true;
Width = 2;
} else if (AMDGPU::VReg_96RegClass.contains(Reg)) {
IsSGPR = false;
Width = 4;
} else if (AMDGPU::AReg_128RegClass.contains(Reg)) {
IsSGPR = false;
+ IsAGPR = true;
Width = 4;
} else if (AMDGPU::SReg_256RegClass.contains(Reg)) {
assert(!AMDGPU::TTMP_256RegClass.contains(Reg) &&
Width = 16;
} else if (AMDGPU::AReg_512RegClass.contains(Reg)) {
IsSGPR = false;
+ IsAGPR = true;
Width = 16;
} else if (AMDGPU::SReg_1024RegClass.contains(Reg)) {
IsSGPR = true;
Width = 32;
} else if (AMDGPU::AReg_1024RegClass.contains(Reg)) {
IsSGPR = false;
+ IsAGPR = true;
Width = 32;
} else {
llvm_unreachable("Unknown register class");
int MaxUsed = HWReg + Width - 1;
if (IsSGPR) {
MaxSGPR = MaxUsed > MaxSGPR ? MaxUsed : MaxSGPR;
+ } else if (IsAGPR) {
+ MaxAGPR = MaxUsed > MaxAGPR ? MaxUsed : MaxAGPR;
} else {
MaxVGPR = MaxUsed > MaxVGPR ? MaxUsed : MaxVGPR;
}
47 - IsaInfo::getNumExtraSGPRs(&ST, true, ST.hasFlatAddressSpace());
MaxSGPR = std::max(MaxSGPR, MaxSGPRGuess);
MaxVGPR = std::max(MaxVGPR, 23);
+ MaxAGPR = std::max(MaxAGPR, 23);
CalleeFrameSize = std::max(CalleeFrameSize, UINT64_C(16384));
Info.UsesVCC = true;
MaxSGPR = std::max(I->second.NumExplicitSGPR - 1, MaxSGPR);
MaxVGPR = std::max(I->second.NumVGPR - 1, MaxVGPR);
+ MaxAGPR = std::max(I->second.NumAGPR - 1, MaxAGPR);
CalleeFrameSize
= std::max(I->second.PrivateSegmentSize, CalleeFrameSize);
Info.UsesVCC |= I->second.UsesVCC;
Info.NumExplicitSGPR = MaxSGPR + 1;
Info.NumVGPR = MaxVGPR + 1;
+ Info.NumAGPR = MaxAGPR + 1;
Info.PrivateSegmentSize += CalleeFrameSize;
return Info;
void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
const MachineFunction &MF) {
SIFunctionResourceInfo Info = analyzeResourceUsage(MF);
+ const GCNSubtarget &STM = MF.getSubtarget<GCNSubtarget>();
- ProgInfo.NumVGPR = Info.NumVGPR;
+ ProgInfo.NumArchVGPR = Info.NumVGPR;
+ ProgInfo.NumAccVGPR = Info.NumAGPR;
+ ProgInfo.NumVGPR = Info.getTotalNumVGPRs(STM);
ProgInfo.NumSGPR = Info.NumExplicitSGPR;
ProgInfo.ScratchSize = Info.PrivateSegmentSize;
ProgInfo.VCCUsed = Info.UsesVCC;
MF.getFunction().getContext().diagnose(DiagStackSize);
}
- const GCNSubtarget &STM = MF.getSubtarget<GCNSubtarget>();
const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
// TODO(scott.linder): The calculations related to SGPR/VGPR blocks are
-; RUN: llc -march=amdgcn -mcpu=gfx908 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx908 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN %s
-declare <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float, float, <32 x float>, i32, i32, i32)
+; GCN-LABEL: {{^}}kernel_32_agprs:
+; GCN: .amdhsa_next_free_vgpr 32
+; GCN: NumVgprs: 9
+; GCN: NumAgprs: 32
+; GCN: TotalNumVgprs: 32
+; GCN: VGPRBlocks: 7
+; GCN: NumVGPRsForWavesPerEU: 32
+; GCN: Occupancy: 8
+define amdgpu_kernel void @kernel_32_agprs() {
+bb:
+ call void asm sideeffect "", "~{v8}" ()
+ call void asm sideeffect "", "~{a31}" ()
+ ret void
+}
+
+; GCN-LABEL: {{^}}kernel_0_agprs:
+; GCN: .amdhsa_next_free_vgpr 1
+; GCN: NumVgprs: 1
+; GCN: NumAgprs: 0
+; GCN: TotalNumVgprs: 1
+; GCN: VGPRBlocks: 0
+; GCN: NumVGPRsForWavesPerEU: 1
+; GCN: Occupancy: 10
+define amdgpu_kernel void @kernel_0_agprs() {
+bb:
+ call void asm sideeffect "", "~{v0}" ()
+ ret void
+}
+
+; GCN-LABEL: {{^}}kernel_40_vgprs:
+; GCN: .amdhsa_next_free_vgpr 40
+; GCN: NumVgprs: 40
+; GCN: NumAgprs: 16
+; GCN: TotalNumVgprs: 40
+; GCN: VGPRBlocks: 9
+; GCN: NumVGPRsForWavesPerEU: 40
+; GCN: Occupancy: 6
+define amdgpu_kernel void @kernel_40_vgprs() {
+bb:
+ call void asm sideeffect "", "~{v39}" ()
+ call void asm sideeffect "", "~{a15}" ()
+ ret void
+}
+
+; GCN-LABEL: {{^}}func_32_agprs:
+; GCN: NumVgprs: 9
+; GCN: NumAgprs: 32
+; GCN: TotalNumVgprs: 32
+define void @func_32_agprs() #0 {
+bb:
+ call void asm sideeffect "", "~{v8}" ()
+ call void asm sideeffect "", "~{a31}" ()
+ ret void
+}
+
+; GCN-LABEL: {{^}}func_32_vgprs:
+; GCN: NumVgprs: 32
+; GCN: NumAgprs: 9
+; GCN: TotalNumVgprs: 32
+define void @func_32_vgprs() {
+bb:
+ call void asm sideeffect "", "~{v31}" ()
+ call void asm sideeffect "", "~{a8}" ()
+ ret void
+}
-; GCN-LABEL: {{^}}test_32_agprs:
-; GCN: v_mfma_f32_32x32x1f32 a[0:31], {{v[0-9]+}}, {{v[0-9]+}},
-; GCN-NOT: v28
-; GCN: NumVgprs: 32
-; GCN: VGPRBlocks: 7
-define amdgpu_kernel void @test_32_agprs(<32 x float> addrspace(1)* %arg) {
+; GCN-LABEL: {{^}}func_0_agprs:
+; GCN: NumVgprs: 1
+; GCN: NumAgprs: 0
+; GCN: TotalNumVgprs: 1
+define amdgpu_kernel void @func_0_agprs() {
bb:
- %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> <float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0>, i32 0, i32 0, i32 0)
- store <32 x float> %mai.1, <32 x float> addrspace(1)* %arg
+ call void asm sideeffect "", "~{v0}" ()
ret void
}
+
+; GCN-LABEL: {{^}}kernel_max_gprs:
+; GCN: .amdhsa_next_free_vgpr 256
+; GCN: NumVgprs: 256
+; GCN: NumAgprs: 256
+; GCN: TotalNumVgprs: 256
+; GCN: VGPRBlocks: 63
+; GCN: NumVGPRsForWavesPerEU: 256
+; GCN: Occupancy: 1
+define amdgpu_kernel void @kernel_max_gprs() {
+bb:
+ call void asm sideeffect "", "~{v255}" ()
+ call void asm sideeffect "", "~{a255}" ()
+ ret void
+}
+
+; GCN-LABEL: {{^}}kernel_call_func_32_agprs:
+; GCN: .amdhsa_next_free_vgpr 32
+; GCN: NumVgprs: 9
+; GCN: NumAgprs: 32
+; GCN: TotalNumVgprs: 32
+; GCN: VGPRBlocks: 7
+; GCN: NumVGPRsForWavesPerEU: 32
+; GCN: Occupancy: 8
+define amdgpu_kernel void @kernel_call_func_32_agprs() {
+bb:
+ call void @func_32_agprs() #0
+ ret void
+}
+
+; GCN-LABEL: {{^}}func_call_func_32_agprs:
+; GCN: NumVgprs: 9
+; GCN: NumAgprs: 32
+; GCN: TotalNumVgprs: 32
+define void @func_call_func_32_agprs() {
+bb:
+ call void @func_32_agprs() #0
+ ret void
+}
+
+declare void @undef_func()
+
+; GCN-LABEL: {{^}}kernel_call_undef_func:
+; GCN: .amdhsa_next_free_vgpr 24
+; GCN: NumVgprs: 24
+; GCN: NumAgprs: 24
+; GCN: TotalNumVgprs: 24
+; GCN: VGPRBlocks: 5
+; GCN: NumVGPRsForWavesPerEU: 24
+; GCN: Occupancy: 10
+define amdgpu_kernel void @kernel_call_undef_func() {
+bb:
+ call void @undef_func()
+ ret void
+}
+
+attributes #0 = { nounwind noinline }