From 74ac54ec5bb9c1e856fc049330591777835c7f13 Mon Sep 17 00:00:00 2001 From: Matt Arsenault Date: Mon, 17 Apr 2017 19:48:30 +0000 Subject: [PATCH] AMDGPU: Use MachineRegisterInfo to find max used register Avoid looping through program to determine register counts. This avoids needing to look at regmask operands. Also fixes some counting errors with flat_scr when there are no stack objects. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@300482 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp | 201 +++++++----------- lib/Target/AMDGPU/AMDGPUAsmPrinter.h | 4 +- .../code-object-metadata-from-llvm-ir-full.ll | 6 +- test/CodeGen/AMDGPU/exceed-max-sgprs.ll | 2 +- test/CodeGen/AMDGPU/flat-scratch-reg.ll | 59 +++-- 5 files changed, 128 insertions(+), 144 deletions(-) diff --git a/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp b/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp index 0446655830d..8d04d15c7bb 100644 --- a/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp +++ b/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp @@ -221,8 +221,8 @@ bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) { if (STM.getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS) { OutStreamer->emitRawComment(" Kernel info:", false); - OutStreamer->emitRawComment(" codeLenInByte = " + Twine(KernelInfo.CodeLen), - false); + OutStreamer->emitRawComment(" codeLenInByte = " + + Twine(getFunctionCodeSize(MF)), false); OutStreamer->emitRawComment(" NumSgprs: " + Twine(KernelInfo.NumSGPR), false); OutStreamer->emitRawComment(" NumVgprs: " + Twine(KernelInfo.NumVGPR), @@ -317,7 +317,7 @@ void AMDGPUAsmPrinter::EmitProgramInfoR600(const MachineFunction &MF) { const MachineOperand &MO = MI.getOperand(op_idx); if (!MO.isReg()) continue; - unsigned HWReg = RI->getEncodingValue(MO.getReg()) & 0xff; + unsigned HWReg = RI->getHWRegIndex(MO.getReg()); // Register with value > 127 aren't GPR if (HWReg > 127) @@ -360,18 +360,12 @@ void AMDGPUAsmPrinter::EmitProgramInfoR600(const MachineFunction &MF) { } } -void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo, - const MachineFunction &MF) const { +uint64_t AMDGPUAsmPrinter::getFunctionCodeSize(const MachineFunction &MF) const { const SISubtarget &STM = MF.getSubtarget(); - const SIMachineFunctionInfo *MFI = MF.getInfo(); - uint64_t CodeSize = 0; - unsigned MaxSGPR = 0; - unsigned MaxVGPR = 0; - bool VCCUsed = false; - bool FlatUsed = false; - const SIRegisterInfo *RI = STM.getRegisterInfo(); const SIInstrInfo *TII = STM.getInstrInfo(); + uint64_t CodeSize = 0; + for (const MachineBasicBlock &MBB : MF) { for (const MachineInstr &MI : MBB) { // TODO: CodeSize should account for multiple functions. @@ -380,122 +374,86 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo, if (MI.isDebugValue()) continue; - if (isVerbose()) - CodeSize += TII->getInstSizeInBytes(MI); + CodeSize += TII->getInstSizeInBytes(MI); + } + } - unsigned numOperands = MI.getNumOperands(); - for (unsigned op_idx = 0; op_idx < numOperands; op_idx++) { - const MachineOperand &MO = MI.getOperand(op_idx); - unsigned width = 0; - bool isSGPR = false; + return CodeSize; +} - if (!MO.isReg()) - continue; +static bool hasAnyNonFlatUseOfReg(const MachineRegisterInfo &MRI, + const SIInstrInfo &TII, + unsigned Reg) { + for (const MachineOperand &UseOp : MRI.reg_operands(Reg)) { + if (!UseOp.isImplicit() || !TII.isFLAT(*UseOp.getParent())) + return true; + } - unsigned reg = MO.getReg(); - switch (reg) { - case AMDGPU::EXEC: - case AMDGPU::EXEC_LO: - case AMDGPU::EXEC_HI: - case AMDGPU::SCC: - case AMDGPU::M0: - case AMDGPU::SRC_SHARED_BASE: - case AMDGPU::SRC_SHARED_LIMIT: - case AMDGPU::SRC_PRIVATE_BASE: - case AMDGPU::SRC_PRIVATE_LIMIT: - continue; + return false; +} - case AMDGPU::VCC: - case AMDGPU::VCC_LO: - case AMDGPU::VCC_HI: - VCCUsed = true; - continue; +void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo, + const MachineFunction &MF) const { + const SISubtarget &STM = MF.getSubtarget(); + const SIMachineFunctionInfo *MFI = MF.getInfo(); + const MachineRegisterInfo &MRI = MF.getRegInfo(); + const SIInstrInfo *TII = STM.getInstrInfo(); + const SIRegisterInfo *RI = &TII->getRegisterInfo(); - case AMDGPU::FLAT_SCR: - case AMDGPU::FLAT_SCR_LO: - case AMDGPU::FLAT_SCR_HI: - // Even if FLAT_SCRATCH is implicitly used, it has no effect if flat - // instructions aren't used to access the scratch buffer. - if (MFI->hasFlatScratchInit()) - FlatUsed = true; - continue; - case AMDGPU::TBA: - case AMDGPU::TBA_LO: - case AMDGPU::TBA_HI: - case AMDGPU::TMA: - case AMDGPU::TMA_LO: - case AMDGPU::TMA_HI: - llvm_unreachable("trap handler registers should not be used"); - - default: - break; - } - - if (AMDGPU::SReg_32RegClass.contains(reg)) { - assert(!AMDGPU::TTMP_32RegClass.contains(reg) && - "trap handler registers should not be used"); - isSGPR = true; - width = 1; - } else if (AMDGPU::VGPR_32RegClass.contains(reg)) { - isSGPR = false; - width = 1; - } else if (AMDGPU::SReg_64RegClass.contains(reg)) { - assert(!AMDGPU::TTMP_64RegClass.contains(reg) && - "trap handler registers should not be used"); - isSGPR = true; - width = 2; - } else if (AMDGPU::VReg_64RegClass.contains(reg)) { - isSGPR = false; - width = 2; - } else if (AMDGPU::VReg_96RegClass.contains(reg)) { - isSGPR = false; - width = 3; - } else if (AMDGPU::SReg_128RegClass.contains(reg)) { - isSGPR = true; - width = 4; - } else if (AMDGPU::VReg_128RegClass.contains(reg)) { - isSGPR = false; - width = 4; - } else if (AMDGPU::SReg_256RegClass.contains(reg)) { - isSGPR = true; - width = 8; - } else if (AMDGPU::VReg_256RegClass.contains(reg)) { - isSGPR = false; - width = 8; - } else if (AMDGPU::SReg_512RegClass.contains(reg)) { - isSGPR = true; - width = 16; - } else if (AMDGPU::VReg_512RegClass.contains(reg)) { - isSGPR = false; - width = 16; - } else { - llvm_unreachable("Unknown register class"); - } - unsigned hwReg = RI->getEncodingValue(reg) & 0xff; - unsigned maxUsed = hwReg + width - 1; - if (isSGPR) { - MaxSGPR = maxUsed > MaxSGPR ? maxUsed : MaxSGPR; - } else { - MaxVGPR = maxUsed > MaxVGPR ? maxUsed : MaxVGPR; - } - } + MCPhysReg NumVGPRReg = AMDGPU::NoRegister; + for (MCPhysReg Reg : reverse(AMDGPU::VGPR_32RegClass.getRegisters())) { + if (MRI.isPhysRegUsed(Reg)) { + NumVGPRReg = Reg; + break; + } + } + + MCPhysReg NumSGPRReg = AMDGPU::NoRegister; + for (MCPhysReg Reg : reverse(AMDGPU::SGPR_32RegClass.getRegisters())) { + if (MRI.isPhysRegUsed(Reg)) { + NumSGPRReg = Reg; + break; } } + // We found the maximum register index. They start at 0, so add one to get the + // number of registers. + ProgInfo.NumVGPR = NumVGPRReg == AMDGPU::NoRegister ? 0 : + RI->getHWRegIndex(NumVGPRReg) + 1; + ProgInfo.NumSGPR = NumSGPRReg == AMDGPU::NoRegister ? 0 : + RI->getHWRegIndex(NumSGPRReg) + 1; unsigned ExtraSGPRs = 0; - if (VCCUsed) + ProgInfo.VCCUsed = MRI.isPhysRegUsed(AMDGPU::VCC_LO) || + MRI.isPhysRegUsed(AMDGPU::VCC_HI); + if (ProgInfo.VCCUsed) ExtraSGPRs = 2; + ProgInfo.FlatUsed = MRI.isPhysRegUsed(AMDGPU::FLAT_SCR_LO) || + MRI.isPhysRegUsed(AMDGPU::FLAT_SCR_HI); + + // Even if FLAT_SCRATCH is implicitly used, it has no effect if flat + // instructions aren't used to access the scratch buffer. Inline assembly + // may need it though. + // + // If we only have implicit uses of flat_scr on flat instructions, it is not + // really needed. + if (ProgInfo.FlatUsed && !MFI->hasFlatScratchInit() && + (!hasAnyNonFlatUseOfReg(MRI, *TII, AMDGPU::FLAT_SCR) && + !hasAnyNonFlatUseOfReg(MRI, *TII, AMDGPU::FLAT_SCR_LO) && + !hasAnyNonFlatUseOfReg(MRI, *TII, AMDGPU::FLAT_SCR_HI))) { + ProgInfo.FlatUsed = false; + } + if (STM.getGeneration() < SISubtarget::VOLCANIC_ISLANDS) { - if (FlatUsed) + if (ProgInfo.FlatUsed) ExtraSGPRs = 4; } else { if (STM.isXNACKEnabled()) ExtraSGPRs = 4; - if (FlatUsed) + if (ProgInfo.FlatUsed) ExtraSGPRs = 6; } @@ -505,34 +463,29 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo, if (STM.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS && !STM.hasSGPRInitBug()) { unsigned MaxAddressableNumSGPRs = STM.getAddressableNumSGPRs(); - if (MaxSGPR + 1 > MaxAddressableNumSGPRs) { + if (ProgInfo.NumSGPR > MaxAddressableNumSGPRs) { // This can happen due to a compiler bug or when using inline asm. LLVMContext &Ctx = MF.getFunction()->getContext(); DiagnosticInfoResourceLimit Diag(*MF.getFunction(), "addressable scalar registers", - MaxSGPR + 1, DS_Error, + ProgInfo.NumSGPR, DS_Error, DK_ResourceLimit, MaxAddressableNumSGPRs); Ctx.diagnose(Diag); - MaxSGPR = MaxAddressableNumSGPRs - 1; + ProgInfo.NumSGPR = MaxAddressableNumSGPRs - 1; } } // Account for extra SGPRs and VGPRs reserved for debugger use. - MaxSGPR += ExtraSGPRs; - MaxVGPR += ExtraVGPRs; - - // We found the maximum register index. They start at 0, so add one to get the - // number of registers. - ProgInfo.NumSGPR = MaxSGPR + 1; - ProgInfo.NumVGPR = MaxVGPR + 1; + ProgInfo.NumSGPR += ExtraSGPRs; + ProgInfo.NumVGPR += ExtraVGPRs; // Adjust number of registers used to meet default/requested minimum/maximum // number of waves per execution unit request. ProgInfo.NumSGPRsForWavesPerEU = std::max( - ProgInfo.NumSGPR, STM.getMinNumSGPRs(MFI->getMaxWavesPerEU())); + std::max(ProgInfo.NumSGPR, 1u), STM.getMinNumSGPRs(MFI->getMaxWavesPerEU())); ProgInfo.NumVGPRsForWavesPerEU = std::max( - ProgInfo.NumVGPR, STM.getMinNumVGPRs(MFI->getMaxWavesPerEU())); + std::max(ProgInfo.NumVGPR, 1u), STM.getMinNumVGPRs(MFI->getMaxWavesPerEU())); if (STM.getGeneration() <= AMDGPUSubtarget::SEA_ISLANDS || STM.hasSGPRInitBug()) { @@ -584,7 +537,7 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo, ProgInfo.VGPRBlocks = ProgInfo.VGPRBlocks / STM.getVGPREncodingGranule() - 1; // Record first reserved VGPR and number of reserved VGPRs. - ProgInfo.ReservedVGPRFirst = STM.debuggerReserveRegs() ? MaxVGPR + 1 : 0; + ProgInfo.ReservedVGPRFirst = STM.debuggerReserveRegs() ? ProgInfo.NumVGPR : 0; ProgInfo.ReservedVGPRCount = STM.getReservedNumVGPRs(MF); // Update DebuggerWavefrontPrivateSegmentOffsetSGPR and @@ -609,10 +562,6 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo, const MachineFrameInfo &FrameInfo = MF.getFrameInfo(); ProgInfo.ScratchSize = FrameInfo.getStackSize(); - ProgInfo.FlatUsed = FlatUsed; - ProgInfo.VCCUsed = VCCUsed; - ProgInfo.CodeLen = CodeSize; - unsigned LDSAlignShift; if (STM.getGeneration() < SISubtarget::SEA_ISLANDS) { // LDS is allocated in 64 dword blocks. diff --git a/lib/Target/AMDGPU/AMDGPUAsmPrinter.h b/lib/Target/AMDGPU/AMDGPUAsmPrinter.h index 13425c8b2a0..8c86dea4b88 100644 --- a/lib/Target/AMDGPU/AMDGPUAsmPrinter.h +++ b/lib/Target/AMDGPU/AMDGPUAsmPrinter.h @@ -55,7 +55,7 @@ private: uint32_t NumVGPR = 0; uint32_t NumSGPR = 0; - uint32_t LDSSize; + uint32_t LDSSize = 0; bool FlatUsed = false; // Number of SGPRs that meets number of waves per execution unit request. @@ -85,11 +85,11 @@ private: // Bonus information for debugging. bool VCCUsed = false; - uint64_t CodeLen = 0; SIProgramInfo() = default; }; + uint64_t getFunctionCodeSize(const MachineFunction &MF) const; void getSIProgramInfo(SIProgramInfo &Out, const MachineFunction &MF) const; void getAmdKernelCode(amd_kernel_code_t &Out, const SIProgramInfo &KernelInfo, const MachineFunction &MF) const; diff --git a/test/CodeGen/AMDGPU/code-object-metadata-from-llvm-ir-full.ll b/test/CodeGen/AMDGPU/code-object-metadata-from-llvm-ir-full.ll index 88ba310a92c..a68ddabd956 100644 --- a/test/CodeGen/AMDGPU/code-object-metadata-from-llvm-ir-full.ll +++ b/test/CodeGen/AMDGPU/code-object-metadata-from-llvm-ir-full.ll @@ -1253,8 +1253,8 @@ define amdgpu_kernel void @test_pointee_align(i64 addrspace(1)* %a, ; NOTES-NEXT: Owner Data size Description ; NOTES-NEXT: AMD 0x00000008 Unknown note type: (0x00000001) ; NOTES-NEXT: AMD 0x0000001b Unknown note type: (0x00000003) -; GFX700: AMD 0x00009171 Unknown note type: (0x0000000a) -; GFX800: AMD 0x00009190 Unknown note type: (0x0000000a) -; GFX900: AMD 0x00009171 Unknown note type: (0x0000000a) +; GFX700: AMD 0x00008b06 Unknown note type: (0x0000000a) +; GFX800: AMD 0x00008e6a Unknown note type: (0x0000000a) +; GFX900: AMD 0x00008b06 Unknown note type: (0x0000000a) ; PARSER: AMDGPU Code Object Metadata Parser Test: PASS diff --git a/test/CodeGen/AMDGPU/exceed-max-sgprs.ll b/test/CodeGen/AMDGPU/exceed-max-sgprs.ll index 40d115bfc06..207dfce75f1 100644 --- a/test/CodeGen/AMDGPU/exceed-max-sgprs.ll +++ b/test/CodeGen/AMDGPU/exceed-max-sgprs.ll @@ -38,7 +38,7 @@ define amdgpu_kernel void @use_too_many_sgprs_bonaire() #1 { ret void } -; ERROR: error: scalar registers limit of 104 exceeded (106) in use_too_many_sgprs_bonaire_flat_scr +; ERROR: error: scalar registers limit of 104 exceeded (108) in use_too_many_sgprs_bonaire_flat_scr define amdgpu_kernel void @use_too_many_sgprs_bonaire_flat_scr() #1 { call void asm sideeffect "", "~{SGPR0_SGPR1_SGPR2_SGPR3_SGPR4_SGPR5_SGPR6_SGPR7}" () call void asm sideeffect "", "~{SGPR8_SGPR9_SGPR10_SGPR11_SGPR12_SGPR13_SGPR14_SGPR15}" () diff --git a/test/CodeGen/AMDGPU/flat-scratch-reg.ll b/test/CodeGen/AMDGPU/flat-scratch-reg.ll index 23f40daf3d2..5705cbc9944 100644 --- a/test/CodeGen/AMDGPU/flat-scratch-reg.ll +++ b/test/CodeGen/AMDGPU/flat-scratch-reg.ll @@ -44,12 +44,12 @@ entry: ; HSA-VI-NOXNACK: is_xnack_enabled = 0 ; HSA-VI-XNACK: is_xnack_enabled = 1 -; CI: ; NumSgprs: 8 -; VI-NOXNACK: ; NumSgprs: 8 -; VI-XNACK: ; NumSgprs: 12 -; HSA-CI: ; NumSgprs: 8 -; HSA-VI-NOXNACK: ; NumSgprs: 8 -; HSA-VI-XNACK: ; NumSgprs: 12 +; CI: ; NumSgprs: 12 +; VI-NOXNACK: ; NumSgprs: 14 +; VI-XNACK: ; NumSgprs: 14 +; HSA-CI: ; NumSgprs: 12 +; HSA-VI-NOXNACK: ; NumSgprs: 14 +; HSA-VI-XNACK: ; NumSgprs: 14 define amdgpu_kernel void @no_vcc_flat() { entry: call void asm sideeffect "", "~{SGPR7},~{FLAT_SCR}"() @@ -60,14 +60,49 @@ entry: ; HSA-NOXNACK: is_xnack_enabled = 0 ; HSA-XNACK: is_xnack_enabled = 1 -; CI: ; NumSgprs: 10 -; VI-NOXNACK: ; NumSgprs: 10 -; VI-XNACK: ; NumSgprs: 12 -; HSA-CI: ; NumSgprs: 10 -; HSA-VI-NOXNACK: ; NumSgprs: 10 -; HSA-VI-XNACK: ; NumSgprs: 12 +; CI: ; NumSgprs: 12 +; VI-NOXNACK: ; NumSgprs: 14 +; VI-XNACK: ; NumSgprs: 14 +; HSA-CI: ; NumSgprs: 12 +; HSA-VI-NOXNACK: ; NumSgprs: 14 +; HSA-VI-XNACK: ; NumSgprs: 14 define amdgpu_kernel void @vcc_flat() { entry: call void asm sideeffect "", "~{SGPR7},~{VCC},~{FLAT_SCR}"() ret void } + +; Make sure used SGPR count for flat_scr is correct when there is no +; scratch usage and implicit flat uses. + +; GCN-LABEL: {{^}}use_flat_scr: +; CI: NumSgprs: 4 +; VI-NOXNACK: NumSgprs: 6 +; VI-XNACK: NumSgprs: 6 +define amdgpu_kernel void @use_flat_scr() #0 { +entry: + call void asm sideeffect "; clobber ", "~{FLAT_SCR}"() + ret void +} + +; GCN-LABEL: {{^}}use_flat_scr_lo: +; CI: NumSgprs: 4 +; VI-NOXNACK: NumSgprs: 6 +; VI-XNACK: NumSgprs: 6 +define amdgpu_kernel void @use_flat_scr_lo() #0 { +entry: + call void asm sideeffect "; clobber ", "~{FLAT_SCR_LO}"() + ret void +} + +; GCN-LABEL: {{^}}use_flat_scr_hi: +; CI: NumSgprs: 4 +; VI-NOXNACK: NumSgprs: 6 +; VI-XNACK: NumSgprs: 6 +define amdgpu_kernel void @use_flat_scr_hi() #0 { +entry: + call void asm sideeffect "; clobber ", "~{FLAT_SCR_HI}"() + ret void +} + +attributes #0 = { nounwind } -- 2.40.0