using namespace llvm;
+#define DEBUG_TYPE "frame-info"
+
+
static ArrayRef<MCPhysReg> getAllSGPR128(const GCNSubtarget &ST,
const MachineFunction &MF) {
return makeArrayRef(AMDGPU::SGPR_128RegClass.begin(),
ST.getMaxNumSGPRs(MF));
}
+// Find a scratch register that we can use at the start of the prologue to
+// re-align the stack pointer. We avoid using callee-save registers since they
+// may appear to be free when this is called from canUseAsPrologue (during
+// shrink wrapping), but then no longer be free when this is called from
+// emitPrologue.
+//
+// FIXME: This is a bit conservative, since in the above case we could use one
+// of the callee-save registers as a scratch temp to re-align the stack pointer,
+// but we would then have to make sure that we were in fact saving at least one
+// callee-save register in the prologue, which is additional complexity that
+// doesn't seem worth the benefit.
+static unsigned findScratchNonCalleeSaveRegister(MachineRegisterInfo &MRI,
+ LivePhysRegs &LiveRegs,
+ const TargetRegisterClass &RC,
+ bool Unused = false) {
+ // Mark callee saved registers as used so we will not choose them.
+ const MCPhysReg *CSRegs = MRI.getCalleeSavedRegs();
+ for (unsigned i = 0; CSRegs[i]; ++i)
+ LiveRegs.addReg(CSRegs[i]);
+
+ if (Unused) {
+ // We are looking for a register that can be used throughout the entire
+ // function, so any use is unacceptable.
+ for (unsigned Reg : RC) {
+ if (!MRI.isPhysRegUsed(Reg) && LiveRegs.available(MRI, Reg))
+ return Reg;
+ }
+ } else {
+ for (unsigned Reg : RC) {
+ if (LiveRegs.available(MRI, Reg))
+ return Reg;
+ }
+ }
+
+ // If we require an unused register, this is used in contexts where failure is
+ // an option and has an alternative plan. In other contexts, this must
+ // succeed0.
+ if (!Unused)
+ report_fatal_error("failed to find free scratch register");
+
+ return AMDGPU::NoRegister;
+}
+
+static MCPhysReg findUnusedSGPRNonCalleeSaved(MachineRegisterInfo &MRI) {
+ LivePhysRegs LiveRegs;
+ LiveRegs.init(*MRI.getTargetRegisterInfo());
+ return findScratchNonCalleeSaveRegister(
+ MRI, LiveRegs, AMDGPU::SReg_32_XM0_XEXECRegClass, true);
+}
+
+// We need to specially emit stack operations here because a different frame
+// register is used than in the rest of the function, as getFrameRegister would
+// use.
+static void buildPrologSpill(LivePhysRegs &LiveRegs, MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator I,
+ const SIInstrInfo *TII, unsigned SpillReg,
+ unsigned ScratchRsrcReg, unsigned SPReg, int FI) {
+ MachineFunction *MF = MBB.getParent();
+ MachineFrameInfo &MFI = MF->getFrameInfo();
+
+ int64_t Offset = MFI.getObjectOffset(FI);
+
+ MachineMemOperand *MMO = MF->getMachineMemOperand(
+ MachinePointerInfo::getFixedStack(*MF, FI), MachineMemOperand::MOStore, 4,
+ MFI.getObjectAlignment(FI));
+
+ if (isUInt<12>(Offset)) {
+ BuildMI(MBB, I, DebugLoc(), TII->get(AMDGPU::BUFFER_STORE_DWORD_OFFSET))
+ .addReg(SpillReg, RegState::Kill)
+ .addReg(ScratchRsrcReg)
+ .addReg(SPReg)
+ .addImm(Offset)
+ .addImm(0) // glc
+ .addImm(0) // slc
+ .addImm(0) // tfe
+ .addImm(0) // dlc
+ .addMemOperand(MMO);
+ return;
+ }
+
+ MCPhysReg OffsetReg = findScratchNonCalleeSaveRegister(
+ MF->getRegInfo(), LiveRegs, AMDGPU::VGPR_32RegClass);
+
+ BuildMI(MBB, I, DebugLoc(), TII->get(AMDGPU::V_MOV_B32_e32), OffsetReg)
+ .addImm(Offset);
+
+ BuildMI(MBB, I, DebugLoc(), TII->get(AMDGPU::BUFFER_STORE_DWORD_OFFEN))
+ .addReg(SpillReg, RegState::Kill)
+ .addReg(OffsetReg, RegState::Kill)
+ .addReg(ScratchRsrcReg)
+ .addReg(SPReg)
+ .addImm(0)
+ .addImm(0) // glc
+ .addImm(0) // slc
+ .addImm(0) // tfe
+ .addImm(0) // dlc
+ .addMemOperand(MMO);
+}
+
+static void buildEpilogReload(LivePhysRegs &LiveRegs, MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator I,
+ const SIInstrInfo *TII, unsigned SpillReg,
+ unsigned ScratchRsrcReg, unsigned SPReg, int FI) {
+ MachineFunction *MF = MBB.getParent();
+ MachineFrameInfo &MFI = MF->getFrameInfo();
+ int64_t Offset = MFI.getObjectOffset(FI);
+
+ MachineMemOperand *MMO = MF->getMachineMemOperand(
+ MachinePointerInfo::getFixedStack(*MF, FI), MachineMemOperand::MOLoad, 4,
+ MFI.getObjectAlignment(FI));
+
+ if (isUInt<12>(Offset)) {
+ BuildMI(MBB, I, DebugLoc(),
+ TII->get(AMDGPU::BUFFER_LOAD_DWORD_OFFSET), SpillReg)
+ .addReg(ScratchRsrcReg)
+ .addReg(SPReg)
+ .addImm(Offset)
+ .addImm(0) // glc
+ .addImm(0) // slc
+ .addImm(0) // tfe
+ .addImm(0) // dlc
+ .addMemOperand(MMO);
+ return;
+ }
+
+ MCPhysReg OffsetReg = findScratchNonCalleeSaveRegister(
+ MF->getRegInfo(), LiveRegs, AMDGPU::VGPR_32RegClass);
+
+ BuildMI(MBB, I, DebugLoc(), TII->get(AMDGPU::V_MOV_B32_e32), OffsetReg)
+ .addImm(Offset);
+
+ BuildMI(MBB, I, DebugLoc(),
+ TII->get(AMDGPU::BUFFER_LOAD_DWORD_OFFEN), SpillReg)
+ .addReg(OffsetReg, RegState::Kill)
+ .addReg(ScratchRsrcReg)
+ .addReg(SPReg)
+ .addImm(0)
+ .addImm(0) // glc
+ .addImm(0) // slc
+ .addImm(0) // tfe
+ .addImm(0) // dlc
+ .addMemOperand(MMO);
+}
+
void SIFrameLowering::emitFlatScratchInit(const GCNSubtarget &ST,
MachineFunction &MF,
MachineBasicBlock &MBB) const {
}
}
-// Find a scratch register that we can use at the start of the prologue to
-// re-align the stack pointer. We avoid using callee-save registers since they
-// may appear to be free when this is called from canUseAsPrologue (during
-// shrink wrapping), but then no longer be free when this is called from
-// emitPrologue.
-//
-// FIXME: This is a bit conservative, since in the above case we could use one
-// of the callee-save registers as a scratch temp to re-align the stack pointer,
-// but we would then have to make sure that we were in fact saving at least one
-// callee-save register in the prologue, which is additional complexity that
-// doesn't seem worth the benefit.
-static unsigned findScratchNonCalleeSaveRegister(MachineFunction &MF,
- LivePhysRegs &LiveRegs,
- const TargetRegisterClass &RC) {
- MachineRegisterInfo &MRI = MF.getRegInfo();
-
- // Mark callee saved registers as used so we will not choose them.
- const MCPhysReg *CSRegs = MRI.getCalleeSavedRegs();
- for (unsigned i = 0; CSRegs[i]; ++i)
- LiveRegs.addReg(CSRegs[i]);
-
- for (unsigned Reg : RC) {
- if (LiveRegs.available(MRI, Reg))
- return Reg;
- }
-
- return AMDGPU::NoRegister;
-}
-
bool SIFrameLowering::isSupportedStackID(TargetStackID::Value ID) const {
switch (ID) {
case TargetStackID::Default:
}
const MachineFrameInfo &MFI = MF.getFrameInfo();
+ MachineRegisterInfo &MRI = MF.getRegInfo();
const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
const SIInstrInfo *TII = ST.getInstrInfo();
const SIRegisterInfo &TRI = TII->getRegisterInfo();
bool HasFP = false;
uint32_t NumBytes = MFI.getStackSize();
uint32_t RoundedSize = NumBytes;
+ // To avoid clobbering VGPRs in lanes that weren't active on function entry,
+ // turn on all lanes before doing the spill to memory.
+ unsigned ScratchExecCopy = AMDGPU::NoRegister;
+
+ // Emit the copy if we need an FP, and are using a free SGPR to save it.
+ if (FuncInfo->SGPRForFPSaveRestoreCopy != AMDGPU::NoRegister) {
+ BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::COPY), FuncInfo->SGPRForFPSaveRestoreCopy)
+ .addReg(FramePtrReg)
+ .setMIFlag(MachineInstr::FrameSetup);
+ }
+
+ for (const SIMachineFunctionInfo::SGPRSpillVGPRCSR &Reg
+ : FuncInfo->getSGPRSpillVGPRs()) {
+ if (!Reg.FI.hasValue())
+ continue;
+
+ if (ScratchExecCopy == AMDGPU::NoRegister) {
+ if (LiveRegs.empty()) {
+ LiveRegs.init(TRI);
+ LiveRegs.addLiveIns(MBB);
+ if (FuncInfo->SGPRForFPSaveRestoreCopy)
+ LiveRegs.removeReg(FuncInfo->SGPRForFPSaveRestoreCopy);
+ }
+
+ ScratchExecCopy
+ = findScratchNonCalleeSaveRegister(MRI, LiveRegs,
+ *TRI.getWaveMaskRegClass());
+ assert(FuncInfo->SGPRForFPSaveRestoreCopy != ScratchExecCopy);
+
+ const unsigned OrSaveExec = ST.isWave32() ?
+ AMDGPU::S_OR_SAVEEXEC_B32 : AMDGPU::S_OR_SAVEEXEC_B64;
+ BuildMI(MBB, MBBI, DL, TII->get(OrSaveExec),
+ ScratchExecCopy)
+ .addImm(-1);
+ }
+
+ buildPrologSpill(LiveRegs, MBB, MBBI, TII, Reg.VGPR,
+ FuncInfo->getScratchRSrcReg(),
+ StackPtrReg,
+ Reg.FI.getValue());
+ }
+
+ if (ScratchExecCopy != AMDGPU::NoRegister) {
+ // FIXME: Split block and make terminator.
+ unsigned ExecMov = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
+ unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
+ BuildMI(MBB, MBBI, DL, TII->get(ExecMov), Exec)
+ .addReg(ScratchExecCopy, RegState::Kill);
+ LiveRegs.addReg(ScratchExecCopy);
+ }
+
+
+ if (FuncInfo->FramePointerSaveIndex) {
+ const int FI = FuncInfo->FramePointerSaveIndex.getValue();
+ assert(!MFI.isDeadObjectIndex(FI) &&
+ MFI.getStackID(FI) == TargetStackID::SGPRSpill);
+ ArrayRef<SIMachineFunctionInfo::SpilledReg> Spill
+ = FuncInfo->getSGPRToVGPRSpills(FI);
+ assert(Spill.size() == 1);
+
+ // Save FP before setting it up.
+ // FIXME: This should respect spillSGPRToVGPR;
+ BuildMI(MBB, MBBI, DL, TII->getMCOpcodeFromPseudo(AMDGPU::V_WRITELANE_B32),
+ Spill[0].VGPR)
+ .addReg(FramePtrReg)
+ .addImm(Spill[0].Lane)
+ .addReg(Spill[0].VGPR, RegState::Undef);
+ }
if (TRI.needsStackRealignment(MF)) {
HasFP = true;
const unsigned Alignment = MFI.getMaxAlignment();
RoundedSize += Alignment;
+ if (LiveRegs.empty()) {
+ LiveRegs.init(TRI);
+ LiveRegs.addLiveIns(MBB);
+ LiveRegs.addReg(FuncInfo->SGPRForFPSaveRestoreCopy);
+ }
- LiveRegs.init(TRI);
- LiveRegs.addLiveIns(MBB);
-
- unsigned ScratchSPReg
- = findScratchNonCalleeSaveRegister(MF, LiveRegs,
- AMDGPU::SReg_32_XM0RegClass);
- assert(ScratchSPReg != AMDGPU::NoRegister);
+ unsigned ScratchSPReg = findScratchNonCalleeSaveRegister(
+ MRI, LiveRegs, AMDGPU::SReg_32_XM0RegClass);
+ assert(ScratchSPReg != AMDGPU::NoRegister &&
+ ScratchSPReg != FuncInfo->SGPRForFPSaveRestoreCopy);
// s_add_u32 tmp_reg, s32, NumBytes
// s_and_b32 s32, tmp_reg, 0b111...0000
.setMIFlag(MachineInstr::FrameSetup);
}
- // To avoid clobbering VGPRs in lanes that weren't active on function entry,
- // turn on all lanes before doing the spill to memory.
- unsigned ScratchExecCopy = AMDGPU::NoRegister;
-
- for (const SIMachineFunctionInfo::SGPRSpillVGPRCSR &Reg
- : FuncInfo->getSGPRSpillVGPRs()) {
- if (!Reg.FI.hasValue())
- continue;
-
- if (ScratchExecCopy == AMDGPU::NoRegister) {
- if (LiveRegs.empty()) {
- LiveRegs.init(TRI);
- LiveRegs.addLiveIns(MBB);
- }
-
- ScratchExecCopy
- = findScratchNonCalleeSaveRegister(MF, LiveRegs,
- *TRI.getWaveMaskRegClass());
-
- const unsigned OrSaveExec = ST.isWave32() ?
- AMDGPU::S_OR_SAVEEXEC_B32 : AMDGPU::S_OR_SAVEEXEC_B64;
- BuildMI(MBB, MBBI, DL, TII->get(OrSaveExec),
- ScratchExecCopy)
- .addImm(-1);
- }
-
- TII->storeRegToStackSlot(MBB, MBBI, Reg.VGPR, true,
- Reg.FI.getValue(), &AMDGPU::VGPR_32RegClass,
- &TII->getRegisterInfo());
- }
+ assert(!HasFP || (FuncInfo->SGPRForFPSaveRestoreCopy != AMDGPU::NoRegister ||
+ FuncInfo->FramePointerSaveIndex) &&
+ "Needed to save FP but didn't save it anywhere");
- if (ScratchExecCopy != AMDGPU::NoRegister) {
- // FIXME: Split block and make terminator.
- unsigned ExecMov = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
- unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
- BuildMI(MBB, MBBI, DL, TII->get(ExecMov), Exec)
- .addReg(ScratchExecCopy);
- }
+ assert(HasFP || (FuncInfo->SGPRForFPSaveRestoreCopy == AMDGPU::NoRegister &&
+ !FuncInfo->FramePointerSaveIndex) &&
+ "Saved FP but didn't need it");
}
void SIFrameLowering::emitEpilogue(MachineFunction &MF,
const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
const SIInstrInfo *TII = ST.getInstrInfo();
+ MachineRegisterInfo &MRI = MF.getRegInfo();
MachineBasicBlock::iterator MBBI = MBB.getFirstTerminator();
+ LivePhysRegs LiveRegs;
DebugLoc DL;
+ const MachineFrameInfo &MFI = MF.getFrameInfo();
+ uint32_t NumBytes = MFI.getStackSize();
+ uint32_t RoundedSize = FuncInfo->isStackRealigned() ?
+ NumBytes + MFI.getMaxAlignment() : NumBytes;
+
+ if (RoundedSize != 0 && hasFP(MF)) {
+ const unsigned StackPtrReg = FuncInfo->getStackPtrOffsetReg();
+ BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::S_SUB_U32), StackPtrReg)
+ .addReg(StackPtrReg)
+ .addImm(RoundedSize * ST.getWavefrontSize())
+ .setMIFlag(MachineInstr::FrameDestroy);
+ }
+
+ if (FuncInfo->SGPRForFPSaveRestoreCopy != AMDGPU::NoRegister) {
+ BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::COPY), FuncInfo->getFrameOffsetReg())
+ .addReg(FuncInfo->SGPRForFPSaveRestoreCopy)
+ .setMIFlag(MachineInstr::FrameSetup);
+ }
+
+ if (FuncInfo->FramePointerSaveIndex) {
+ const int FI = FuncInfo->FramePointerSaveIndex.getValue();
+ const MachineFrameInfo &MFI = MF.getFrameInfo();
+
+ assert(!MFI.isDeadObjectIndex(FI));
+ assert(MFI.getStackID(FI) == TargetStackID::SGPRSpill);
+ ArrayRef<SIMachineFunctionInfo::SpilledReg> Spill
+ = FuncInfo->getSGPRToVGPRSpills(FI);
+ assert(Spill.size() == 1);
+ BuildMI(MBB, MBBI, DL, TII->getMCOpcodeFromPseudo(AMDGPU::V_READLANE_B32),
+ FuncInfo->getFrameOffsetReg())
+ .addReg(Spill[0].VGPR)
+ .addImm(Spill[0].Lane);
+ }
+
unsigned ScratchExecCopy = AMDGPU::NoRegister;
for (const SIMachineFunctionInfo::SGPRSpillVGPRCSR &Reg
: FuncInfo->getSGPRSpillVGPRs()) {
const SIRegisterInfo &TRI = TII->getRegisterInfo();
if (ScratchExecCopy == AMDGPU::NoRegister) {
// See emitPrologue
- LivePhysRegs LiveRegs(*ST.getRegisterInfo());
- LiveRegs.addLiveOuts(MBB);
- LiveRegs.stepBackward(*MBBI);
+ if (LiveRegs.empty()) {
+ LiveRegs.init(*ST.getRegisterInfo());
+ LiveRegs.addLiveOuts(MBB);
+ LiveRegs.stepBackward(*MBBI);
+ }
- ScratchExecCopy
- = findScratchNonCalleeSaveRegister(MF, LiveRegs,
- *TRI.getWaveMaskRegClass());
+ ScratchExecCopy = findScratchNonCalleeSaveRegister(
+ MRI, LiveRegs, *TRI.getWaveMaskRegClass());
+ LiveRegs.removeReg(ScratchExecCopy);
- const unsigned OrSaveExec = ST.isWave32() ?
- AMDGPU::S_OR_SAVEEXEC_B32 : AMDGPU::S_OR_SAVEEXEC_B64;
+ const unsigned OrSaveExec =
+ ST.isWave32() ? AMDGPU::S_OR_SAVEEXEC_B32 : AMDGPU::S_OR_SAVEEXEC_B64;
BuildMI(MBB, MBBI, DL, TII->get(OrSaveExec), ScratchExecCopy)
.addImm(-1);
}
- TII->loadRegFromStackSlot(MBB, MBBI, Reg.VGPR,
- Reg.FI.getValue(), &AMDGPU::VGPR_32RegClass,
- &TII->getRegisterInfo());
+ buildEpilogReload(LiveRegs, MBB, MBBI, TII, Reg.VGPR,
+ FuncInfo->getScratchRSrcReg(),
+ FuncInfo->getStackPtrOffsetReg(), Reg.FI.getValue());
}
if (ScratchExecCopy != AMDGPU::NoRegister) {
unsigned ExecMov = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
BuildMI(MBB, MBBI, DL, TII->get(ExecMov), Exec)
- .addReg(ScratchExecCopy);
- }
-
- const MachineFrameInfo &MFI = MF.getFrameInfo();
- uint32_t NumBytes = MFI.getStackSize();
- uint32_t RoundedSize = FuncInfo->isStackRealigned() ?
- NumBytes + MFI.getMaxAlignment() : NumBytes;
-
- if (RoundedSize != 0 && hasFP(MF)) {
- const unsigned StackPtrReg = FuncInfo->getStackPtrOffsetReg();
- BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::S_SUB_U32), StackPtrReg)
- .addReg(StackPtrReg)
- .addImm(RoundedSize * ST.getWavefrontSize())
- .setMIFlag(MachineInstr::FrameDestroy);
+ .addReg(ScratchExecCopy, RegState::Kill);
}
}
// Note SGPRSpill stack IDs should only be used for SGPR spilling to VGPRs, not
-// memory.
+// memory. They should have been removed by now.
static bool allStackObjectsAreDead(const MachineFrameInfo &MFI) {
for (int I = MFI.getObjectIndexBegin(), E = MFI.getObjectIndexEnd();
I != E; ++I) {
return true;
}
+
+#ifndef NDEBUG
+static bool allSGPRSpillsAreDead(const MachineFrameInfo &MFI,
+ Optional<int> FramePointerSaveIndex) {
+ for (int I = MFI.getObjectIndexBegin(), E = MFI.getObjectIndexEnd();
+ I != E; ++I) {
+ if (!MFI.isDeadObjectIndex(I) &&
+ MFI.getStackID(I) == TargetStackID::SGPRSpill &&
+ FramePointerSaveIndex && I != FramePointerSaveIndex) {
+ return false;
+ }
+ }
+
+ return true;
+}
+#endif
+
int SIFrameLowering::getFrameIndexReference(const MachineFunction &MF, int FI,
unsigned &FrameReg) const {
const SIRegisterInfo *RI = MF.getSubtarget<GCNSubtarget>().getRegisterInfo();
RegScavenger *RS) const {
MachineFrameInfo &MFI = MF.getFrameInfo();
- if (!MFI.hasStackObjects())
- return;
-
const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
- const SIInstrInfo *TII = ST.getInstrInfo();
- const SIRegisterInfo &TRI = TII->getRegisterInfo();
+ const SIRegisterInfo *TRI = ST.getRegisterInfo();
SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
- FuncInfo->removeSGPRToVGPRFrameIndices(MFI);
+ assert(allSGPRSpillsAreDead(MFI, None) &&
+ "SGPR spill should have been removed in SILowerSGPRSpills");
// FIXME: The other checks should be redundant with allStackObjectsAreDead,
// but currently hasNonSpillStackObjects is set only from source
if (FuncInfo->isEntryFunction()) {
int ScavengeFI = MFI.CreateFixedObject(
- TRI.getSpillSize(AMDGPU::SGPR_32RegClass), 0, false);
+ TRI->getSpillSize(AMDGPU::SGPR_32RegClass), 0, false);
RS->addScavengingFrameIndex(ScavengeFI);
} else {
int ScavengeFI = MFI.CreateStackObject(
- TRI.getSpillSize(AMDGPU::SGPR_32RegClass),
- TRI.getSpillAlignment(AMDGPU::SGPR_32RegClass),
+ TRI->getSpillSize(AMDGPU::SGPR_32RegClass),
+ TRI->getSpillAlignment(AMDGPU::SGPR_32RegClass),
false);
RS->addScavengingFrameIndex(ScavengeFI);
}
// Only report VGPRs to generic code.
void SIFrameLowering::determineCalleeSaves(MachineFunction &MF,
- BitVector &SavedRegs,
+ BitVector &SavedVGPRs,
RegScavenger *RS) const {
- TargetFrameLowering::determineCalleeSaves(MF, SavedRegs, RS);
+ TargetFrameLowering::determineCalleeSaves(MF, SavedVGPRs, RS);
+
+ SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
+ const MachineFrameInfo &FrameInfo = MF.getFrameInfo();
const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
const SIRegisterInfo *TRI = ST.getRegisterInfo();
- SavedRegs.clearBitsNotInMask(TRI->getAllVGPRRegMask());
- // VGPRs used for SGPR spilling need to be specially inserted in the prolog.
- const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
+ // Ignore the SGPRs the default implementation found.
+ SavedVGPRs.clearBitsNotInMask(TRI->getAllVGPRRegMask());
+
+ // hasFP only knows about stack objects that already exist. We're now
+ // determining the stack slots that will be created, so we have to predict
+ // them. Stack objects force FP usage with calls.
+ //
+ // Note a new VGPR CSR may be introduced if one is used for the spill, but we
+ // don't want to report it here.
+ //
+ // FIXME: Is this really hasReservedCallFrame?
+ const bool WillHaveFP =
+ FrameInfo.hasCalls() &&
+ (SavedVGPRs.any() || !allStackObjectsAreDead(FrameInfo));
+
+ // VGPRs used for SGPR spilling need to be specially inserted in the prolog,
+ // so don't allow the default insertion to handle them.
for (auto SSpill : MFI->getSGPRSpillVGPRs())
- SavedRegs.reset(SSpill.VGPR);
+ SavedVGPRs.reset(SSpill.VGPR);
+
+ const bool HasFP = WillHaveFP || hasFP(MF);
+ if (!HasFP)
+ return;
+
+ if (MFI->haveFreeLanesForSGPRSpill(MF, 1)) {
+ int NewFI = MF.getFrameInfo().CreateStackObject(4, 4, true, nullptr,
+ TargetStackID::SGPRSpill);
+
+ // If there is already a VGPR with free lanes, use it. We may already have
+ // to pay the penalty for spilling a CSR VGPR.
+ if (!MFI->allocateSGPRSpillToVGPR(MF, NewFI))
+ llvm_unreachable("allocate SGPR spill should have worked");
+
+ MFI->FramePointerSaveIndex = NewFI;
+
+ LLVM_DEBUG(
+ auto Spill = MFI->getSGPRToVGPRSpills(NewFI).front();
+ dbgs() << "Spilling FP to " << printReg(Spill.VGPR, TRI)
+ << ':' << Spill.Lane << '\n');
+ return;
+ }
+
+ MFI->SGPRForFPSaveRestoreCopy = findUnusedSGPRNonCalleeSaved(MF.getRegInfo());
+
+ if (!MFI->SGPRForFPSaveRestoreCopy) {
+ // There's no free lane to spill, and no free register to save FP, so we're
+ // forced to spill another VGPR to use for the spill.
+ int NewFI = MF.getFrameInfo().CreateStackObject(4, 4, true, nullptr,
+ TargetStackID::SGPRSpill);
+ if (!MFI->allocateSGPRSpillToVGPR(MF, NewFI))
+ llvm_unreachable("allocate SGPR spill should have worked");
+ MFI->FramePointerSaveIndex = NewFI;
+
+ LLVM_DEBUG(
+ auto Spill = MFI->getSGPRToVGPRSpills(NewFI).front();
+ dbgs() << "FP requires fallback spill to " << printReg(Spill.VGPR, TRI)
+ << ':' << Spill.Lane << '\n';);
+ } else {
+ LLVM_DEBUG(dbgs() << "Saving FP with copy to " <<
+ printReg(MFI->SGPRForFPSaveRestoreCopy, TRI) << '\n');
+ }
}
void SIFrameLowering::determineCalleeSavesSGPR(MachineFunction &MF,
SavedRegs.clearBitsInMask(TRI->getAllVGPRRegMask());
}
+bool SIFrameLowering::assignCalleeSavedSpillSlots(
+ MachineFunction &MF, const TargetRegisterInfo *TRI,
+ std::vector<CalleeSavedInfo> &CSI) const {
+ if (CSI.empty())
+ return true; // Early exit if no callee saved registers are modified!
+
+ const SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
+ if (!FuncInfo->SGPRForFPSaveRestoreCopy)
+ return false;
+
+ for (auto &CS : CSI) {
+ if (CS.getReg() == FuncInfo->getFrameOffsetReg()) {
+ if (FuncInfo->SGPRForFPSaveRestoreCopy != AMDGPU::NoRegister)
+ CS.setDstReg(FuncInfo->SGPRForFPSaveRestoreCopy);
+ break;
+ }
+ }
+
+ return false;
+}
+
MachineBasicBlock::iterator SIFrameLowering::eliminateCallFramePseudoInstr(
MachineFunction &MF,
MachineBasicBlock &MBB,
if (MFI.hasCalls()) {
// All offsets are unsigned, so need to be addressed in the same direction
// as stack growth.
+
+ // FIXME: This function is pretty broken, since it can be called before the
+ // frame layout is determined or CSR spills are inserted.
if (MFI.getStackSize() != 0)
return true;
RegScavenger *RS = nullptr) const override;
void determineCalleeSavesSGPR(MachineFunction &MF, BitVector &SavedRegs,
RegScavenger *RS = nullptr) const;
+ bool
+ assignCalleeSavedSpillSlots(MachineFunction &MF,
+ const TargetRegisterInfo *TRI,
+ std::vector<CalleeSavedInfo> &CSI) const override;
bool isSupportedStackID(TargetStackID::Value ID) const override;
MachineFrameInfo &MFI = MF.getFrameInfo();
SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass;
- SDValue CallerSavedFP;
-
// Adjust the stack pointer for the new arguments...
// These operations are automatically eliminated by the prolog/epilog pass
if (!IsSibCall) {
= DAG.getCopyFromReg(Chain, DL, Info->getScratchRSrcReg(), MVT::v4i32);
RegsToPass.emplace_back(AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3, ScratchRSrcReg);
CopyFromChains.push_back(ScratchRSrcReg.getValue(1));
-
- if (!Info->isEntryFunction()) {
- // Avoid clobbering this function's FP value. In the current convention
- // callee will overwrite this, so do save/restore around the call site.
- CallerSavedFP = DAG.getCopyFromReg(Chain, DL,
- Info->getFrameOffsetReg(), MVT::i32);
- CopyFromChains.push_back(CallerSavedFP.getValue(1));
- }
-
Chain = DAG.getTokenFactor(DL, CopyFromChains);
}
Chain = Call.getValue(0);
InFlag = Call.getValue(1);
- if (CallerSavedFP) {
- SDValue FPReg = DAG.getRegister(Info->getFrameOffsetReg(), MVT::i32);
- Chain = DAG.getCopyToReg(Chain, DL, FPReg, CallerSavedFP, InFlag);
- InFlag = Chain.getValue(1);
- }
-
uint64_t CalleePopBytes = NumBytes;
Chain = DAG.getCALLSEQ_END(Chain, DAG.getTargetConstant(0, DL, MVT::i32),
DAG.getTargetConstant(CalleePopBytes, DL, MVT::i32),
// Add the scratch resource registers as implicit uses because we may end up
// needing them, and need to ensure that the reserved registers are
// correctly handled.
-
- FrameInfo.setStackID(FrameIndex, TargetStackID::SGPRSpill);
+ if (RI.spillSGPRToVGPR())
+ FrameInfo.setStackID(FrameIndex, TargetStackID::SGPRSpill);
if (ST.hasScalarStores()) {
// m0 is used for offset to scalar stores if used to spill.
Spill.addReg(AMDGPU::M0, RegState::ImplicitDefine | RegState::Dead);
MRI.constrainRegClass(DestReg, &AMDGPU::SReg_32_XM0RegClass);
}
- FrameInfo.setStackID(FrameIndex, TargetStackID::SGPRSpill);
+ if (RI.spillSGPRToVGPR())
+ FrameInfo.setStackID(FrameIndex, TargetStackID::SGPRSpill);
MachineInstrBuilder Spill = BuildMI(MBB, MI, DL, OpDesc, DestReg)
.addFrameIndex(FrameIndex) // addr
.addMemOperand(MMO)
// required for scratch access.
ScratchRSrcReg = AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3;
ScratchWaveOffsetReg = AMDGPU::SGPR33;
- FrameOffsetReg = AMDGPU::SGPR5;
+
+ // TODO: Pick a high register, and shift down, similar to a kernel.wwwwwwwwwwww
+ FrameOffsetReg = AMDGPU::SGPR34;
StackPtrOffsetReg = AMDGPU::SGPR32;
ArgInfo.PrivateSegmentBuffer =
return false;
}
+/// \p returns true if \p NumLanes slots are available in VGPRs already used for
+/// SGPR spilling.
+//
+// FIXME: This only works after processFunctionBeforeFrameFinalized
+bool SIMachineFunctionInfo::haveFreeLanesForSGPRSpill(const MachineFunction &MF,
+ unsigned NumNeed) const {
+ const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
+ unsigned WaveSize = ST.getWavefrontSize();
+ return NumVGPRSpillLanes + NumNeed <= WaveSize * SpillVGPRs.size();
+}
+
/// Reserve a slice of a VGPR to support spilling for FrameIndex \p FI.
bool SIMachineFunctionInfo::allocateSGPRSpillToVGPR(MachineFunction &MF,
int FI) {
}
void SIMachineFunctionInfo::removeSGPRToVGPRFrameIndices(MachineFrameInfo &MFI) {
- for (auto &R : SGPRToVGPRSpills)
- MFI.RemoveStackObject(R.first);
- // All other SPGRs must be allocated on the default stack, so reset
- // the stack ID.
- for (unsigned i = MFI.getObjectIndexBegin(), e = MFI.getObjectIndexEnd();
- i != e; ++i)
- MFI.setStackID(i, 0);
+ // The FP spill hasn't been inserted yet, so keep it around.
+ for (auto &R : SGPRToVGPRSpills) {
+ if (R.first != FramePointerSaveIndex)
+ MFI.RemoveStackObject(R.first);
+ }
+
+ // All other SPGRs must be allocated on the default stack, so reset the stack
+ // ID.
+ for (int i = MFI.getObjectIndexBegin(), e = MFI.getObjectIndexEnd(); i != e;
+ ++i)
+ if (i != FramePointerSaveIndex)
+ MFI.setStackID(i, TargetStackID::Default);
}
MCPhysReg SIMachineFunctionInfo::getNextUserSGPR() const {
unsigned NumVGPRSpillLanes = 0;
SmallVector<SGPRSpillVGPRCSR, 2> SpillVGPRs;
+public: // FIXME
+ /// If this is set, an SGPR used for save/restore of the register used for the
+ /// frame pointer.
+ unsigned SGPRForFPSaveRestoreCopy = 0;
+ Optional<int> FramePointerSaveIndex;
+
public:
SIMachineFunctionInfo(const MachineFunction &MF);
return Mode;
}
+ bool haveFreeLanesForSGPRSpill(const MachineFunction &MF,
+ unsigned NumLane) const;
bool allocateSGPRSpillToVGPR(MachineFunction &MF, int FI);
void removeSGPRToVGPRFrameIndices(MachineFrameInfo &MFI);
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: ds_read_b32 v2, v0
-; GCN-NEXT: s_mov_b64 s[6:7], 0
+; GCN-NEXT: s_mov_b64 s[4:5], 0
; GCN-NEXT: BB0_1: ; %atomicrmw.start
; GCN-NEXT: ; =>This Inner Loop Header: Depth=1
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: buffer_wbinvl1_vol
; GCN-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2
; GCN-NEXT: v_mov_b32_e32 v2, v1
-; GCN-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
-; GCN-NEXT: s_andn2_b64 exec, exec, s[6:7]
+; GCN-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GCN-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GCN-NEXT: s_cbranch_execnz BB0_1
; GCN-NEXT: ; %bb.2: ; %atomicrmw.end
-; GCN-NEXT: s_or_b64 exec, exec, s[6:7]
+; GCN-NEXT: s_or_b64 exec, exec, s[4:5]
; GCN-NEXT: v_mov_b32_e32 v0, v1
; GCN-NEXT: s_setpc_b64 s[30:31]
%result = atomicrmw nand i32 addrspace(3)* %ptr, i32 4 seq_cst
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: global_load_dword v3, v[0:1], off
-; GCN-NEXT: s_mov_b64 s[6:7], 0
+; GCN-NEXT: s_mov_b64 s[4:5], 0
; GCN-NEXT: BB1_1: ; %atomicrmw.start
; GCN-NEXT: ; =>This Inner Loop Header: Depth=1
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_not_b32_e32 v2, v3
; GCN-NEXT: v_or_b32_e32 v2, -5, v2
-; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc
-; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN-NEXT: buffer_wbinvl1_vol
; GCN-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GCN-NEXT: v_mov_b32_e32 v3, v2
-; GCN-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
-; GCN-NEXT: s_andn2_b64 exec, exec, s[6:7]
+; GCN-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GCN-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GCN-NEXT: s_cbranch_execnz BB1_1
; GCN-NEXT: ; %bb.2: ; %atomicrmw.end
-; GCN-NEXT: s_or_b64 exec, exec, s[6:7]
+; GCN-NEXT: s_or_b64 exec, exec, s[4:5]
; GCN-NEXT: v_mov_b32_e32 v0, v2
; GCN-NEXT: s_setpc_b64 s[30:31]
%result = atomicrmw nand i32 addrspace(1)* %ptr, i32 4 seq_cst
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: flat_load_dword v3, v[0:1]
-; GCN-NEXT: s_mov_b64 s[6:7], 0
+; GCN-NEXT: s_mov_b64 s[4:5], 0
; GCN-NEXT: BB2_1: ; %atomicrmw.start
; GCN-NEXT: ; =>This Inner Loop Header: Depth=1
; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN-NEXT: v_not_b32_e32 v2, v3
; GCN-NEXT: v_or_b32_e32 v2, -5, v2
-; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
-; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN-NEXT: buffer_wbinvl1_vol
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GCN-NEXT: v_mov_b32_e32 v3, v2
-; GCN-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
-; GCN-NEXT: s_andn2_b64 exec, exec, s[6:7]
+; GCN-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GCN-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GCN-NEXT: s_cbranch_execnz BB2_1
; GCN-NEXT: ; %bb.2: ; %atomicrmw.end
-; GCN-NEXT: s_or_b64 exec, exec, s[6:7]
+; GCN-NEXT: s_or_b64 exec, exec, s[4:5]
; GCN-NEXT: v_mov_b32_e32 v0, v2
; GCN-NEXT: s_setpc_b64 s[30:31]
%result = atomicrmw nand i32* %ptr, i32 4 seq_cst
}
; GCN-LABEL: {{^}}void_func_byval_struct_non_leaf:
-; GCN: s_mov_b32 s5, s32
-; GCN: s_add_u32 s32, s32, 0xc00{{$}}
-; GCN-DAG: buffer_store_dword v32
-; GCN-DAG: buffer_store_dword v33
+; GCN: buffer_store_dword v33, off, s[0:3], s32 offset:36
+; GCN-DAG: v_writelane_b32 v33, s34,
+; GCN: s_mov_b32 s34, s32
+; GCN-DAG: buffer_load_dword [[LOAD0:v[0-9]+]], off, s[0:3], s34{{$}}
+; GCN-DAG: s_add_u32 s32, s32, 0xc00{{$}}
+; GCN-DAG: buffer_store_dword v32, off, s[0:3], s34 offset:32
; GCN-NOT: v_writelane_b32 v{{[0-9]+}}, s32
-; GCN-DAG: v_writelane_b32
-; GCN-DAG: buffer_load_dword [[LOAD0:v[0-9]+]], off, s[0:3], s5{{$}}
+
; GCN-DAG: v_add_{{[iu]}}32_e32 [[ADD0:v[0-9]+]], vcc, 1, [[LOAD0]]
-; GCN-DAG: buffer_store_dword [[ADD0]], off, s[0:3], s5{{$}}
+; GCN: buffer_store_dword [[ADD0]], off, s[0:3], s34{{$}}
-; GCN-DAG: buffer_load_dword [[LOAD1:v[0-9]+]], off, s[0:3], s5 offset:16{{$}}
+; GCN-DAG: buffer_load_dword [[LOAD1:v[0-9]+]], off, s[0:3], s34 offset:16{{$}}
; GCN-DAG: v_add_{{[iu]}}32_e32 [[ADD1:v[0-9]+]], vcc, 2, [[LOAD1]]
; GCN: s_swappc_b64
-; GCN: buffer_store_dword [[ADD1]], off, s[0:3], s5 offset:16{{$}}
+; GCN: buffer_store_dword [[ADD1]], off, s[0:3], s34 offset:16{{$}}
; GCN: v_readlane_b32
; GCN-NOT: v_readlane_b32 s32
-; GCN-DAG: buffer_load_dword v32,
-; GCN-DAG: buffer_load_dword v33,
+; GCN-DAG: buffer_load_dword v32, off, s[0:3], s34 offset:32
; GCN: s_sub_u32 s32, s32, 0xc00{{$}}
+; GCN: v_readlane_b32 s34, v33,
+; GCN-DAG: buffer_load_dword v33, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
; GCN: s_setpc_b64
define void @void_func_byval_struct_non_leaf(%struct.ByValStruct addrspace(5)* byval noalias nocapture align 4 %arg0, %struct.ByValStruct addrspace(5)* byval noalias nocapture align 4 %arg1) #1 {
entry:
}
; GCN-LABEL: {{^}}call_void_func_byval_struct_func:
-; GCN: s_mov_b32 s5, s32
+; GCN: s_mov_b32 s34, s32
; GCN-DAG: s_add_u32 s32, s32, 0xc00{{$}}
; GCN-DAG: v_writelane_b32
; GCN-DAG: v_mov_b32_e32 [[NINE:v[0-9]+]], 9
; GCN-DAG: v_mov_b32_e32 [[THIRTEEN:v[0-9]+]], 13
-; GCN-DAG: buffer_store_dword [[NINE]], off, s[0:3], s5{{$}}
-; GCN-DAG: buffer_store_dword [[THIRTEEN]], off, s[0:3], s5 offset:16
+; GCN-DAG: buffer_store_dword [[NINE]], off, s[0:3], s34{{$}}
+; GCN-DAG: buffer_store_dword [[THIRTEEN]], off, s[0:3], s34 offset:16
-; GCN-DAG: buffer_load_dword [[LOAD0:v[0-9]+]], off, s[0:3], s5{{$}}
-; GCN-DAG: buffer_load_dword [[LOAD1:v[0-9]+]], off, s[0:3], s5 offset:4
-; GCN-DAG: buffer_load_dword [[LOAD2:v[0-9]+]], off, s[0:3], s5 offset:8
-; GCN-DAG: buffer_load_dword [[LOAD3:v[0-9]+]], off, s[0:3], s5 offset:12
+; GCN-DAG: buffer_load_dword [[LOAD0:v[0-9]+]], off, s[0:3], s34{{$}}
+; GCN-DAG: buffer_load_dword [[LOAD1:v[0-9]+]], off, s[0:3], s34 offset:4
+; GCN-DAG: buffer_load_dword [[LOAD2:v[0-9]+]], off, s[0:3], s34 offset:8
+; GCN-DAG: buffer_load_dword [[LOAD3:v[0-9]+]], off, s[0:3], s34 offset:12
; GCN-NOT: s_add_u32 s32, s32, 0x800
; GCN-DAG: buffer_store_dword [[LOAD2]], off, s[0:3], s32 offset:8
; GCN-DAG: buffer_store_dword [[LOAD3]], off, s[0:3], s32 offset:12
-; GCN: buffer_load_dword [[LOAD4:v[0-9]+]], off, s[0:3], s5 offset:16
-; GCN: buffer_load_dword [[LOAD5:v[0-9]+]], off, s[0:3], s5 offset:20
-; GCN: buffer_load_dword [[LOAD6:v[0-9]+]], off, s[0:3], s5 offset:24
-; GCN: buffer_load_dword [[LOAD7:v[0-9]+]], off, s[0:3], s5 offset:28
+; GCN: buffer_load_dword [[LOAD4:v[0-9]+]], off, s[0:3], s34 offset:16
+; GCN: buffer_load_dword [[LOAD5:v[0-9]+]], off, s[0:3], s34 offset:20
+; GCN: buffer_load_dword [[LOAD6:v[0-9]+]], off, s[0:3], s34 offset:24
+; GCN: buffer_load_dword [[LOAD7:v[0-9]+]], off, s[0:3], s34 offset:28
; GCN-DAG: buffer_store_dword [[LOAD4]], off, s[0:3], s32 offset:16
; GCN-DAG: buffer_store_dword [[LOAD5]], off, s[0:3], s32 offset:20
; GCN-NOT: s_sub_u32 s32, s32, 0x800
; GCN: s_sub_u32 s32, s32, 0xc00{{$}}
-; GCN-NEXT: s_waitcnt
-; GCN-NEXT: s_setpc_b64
+; GCN: v_readlane_b32 s34, v
+; GCN: s_waitcnt
+; GCN: s_setpc_b64
define void @call_void_func_byval_struct_func() #1 {
entry:
%arg0 = alloca %struct.ByValStruct, align 4, addrspace(5)
}
; GCN-LABEL: {{^}}call_void_func_byval_struct_align8_func:
-; GCN: s_mov_b32 s5, s32
+; GCN: s_mov_b32 s34, s32
; GCN-DAG: s_add_u32 s32, s32, 0xc00{{$}}
; GCN-DAG: v_writelane_b32
; GCN-DAG: v_mov_b32_e32 [[NINE:v[0-9]+]], 9
; GCN-DAG: v_mov_b32_e32 [[THIRTEEN:v[0-9]+]], 13
-; GCN-DAG: buffer_store_dword [[NINE]], off, s[0:3], s5{{$}}
-; GCN-DAG: buffer_store_dword [[THIRTEEN]], off, s[0:3], s5 offset:16
+; GCN-DAG: buffer_store_dword [[NINE]], off, s[0:3], s34{{$}}
+; GCN-DAG: buffer_store_dword [[THIRTEEN]], off, s[0:3], s34 offset:16
-; GCN-DAG: buffer_load_dword [[LOAD0:v[0-9]+]], off, s[0:3], s5{{$}}
-; GCN-DAG: buffer_load_dword [[LOAD1:v[0-9]+]], off, s[0:3], s5 offset:4
-; GCN-DAG: buffer_load_dword [[LOAD2:v[0-9]+]], off, s[0:3], s5 offset:8
-; GCN-DAG: buffer_load_dword [[LOAD3:v[0-9]+]], off, s[0:3], s5 offset:12
+; GCN-DAG: buffer_load_dword [[LOAD0:v[0-9]+]], off, s[0:3], s34{{$}}
+; GCN-DAG: buffer_load_dword [[LOAD1:v[0-9]+]], off, s[0:3], s34 offset:4
+; GCN-DAG: buffer_load_dword [[LOAD2:v[0-9]+]], off, s[0:3], s34 offset:8
+; GCN-DAG: buffer_load_dword [[LOAD3:v[0-9]+]], off, s[0:3], s34 offset:12
; GCN-NOT: s_add_u32 s32, s32, 0x800
; GCN-DAG: buffer_store_dword [[LOAD2]], off, s[0:3], s32 offset:8
; GCN-DAG: buffer_store_dword [[LOAD3]], off, s[0:3], s32 offset:12
-; GCN: buffer_load_dword [[LOAD4:v[0-9]+]], off, s[0:3], s5 offset:16
-; GCN: buffer_load_dword [[LOAD5:v[0-9]+]], off, s[0:3], s5 offset:20
-; GCN: buffer_load_dword [[LOAD6:v[0-9]+]], off, s[0:3], s5 offset:24
-; GCN: buffer_load_dword [[LOAD7:v[0-9]+]], off, s[0:3], s5 offset:28
+; GCN: buffer_load_dword [[LOAD4:v[0-9]+]], off, s[0:3], s34 offset:16
+; GCN: buffer_load_dword [[LOAD5:v[0-9]+]], off, s[0:3], s34 offset:20
+; GCN: buffer_load_dword [[LOAD6:v[0-9]+]], off, s[0:3], s34 offset:24
+; GCN: buffer_load_dword [[LOAD7:v[0-9]+]], off, s[0:3], s34 offset:28
; GCN: s_waitcnt vmcnt(0)
; GCN-DAG: buffer_store_dword [[LOAD4]], off, s[0:3], s32 offset:16
; GCN-DAG: buffer_store_dword [[LOAD5]], off, s[0:3], s32 offset:20
; GCN-NOT: s_sub_u32 s32, s32, 0x800
; GCN: s_sub_u32 s32, s32, 0xc00{{$}}
-; GCN-NEXT: s_waitcnt
+; GCN: v_readlane_b32 s34, v
+; GCN: s_waitcnt
; GCN-NEXT: s_setpc_b64
define void @call_void_func_byval_struct_align8_func() #0 {
entry:
ret void
}
-declare void @external_void_func_void() #0
+declare hidden void @external_void_func_void() #0
declare void @llvm.lifetime.start.p5i8(i64, i8 addrspace(5)* nocapture) #3
declare void @llvm.lifetime.end.p5i8(i64, i8 addrspace(5)* nocapture) #3
}
; GCN-LABEL: {{^}}indirect_use_vcc:
-; GCN: v_writelane_b32 v32, s34, 0
-; GCN: v_writelane_b32 v32, s35, 1
-; GCN: v_writelane_b32 v32, s36, 2
+; GCN: v_writelane_b32 v32, s34, 2
+; GCN: v_writelane_b32 v32, s36, 0
+; GCN: v_writelane_b32 v32, s37, 1
; GCN: s_swappc_b64
-; GCN: v_readlane_b32 s36, v32, 2
-; GCN: v_readlane_b32 s35, v32, 1
-; GCN: v_readlane_b32 s34, v32, 0
-; GCN: ; NumSgprs: 39
+; GCN: v_readlane_b32 s37, v32, 1
+; GCN: v_readlane_b32 s36, v32, 0
+; GCN: v_readlane_b32 s34, v32, 2
+; GCN: ; NumSgprs: 40
; GCN: ; NumVgprs: 33
define void @indirect_use_vcc() #1 {
call void @use_vcc()
; GCN-LABEL: {{^}}indirect_2level_use_vcc_kernel:
; GCN: is_dynamic_callstack = 0
-; CI: ; NumSgprs: 41
-; VI-NOBUG: ; NumSgprs: 43
+; CI: ; NumSgprs: 42
+; VI-NOBUG: ; NumSgprs: 44
; VI-BUG: ; NumSgprs: 96
; GCN: ; NumVgprs: 33
define amdgpu_kernel void @indirect_2level_use_vcc_kernel(i32 addrspace(1)* %out) #0 {
}
; GCN-LABEL: {{^}}indirect_use_flat_scratch:
-; CI: ; NumSgprs: 41
-; VI: ; NumSgprs: 43
+; CI: ; NumSgprs: 42
+; VI: ; NumSgprs: 44
; GCN: ; NumVgprs: 33
define void @indirect_use_flat_scratch() #1 {
call void @use_flat_scratch()
; GCN-LABEL: {{^}}indirect_2level_use_flat_scratch_kernel:
; GCN: is_dynamic_callstack = 0
-; CI: ; NumSgprs: 41
-; VI-NOBUG: ; NumSgprs: 43
+; CI: ; NumSgprs: 42
+; VI-NOBUG: ; NumSgprs: 44
; VI-BUG: ; NumSgprs: 96
; GCN: ; NumVgprs: 33
define amdgpu_kernel void @indirect_2level_use_flat_scratch_kernel(i32 addrspace(1)* %out) #0 {
}
; GCN-LABEL: {{^}}test_func_call_external_void_func_void_clobber_s30_s31_call_external_void_func_void:
-; GCN: v_writelane_b32 v32, s34, 0
-; GCN: v_writelane_b32 v32, s35, 1
-; GCN: v_writelane_b32 v32, s36, 2
-; GCN: v_writelane_b32 v32, s37, 3
-; GCN: v_writelane_b32 v32, s38, 4
+; GCN: buffer_store_dword
+; GCN: v_writelane_b32 v32, s34, 4
+; GCN: v_writelane_b32 v32, s36, 0
+; GCN: v_writelane_b32 v32, s37, 1
+; GCN: v_writelane_b32 v32, s38, 2
-; GCN: s_mov_b32 [[COPY_FP:s[0-9]+]], s5
-; GCN-NEXT: s_swappc_b64
-; GCN-NEXT: s_mov_b32 s5, [[COPY_FP]]
-; GCN-NEXT: s_mov_b32 [[COPY_FP]], s5
+; GCN: s_swappc_b64
; GCN-NEXT: ;;#ASMSTART
; GCN-NEXT: ;;#ASMEND
; GCN-NEXT: s_swappc_b64
-; GCN-DAG: s_mov_b32 s5, [[COPY_FP]]
-; GCN-DAG: v_readlane_b32 s38, v32, 4
-; GCN: v_readlane_b32 s37, v32, 3
-; GCN: v_readlane_b32 s36, v32, 2
-; GCN: v_readlane_b32 s35, v32, 1
-; GCN: v_readlane_b32 s34, v32, 0
+; GCN-DAG: v_readlane_b32 s39, v32, 3
+; GCN-DAG: v_readlane_b32 s38, v32, 2
+; GCN: v_readlane_b32 s37, v32, 1
+; GCN: v_readlane_b32 s36, v32, 0
+
+; GCN: v_readlane_b32 s34, v32, 4
+; GCN: buffer_load_dword
; GCN: s_setpc_b64
define void @test_func_call_external_void_func_void_clobber_s30_s31_call_external_void_func_void() #0 {
call void @external_void_func_void()
ret void
}
-; FIXME: Avoid extra restore of FP in between calls.
; GCN-LABEL: {{^}}test_func_call_external_void_funcx2:
-; GCN: s_mov_b32 [[COPY_FP:s[0-9]+]], s5
-; GCN-NEXT: s_swappc_b64
-; GCN-NEXT: s_mov_b32 s5, [[COPY_FP]]
-; GCN-NEXT: s_mov_b32 [[COPY_FP]], s5
+; GCN: buffer_store_dword v32
+; GCN: v_writelane_b32 v32, s34, 4
+
+; GCN: s_mov_b32 s34, s32
+; GCN: s_add_u32 s32, s32, 0x400
+; GCN: s_swappc_b64
; GCN-NEXT: s_swappc_b64
-; GCN: s_mov_b32 s5, [[COPY_FP]]
+
+; GCN: v_readlane_b32 s34, v32, 4
+; GCN: buffer_load_dword v32,
define void @test_func_call_external_void_funcx2() #0 {
call void @external_void_func_void()
call void @external_void_func_void()
; GCN-LABEL: tailcall_got_load:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: s_getpc_b64 s[6:7]
-; GCN-NEXT: s_add_u32 s6, s6, got.func@gotpcrel32@lo+4
-; GCN-NEXT: s_addc_u32 s7, s7, got.func@gotpcrel32@hi+4
-; GCN-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x0
+; GCN-NEXT: s_getpc_b64 s[4:5]
+; GCN-NEXT: s_add_u32 s4, s4, got.func@gotpcrel32@lo+4
+; GCN-NEXT: s_addc_u32 s5, s5, got.func@gotpcrel32@hi+4
+; GCN-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
; GCN-NEXT: v_mov_b32_e32 v0, 0
; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: s_setpc_b64 s[6:7]
+; GCN-NEXT: s_setpc_b64 s[4:5]
tail call void @got.func(i32 0)
ret void
}
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: ds_read_b32 v0, v0
-; GCN-NEXT: s_getpc_b64 s[6:7]
-; GCN-NEXT: s_add_u32 s6, s6, func@rel32@lo+4
-; GCN-NEXT: s_addc_u32 s7, s7, func@rel32@hi+4
-; GCN-NEXT: s_setpc_b64 s[6:7]
+; GCN-NEXT: s_getpc_b64 s[4:5]
+; GCN-NEXT: s_add_u32 s4, s4, func@rel32@lo+4
+; GCN-NEXT: s_addc_u32 s5, s5, func@rel32@hi+4
+; GCN-NEXT: s_setpc_b64 s[4:5]
%vgpr = load volatile i32, i32 addrspace(3)* %ptr
tail call void @func(i32 %vgpr)
ret void
; GCN-LABEL: {{^}}callee_no_stack_no_fp_elim_all:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt
-; GCN-NEXT: s_mov_b32 s5, s32
+; GCN-NEXT: s_mov_b32 s4, s34
+; GCN-NEXT: s_mov_b32 s34, s32
+; GCN-NEXT: s_mov_b32 s34, s4
; GCN-NEXT: s_setpc_b64
define void @callee_no_stack_no_fp_elim_all() #1 {
ret void
ret void
}
+; Can use free call clobbered register to preserve original FP value.
+
; GCN-LABEL: {{^}}callee_with_stack_no_fp_elim_all:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt
-; GCN-NEXT: s_mov_b32 s5, s32
+; GCN-NEXT: s_mov_b32 s4, s34
+; GCN-NEXT: s_mov_b32 s34, s32
; GCN-NEXT: s_add_u32 s32, s32, 0x200
; GCN-NEXT: v_mov_b32_e32 v0, 0{{$}}
-; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s5 offset:4{{$}}
+; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s34 offset:4{{$}}
; GCN-NEXT: s_sub_u32 s32, s32, 0x200
+; GCN-NEXT: s_mov_b32 s34, s4
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: s_setpc_b64
define void @callee_with_stack_no_fp_elim_all() #1 {
; GCN-LABEL: {{^}}callee_with_stack_and_call:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt
-; GCN: s_mov_b32 s5, s32
+; GCN: s_or_saveexec_b64 [[COPY_EXEC0:s\[[0-9]+:[0-9]+\]]], -1{{$}}
+; GCN-NEXT: buffer_store_dword [[CSR_VGPR:v[0-9]+]], off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; GCN-NEXT: s_mov_b64 exec, [[COPY_EXEC0]]
+; GCN: v_writelane_b32 [[CSR_VGPR]], s34, 2
+; GCN-DAG: s_mov_b32 s34, s32
; GCN-DAG: s_add_u32 s32, s32, 0x400{{$}}
-; GCN: buffer_store_dword v32, off, s[0:3], s5 offset:4
-
-; GCN-DAG: v_writelane_b32 v32, s34,
-; GCN-DAG: v_writelane_b32 v32, s35,
-; GCN-DAG: v_writelane_b32 v32, s36,
-; GCN-DAG: v_mov_b32_e32 v0, 0{{$}}
-; GCN-DAG: buffer_store_dword v0, off, s[0:3], s5{{$}}
-; GCN-DAG: s_mov_b32 [[COPY_FP:s[0-9]+]], s5
+; GCN-DAG: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0{{$}}
+; GCN-DAG: v_writelane_b32 [[CSR_VGPR]], s36,
+; GCN-DAG: v_writelane_b32 [[CSR_VGPR]], s37,
+; GCN-DAG: buffer_store_dword [[ZERO]], off, s[0:3], s34{{$}}
; GCN: s_swappc_b64
-; GCN-DAG: s_mov_b32 s5, [[COPY_FP]]
-; GCN-DAG: v_readlane_b32 s34,
-; GCN-DAG: v_readlane_b32 s35,
-; GCN-DAG: v_readlane_b32 s36,
-; GCN: buffer_load_dword v32, off, s[0:3], s5 offset:4
-; GCN: s_waitcnt
+
+; GCN-DAG: v_readlane_b32 s36, [[CSR_VGPR]]
+; GCN-DAG: v_readlane_b32 s37, [[CSR_VGPR]]
+
+; GCN: s_sub_u32 s32, s32, 0x400{{$}}
+; GCN-NEXT: v_readlane_b32 s34, [[CSR_VGPR]], 2
+; GCN-NEXT: s_or_saveexec_b64 [[COPY_EXEC1:s\[[0-9]+:[0-9]+\]]], -1{{$}}
+; GCN-NEXT: buffer_load_dword [[CSR_VGPR]], off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; GCN-NEXT: s_mov_b64 exec, [[COPY_EXEC1]]
+; GCN-NEXT: s_waitcnt vmcnt(0)
+
; GCN-NEXT: s_setpc_b64
define void @callee_with_stack_and_call() #0 {
%alloca = alloca i32, addrspace(5)
; GCN-LABEL: {{^}}callee_no_stack_with_call:
; GCN: s_waitcnt
-; GCN: s_mov_b32 s5, s32
-; GCN: s_or_saveexec_b64 [[COPY_EXEC0:s\[[0-9]+:[0-9]+\]]], -1{{$}}
-; GCN-NEXT: buffer_store_dword v32, off, s[0:3], s5 ; 4-byte Folded Spill
+; GCN-NEXT: s_or_saveexec_b64 [[COPY_EXEC0:s\[[0-9]+:[0-9]+\]]], -1{{$}}
+; GCN-NEXT: buffer_store_dword [[CSR_VGPR:v[0-9]+]], off, s[0:3], s32 ; 4-byte Folded Spill
; GCN-NEXT: s_mov_b64 exec, [[COPY_EXEC0]]
-; GCN-DAG: v_writelane_b32 v32, s34, 0
-; GCN-DAG: v_writelane_b32 v32, s35, 1
-; GCN-DAG: v_writelane_b32 v32, s36, 2
-; GCN-DAG: s_mov_b32 [[COPY_FP:s[0-9]+]], s5
-; GCN: s_swappc_b64
-; GCN: s_mov_b32 s5, [[COPY_FP]]
+; GCN-DAG: s_add_u32 s32, s32, 0x400
+; GCN-DAG: v_writelane_b32 [[CSR_VGPR]], s34, [[FP_SPILL_LANE:[0-9]+]]
-; GCN-DAG: v_readlane_b32 s34, v32, 0
-; GCN-DAG: v_readlane_b32 s35, v32, 1
-; GCN-DAG: v_readlane_b32 s36, v32, 2
+; GCN-DAG: v_writelane_b32 [[CSR_VGPR]], s36, 0
+; GCN-DAG: v_writelane_b32 [[CSR_VGPR]], s37, 1
+; GCN: s_swappc_b64
-; GCN: s_or_saveexec_b64 [[COPY_EXEC1:s\[[0-9]+:[0-9]+\]]], -1{{$}}
-; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s5 ; 4-byte Folded Reload
-; GCN-NEXT: s_mov_b64 exec, [[COPY_EXEC1]]
+; GCN-DAG: v_readlane_b32 s36, v32, 0
+; GCN-DAG: v_readlane_b32 s37, v32, 1
; GCN: s_sub_u32 s32, s32, 0x400
-; GCN: s_setpc_b64
+; GCN-NEXT: v_readlane_b32 s34, [[CSR_VGPR]], [[FP_SPILL_LANE]]
+; GCN-NEXT: s_or_saveexec_b64 [[COPY_EXEC1:s\[[0-9]+:[0-9]+\]]], -1{{$}}
+; GCN-NEXT: buffer_load_dword [[CSR_VGPR]], off, s[0:3], s32 ; 4-byte Folded Reload
+; GCN-NEXT: s_mov_b64 exec, [[COPY_EXEC1]]
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: s_setpc_b64
define void @callee_no_stack_with_call() #0 {
call void @external_void_func_void()
ret void
}
-declare void @external_void_func_void() #0
+declare hidden void @external_void_func_void() #0
-; Make sure if a CSR vgpr is used for SGPR spilling, it is saved and restored
+; Make sure if a CSR vgpr is used for SGPR spilling, it is saved and
+; restored. No FP is required.
+;
; GCN-LABEL: {{^}}callee_func_sgpr_spill_no_calls:
; GCN: s_or_saveexec_b64 [[COPY_EXEC0:s\[[0-9]+:[0-9]+\]]], -1{{$}}
-; GCN-NEXT: buffer_store_dword v32, off, s[0:3], s32 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword [[CSR_VGPR:v[0-9]+]], off, s[0:3], s32 ; 4-byte Folded Spill
; GCN-NEXT: s_mov_b64 exec, [[COPY_EXEC0]]
+; GCN: v_writelane_b32 [[CSR_VGPR]], s
+; GCN: v_writelane_b32 [[CSR_VGPR]], s
-; GCN: v_writelane_b32 v32
; GCN: ;;#ASMSTART
-; GCN: v_readlane_b32 s{{[0-9]+}}, v32
+; GCN: v_readlane_b32 s{{[0-9]+}}, [[CSR_VGPR]]
+; GCN: v_readlane_b32 s{{[0-9]+}}, [[CSR_VGPR]]
; GCN: s_or_saveexec_b64 [[COPY_EXEC1:s\[[0-9]+:[0-9]+\]]], -1{{$}}
-; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 ; 4-byte Folded Reload
+; GCN-NEXT: buffer_load_dword [[CSR_VGPR]], off, s[0:3], s32 ; 4-byte Folded Reload
; GCN-NEXT: s_mov_b64 exec, [[COPY_EXEC1]]
-
; GCN-NEXT: s_waitcnt
; GCN-NEXT: s_setpc_b64
define void @callee_func_sgpr_spill_no_calls(i32 %in) #0 {
ret void
}
+; TODO: Can the SP inc/deec be remvoed?
+; GCN-LABEL: {{^}}callee_with_stack_no_fp_elim_csr_vgpr:
+; GCN: s_waitcnt
+; GCN-NEXT:s_mov_b32 [[FP_COPY:s[0-9]+]], s34
+; GCN-NEXT: s_mov_b32 s34, s32
+; GCN: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0
+; GCN-DAG: buffer_store_dword v33, off, s[0:3], s34 ; 4-byte Folded Spill
+; GCN-DAG: buffer_store_dword [[ZERO]], off, s[0:3], s34 offset:8
+
+; GCN: ;;#ASMSTART
+; GCN-NEXT: ; clobber v33
+; GCN-NEXT: ;;#ASMEND
+
+; GCN: buffer_load_dword v33, off, s[0:3], s34 ; 4-byte Folded Reload
+; GCN: s_add_u32 s32, s32, 0x300
+; GCN-NEXT: s_sub_u32 s32, s32, 0x300
+; GCN-NEXT: s_mov_b32 s34, s4
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: s_setpc_b64
+define void @callee_with_stack_no_fp_elim_csr_vgpr() #1 {
+ %alloca = alloca i32, addrspace(5)
+ store volatile i32 0, i32 addrspace(5)* %alloca
+ call void asm sideeffect "; clobber v33", "~{v33}"()
+ ret void
+}
+
+; Use a copy to a free SGPR instead of introducing a second CSR VGPR.
+; GCN-LABEL: {{^}}last_lane_vgpr_for_fp_csr:
+; GCN: s_waitcnt
+; GCN-NEXT: v_writelane_b32 v1, s34, 63
+; GCN-NEXT: s_mov_b32 s34, s32
+; GCN: buffer_store_dword v33, off, s[0:3], s34 ; 4-byte Folded Spill
+; GCN-COUNT-63: v_writelane_b32 v1
+; GCN: buffer_store_dword v{{[0-9]+}}, off, s[0:3], s34 offset:8
+; GCN: ;;#ASMSTART
+; GCN-COUNT-63: v_readlane_b32 s{{[0-9]+}}, v1
+
+; GCN: s_add_u32 s32, s32, 0x300
+; GCN-NEXT: s_sub_u32 s32, s32, 0x300
+; GCN-NEXT: v_readlane_b32 s34, v1, 63
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: s_setpc_b64
+define void @last_lane_vgpr_for_fp_csr() #1 {
+ %alloca = alloca i32, addrspace(5)
+ store volatile i32 0, i32 addrspace(5)* %alloca
+ call void asm sideeffect "; clobber v33", "~{v33}"()
+ call void asm sideeffect "",
+ "~{s40},~{s41},~{s42},~{s43},~{s44},~{s45},~{s46},~{s47},~{s48},~{s49}
+ ,~{s50},~{s51},~{s52},~{s53},~{s54},~{s55},~{s56},~{s57},~{s58},~{s59}
+ ,~{s60},~{s61},~{s62},~{s63},~{s64},~{s65},~{s66},~{s67},~{s68},~{s69}
+ ,~{s70},~{s71},~{s72},~{s73},~{s74},~{s75},~{s76},~{s77},~{s78},~{s79}
+ ,~{s80},~{s81},~{s82},~{s83},~{s84},~{s85},~{s86},~{s87},~{s88},~{s89}
+ ,~{s90},~{s91},~{s92},~{s93},~{s94},~{s95},~{s96},~{s97},~{s98},~{s99}
+ ,~{s100},~{s101},~{s102}"() #1
+
+ ret void
+}
+
+; Use a copy to a free SGPR instead of introducing a second CSR VGPR.
+; GCN-LABEL: {{^}}no_new_vgpr_for_fp_csr:
+; GCN: s_waitcnt
+; GCN-NEXT: s_mov_b32 [[FP_COPY:s[0-9]+]], s34
+; GCN-NEXT: s_mov_b32 s34, s32
+; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s34 ; 4-byte Folded Spill
+; GCN-COUNT-64: v_writelane_b32 v1,
+
+; GCN: buffer_store_dword
+; GCN: ;;#ASMSTART
+; GCN-COUNT-64: v_readlane_b32 s{{[0-9]+}}, v1
+
+; GCN: buffer_load_dword v33, off, s[0:3], s34 ; 4-byte Folded Reload
+; GCN: s_add_u32 s32, s32, 0x300
+; GCN-NEXT: s_sub_u32 s32, s32, 0x300
+; GCN-NEXT: s_mov_b32 s34, [[FP_COPY]]
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: s_setpc_b64
+define void @no_new_vgpr_for_fp_csr() #1 {
+ %alloca = alloca i32, addrspace(5)
+ store volatile i32 0, i32 addrspace(5)* %alloca
+ call void asm sideeffect "; clobber v33", "~{v33}"()
+ call void asm sideeffect "",
+ "~{s39},~{s40},~{s41},~{s42},~{s43},~{s44},~{s45},~{s46},~{s47},~{s48},~{s49}
+ ,~{s50},~{s51},~{s52},~{s53},~{s54},~{s55},~{s56},~{s57},~{s58},~{s59}
+ ,~{s60},~{s61},~{s62},~{s63},~{s64},~{s65},~{s66},~{s67},~{s68},~{s69}
+ ,~{s70},~{s71},~{s72},~{s73},~{s74},~{s75},~{s76},~{s77},~{s78},~{s79}
+ ,~{s80},~{s81},~{s82},~{s83},~{s84},~{s85},~{s86},~{s87},~{s88},~{s89}
+ ,~{s90},~{s91},~{s92},~{s93},~{s94},~{s95},~{s96},~{s97},~{s98},~{s99}
+ ,~{s100},~{s101},~{s102}"() #1
+
+ ret void
+}
+
+; GCN-LABEL: {{^}}realign_stack_no_fp_elim:
+; GCN: s_waitcnt
+; GCN-NEXT: s_add_u32 [[SCRATCH:s[0-9]+]], s32, 0x7ffc0
+; GCN-NEXT: s_mov_b32 s4, s34
+; GCN-NEXT: s_and_b32 s34, [[SCRATCH]], 0xfff80000
+; GCN-NEXT: s_add_u32 s32, s32, 0x100000
+; GCN-NEXT: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0
+; GCN-NEXT: buffer_store_dword [[ZERO]], off, s[0:3], s34
+; GCN-NEXT: s_sub_u32 s32, s32, 0x100000
+; GCN-NEXT: s_mov_b32 s34, s4
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: s_setpc_b64
+define void @realign_stack_no_fp_elim() #1 {
+ %alloca = alloca i32, align 8192, addrspace(5)
+ store volatile i32 0, i32 addrspace(5)* %alloca
+ ret void
+}
+
+; GCN-LABEL: {{^}}no_unused_non_csr_sgpr_for_fp:
+; GCN: s_waitcnt
+; GCN-NEXT: v_writelane_b32 v1, s34, 0
+; GCN-NEXT: s_mov_b32 s34, s32
+; GCN: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0
+; GCN: buffer_store_dword [[ZERO]], off, s[0:3], s34 offset:4
+; GCN: ;;#ASMSTART
+; GCN: s_add_u32 s32, s32, 0x200
+; GCN-NEXT: s_mov_b64 s[30:31], vcc
+; GCN-NEXT: s_sub_u32 s32, s32, 0x200
+; GCN-NEXT: v_readlane_b32 s34, v1, 0
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: s_setpc_b64 s[30:31]
+define void @no_unused_non_csr_sgpr_for_fp() #1 {
+ %alloca = alloca i32, addrspace(5)
+ store volatile i32 0, i32 addrspace(5)* %alloca
+
+ ; Use all clobberable registers, so FP has to spill to a VGPR.
+ call void asm sideeffect "",
+ "~{s0},~{s1},~{s2},~{s3},~{s4},~{s5},~{s6},~{s7},~{s8},~{s9}
+ ,~{s10},~{s11},~{s12},~{s13},~{s14},~{s15},~{s16},~{s17},~{s18},~{s19}
+ ,~{s20},~{s21},~{s22},~{s23},~{s24},~{s25},~{s26},~{s27},~{s28},~{s29}
+ ,~{s30},~{s31}"() #0
+
+ ret void
+}
+
+; Need a new CSR VGPR to satisfy the FP spill.
+; GCN-LABEL: {{^}}no_unused_non_csr_sgpr_for_fp_no_scratch_vgpr:
+; GCN: s_waitcnt
+; GCN-NEXT: s_or_saveexec_b64 [[COPY_EXEC0:s\[[0-9]+:[0-9]+\]]], -1{{$}}
+; GCN-NEXT: buffer_store_dword [[CSR_VGPR:v[0-9]+]], off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; GCN-NEXT: s_mov_b64 exec, [[COPY_EXEC0]]
+; GCN-NEXT: v_writelane_b32 v32, s34, 0
+; GCN-NEXT: s_mov_b32 s34, s32
+; GCN: s_add_u32 s32, s32, 0x300{{$}}
+
+; GCN-DAG: s_mov_b64 vcc, s[30:31]
+; GCN-DAG: buffer_store_dword
+
+; GCN: ;;#ASMSTART
+; GCN: s_mov_b64 s[30:31], vcc
+
+; GCN: s_sub_u32 s32, s32, 0x300{{$}}
+; GCN-NEXT: v_readlane_b32 s34, v32, 0
+; GCN-NEXT: s_or_saveexec_b64 [[COPY_EXEC1:s\[[0-9]+:[0-9]+\]]], -1{{$}}
+; GCN-NEXT: buffer_load_dword [[CSR_VGPR]], off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; GCN-NEXT: s_mov_b64 exec, [[COPY_EXEC1]]
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: s_setpc_b64
+define void @no_unused_non_csr_sgpr_for_fp_no_scratch_vgpr() #1 {
+ %alloca = alloca i32, addrspace(5)
+ store volatile i32 0, i32 addrspace(5)* %alloca
+
+ ; Use all clobberable registers, so FP has to spill to a VGPR.
+ call void asm sideeffect "",
+ "~{s0},~{s1},~{s2},~{s3},~{s4},~{s5},~{s6},~{s7},~{s8},~{s9}
+ ,~{s10},~{s11},~{s12},~{s13},~{s14},~{s15},~{s16},~{s17},~{s18},~{s19}
+ ,~{s20},~{s21},~{s22},~{s23},~{s24},~{s25},~{s26},~{s27},~{s28},~{s29}
+ ,~{s30},~{s31}"() #0
+
+ call void asm sideeffect "; clobber nonpreserved VGPRs",
+ "~{v0},~{v1},~{v2},~{v3},~{v4},~{v5},~{v6},~{v7},~{v8},~{v9}
+ ,~{v10},~{v11},~{v12},~{v13},~{v14},~{v15},~{v16},~{v17},~{v18},~{v19}
+ ,~{v20},~{v21},~{v22},~{v23},~{v24},~{v25},~{v26},~{v27},~{v28},~{v29}
+ ,~{v30},~{v31}"() #1
+
+ ret void
+}
+
+; The byval argument exceeds the MUBUF constant offset, so a scratch
+; register is needed to access the CSR VGPR slot.
+; GCN-LABEL: {{^}}scratch_reg_needed_mubuf_offset:
+; GCN: s_waitcnt
+; GCN-NEXT: s_or_saveexec_b64 [[COPY_EXEC0:s\[[0-9]+:[0-9]+\]]], -1{{$}}
+; GCN-NEXT: v_mov_b32_e32 [[SCRATCH_VGPR:v[0-9]+]], 0x1008
+; GCN-NEXT: buffer_store_dword [[CSR_VGPR:v[0-9]+]], [[SCRATCH_VGPR]], s[0:3], s32 offen ; 4-byte Folded Spill
+; GCN-NEXT: s_mov_b64 exec, [[COPY_EXEC0]]
+; GCN-NEXT: v_writelane_b32 v32, s34, 0
+; GCN-NEXT: s_mov_b32 s34, s32
+; GCN-DAG: s_add_u32 s32, s32, 0x40300{{$}}
+; GCN-DAG: s_mov_b64 vcc, s[30:31]
+; GCN-DAG: buffer_store_dword
+
+; GCN: ;;#ASMSTART
+; GCN: s_mov_b64 s[30:31], vcc
+
+; GCN: s_sub_u32 s32, s32, 0x40300{{$}}
+; GCN-NEXT: v_readlane_b32 s34, v32, 0
+; GCN-NEXT: s_or_saveexec_b64 [[COPY_EXEC1:s\[[0-9]+:[0-9]+\]]], -1{{$}}
+; GCN-NEXT: v_mov_b32_e32 [[SCRATCH_VGPR:v[0-9]+]], 0x1008
+; GCN-NEXT: buffer_load_dword [[CSR_VGPR]], [[SCRATCH_VGPR]], s[0:3], s32 offen ; 4-byte Folded Reload
+; GCN-NEXT: s_mov_b64 exec, [[COPY_EXEC1]]
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: s_setpc_b64
+define void @scratch_reg_needed_mubuf_offset([4096 x i8] addrspace(5)* byval align 4 %arg) #1 {
+ %alloca = alloca i32, addrspace(5)
+ store volatile i32 0, i32 addrspace(5)* %alloca
+
+ ; Use all clobberable registers, so FP has to spill to a VGPR.
+ call void asm sideeffect "; clobber nonpreserved SGPRs",
+ "~{s0},~{s1},~{s2},~{s3},~{s4},~{s5},~{s6},~{s7},~{s8},~{s9}
+ ,~{s10},~{s11},~{s12},~{s13},~{s14},~{s15},~{s16},~{s17},~{s18},~{s19}
+ ,~{s20},~{s21},~{s22},~{s23},~{s24},~{s25},~{s26},~{s27},~{s28},~{s29}
+ ,~{s30},~{s31}"() #0
+
+ ; Use all clobberable VGPRs, so a CSR spill is needed for the VGPR
+ call void asm sideeffect "; clobber nonpreserved VGPRs",
+ "~{v0},~{v1},~{v2},~{v3},~{v4},~{v5},~{v6},~{v7},~{v8},~{v9}
+ ,~{v10},~{v11},~{v12},~{v13},~{v14},~{v15},~{v16},~{v17},~{v18},~{v19}
+ ,~{v20},~{v21},~{v22},~{v23},~{v24},~{v25},~{v26},~{v27},~{v28},~{v29}
+ ,~{v30},~{v31}"() #1
+
+ ret void
+}
+
+; GCN-LABEL: {{^}}local_empty_func:
+; GCN: s_waitcnt
+; GCN-NEXT: s_setpc_b64
+define internal void @local_empty_func() #0 {
+ ret void
+}
+
+; An FP is needed, despite not needing any spills
+; TODO: Ccould see callee does not use stack and omit FP.
+; GCN-LABEL: {{^}}ipra_call_with_stack:
+; GCN: s_mov_b32 [[FP_COPY:s[0-9]+]], s34
+; GCN: s_mov_b32 s34, s32
+; GCN: s_add_u32 s32, s32, 0x400
+; GCN: buffer_store_dword v{{[0-9]+}}, off, s[0:3], s34{{$}}
+; GCN: s_swappc_b64
+; GCN: s_sub_u32 s32, s32, 0x400
+; GCN: s_mov_b32 s34, [[FP_COPY:s[0-9]+]]
+define void @ipra_call_with_stack() #0 {
+ %alloca = alloca i32, addrspace(5)
+ store volatile i32 0, i32 addrspace(5)* %alloca
+ call void @local_empty_func()
+ ret void
+}
+
attributes #0 = { nounwind }
attributes #1 = { nounwind "frame-pointer"="all" }
attributes #2 = { nounwind "frame-pointer"="non-leaf" }
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -mattr=-code-object-v3 -enable-ipra=0 -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -enable-var-scope -check-prefixes=GCN,GFX9 %s
; GCN-LABEL: {{^}}use_dispatch_ptr:
-; GCN: v_mov_b32_e32 v[[LO:[0-9]+]], s6
-; GCN: v_mov_b32_e32 v[[HI:[0-9]+]], s7
+; GCN: v_mov_b32_e32 v[[LO:[0-9]+]], s4
+; GCN: v_mov_b32_e32 v[[HI:[0-9]+]], s5
; GCN: {{flat|global}}_load_dword v{{[0-9]+}}, v{{\[}}[[LO]]:[[HI]]{{\]}}
define hidden void @use_dispatch_ptr() #1 {
%dispatch_ptr = call noalias i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr() #0
; GCN-LABEL: {{^}}kern_indirect_use_dispatch_ptr:
; GCN: enable_sgpr_dispatch_ptr = 1
-; GCN: s_mov_b64 s[6:7], s[4:5]
+; GCN-NOT: s[4:5]
+; GCN-NOT: s4
+; GCN-NOT: s5
define amdgpu_kernel void @kern_indirect_use_dispatch_ptr(i32) #1 {
call void @use_dispatch_ptr()
ret void
}
; GCN-LABEL: {{^}}use_queue_ptr:
-; GCN: v_mov_b32_e32 v[[LO:[0-9]+]], s6
-; GCN: v_mov_b32_e32 v[[HI:[0-9]+]], s7
+; GCN: v_mov_b32_e32 v[[LO:[0-9]+]], s4
+; GCN: v_mov_b32_e32 v[[HI:[0-9]+]], s5
; GCN: {{flat|global}}_load_dword v{{[0-9]+}}, v{{\[}}[[LO]]:[[HI]]{{\]}}
define hidden void @use_queue_ptr() #1 {
%queue_ptr = call noalias i8 addrspace(4)* @llvm.amdgcn.queue.ptr() #0
; GCN-LABEL: {{^}}kern_indirect_use_queue_ptr:
; GCN: enable_sgpr_queue_ptr = 1
-; GCN: s_mov_b64 s[6:7], s[4:5]
-; GCN: s_swappc_b64
+; GCN-NOT: s[4:5]
+; GCN-NOT: s4
+; GCN-NOT: s5
define amdgpu_kernel void @kern_indirect_use_queue_ptr(i32) #1 {
call void @use_queue_ptr()
ret void
; GCN-LABEL: {{^}}kern_indirect_use_queue_ptr_addrspacecast:
; CIVI: enable_sgpr_queue_ptr = 1
-
-; CIVI: s_mov_b64 s[6:7], s[4:5]
-; GFX9-NOT: s_mov_b64
-; GCN: s_swappc_b64
+; CIVI-NOT: s[4:5]
+; CIVI-NOT: s4
+; CIVI-NOT: s5
define amdgpu_kernel void @kern_indirect_use_queue_ptr_addrspacecast(i32) #1 {
call void @use_queue_ptr_addrspacecast()
ret void
}
; GCN-LABEL: {{^}}use_kernarg_segment_ptr:
-; GCN: v_mov_b32_e32 v[[LO:[0-9]+]], s6
-; GCN: v_mov_b32_e32 v[[HI:[0-9]+]], s7
+; GCN: v_mov_b32_e32 v[[LO:[0-9]+]], s4
+; GCN: v_mov_b32_e32 v[[HI:[0-9]+]], s5
; GCN: {{flat|global}}_load_dword v{{[0-9]+}}, v{{\[}}[[LO]]:[[HI]]{{\]}}
define hidden void @use_kernarg_segment_ptr() #1 {
%kernarg_segment_ptr = call noalias i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr() #0
; GCN-LABEL: {{^}}kern_indirect_use_kernarg_segment_ptr:
; GCN: enable_sgpr_kernarg_segment_ptr = 1
-; GCN: s_mov_b64 s[6:7], s[4:5]
+; GCN-NOT: s[4:5]
+; GCN-NOT: s4
+; GCN-NOT: s5
; GCN: s_swappc_b64
define amdgpu_kernel void @kern_indirect_use_kernarg_segment_ptr(i32) #1 {
call void @use_kernarg_segment_ptr()
}
; GCN-LABEL: {{^}}use_dispatch_id:
-; GCN: ; use s[6:7]
+; GCN: ; use s[4:5]
define hidden void @use_dispatch_id() #1 {
%id = call i64 @llvm.amdgcn.dispatch.id()
call void asm sideeffect "; use $0", "s"(i64 %id)
; GCN-LABEL: {{^}}kern_indirect_use_dispatch_id:
; GCN: enable_sgpr_dispatch_id = 1
-
-; GCN: s_mov_b64 s[6:7], s[4:5]
+; GCN-NOT: s[4:5]
+; GCN-NOT: s4
+; GCN-NOT: s5
define amdgpu_kernel void @kern_indirect_use_dispatch_id() #1 {
call void @use_dispatch_id()
ret void
; GCN-LABEL: {{^}}use_workgroup_id_xy:
; GCN: ; use s4
-; GCN: ; use s6
+; GCN: ; use s5
define hidden void @use_workgroup_id_xy() #1 {
%val0 = call i32 @llvm.amdgcn.workgroup.id.x()
%val1 = call i32 @llvm.amdgcn.workgroup.id.y()
; GCN-LABEL: {{^}}use_workgroup_id_xyz:
; GCN: ; use s4
+; GCN: ; use s5
; GCN: ; use s6
-; GCN: ; use s7
define hidden void @use_workgroup_id_xyz() #1 {
%val0 = call i32 @llvm.amdgcn.workgroup.id.x()
%val1 = call i32 @llvm.amdgcn.workgroup.id.y()
; GCN-LABEL: {{^}}use_workgroup_id_xz:
; GCN: ; use s4
-; GCN: ; use s6
+; GCN: ; use s5
define hidden void @use_workgroup_id_xz() #1 {
%val0 = call i32 @llvm.amdgcn.workgroup.id.x()
%val1 = call i32 @llvm.amdgcn.workgroup.id.z()
; GCN-LABEL: {{^}}use_workgroup_id_yz:
; GCN: ; use s4
-; GCN: ; use s6
+; GCN: ; use s5
define hidden void @use_workgroup_id_yz() #1 {
%val0 = call i32 @llvm.amdgcn.workgroup.id.y()
%val1 = call i32 @llvm.amdgcn.workgroup.id.z()
; GCN: s_mov_b32 s33, s8
+; GCN: s_mov_b32 s5, s7
; GCN: s_mov_b32 s4, s6
-; GCN: s_mov_b32 s6, s7
; GCN: s_mov_b32 s32, s33
; GCN: s_swappc_b64
define amdgpu_kernel void @kern_indirect_use_workgroup_id_xy() #1 {
; GCN: s_mov_b32 s33, s9
; GCN: s_mov_b32 s4, s6
-; GCN: s_mov_b32 s6, s7
-; GCN: s_mov_b32 s7, s8
+; GCN: s_mov_b32 s5, s7
+; GCN: s_mov_b32 s6, s8
; GCN: s_mov_b32 s32, s33
; GCN: s_swappc_b64
; GCN: enable_sgpr_workgroup_id_z = 1
; GCN: s_mov_b32 s33, s8
+; GCN: s_mov_b32 s5, s7
; GCN: s_mov_b32 s4, s6
-; GCN: s_mov_b32 s6, s7
; GCN: s_mov_b32 s32, s33
; GCN: enable_sgpr_workgroup_id_z = 1
; GCN: s_mov_b32 s33, s9
-; GCN: s_mov_b32 s6, s8
; GCN: s_mov_b32 s4, s7
+; GCN: s_mov_b32 s5, s8
; GCN: s_mov_b32 s32, s33
; GCN: s_swappc_b64
define amdgpu_kernel void @kern_indirect_use_workgroup_id_yz() #1 {
; GCN-LABEL: {{^}}use_every_sgpr_input:
; GCN: buffer_store_dword v{{[0-9]+}}, off, s[0:3], s32{{$}}
+; GCN: v_mov_b32_e32 v[[LO:[0-9]+]], s4
+; GCN: v_mov_b32_e32 v[[HI:[0-9]+]], s5
+; GCN: {{flat|global}}_load_dword v{{[0-9]+}}, v{{\[}}[[LO]]:[[HI]]{{\]}}
; GCN: v_mov_b32_e32 v[[LO:[0-9]+]], s6
; GCN: v_mov_b32_e32 v[[HI:[0-9]+]], s7
; GCN: {{flat|global}}_load_dword v{{[0-9]+}}, v{{\[}}[[LO]]:[[HI]]{{\]}}
; GCN: v_mov_b32_e32 v[[LO:[0-9]+]], s8
; GCN: v_mov_b32_e32 v[[HI:[0-9]+]], s9
; GCN: {{flat|global}}_load_dword v{{[0-9]+}}, v{{\[}}[[LO]]:[[HI]]{{\]}}
-; GCN: v_mov_b32_e32 v[[LO:[0-9]+]], s10
-; GCN: v_mov_b32_e32 v[[HI:[0-9]+]], s11
-; GCN: {{flat|global}}_load_dword v{{[0-9]+}}, v{{\[}}[[LO]]:[[HI]]{{\]}}
-; GCN: ; use s[12:13]
-; GCN: ; use s4
+; GCN: ; use s[10:11]
+; GCN: ; use s12
+; GCN: ; use s13
; GCN: ; use s14
-; GCN: ; use s15
define hidden void @use_every_sgpr_input() #1 {
%alloca = alloca i32, align 4, addrspace(5)
store volatile i32 0, i32 addrspace(5)* %alloca
; GCN: enable_sgpr_flat_scratch_init = 1
; GCN: s_mov_b32 s33, s17
-; GCN: s_mov_b64 s[12:13], s[10:11]
-; GCN: s_mov_b64 s[10:11], s[8:9]
-; GCN: s_mov_b64 s[8:9], s[6:7]
-; GCN: s_mov_b64 s[6:7], s[4:5]
-; GCN: s_mov_b32 s4, s14
-; GCN: s_mov_b32 s14, s15
-; GCN: s_mov_b32 s15, s16
+; GCN: s_mov_b32 s12, s14
+; GCN: s_mov_b32 s13, s15
+; GCN: s_mov_b32 s14, s16
; GCN: s_mov_b32 s32, s33
; GCN: s_swappc_b64
define amdgpu_kernel void @kern_indirect_use_every_sgpr_input() #1 {
; GCN-NOT: s[8:9]
; GCN-NOT: s[10:11]
; GCN-NOT: s[12:13]
-; GCN: s_or_saveexec_b64 s[6:7], -1
+; GCN: s_or_saveexec_b64 s[4:5], -1
define hidden void @func_indirect_use_every_sgpr_input() #1 {
call void @use_every_sgpr_input()
ret void
}
; GCN-LABEL: {{^}}func_use_every_sgpr_input_call_use_workgroup_id_xyz:
-; GCN-NOT: s_mov_b32 s4
+; GCN: s_mov_b32 s4, s12
+; GCN: s_mov_b32 s5, s13
; GCN: s_mov_b32 s6, s14
-; GCN-NEXT: s_mov_b32 s7, s15
-; GCN-NOT: s_mov_b32 s4
+; GCN: ; use s[10:11]
+; GCN: ; use s12
+; GCN: ; use s13
+; GCN: ; use s14
; GCN: s_swappc_b64
define hidden void @func_use_every_sgpr_input_call_use_workgroup_id_xyz() #1 {
}
; GCN-LABEL: {{^}}func_use_every_sgpr_input_call_use_workgroup_id_xyz_spill:
-; GCN: s_mov_b32 s5, s32
-
+; GCN-DAG: s_mov_b32 s34, s32
; GCN-DAG: s_add_u32 s32, s32, 0x400
+; GCN-DAG: s_mov_b64 s{{\[}}[[LO_X:[0-9]+]]{{\:}}[[HI_X:[0-9]+]]{{\]}}, s[4:5]
+; GCN-DAG: s_mov_b64 s{{\[}}[[LO_Y:[0-9]+]]{{\:}}[[HI_Y:[0-9]+]]{{\]}}, s[6:7]
+
+
+; GCN: s_mov_b32 s4, s12
+; GCN: s_mov_b32 s5, s13
+; GCN: s_mov_b32 s6, s14
-; GCN: s_mov_b64 s{{\[}}[[LO_X:[0-9]+]]{{\:}}[[HI_X:[0-9]+]]{{\]}}, s[6:7]
-; GCN-NOT: s_mov_b32 s4,
-; GCN-DAG: s_mov_b32 s6, s14
-; GCN-DAG: s_mov_b32 s7, s15
+; GCN: s_mov_b64 s{{\[}}[[LO_Z:[0-9]+]]{{\:}}[[HI_Z:[0-9]+]]{{\]}}, s[8:9]
-; GCN: s_mov_b64 s{{\[}}[[LO_Y:[0-9]+]]{{\:}}[[HI_Y:[0-9]+]]{{\]}}, s[8:9]
+; GCN-DAG: s_mov_b32 [[SAVE_X:s[0-57-9][0-9]*]], s12
+; GCN-DAG: s_mov_b32 [[SAVE_Y:s[0-57-9][0-9]*]], s13
+; GCN-DAG: s_mov_b32 [[SAVE_Z:s[0-68-9][0-9]*]], s14
-; GCN-DAG: s_mov_b32 [[SAVE_X:s[0-57-9][0-9]*]], s4
-; GCN-DAG: s_mov_b32 [[SAVE_Y:s[0-57-9][0-9]*]], s14
-; GCN-DAG: s_mov_b32 [[SAVE_Z:s[0-68-9][0-9]*]], s15
-; GCN: s_mov_b64 s{{\[}}[[LO_Z:[0-9]+]]{{\:}}[[HI_Z:[0-9]+]]{{\]}}, s[10:11]
; GCN: s_swappc_b64
-; GCN-DAG: buffer_store_dword v{{[0-9]+}}, off, s[0:3], s5{{$}}
+; GCN-DAG: buffer_store_dword v{{[0-9]+}}, off, s[0:3], s34{{$}}
; GCN-DAG: v_mov_b32_e32 v[[LO1:[0-9]+]], s[[LO_X]]
; GCN-DAG: v_mov_b32_e32 v[[HI1:[0-9]+]], s[[HI_X]]
; GCN-DAG: {{flat|global}}_load_dword v{{[0-9]+}}, v{{\[}}[[LO1]]:[[HI1]]{{\]}}
}
; GCN-LABEL: {{^}}func_call_too_many_args_use_workitem_id_x:
-; GCN: s_mov_b32 s5, s32
+; GCN: s_mov_b32 s34, s32
; GCN: buffer_store_dword v1, off, s[0:3], s32{{$}}
; GCN: s_swappc_b64
define void @func_call_too_many_args_use_workitem_id_x(i32 %arg0) #1 {
; Requires loading and storing to stack slot.
; GCN-LABEL: {{^}}too_many_args_call_too_many_args_use_workitem_id_x:
-; GCN: s_add_u32 s32, s32, 0x400{{$}}
-; GCN: buffer_store_dword v32, off, s[0:3], s5 offset:4 ; 4-byte Folded Spill
-; GCN: buffer_load_dword v32, off, s[0:3], s5{{$}}
+; GCN-DAG: s_add_u32 s32, s32, 0x400{{$}}
+; GCN-DAG: buffer_store_dword v32, off, s[0:3], s34 offset:4 ; 4-byte Folded Spill
+; GCN: buffer_load_dword v32, off, s[0:3], s34{{$}}
; GCN: buffer_store_dword v32, off, s[0:3], s32{{$}}
; GCN: s_swappc_b64
-; GCN: buffer_load_dword v32, off, s[0:3], s5 offset:4 ; 4-byte Folded Reload
+; GCN: buffer_load_dword v32, off, s[0:3], s34 offset:4 ; 4-byte Folded Reload
; GCN: s_sub_u32 s32, s32, 0x400{{$}}
; GCN: s_setpc_b64
define void @too_many_args_call_too_many_args_use_workitem_id_x(
; GCN-LABEL: {{^}}func_call_too_many_args_use_workitem_id_x_byval:
; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 0x3e7{{$}}
-; GCN: buffer_store_dword [[K]], off, s[0:3], s5{{$}}
+; GCN: buffer_store_dword [[K]], off, s[0:3], s34{{$}}
; GCN: buffer_store_dword v0, off, s[0:3], s32 offset:4
-; GCN: buffer_load_dword [[RELOAD_BYVAL:v[0-9]+]], off, s[0:3], s5{{$}}
+; GCN: buffer_load_dword [[RELOAD_BYVAL:v[0-9]+]], off, s[0:3], s34{{$}}
; GCN: buffer_store_dword [[RELOAD_BYVAL]], off, s[0:3], s32{{$}}
; GCN: v_mov_b32_e32 [[RELOAD_BYVAL]],
; GCN: s_swappc_b64
; GCN-LABEL: call_split_type_used_outside_block_v2f32:
; GCN: ; %bb.0: ; %bb0
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: s_mov_b32 s5, s32
+; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1
+; GCN-NEXT: buffer_store_dword v32, off, s[0:3], s32 ; 4-byte Folded Spill
+; GCN-NEXT: s_mov_b64 exec, s[4:5]
+; GCN-NEXT: v_writelane_b32 v32, s34, 2
+; GCN-NEXT: v_writelane_b32 v32, s36, 0
+; GCN-NEXT: s_mov_b32 s34, s32
; GCN-NEXT: s_add_u32 s32, s32, 0x400
-; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1
-; GCN-NEXT: buffer_store_dword v32, off, s[0:3], s5 ; 4-byte Folded Spill
-; GCN-NEXT: s_mov_b64 exec, s[6:7]
-; GCN-NEXT: v_writelane_b32 v32, s34, 0
-; GCN-NEXT: v_writelane_b32 v32, s35, 1
-; GCN-NEXT: v_writelane_b32 v32, s36, 2
-; GCN-NEXT: s_getpc_b64 s[6:7]
-; GCN-NEXT: s_add_u32 s6, s6, func_v2f32@rel32@lo+4
-; GCN-NEXT: s_addc_u32 s7, s7, func_v2f32@rel32@hi+4
-; GCN-NEXT: s_mov_b64 s[34:35], s[30:31]
-; GCN-NEXT: s_mov_b32 s36, s5
-; GCN-NEXT: s_swappc_b64 s[30:31], s[6:7]
-; GCN-NEXT: s_mov_b32 s5, s36
-; GCN-NEXT: s_mov_b64 s[30:31], s[34:35]
-; GCN-NEXT: v_readlane_b32 s36, v32, 2
-; GCN-NEXT: v_readlane_b32 s35, v32, 1
-; GCN-NEXT: v_readlane_b32 s34, v32, 0
-; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1
-; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s5 ; 4-byte Folded Reload
-; GCN-NEXT: s_mov_b64 exec, s[6:7]
+; GCN-NEXT: v_writelane_b32 v32, s37, 1
+; GCN-NEXT: s_getpc_b64 s[4:5]
+; GCN-NEXT: s_add_u32 s4, s4, func_v2f32@rel32@lo+4
+; GCN-NEXT: s_addc_u32 s5, s5, func_v2f32@rel32@hi+4
+; GCN-NEXT: s_mov_b64 s[36:37], s[30:31]
+; GCN-NEXT: s_swappc_b64 s[30:31], s[4:5]
+; GCN-NEXT: s_mov_b64 s[30:31], s[36:37]
+; GCN-NEXT: v_readlane_b32 s37, v32, 1
+; GCN-NEXT: v_readlane_b32 s36, v32, 0
; GCN-NEXT: s_sub_u32 s32, s32, 0x400
+; GCN-NEXT: v_readlane_b32 s34, v32, 2
+; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1
+; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 ; 4-byte Folded Reload
+; GCN-NEXT: s_mov_b64 exec, s[4:5]
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: s_setpc_b64 s[30:31]
bb0:
; GCN-LABEL: call_split_type_used_outside_block_v3f32:
; GCN: ; %bb.0: ; %bb0
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: s_mov_b32 s5, s32
+; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1
+; GCN-NEXT: buffer_store_dword v32, off, s[0:3], s32 ; 4-byte Folded Spill
+; GCN-NEXT: s_mov_b64 exec, s[4:5]
+; GCN-NEXT: v_writelane_b32 v32, s34, 2
+; GCN-NEXT: v_writelane_b32 v32, s36, 0
+; GCN-NEXT: s_mov_b32 s34, s32
; GCN-NEXT: s_add_u32 s32, s32, 0x400
-; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1
-; GCN-NEXT: buffer_store_dword v32, off, s[0:3], s5 ; 4-byte Folded Spill
-; GCN-NEXT: s_mov_b64 exec, s[6:7]
-; GCN-NEXT: v_writelane_b32 v32, s34, 0
-; GCN-NEXT: v_writelane_b32 v32, s35, 1
-; GCN-NEXT: v_writelane_b32 v32, s36, 2
-; GCN-NEXT: s_getpc_b64 s[6:7]
-; GCN-NEXT: s_add_u32 s6, s6, func_v3f32@rel32@lo+4
-; GCN-NEXT: s_addc_u32 s7, s7, func_v3f32@rel32@hi+4
-; GCN-NEXT: s_mov_b64 s[34:35], s[30:31]
-; GCN-NEXT: s_mov_b32 s36, s5
-; GCN-NEXT: s_swappc_b64 s[30:31], s[6:7]
-; GCN-NEXT: s_mov_b32 s5, s36
-; GCN-NEXT: s_mov_b64 s[30:31], s[34:35]
-; GCN-NEXT: v_readlane_b32 s36, v32, 2
-; GCN-NEXT: v_readlane_b32 s35, v32, 1
-; GCN-NEXT: v_readlane_b32 s34, v32, 0
-; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1
-; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s5 ; 4-byte Folded Reload
-; GCN-NEXT: s_mov_b64 exec, s[6:7]
+; GCN-NEXT: v_writelane_b32 v32, s37, 1
+; GCN-NEXT: s_getpc_b64 s[4:5]
+; GCN-NEXT: s_add_u32 s4, s4, func_v3f32@rel32@lo+4
+; GCN-NEXT: s_addc_u32 s5, s5, func_v3f32@rel32@hi+4
+; GCN-NEXT: s_mov_b64 s[36:37], s[30:31]
+; GCN-NEXT: s_swappc_b64 s[30:31], s[4:5]
+; GCN-NEXT: s_mov_b64 s[30:31], s[36:37]
+; GCN-NEXT: v_readlane_b32 s37, v32, 1
+; GCN-NEXT: v_readlane_b32 s36, v32, 0
; GCN-NEXT: s_sub_u32 s32, s32, 0x400
+; GCN-NEXT: v_readlane_b32 s34, v32, 2
+; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1
+; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 ; 4-byte Folded Reload
+; GCN-NEXT: s_mov_b64 exec, s[4:5]
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: s_setpc_b64 s[30:31]
bb0:
; GCN-LABEL: call_split_type_used_outside_block_v4f16:
; GCN: ; %bb.0: ; %bb0
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: s_mov_b32 s5, s32
+; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1
+; GCN-NEXT: buffer_store_dword v32, off, s[0:3], s32 ; 4-byte Folded Spill
+; GCN-NEXT: s_mov_b64 exec, s[4:5]
+; GCN-NEXT: v_writelane_b32 v32, s34, 2
+; GCN-NEXT: v_writelane_b32 v32, s36, 0
+; GCN-NEXT: s_mov_b32 s34, s32
; GCN-NEXT: s_add_u32 s32, s32, 0x400
-; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1
-; GCN-NEXT: buffer_store_dword v32, off, s[0:3], s5 ; 4-byte Folded Spill
-; GCN-NEXT: s_mov_b64 exec, s[6:7]
-; GCN-NEXT: v_writelane_b32 v32, s34, 0
-; GCN-NEXT: v_writelane_b32 v32, s35, 1
-; GCN-NEXT: v_writelane_b32 v32, s36, 2
-; GCN-NEXT: s_getpc_b64 s[6:7]
-; GCN-NEXT: s_add_u32 s6, s6, func_v4f16@rel32@lo+4
-; GCN-NEXT: s_addc_u32 s7, s7, func_v4f16@rel32@hi+4
-; GCN-NEXT: s_mov_b64 s[34:35], s[30:31]
-; GCN-NEXT: s_mov_b32 s36, s5
-; GCN-NEXT: s_swappc_b64 s[30:31], s[6:7]
-; GCN-NEXT: s_mov_b32 s5, s36
-; GCN-NEXT: s_mov_b64 s[30:31], s[34:35]
-; GCN-NEXT: v_readlane_b32 s36, v32, 2
-; GCN-NEXT: v_readlane_b32 s35, v32, 1
-; GCN-NEXT: v_readlane_b32 s34, v32, 0
-; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1
-; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s5 ; 4-byte Folded Reload
-; GCN-NEXT: s_mov_b64 exec, s[6:7]
+; GCN-NEXT: v_writelane_b32 v32, s37, 1
+; GCN-NEXT: s_getpc_b64 s[4:5]
+; GCN-NEXT: s_add_u32 s4, s4, func_v4f16@rel32@lo+4
+; GCN-NEXT: s_addc_u32 s5, s5, func_v4f16@rel32@hi+4
+; GCN-NEXT: s_mov_b64 s[36:37], s[30:31]
+; GCN-NEXT: s_swappc_b64 s[30:31], s[4:5]
+; GCN-NEXT: s_mov_b64 s[30:31], s[36:37]
+; GCN-NEXT: v_readlane_b32 s37, v32, 1
+; GCN-NEXT: v_readlane_b32 s36, v32, 0
; GCN-NEXT: s_sub_u32 s32, s32, 0x400
+; GCN-NEXT: v_readlane_b32 s34, v32, 2
+; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1
+; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 ; 4-byte Folded Reload
+; GCN-NEXT: s_mov_b64 exec, s[4:5]
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: s_setpc_b64 s[30:31]
bb0:
; GCN-LABEL: call_split_type_used_outside_block_struct:
; GCN: ; %bb.0: ; %bb0
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: s_mov_b32 s5, s32
+; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1
+; GCN-NEXT: buffer_store_dword v32, off, s[0:3], s32 ; 4-byte Folded Spill
+; GCN-NEXT: s_mov_b64 exec, s[4:5]
+; GCN-NEXT: v_writelane_b32 v32, s34, 2
+; GCN-NEXT: v_writelane_b32 v32, s36, 0
+; GCN-NEXT: s_mov_b32 s34, s32
; GCN-NEXT: s_add_u32 s32, s32, 0x400
-; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1
-; GCN-NEXT: buffer_store_dword v32, off, s[0:3], s5 ; 4-byte Folded Spill
-; GCN-NEXT: s_mov_b64 exec, s[6:7]
-; GCN-NEXT: v_writelane_b32 v32, s34, 0
-; GCN-NEXT: v_writelane_b32 v32, s35, 1
-; GCN-NEXT: v_writelane_b32 v32, s36, 2
-; GCN-NEXT: s_getpc_b64 s[6:7]
-; GCN-NEXT: s_add_u32 s6, s6, func_struct@rel32@lo+4
-; GCN-NEXT: s_addc_u32 s7, s7, func_struct@rel32@hi+4
-; GCN-NEXT: s_mov_b64 s[34:35], s[30:31]
-; GCN-NEXT: s_mov_b32 s36, s5
-; GCN-NEXT: s_swappc_b64 s[30:31], s[6:7]
-; GCN-NEXT: s_mov_b32 s5, s36
+; GCN-NEXT: v_writelane_b32 v32, s37, 1
+; GCN-NEXT: s_getpc_b64 s[4:5]
+; GCN-NEXT: s_add_u32 s4, s4, func_struct@rel32@lo+4
+; GCN-NEXT: s_addc_u32 s5, s5, func_struct@rel32@hi+4
+; GCN-NEXT: s_mov_b64 s[36:37], s[30:31]
+; GCN-NEXT: s_swappc_b64 s[30:31], s[4:5]
+; GCN-NEXT: s_mov_b64 s[30:31], s[36:37]
+; GCN-NEXT: v_readlane_b32 s37, v32, 1
+; GCN-NEXT: v_readlane_b32 s36, v32, 0
; GCN-NEXT: v_mov_b32_e32 v1, v4
-; GCN-NEXT: s_mov_b64 s[30:31], s[34:35]
-; GCN-NEXT: v_readlane_b32 s36, v32, 2
-; GCN-NEXT: v_readlane_b32 s35, v32, 1
-; GCN-NEXT: v_readlane_b32 s34, v32, 0
-; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1
-; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s5 ; 4-byte Folded Reload
-; GCN-NEXT: s_mov_b64 exec, s[6:7]
; GCN-NEXT: s_sub_u32 s32, s32, 0x400
+; GCN-NEXT: v_readlane_b32 s34, v32, 2
+; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1
+; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 ; 4-byte Folded Reload
+; GCN-NEXT: s_mov_b64 exec, s[4:5]
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: s_setpc_b64 s[30:31]
bb0:
; CI: s_sub_u32 [[SUB:s[0-9]+]], s32, s33
; CI-NEXT: v_lshr_b32_e64 [[SCALED:v[0-9]+]], [[SUB]], 6
-; CI-NEXT: v_add_i32_e64 v1, s[6:7], 4, [[SCALED]]
+; CI-NEXT: v_add_i32_e64 v1, s{{\[[0-9]+:[0-9]+\]}}, 4, [[SCALED]]
; CI-NOT: v_mov
; CI: ds_write_b32 v0, v0
; CI-NEXT: ds_write_b32 v0, v1
; GCN-DAG: s_movk_i32 [[K:s[0-9]+]], 0x200
; CI-DAG: v_lshr_b32_e64 [[SCALED:v[0-9]+]], [[SUB]], 6
-; CI: v_add_i32_e64 [[VZ:v[0-9]+]], s[6:7], [[K]], [[SCALED]]
+; CI: v_add_i32_e64 [[VZ:v[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, [[K]], [[SCALED]]
; GFX9-DAG: v_lshrrev_b32_e64 [[SCALED:v[0-9]+]], 6, [[SUB]]
; GFX9: v_add_u32_e32 [[VZ:v[0-9]+]], [[K]], [[SCALED]]
; GCN-LABEL: {{^}}undefined_stack_store_reg:
; GCN: s_and_saveexec_b64
-; GCN: buffer_store_dword v0, off, s[0:3], s5 offset:
-; GCN: buffer_store_dword v0, off, s[0:3], s5 offset:
-; GCN: buffer_store_dword v0, off, s[0:3], s5 offset:
-; GCN: buffer_store_dword v{{[0-9]+}}, off, s[0:3], s5 offset:
+; GCN: buffer_store_dword v0, off, s[0:3], s34 offset:
+; GCN: buffer_store_dword v0, off, s[0:3], s34 offset:
+; GCN: buffer_store_dword v0, off, s[0:3], s34 offset:
+; GCN: buffer_store_dword v{{[0-9]+}}, off, s[0:3], s34 offset:
define void @undefined_stack_store_reg(float %arg, i32 %arg1) #0 {
bb:
%tmp = alloca <4 x float>, align 16, addrspace(5)
}
; GCN-LABEL: {{^}}f32_func_void:
-; GCN: buffer_load_dword v0, off, s[8:11], 0
+; GCN: buffer_load_dword v0, off, s[4:7], 0
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: s_setpc_b64
define float @f32_func_void() #0 {
; GCN-LABEL: {{^}}func_tail_call:
; GCN: s_waitcnt
-; GCN-NEXT: s_getpc_b64 s[6:7]
-; GCN-NEXT: s_add_u32 s6,
-; GCN-NEXT: s_addc_u32 s7,
-; GCN-NEXT: s_setpc_b64 s[6:7]
+; GCN-NEXT: s_getpc_b64 s[4:5]
+; GCN-NEXT: s_add_u32 s4,
+; GCN-NEXT: s_addc_u32 s5,
+; GCN-NEXT: s_setpc_b64 s[4:5]
; GCN: ; NumSgprs: 32
; GCN: ; NumVgprs: 8
; SI-LABEL: {{^}}test_fold_and_ord:
; SI: s_waitcnt
-; SI-NEXT: v_cmp_class_f32_e64 s[6:7], v0, 32{{$}}
-; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[6:7]
+; SI-NEXT: v_cmp_class_f32_e64 [[COND:s\[[0-9]+:[0-9]+\]]], v0, 32{{$}}
+; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, [[COND]]
; SI-NEXT: s_setpc_b64
define i1 @test_fold_and_ord(float %a) {
%class = call i1 @llvm.amdgcn.class.f32(float %a, i32 35) #1
; SI-LABEL: {{^}}test_fold_and_unord:
; SI: s_waitcnt
-; SI-NEXT: v_cmp_class_f32_e64 s[6:7], v0, 3{{$}}
-; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[6:7]
+; SI-NEXT: v_cmp_class_f32_e64 [[COND:s\[[0-9]+:[0-9]+\]]], v0, 3{{$}}
+; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, [[COND]]
; SI-NEXT: s_setpc_b64
define i1 @test_fold_and_unord(float %a) {
%class = call i1 @llvm.amdgcn.class.f32(float %a, i32 35) #1
; GCN-LABEL: {{^}}func_implicitarg_ptr:
; GCN: s_waitcnt
-; MESA: v_mov_b32_e32 v0, s6
-; MESA: v_mov_b32_e32 v1, s7
+; MESA: v_mov_b32_e32 v0, s4
+; MESA: v_mov_b32_e32 v1, s5
; MESA: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
-; HSA: v_mov_b32_e32 v0, s6
-; HSA: v_mov_b32_e32 v1, s7
+; HSA: v_mov_b32_e32 v0, s4
+; HSA: v_mov_b32_e32 v1, s5
; HSA: flat_load_dword v0, v[0:1]
; GCN-NEXT: s_waitcnt
; GCN-NEXT: s_setpc_b64
; GCN-LABEL: {{^}}opencl_func_implicitarg_ptr:
; GCN: s_waitcnt
-; MESA: v_mov_b32_e32 v0, s6
-; MESA: v_mov_b32_e32 v1, s7
+; MESA: v_mov_b32_e32 v0, s4
+; MESA: v_mov_b32_e32 v1, s5
; MESA: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
-; HSA: v_mov_b32_e32 v0, s6
-; HSA: v_mov_b32_e32 v1, s7
+; HSA: v_mov_b32_e32 v0, s4
+; HSA: v_mov_b32_e32 v1, s5
; HSA: flat_load_dword v0, v[0:1]
; GCN-NEXT: s_waitcnt
; GCN-NEXT: s_setpc_b64
; GCN: enable_sgpr_kernarg_segment_ptr = 1
; HSA: kernarg_segment_byte_size = 0
; MESA: kernarg_segment_byte_size = 16
-; GCN: s_mov_b64 s[6:7], s[4:5]
+; GCN-NOT: s[4:5]
+; GCN-NOT: s4
+; GCN-NOT: s5
; GCN: s_swappc_b64
define amdgpu_kernel void @kernel_call_implicitarg_ptr_func_empty() #0 {
call void @func_implicitarg_ptr()
; GCN: enable_sgpr_kernarg_segment_ptr = 1
; HSA: kernarg_segment_byte_size = 48
; MESA: kernarg_segment_byte_size = 16
-; GCN: s_mov_b64 s[6:7], s[4:5]
+; GCN-NOT: s[4:5]
+; GCN-NOT: s4
+; GCN-NOT: s5
; GCN: s_swappc_b64
define amdgpu_kernel void @opencl_kernel_call_implicitarg_ptr_func_empty() #1 {
call void @func_implicitarg_ptr()
; HSA: kernarg_segment_byte_size = 112
; MESA: kernarg_segment_byte_size = 128
-; HSA: s_add_u32 s6, s4, 0x70
-; MESA: s_add_u32 s6, s4, 0x70
+; HSA: s_add_u32 s4, s4, 0x70
+; MESA: s_add_u32 s4, s4, 0x70
-; GCN: s_addc_u32 s7, s5, 0{{$}}
+; GCN: s_addc_u32 s5, s5, 0{{$}}
; GCN: s_swappc_b64
define amdgpu_kernel void @kernel_call_implicitarg_ptr_func([112 x i8]) #0 {
call void @func_implicitarg_ptr()
; HSA: kernarg_segment_byte_size = 160
; MESA: kernarg_segment_byte_size = 128
-; GCN: s_add_u32 s6, s4, 0x70
-
-; GCN: s_addc_u32 s7, s5, 0{{$}}
+; GCN: s_add_u32 s4, s4, 0x70
+; GCN: s_addc_u32 s5, s5, 0{{$}}
; GCN: s_swappc_b64
define amdgpu_kernel void @opencl_kernel_call_implicitarg_ptr_func([112 x i8]) #1 {
call void @func_implicitarg_ptr()
}
; GCN-LABEL: {{^}}func_call_implicitarg_ptr_func:
-; GCN-NOT: s6
-; GCN-NOT: s7
-; GCN-NOT: s[6:7]
+; GCN-NOT: s4
+; GCN-NOT: s5
+; GCN-NOT: s[4:5]
define void @func_call_implicitarg_ptr_func() #0 {
call void @func_implicitarg_ptr()
ret void
}
; GCN-LABEL: {{^}}opencl_func_call_implicitarg_ptr_func:
-; GCN-NOT: s6
-; GCN-NOT: s7
-; GCN-NOT: s[6:7]
+; GCN-NOT: s4
+; GCN-NOT: s5
+; GCN-NOT: s[4:5]
define void @opencl_func_call_implicitarg_ptr_func() #0 {
call void @func_implicitarg_ptr()
ret void
; GCN-LABEL: {{^}}func_kernarg_implicitarg_ptr:
; GCN: s_waitcnt
-; MESA-DAG: v_mov_b32_e32 v0, s6
-; MESA-DAG: v_mov_b32_e32 v1, s7
-; MESA-DAG: buffer_load_dword v0, v[0:1], s[12:15], 0 addr64
-; MESA: v_mov_b32_e32 v0, s8
-; MESA: v_mov_b32_e32 v1, s9
-; MESA: buffer_load_dword v0, v[0:1], s[12:15], 0 addr64
+; MESA-DAG: v_mov_b32_e32 v0, s4
+; MESA-DAG: v_mov_b32_e32 v1, s5
+; MESA-DAG: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
+; MESA: v_mov_b32_e32 v0, s6
+; MESA: v_mov_b32_e32 v1, s7
+; MESA: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
+; HSA: v_mov_b32_e32 v0, s4
+; HSA: v_mov_b32_e32 v1, s5
+; HSA: flat_load_dword v0, v[0:1]
; HSA: v_mov_b32_e32 v0, s6
; HSA: v_mov_b32_e32 v1, s7
; HSA: flat_load_dword v0, v[0:1]
-; HSA: v_mov_b32_e32 v0, s8
-; HSA: v_mov_b32_e32 v1, s9
-; HSA: flat_load_dword v0, v[0:1]
; GCN: s_waitcnt vmcnt(0)
define void @func_kernarg_implicitarg_ptr() #0 {
; GCN-LABEL: {{^}}opencl_func_kernarg_implicitarg_ptr:
; GCN: s_waitcnt
+; MESA-DAG: v_mov_b32_e32 v0, s4
+; MESA-DAG: v_mov_b32_e32 v1, s5
+; MESA: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
; MESA-DAG: v_mov_b32_e32 v0, s6
; MESA-DAG: v_mov_b32_e32 v1, s7
-; MESA: buffer_load_dword v0, v[0:1], s[12:15], 0 addr64
-; MESA-DAG: v_mov_b32_e32 v0, s8
-; MESA-DAG: v_mov_b32_e32 v1, s9
-; MESA: buffer_load_dword v0, v[0:1], s[12:15], 0 addr64
+; MESA: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
-; HSA: v_mov_b32_e32 v0, s6
-; HSA: v_mov_b32_e32 v1, s7
+; HSA: v_mov_b32_e32 v0, s4
+; HSA: v_mov_b32_e32 v1, s5
; HSA: flat_load_dword v0, v[0:1]
-; HSA: v_mov_b32_e32 v0, s8
-; HSA: v_mov_b32_e32 v1, s9
+; HSA: v_mov_b32_e32 v0, s6
+; HSA: v_mov_b32_e32 v1, s7
; HSA: flat_load_dword v0, v[0:1]
; GCN: s_waitcnt vmcnt(0)
}
; GCN-LABEL: {{^}}kernel_call_kernarg_implicitarg_ptr_func:
-; GCN: s_add_u32 s8, s4, 0x70
-; GCN: s_addc_u32 s9, s5, 0
-
-; GCN: s_mov_b64 s[6:7], s[4:5]
+; GCN: s_add_u32 s6, s4, 0x70
+; GCN: s_addc_u32 s7, s5, 0
; GCN: s_swappc_b64
define amdgpu_kernel void @kernel_call_kernarg_implicitarg_ptr_func([112 x i8]) #0 {
call void @func_kernarg_implicitarg_ptr()
; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SI %s
; GCN-LABEL: {{^}}mad_i64_i32_sextops:
-; CI: v_mad_i64_i32 v[0:1], s[6:7], v0, v1, v[2:3]
+; CI: v_mad_i64_i32 v[0:1], s{{\[[0-9]+:[0-9]+\]}}, v0, v1, v[2:3]
; SI: v_mul_lo_u32
; SI: v_mul_hi_i32
}
; GCN-LABEL: {{^}}mad_i64_i32_sextops_commute:
-; CI: v_mad_i64_i32 v[0:1], s[6:7], v0, v1, v[2:3]
+; CI: v_mad_i64_i32 v[0:1], s{{\[[0-9]+:[0-9]+\]}}, v0, v1, v[2:3]
; SI-DAG: v_mul_lo_u32
; SI-DAG: v_mul_hi_i32
}
; GCN-LABEL: {{^}}mad_u64_u32_zextops:
-; CI: v_mad_u64_u32 v[0:1], s[6:7], v0, v1, v[2:3]
+; CI: v_mad_u64_u32 v[0:1], s{{\[[0-9]+:[0-9]+\]}}, v0, v1, v[2:3]
; SI-DAG: v_mul_lo_u32
; SI-DAG: v_mul_hi_u32
}
; GCN-LABEL: {{^}}mad_u64_u32_zextops_commute:
-; CI: v_mad_u64_u32 v[0:1], s[6:7], v0, v1, v[2:3]
+; CI: v_mad_u64_u32 v[0:1], s{{\[[0-9]+:[0-9]+\]}}, v0, v1, v[2:3]
; SI-DAG: v_mul_lo_u32
; SI-DAG: v_mul_hi_u32
; GCN-LABEL: {{^}}mad_i64_i32_sextops_i32_i63:
; CI: v_lshl_b64
; CI: v_ashr
-; CI: v_mad_i64_i32 v[0:1], s[6:7], v0, v1, v[2:3]
+; CI: v_mad_i64_i32 v[0:1], s{{\[[0-9]+:[0-9]+\]}}, v0, v1, v[2:3]
; SI-NOT: v_mad_u64_u32
define i63 @mad_i64_i32_sextops_i32_i63(i32 %arg0, i32 %arg1, i63 %arg2) #0 {
; CI: v_bfe_i32 v[[B1:[0-9]+]], v1, 0, 31
; CI: v_ashr_i64
; CI: v_bfe_i32 v[[B2:[0-9]+]], v0, 0, 31
-; CI: v_mad_i64_i32 v[0:1], s[6:7], v[[B2]], v[[B1]], v[1:2]
+; CI: v_mad_i64_i32 v[0:1], s{{\[[0-9]+:[0-9]+\]}}, v[[B2]], v[[B1]], v[1:2]
define i63 @mad_i64_i32_sextops_i31_i63(i31 %arg0, i31 %arg1, i63 %arg2) #0 {
%sext0 = sext i31 %arg0 to i63
%sext1 = sext i31 %arg1 to i63
}
; GCN-LABEL: {{^}}mad_u64_u32_bitops:
-; CI: v_mad_u64_u32 v[0:1], s[6:7], v0, v2, v[4:5]
+; CI: v_mad_u64_u32 v[0:1], s{{\[[0-9]+:[0-9]+\]}}, v0, v2, v[4:5]
define i64 @mad_u64_u32_bitops(i64 %arg0, i64 %arg1, i64 %arg2) #0 {
%trunc.lhs = and i64 %arg0, 4294967295
%trunc.rhs = and i64 %arg1, 4294967295
}
; GCN-LABEL: {{^}}mad_i64_i32_bitops:
-; CI: v_mad_i64_i32 v[0:1], s[6:7], v0, v2, v[4:5]
+; CI: v_mad_i64_i32 v[0:1], s{{\[[0-9]+:[0-9]+\]}}, v0, v2, v[4:5]
; SI-NOT: v_mad_
define i64 @mad_i64_i32_bitops(i64 %arg0, i64 %arg1, i64 %arg2) #0 {
%shl.lhs = shl i64 %arg0, 32
; Example from bug report
; GCN-LABEL: {{^}}mad_i64_i32_unpack_i64ops:
-; CI: v_mad_u64_u32 v[0:1], s[6:7], v1, v0, v[0:1]
+; CI: v_mad_u64_u32 v[0:1], s{{\[[0-9]+:[0-9]+\]}}, v1, v0, v[0:1]
; SI-NOT: v_mad_u64_u32
define i64 @mad_i64_i32_unpack_i64ops(i64 %arg0) #0 {
%tmp4 = lshr i64 %arg0, 32
; GCN-LABEL: {{^}}test_func_call_external_void_func_i32_imm:
; GCN: s_waitcnt
-; GCN: s_mov_b32 s5, s32
-; GCN-DAG: s_add_u32 s32, s32, 0x400
+
; Spill CSR VGPR used for SGPR spilling
; GCN: s_or_saveexec_b64 [[COPY_EXEC0:s\[[0-9]+:[0-9]+\]]], -1{{$}}
-; GCN-NEXT: buffer_store_dword v32, off, s[0:3], s5 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v32, off, s[0:3], s32 ; 4-byte Folded Spill
; GCN-NEXT: s_mov_b64 exec, [[COPY_EXEC0]]
-
-; GCN-DAG: v_writelane_b32 v32, s34, 0
-; GCN-DAG: v_writelane_b32 v32, s35, 1
-; GCN-DAG: v_writelane_b32 v32, s36, 2
+; GCN-DAG: v_writelane_b32 v32, s34, 2
+; GCN-DAG: s_mov_b32 s34, s32
+; GCN-DAG: s_add_u32 s32, s32, 0x400
+; GCN-DAG: v_writelane_b32 v32, s36, 0
+; GCN-DAG: v_writelane_b32 v32, s37, 1
; GCN: s_swappc_b64
-; GCN: v_readlane_b32 s36, v32, 2
-; GCN: v_readlane_b32 s35, v32, 1
-; GCN: v_readlane_b32 s34, v32, 0
+; GCN: v_readlane_b32 s37, v32, 1
+; GCN: v_readlane_b32 s36, v32, 0
+
+; GCN-NEXT: s_sub_u32 s32, s32, 0x400
+; GCN-NEXT: v_readlane_b32 s34, v32, 2
; GCN: s_or_saveexec_b64 [[COPY_EXEC1:s\[[0-9]+:[0-9]+\]]], -1{{$}}
-; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s5 ; 4-byte Folded Reload
+; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 ; 4-byte Folded Reload
; GCN-NEXT: s_mov_b64 exec, [[COPY_EXEC1]]
-
-; GCN: s_sub_u32 s32, s32, 0x400
-; GCN: s_setpc_b64
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: s_setpc_b64
define void @test_func_call_external_void_func_i32_imm() #0 {
call void @external_void_func_i32(i32 42)
ret void
; GCN-LABEL: {{^}}test_func_call_external_void_func_i32_imm_stack_use:
; GCN: s_waitcnt
-; GCN: s_mov_b32 s5, s32
+; GCN: s_mov_b32 s34, s32
; GCN-DAG: s_add_u32 s32, s32, 0x1400{{$}}
-; GCN-DAG: buffer_store_dword v{{[0-9]+}}, off, s[0:3], s5 offset
+; GCN-DAG: buffer_store_dword v{{[0-9]+}}, off, s[0:3], s34 offset:
; GCN: s_swappc_b64
; GCN: s_sub_u32 s32, s32, 0x1400{{$}}
; GCN: s_setpc_b64
; GCN-NEXT: v_sub_i32_e32 v7, vcc, 64, v4
; GCN-NEXT: v_lshl_b64 v[5:6], v[2:3], v4
; GCN-NEXT: v_lshr_b64 v[7:8], v[0:1], v7
-; GCN-NEXT: v_cmp_eq_u32_e64 s[6:7], 0, v4
+; GCN-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v4
; GCN-NEXT: v_or_b32_e32 v7, v5, v7
; GCN-NEXT: v_subrev_i32_e32 v5, vcc, 64, v4
; GCN-NEXT: v_or_b32_e32 v8, v6, v8
; GCN-NEXT: v_lshl_b64 v[0:1], v[0:1], v4
; GCN-NEXT: v_cndmask_b32_e32 v6, v6, v8, vcc
; GCN-NEXT: v_cndmask_b32_e32 v5, v5, v7, vcc
-; GCN-NEXT: v_cndmask_b32_e64 v3, v6, v3, s[6:7]
-; GCN-NEXT: v_cndmask_b32_e64 v2, v5, v2, s[6:7]
+; GCN-NEXT: v_cndmask_b32_e64 v3, v6, v3, s[4:5]
+; GCN-NEXT: v_cndmask_b32_e64 v2, v5, v2, s[4:5]
; GCN-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
; GCN-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc
; GCN-NEXT: s_setpc_b64 s[30:31]
; GCN-NEXT: v_sub_i32_e32 v7, vcc, 64, v4
; GCN-NEXT: v_lshr_b64 v[5:6], v[0:1], v4
; GCN-NEXT: v_lshl_b64 v[7:8], v[2:3], v7
-; GCN-NEXT: v_cmp_eq_u32_e64 s[6:7], 0, v4
+; GCN-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v4
; GCN-NEXT: v_or_b32_e32 v7, v5, v7
; GCN-NEXT: v_subrev_i32_e32 v5, vcc, 64, v4
; GCN-NEXT: v_or_b32_e32 v8, v6, v8
; GCN-NEXT: v_lshr_b64 v[2:3], v[2:3], v4
; GCN-NEXT: v_cndmask_b32_e32 v6, v6, v8, vcc
; GCN-NEXT: v_cndmask_b32_e32 v5, v5, v7, vcc
-; GCN-NEXT: v_cndmask_b32_e64 v1, v6, v1, s[6:7]
-; GCN-NEXT: v_cndmask_b32_e64 v0, v5, v0, s[6:7]
+; GCN-NEXT: v_cndmask_b32_e64 v1, v6, v1, s[4:5]
+; GCN-NEXT: v_cndmask_b32_e64 v0, v5, v0, s[4:5]
; GCN-NEXT: v_cndmask_b32_e32 v3, 0, v3, vcc
; GCN-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc
; GCN-NEXT: s_setpc_b64 s[30:31]
; GCN-NEXT: v_subrev_i32_e32 v10, vcc, 64, v4
; GCN-NEXT: v_ashr_i64 v[5:6], v[2:3], v4
; GCN-NEXT: v_ashr_i64 v[2:3], v[2:3], v10
-; GCN-NEXT: v_cmp_gt_u32_e64 s[6:7], 64, v4
+; GCN-NEXT: v_cmp_gt_u32_e64 s[4:5], 64, v4
; GCN-NEXT: v_or_b32_e32 v7, v7, v9
-; GCN-NEXT: v_cndmask_b32_e64 v5, v11, v5, s[6:7]
-; GCN-NEXT: v_cndmask_b32_e64 v3, v3, v8, s[6:7]
+; GCN-NEXT: v_cndmask_b32_e64 v5, v11, v5, s[4:5]
+; GCN-NEXT: v_cndmask_b32_e64 v3, v3, v8, s[4:5]
; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4
-; GCN-NEXT: v_cndmask_b32_e64 v2, v2, v7, s[6:7]
-; GCN-NEXT: v_cndmask_b32_e64 v6, v11, v6, s[6:7]
+; GCN-NEXT: v_cndmask_b32_e64 v2, v2, v7, s[4:5]
+; GCN-NEXT: v_cndmask_b32_e64 v6, v11, v6, s[4:5]
; GCN-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
; GCN-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
; GCN-NEXT: v_mov_b32_e32 v2, v5
; GCN-NEXT: v_lshl_b64 v[4:5], 17, v3
; GCN-NEXT: v_cmp_gt_u32_e32 vcc, 64, v0
; GCN-NEXT: v_cndmask_b32_e32 v2, v5, v2, vcc
-; GCN-NEXT: v_cmp_ne_u32_e64 s[6:7], 0, v0
-; GCN-NEXT: v_cndmask_b32_e64 v3, 0, v2, s[6:7]
+; GCN-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v0
+; GCN-NEXT: v_cndmask_b32_e64 v3, 0, v2, s[4:5]
; GCN-NEXT: v_cndmask_b32_e32 v2, v4, v1, vcc
; GCN-NEXT: v_lshl_b64 v[0:1], 17, v0
-; GCN-NEXT: v_cndmask_b32_e64 v2, 0, v2, s[6:7]
+; GCN-NEXT: v_cndmask_b32_e64 v2, 0, v2, s[4:5]
; GCN-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
; GCN-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc
; GCN-NEXT: s_setpc_b64 s[30:31]
; GCN-LABEL: v_lshr_i128_kv:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: s_mov_b32 s7, 0
-; GCN-NEXT: s_movk_i32 s6, 0x41
-; GCN-NEXT: v_lshr_b64 v[2:3], s[6:7], v0
+; GCN-NEXT: s_mov_b32 s5, 0
+; GCN-NEXT: s_movk_i32 s4, 0x41
+; GCN-NEXT: v_lshr_b64 v[2:3], s[4:5], v0
; GCN-NEXT: v_cmp_gt_u32_e32 vcc, 64, v0
-; GCN-NEXT: v_cmp_ne_u32_e64 s[6:7], 0, v0
+; GCN-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v0
; GCN-NEXT: v_cndmask_b32_e32 v0, 0, v2, vcc
; GCN-NEXT: v_mov_b32_e32 v2, 0x41
; GCN-NEXT: v_cndmask_b32_e32 v1, 0, v3, vcc
-; GCN-NEXT: v_cndmask_b32_e64 v0, v2, v0, s[6:7]
+; GCN-NEXT: v_cndmask_b32_e64 v0, v2, v0, s[4:5]
; GCN-NEXT: v_mov_b32_e32 v2, 0
-; GCN-NEXT: v_cndmask_b32_e64 v1, 0, v1, s[6:7]
+; GCN-NEXT: v_cndmask_b32_e64 v1, 0, v1, s[4:5]
; GCN-NEXT: v_mov_b32_e32 v3, 0
; GCN-NEXT: s_setpc_b64 s[30:31]
%shl = lshr i128 65, %rhs
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_lshr_b64 v[2:3], 33, v0
; GCN-NEXT: v_cmp_gt_u32_e32 vcc, 64, v0
-; GCN-NEXT: v_cmp_ne_u32_e64 s[6:7], 0, v0
+; GCN-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v0
; GCN-NEXT: v_cndmask_b32_e32 v1, 0, v3, vcc
; GCN-NEXT: v_cndmask_b32_e32 v0, 0, v2, vcc
; GCN-NEXT: v_mov_b32_e32 v2, 0
-; GCN-NEXT: v_cndmask_b32_e64 v1, 0, v1, s[6:7]
-; GCN-NEXT: v_cndmask_b32_e64 v0, 33, v0, s[6:7]
+; GCN-NEXT: v_cndmask_b32_e64 v1, 0, v1, s[4:5]
+; GCN-NEXT: v_cndmask_b32_e64 v0, 33, v0, s[4:5]
; GCN-NEXT: v_mov_b32_e32 v3, 0
; GCN-NEXT: s_setpc_b64 s[30:31]
%shl = ashr i128 33, %rhs
; GCN-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc
; GCN-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GCN-NEXT: s_endpgm
-; GCN-NEXT: .section .rodata,#alloc
-; GCN-NEXT: .p2align 6
-; GCN-NEXT: .amdhsa_kernel s_shl_i128_ss
-; GCN-NEXT: .amdhsa_group_segment_fixed_size 0
-; GCN-NEXT: .amdhsa_private_segment_fixed_size 0
-; GCN-NEXT: .amdhsa_user_sgpr_private_segment_buffer 1
-; GCN-NEXT: .amdhsa_user_sgpr_dispatch_ptr 0
-; GCN-NEXT: .amdhsa_user_sgpr_queue_ptr 0
-; GCN-NEXT: .amdhsa_user_sgpr_kernarg_segment_ptr 1
-; GCN-NEXT: .amdhsa_user_sgpr_dispatch_id 0
-; GCN-NEXT: .amdhsa_user_sgpr_flat_scratch_init 0
-; GCN-NEXT: .amdhsa_user_sgpr_private_segment_size 0
-; GCN-NEXT: .amdhsa_system_sgpr_private_segment_wavefront_offset 0
-; GCN-NEXT: .amdhsa_system_sgpr_workgroup_id_x 1
-; GCN-NEXT: .amdhsa_system_sgpr_workgroup_id_y 0
-; GCN-NEXT: .amdhsa_system_sgpr_workgroup_id_z 0
-; GCN-NEXT: .amdhsa_system_sgpr_workgroup_info 0
-; GCN-NEXT: .amdhsa_system_vgpr_workitem_id 0
-; GCN-NEXT: .amdhsa_next_free_vgpr 8
-; GCN-NEXT: .amdhsa_next_free_sgpr 12
-; GCN-NEXT: .amdhsa_reserve_flat_scratch 0
-; GCN-NEXT: .amdhsa_float_round_mode_32 0
-; GCN-NEXT: .amdhsa_float_round_mode_16_64 0
-; GCN-NEXT: .amdhsa_float_denorm_mode_32 0
-; GCN-NEXT: .amdhsa_float_denorm_mode_16_64 3
-; GCN-NEXT: .amdhsa_dx10_clamp 1
-; GCN-NEXT: .amdhsa_ieee_mode 1
-; GCN-NEXT: .amdhsa_exception_fp_ieee_invalid_op 0
-; GCN-NEXT: .amdhsa_exception_fp_denorm_src 0
-; GCN-NEXT: .amdhsa_exception_fp_ieee_div_zero 0
-; GCN-NEXT: .amdhsa_exception_fp_ieee_overflow 0
-; GCN-NEXT: .amdhsa_exception_fp_ieee_underflow 0
-; GCN-NEXT: .amdhsa_exception_fp_ieee_inexact 0
-; GCN-NEXT: .amdhsa_exception_int_div_zero 0
-; GCN-NEXT: .end_amdhsa_kernel
-; GCN-NEXT: .text
%shift = shl i128 %lhs, %rhs
store i128 %shift, i128 addrspace(1)* null
ret void
; GCN-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc
; GCN-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GCN-NEXT: s_endpgm
-; GCN-NEXT: .section .rodata,#alloc
-; GCN-NEXT: .p2align 6
-; GCN-NEXT: .amdhsa_kernel s_lshr_i128_ss
-; GCN-NEXT: .amdhsa_group_segment_fixed_size 0
-; GCN-NEXT: .amdhsa_private_segment_fixed_size 0
-; GCN-NEXT: .amdhsa_user_sgpr_private_segment_buffer 1
-; GCN-NEXT: .amdhsa_user_sgpr_dispatch_ptr 0
-; GCN-NEXT: .amdhsa_user_sgpr_queue_ptr 0
-; GCN-NEXT: .amdhsa_user_sgpr_kernarg_segment_ptr 1
-; GCN-NEXT: .amdhsa_user_sgpr_dispatch_id 0
-; GCN-NEXT: .amdhsa_user_sgpr_flat_scratch_init 0
-; GCN-NEXT: .amdhsa_user_sgpr_private_segment_size 0
-; GCN-NEXT: .amdhsa_system_sgpr_private_segment_wavefront_offset 0
-; GCN-NEXT: .amdhsa_system_sgpr_workgroup_id_x 1
-; GCN-NEXT: .amdhsa_system_sgpr_workgroup_id_y 0
-; GCN-NEXT: .amdhsa_system_sgpr_workgroup_id_z 0
-; GCN-NEXT: .amdhsa_system_sgpr_workgroup_info 0
-; GCN-NEXT: .amdhsa_system_vgpr_workitem_id 0
-; GCN-NEXT: .amdhsa_next_free_vgpr 8
-; GCN-NEXT: .amdhsa_next_free_sgpr 12
-; GCN-NEXT: .amdhsa_reserve_flat_scratch 0
-; GCN-NEXT: .amdhsa_float_round_mode_32 0
-; GCN-NEXT: .amdhsa_float_round_mode_16_64 0
-; GCN-NEXT: .amdhsa_float_denorm_mode_32 0
-; GCN-NEXT: .amdhsa_float_denorm_mode_16_64 3
-; GCN-NEXT: .amdhsa_dx10_clamp 1
-; GCN-NEXT: .amdhsa_ieee_mode 1
-; GCN-NEXT: .amdhsa_exception_fp_ieee_invalid_op 0
-; GCN-NEXT: .amdhsa_exception_fp_denorm_src 0
-; GCN-NEXT: .amdhsa_exception_fp_ieee_div_zero 0
-; GCN-NEXT: .amdhsa_exception_fp_ieee_overflow 0
-; GCN-NEXT: .amdhsa_exception_fp_ieee_underflow 0
-; GCN-NEXT: .amdhsa_exception_fp_ieee_inexact 0
-; GCN-NEXT: .amdhsa_exception_int_div_zero 0
-; GCN-NEXT: .end_amdhsa_kernel
-; GCN-NEXT: .text
%shift = lshr i128 %lhs, %rhs
store i128 %shift, i128 addrspace(1)* null
ret void
; GCN-NEXT: v_mov_b32_e32 v5, 0
; GCN-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GCN-NEXT: s_endpgm
-; GCN-NEXT: .section .rodata,#alloc
-; GCN-NEXT: .p2align 6
-; GCN-NEXT: .amdhsa_kernel s_ashr_i128_ss
-; GCN-NEXT: .amdhsa_group_segment_fixed_size 0
-; GCN-NEXT: .amdhsa_private_segment_fixed_size 0
-; GCN-NEXT: .amdhsa_user_sgpr_private_segment_buffer 1
-; GCN-NEXT: .amdhsa_user_sgpr_dispatch_ptr 0
-; GCN-NEXT: .amdhsa_user_sgpr_queue_ptr 0
-; GCN-NEXT: .amdhsa_user_sgpr_kernarg_segment_ptr 1
-; GCN-NEXT: .amdhsa_user_sgpr_dispatch_id 0
-; GCN-NEXT: .amdhsa_user_sgpr_flat_scratch_init 0
-; GCN-NEXT: .amdhsa_user_sgpr_private_segment_size 0
-; GCN-NEXT: .amdhsa_system_sgpr_private_segment_wavefront_offset 0
-; GCN-NEXT: .amdhsa_system_sgpr_workgroup_id_x 1
-; GCN-NEXT: .amdhsa_system_sgpr_workgroup_id_y 0
-; GCN-NEXT: .amdhsa_system_sgpr_workgroup_id_z 0
-; GCN-NEXT: .amdhsa_system_sgpr_workgroup_info 0
-; GCN-NEXT: .amdhsa_system_vgpr_workitem_id 0
-; GCN-NEXT: .amdhsa_next_free_vgpr 8
-; GCN-NEXT: .amdhsa_next_free_sgpr 12
-; GCN-NEXT: .amdhsa_reserve_flat_scratch 0
-; GCN-NEXT: .amdhsa_float_round_mode_32 0
-; GCN-NEXT: .amdhsa_float_round_mode_16_64 0
-; GCN-NEXT: .amdhsa_float_denorm_mode_32 0
-; GCN-NEXT: .amdhsa_float_denorm_mode_16_64 3
-; GCN-NEXT: .amdhsa_dx10_clamp 1
-; GCN-NEXT: .amdhsa_ieee_mode 1
-; GCN-NEXT: .amdhsa_exception_fp_ieee_invalid_op 0
-; GCN-NEXT: .amdhsa_exception_fp_denorm_src 0
-; GCN-NEXT: .amdhsa_exception_fp_ieee_div_zero 0
-; GCN-NEXT: .amdhsa_exception_fp_ieee_overflow 0
-; GCN-NEXT: .amdhsa_exception_fp_ieee_underflow 0
-; GCN-NEXT: .amdhsa_exception_fp_ieee_inexact 0
-; GCN-NEXT: .amdhsa_exception_int_div_zero 0
-; GCN-NEXT: .end_amdhsa_kernel
-; GCN-NEXT: .text
%shift = ashr i128 %lhs, %rhs
store i128 %shift, i128 addrspace(1)* null
ret void
; GCN-NEXT: v_sub_i32_e32 v18, vcc, 64, v8
; GCN-NEXT: v_lshl_b64 v[16:17], v[2:3], v8
; GCN-NEXT: v_lshr_b64 v[18:19], v[0:1], v18
-; GCN-NEXT: v_cmp_eq_u64_e64 s[8:9], 0, v[10:11]
-; GCN-NEXT: v_cmp_gt_u64_e64 s[6:7], 64, v[8:9]
+; GCN-NEXT: v_cmp_eq_u64_e64 s[6:7], 0, v[10:11]
+; GCN-NEXT: v_cmp_gt_u64_e64 s[4:5], 64, v[8:9]
; GCN-NEXT: v_or_b32_e32 v11, v9, v11
; GCN-NEXT: v_subrev_i32_e32 v9, vcc, 64, v8
; GCN-NEXT: v_or_b32_e32 v10, v8, v10
; GCN-NEXT: v_or_b32_e32 v19, v17, v19
; GCN-NEXT: v_or_b32_e32 v18, v16, v18
; GCN-NEXT: v_lshl_b64 v[16:17], v[0:1], v9
-; GCN-NEXT: s_and_b64 s[6:7], s[8:9], s[6:7]
+; GCN-NEXT: s_and_b64 s[4:5], s[6:7], s[4:5]
; GCN-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[10:11]
-; GCN-NEXT: v_cndmask_b32_e64 v9, v17, v19, s[6:7]
+; GCN-NEXT: v_cndmask_b32_e64 v9, v17, v19, s[4:5]
; GCN-NEXT: v_cndmask_b32_e32 v3, v9, v3, vcc
-; GCN-NEXT: v_cndmask_b32_e64 v9, v16, v18, s[6:7]
+; GCN-NEXT: v_cndmask_b32_e64 v9, v16, v18, s[4:5]
; GCN-NEXT: v_cndmask_b32_e32 v2, v9, v2, vcc
; GCN-NEXT: v_sub_i32_e32 v11, vcc, 64, v12
; GCN-NEXT: v_lshl_b64 v[9:10], v[6:7], v12
; GCN-NEXT: v_lshr_b64 v[16:17], v[4:5], v11
-; GCN-NEXT: v_cmp_gt_u64_e64 s[8:9], 64, v[12:13]
+; GCN-NEXT: v_cmp_gt_u64_e64 s[6:7], 64, v[12:13]
; GCN-NEXT: v_or_b32_e32 v16, v9, v16
-; GCN-NEXT: v_cmp_eq_u64_e64 s[10:11], 0, v[14:15]
+; GCN-NEXT: v_cmp_eq_u64_e64 s[8:9], 0, v[14:15]
; GCN-NEXT: v_subrev_i32_e32 v9, vcc, 64, v12
; GCN-NEXT: v_or_b32_e32 v11, v10, v17
; GCN-NEXT: v_lshl_b64 v[9:10], v[4:5], v9
-; GCN-NEXT: s_and_b64 vcc, s[10:11], s[8:9]
+; GCN-NEXT: s_and_b64 vcc, s[8:9], s[6:7]
; GCN-NEXT: v_cndmask_b32_e32 v17, v10, v11, vcc
; GCN-NEXT: v_or_b32_e32 v11, v13, v15
; GCN-NEXT: v_or_b32_e32 v10, v12, v14
; GCN-NEXT: v_lshl_b64 v[0:1], v[0:1], v8
; GCN-NEXT: v_lshl_b64 v[4:5], v[4:5], v12
-; GCN-NEXT: v_cmp_eq_u64_e64 s[8:9], 0, v[10:11]
+; GCN-NEXT: v_cmp_eq_u64_e64 s[6:7], 0, v[10:11]
; GCN-NEXT: v_cndmask_b32_e32 v9, v9, v16, vcc
-; GCN-NEXT: v_cndmask_b32_e64 v7, v17, v7, s[8:9]
-; GCN-NEXT: v_cndmask_b32_e64 v6, v9, v6, s[8:9]
-; GCN-NEXT: v_cndmask_b32_e64 v1, 0, v1, s[6:7]
-; GCN-NEXT: v_cndmask_b32_e64 v0, 0, v0, s[6:7]
+; GCN-NEXT: v_cndmask_b32_e64 v7, v17, v7, s[6:7]
+; GCN-NEXT: v_cndmask_b32_e64 v6, v9, v6, s[6:7]
+; GCN-NEXT: v_cndmask_b32_e64 v1, 0, v1, s[4:5]
+; GCN-NEXT: v_cndmask_b32_e64 v0, 0, v0, s[4:5]
; GCN-NEXT: v_cndmask_b32_e32 v5, 0, v5, vcc
; GCN-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc
; GCN-NEXT: s_setpc_b64 s[30:31]
; GCN-NEXT: v_sub_i32_e32 v18, vcc, 64, v8
; GCN-NEXT: v_lshr_b64 v[16:17], v[0:1], v8
; GCN-NEXT: v_lshl_b64 v[18:19], v[2:3], v18
-; GCN-NEXT: v_cmp_eq_u64_e64 s[8:9], 0, v[10:11]
-; GCN-NEXT: v_cmp_gt_u64_e64 s[6:7], 64, v[8:9]
+; GCN-NEXT: v_cmp_eq_u64_e64 s[6:7], 0, v[10:11]
+; GCN-NEXT: v_cmp_gt_u64_e64 s[4:5], 64, v[8:9]
; GCN-NEXT: v_or_b32_e32 v11, v9, v11
; GCN-NEXT: v_subrev_i32_e32 v9, vcc, 64, v8
; GCN-NEXT: v_or_b32_e32 v10, v8, v10
; GCN-NEXT: v_or_b32_e32 v19, v17, v19
; GCN-NEXT: v_or_b32_e32 v18, v16, v18
; GCN-NEXT: v_lshr_b64 v[16:17], v[2:3], v9
-; GCN-NEXT: s_and_b64 s[6:7], s[8:9], s[6:7]
+; GCN-NEXT: s_and_b64 s[4:5], s[6:7], s[4:5]
; GCN-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[10:11]
-; GCN-NEXT: v_cndmask_b32_e64 v9, v17, v19, s[6:7]
+; GCN-NEXT: v_cndmask_b32_e64 v9, v17, v19, s[4:5]
; GCN-NEXT: v_cndmask_b32_e32 v1, v9, v1, vcc
-; GCN-NEXT: v_cndmask_b32_e64 v9, v16, v18, s[6:7]
+; GCN-NEXT: v_cndmask_b32_e64 v9, v16, v18, s[4:5]
; GCN-NEXT: v_cndmask_b32_e32 v0, v9, v0, vcc
; GCN-NEXT: v_sub_i32_e32 v11, vcc, 64, v12
; GCN-NEXT: v_lshr_b64 v[9:10], v[4:5], v12
; GCN-NEXT: v_lshl_b64 v[16:17], v[6:7], v11
-; GCN-NEXT: v_cmp_gt_u64_e64 s[8:9], 64, v[12:13]
+; GCN-NEXT: v_cmp_gt_u64_e64 s[6:7], 64, v[12:13]
; GCN-NEXT: v_or_b32_e32 v16, v9, v16
-; GCN-NEXT: v_cmp_eq_u64_e64 s[10:11], 0, v[14:15]
+; GCN-NEXT: v_cmp_eq_u64_e64 s[8:9], 0, v[14:15]
; GCN-NEXT: v_subrev_i32_e32 v9, vcc, 64, v12
; GCN-NEXT: v_or_b32_e32 v11, v10, v17
; GCN-NEXT: v_lshr_b64 v[9:10], v[6:7], v9
-; GCN-NEXT: s_and_b64 vcc, s[10:11], s[8:9]
+; GCN-NEXT: s_and_b64 vcc, s[8:9], s[6:7]
; GCN-NEXT: v_cndmask_b32_e32 v17, v10, v11, vcc
; GCN-NEXT: v_or_b32_e32 v11, v13, v15
; GCN-NEXT: v_or_b32_e32 v10, v12, v14
; GCN-NEXT: v_lshr_b64 v[2:3], v[2:3], v8
; GCN-NEXT: v_lshr_b64 v[6:7], v[6:7], v12
-; GCN-NEXT: v_cmp_eq_u64_e64 s[8:9], 0, v[10:11]
+; GCN-NEXT: v_cmp_eq_u64_e64 s[6:7], 0, v[10:11]
; GCN-NEXT: v_cndmask_b32_e32 v9, v9, v16, vcc
-; GCN-NEXT: v_cndmask_b32_e64 v5, v17, v5, s[8:9]
-; GCN-NEXT: v_cndmask_b32_e64 v4, v9, v4, s[8:9]
-; GCN-NEXT: v_cndmask_b32_e64 v3, 0, v3, s[6:7]
-; GCN-NEXT: v_cndmask_b32_e64 v2, 0, v2, s[6:7]
+; GCN-NEXT: v_cndmask_b32_e64 v5, v17, v5, s[6:7]
+; GCN-NEXT: v_cndmask_b32_e64 v4, v9, v4, s[6:7]
+; GCN-NEXT: v_cndmask_b32_e64 v3, 0, v3, s[4:5]
+; GCN-NEXT: v_cndmask_b32_e64 v2, 0, v2, s[4:5]
; GCN-NEXT: v_cndmask_b32_e32 v7, 0, v7, vcc
; GCN-NEXT: v_cndmask_b32_e32 v6, 0, v6, vcc
; GCN-NEXT: s_setpc_b64 s[30:31]
; GCN-NEXT: v_sub_i32_e32 v18, vcc, 64, v8
; GCN-NEXT: v_lshr_b64 v[16:17], v[0:1], v8
; GCN-NEXT: v_lshl_b64 v[18:19], v[2:3], v18
-; GCN-NEXT: v_cmp_eq_u64_e64 s[8:9], 0, v[10:11]
-; GCN-NEXT: v_cmp_gt_u64_e64 s[6:7], 64, v[8:9]
+; GCN-NEXT: v_cmp_eq_u64_e64 s[6:7], 0, v[10:11]
+; GCN-NEXT: v_cmp_gt_u64_e64 s[4:5], 64, v[8:9]
; GCN-NEXT: v_or_b32_e32 v11, v9, v11
; GCN-NEXT: v_subrev_i32_e32 v9, vcc, 64, v8
; GCN-NEXT: v_or_b32_e32 v10, v8, v10
; GCN-NEXT: v_or_b32_e32 v19, v17, v19
; GCN-NEXT: v_or_b32_e32 v18, v16, v18
; GCN-NEXT: v_ashr_i64 v[16:17], v[2:3], v9
-; GCN-NEXT: s_and_b64 s[6:7], s[8:9], s[6:7]
+; GCN-NEXT: s_and_b64 s[4:5], s[6:7], s[4:5]
; GCN-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[10:11]
-; GCN-NEXT: v_cndmask_b32_e64 v9, v17, v19, s[6:7]
+; GCN-NEXT: v_cndmask_b32_e64 v9, v17, v19, s[4:5]
; GCN-NEXT: v_cndmask_b32_e32 v1, v9, v1, vcc
-; GCN-NEXT: v_cndmask_b32_e64 v9, v16, v18, s[6:7]
+; GCN-NEXT: v_cndmask_b32_e64 v9, v16, v18, s[4:5]
; GCN-NEXT: v_cndmask_b32_e32 v0, v9, v0, vcc
; GCN-NEXT: v_sub_i32_e32 v11, vcc, 64, v12
; GCN-NEXT: v_lshr_b64 v[9:10], v[4:5], v12
; GCN-NEXT: v_lshl_b64 v[16:17], v[6:7], v11
-; GCN-NEXT: v_cmp_gt_u64_e64 s[8:9], 64, v[12:13]
+; GCN-NEXT: v_cmp_gt_u64_e64 s[6:7], 64, v[12:13]
; GCN-NEXT: v_or_b32_e32 v16, v9, v16
-; GCN-NEXT: v_cmp_eq_u64_e64 s[10:11], 0, v[14:15]
+; GCN-NEXT: v_cmp_eq_u64_e64 s[8:9], 0, v[14:15]
; GCN-NEXT: v_subrev_i32_e32 v9, vcc, 64, v12
; GCN-NEXT: v_or_b32_e32 v11, v10, v17
; GCN-NEXT: v_ashr_i64 v[9:10], v[6:7], v9
-; GCN-NEXT: s_and_b64 vcc, s[10:11], s[8:9]
+; GCN-NEXT: s_and_b64 vcc, s[8:9], s[6:7]
; GCN-NEXT: v_cndmask_b32_e32 v17, v10, v11, vcc
; GCN-NEXT: v_or_b32_e32 v11, v13, v15
; GCN-NEXT: v_or_b32_e32 v10, v12, v14
-; GCN-NEXT: v_cmp_eq_u64_e64 s[8:9], 0, v[10:11]
+; GCN-NEXT: v_cmp_eq_u64_e64 s[6:7], 0, v[10:11]
; GCN-NEXT: v_cndmask_b32_e32 v9, v9, v16, vcc
-; GCN-NEXT: v_cndmask_b32_e64 v4, v9, v4, s[8:9]
+; GCN-NEXT: v_cndmask_b32_e64 v4, v9, v4, s[6:7]
; GCN-NEXT: v_ashr_i64 v[8:9], v[2:3], v8
; GCN-NEXT: v_ashrrev_i32_e32 v2, 31, v3
-; GCN-NEXT: v_cndmask_b32_e64 v3, v2, v9, s[6:7]
-; GCN-NEXT: v_cndmask_b32_e64 v2, v2, v8, s[6:7]
+; GCN-NEXT: v_cndmask_b32_e64 v3, v2, v9, s[4:5]
+; GCN-NEXT: v_cndmask_b32_e64 v2, v2, v8, s[4:5]
; GCN-NEXT: v_ashr_i64 v[8:9], v[6:7], v12
; GCN-NEXT: v_ashrrev_i32_e32 v6, 31, v7
; GCN-NEXT: v_cndmask_b32_e32 v7, v6, v9, vcc
-; GCN-NEXT: v_cndmask_b32_e64 v5, v17, v5, s[8:9]
+; GCN-NEXT: v_cndmask_b32_e64 v5, v17, v5, s[6:7]
; GCN-NEXT: v_cndmask_b32_e32 v6, v6, v8, vcc
; GCN-NEXT: s_setpc_b64 s[30:31]
%shl = ashr <2 x i128> %lhs, %rhs
; GCN-NEXT: flat_store_dwordx4 v[8:9], v[0:3]
; GCN-NEXT: flat_store_dwordx4 v[10:11], v[4:7]
; GCN-NEXT: s_endpgm
-; GCN-NEXT: .section .rodata,#alloc
-; GCN-NEXT: .p2align 6
-; GCN-NEXT: .amdhsa_kernel s_shl_v2i128ss
-; GCN-NEXT: .amdhsa_group_segment_fixed_size 0
-; GCN-NEXT: .amdhsa_private_segment_fixed_size 0
-; GCN-NEXT: .amdhsa_user_sgpr_private_segment_buffer 1
-; GCN-NEXT: .amdhsa_user_sgpr_dispatch_ptr 0
-; GCN-NEXT: .amdhsa_user_sgpr_queue_ptr 0
-; GCN-NEXT: .amdhsa_user_sgpr_kernarg_segment_ptr 1
-; GCN-NEXT: .amdhsa_user_sgpr_dispatch_id 0
-; GCN-NEXT: .amdhsa_user_sgpr_flat_scratch_init 0
-; GCN-NEXT: .amdhsa_user_sgpr_private_segment_size 0
-; GCN-NEXT: .amdhsa_system_sgpr_private_segment_wavefront_offset 0
-; GCN-NEXT: .amdhsa_system_sgpr_workgroup_id_x 1
-; GCN-NEXT: .amdhsa_system_sgpr_workgroup_id_y 0
-; GCN-NEXT: .amdhsa_system_sgpr_workgroup_id_z 0
-; GCN-NEXT: .amdhsa_system_sgpr_workgroup_info 0
-; GCN-NEXT: .amdhsa_system_vgpr_workitem_id 0
-; GCN-NEXT: .amdhsa_next_free_vgpr 16
-; GCN-NEXT: .amdhsa_next_free_sgpr 22
-; GCN-NEXT: .amdhsa_reserve_flat_scratch 0
-; GCN-NEXT: .amdhsa_float_round_mode_32 0
-; GCN-NEXT: .amdhsa_float_round_mode_16_64 0
-; GCN-NEXT: .amdhsa_float_denorm_mode_32 0
-; GCN-NEXT: .amdhsa_float_denorm_mode_16_64 3
-; GCN-NEXT: .amdhsa_dx10_clamp 1
-; GCN-NEXT: .amdhsa_ieee_mode 1
-; GCN-NEXT: .amdhsa_exception_fp_ieee_invalid_op 0
-; GCN-NEXT: .amdhsa_exception_fp_denorm_src 0
-; GCN-NEXT: .amdhsa_exception_fp_ieee_div_zero 0
-; GCN-NEXT: .amdhsa_exception_fp_ieee_overflow 0
-; GCN-NEXT: .amdhsa_exception_fp_ieee_underflow 0
-; GCN-NEXT: .amdhsa_exception_fp_ieee_inexact 0
-; GCN-NEXT: .amdhsa_exception_int_div_zero 0
-; GCN-NEXT: .end_amdhsa_kernel
-; GCN-NEXT: .text
%shift = shl <2 x i128> %lhs, %rhs
store <2 x i128> %shift, <2 x i128> addrspace(1)* null
ret void
; GCN-NEXT: flat_store_dwordx4 v[8:9], v[0:3]
; GCN-NEXT: flat_store_dwordx4 v[10:11], v[4:7]
; GCN-NEXT: s_endpgm
-; GCN-NEXT: .section .rodata,#alloc
-; GCN-NEXT: .p2align 6
-; GCN-NEXT: .amdhsa_kernel s_lshr_v2i128_ss
-; GCN-NEXT: .amdhsa_group_segment_fixed_size 0
-; GCN-NEXT: .amdhsa_private_segment_fixed_size 0
-; GCN-NEXT: .amdhsa_user_sgpr_private_segment_buffer 1
-; GCN-NEXT: .amdhsa_user_sgpr_dispatch_ptr 0
-; GCN-NEXT: .amdhsa_user_sgpr_queue_ptr 0
-; GCN-NEXT: .amdhsa_user_sgpr_kernarg_segment_ptr 1
-; GCN-NEXT: .amdhsa_user_sgpr_dispatch_id 0
-; GCN-NEXT: .amdhsa_user_sgpr_flat_scratch_init 0
-; GCN-NEXT: .amdhsa_user_sgpr_private_segment_size 0
-; GCN-NEXT: .amdhsa_system_sgpr_private_segment_wavefront_offset 0
-; GCN-NEXT: .amdhsa_system_sgpr_workgroup_id_x 1
-; GCN-NEXT: .amdhsa_system_sgpr_workgroup_id_y 0
-; GCN-NEXT: .amdhsa_system_sgpr_workgroup_id_z 0
-; GCN-NEXT: .amdhsa_system_sgpr_workgroup_info 0
-; GCN-NEXT: .amdhsa_system_vgpr_workitem_id 0
-; GCN-NEXT: .amdhsa_next_free_vgpr 16
-; GCN-NEXT: .amdhsa_next_free_sgpr 22
-; GCN-NEXT: .amdhsa_reserve_flat_scratch 0
-; GCN-NEXT: .amdhsa_float_round_mode_32 0
-; GCN-NEXT: .amdhsa_float_round_mode_16_64 0
-; GCN-NEXT: .amdhsa_float_denorm_mode_32 0
-; GCN-NEXT: .amdhsa_float_denorm_mode_16_64 3
-; GCN-NEXT: .amdhsa_dx10_clamp 1
-; GCN-NEXT: .amdhsa_ieee_mode 1
-; GCN-NEXT: .amdhsa_exception_fp_ieee_invalid_op 0
-; GCN-NEXT: .amdhsa_exception_fp_denorm_src 0
-; GCN-NEXT: .amdhsa_exception_fp_ieee_div_zero 0
-; GCN-NEXT: .amdhsa_exception_fp_ieee_overflow 0
-; GCN-NEXT: .amdhsa_exception_fp_ieee_underflow 0
-; GCN-NEXT: .amdhsa_exception_fp_ieee_inexact 0
-; GCN-NEXT: .amdhsa_exception_int_div_zero 0
-; GCN-NEXT: .end_amdhsa_kernel
-; GCN-NEXT: .text
%shift = lshr <2 x i128> %lhs, %rhs
store <2 x i128> %shift, <2 x i128> addrspace(1)* null
ret void
; GCN-NEXT: flat_store_dwordx4 v[8:9], v[0:3]
; GCN-NEXT: flat_store_dwordx4 v[10:11], v[4:7]
; GCN-NEXT: s_endpgm
-; GCN-NEXT: .section .rodata,#alloc
-; GCN-NEXT: .p2align 6
-; GCN-NEXT: .amdhsa_kernel s_ashr_v2i128_ss
-; GCN-NEXT: .amdhsa_group_segment_fixed_size 0
-; GCN-NEXT: .amdhsa_private_segment_fixed_size 0
-; GCN-NEXT: .amdhsa_user_sgpr_private_segment_buffer 1
-; GCN-NEXT: .amdhsa_user_sgpr_dispatch_ptr 0
-; GCN-NEXT: .amdhsa_user_sgpr_queue_ptr 0
-; GCN-NEXT: .amdhsa_user_sgpr_kernarg_segment_ptr 1
-; GCN-NEXT: .amdhsa_user_sgpr_dispatch_id 0
-; GCN-NEXT: .amdhsa_user_sgpr_flat_scratch_init 0
-; GCN-NEXT: .amdhsa_user_sgpr_private_segment_size 0
-; GCN-NEXT: .amdhsa_system_sgpr_private_segment_wavefront_offset 0
-; GCN-NEXT: .amdhsa_system_sgpr_workgroup_id_x 1
-; GCN-NEXT: .amdhsa_system_sgpr_workgroup_id_y 0
-; GCN-NEXT: .amdhsa_system_sgpr_workgroup_id_z 0
-; GCN-NEXT: .amdhsa_system_sgpr_workgroup_info 0
-; GCN-NEXT: .amdhsa_system_vgpr_workitem_id 0
-; GCN-NEXT: .amdhsa_next_free_vgpr 16
-; GCN-NEXT: .amdhsa_next_free_sgpr 23
-; GCN-NEXT: .amdhsa_reserve_flat_scratch 0
-; GCN-NEXT: .amdhsa_float_round_mode_32 0
-; GCN-NEXT: .amdhsa_float_round_mode_16_64 0
-; GCN-NEXT: .amdhsa_float_denorm_mode_32 0
-; GCN-NEXT: .amdhsa_float_denorm_mode_16_64 3
-; GCN-NEXT: .amdhsa_dx10_clamp 1
-; GCN-NEXT: .amdhsa_ieee_mode 1
-; GCN-NEXT: .amdhsa_exception_fp_ieee_invalid_op 0
-; GCN-NEXT: .amdhsa_exception_fp_denorm_src 0
-; GCN-NEXT: .amdhsa_exception_fp_ieee_div_zero 0
-; GCN-NEXT: .amdhsa_exception_fp_ieee_overflow 0
-; GCN-NEXT: .amdhsa_exception_fp_ieee_underflow 0
-; GCN-NEXT: .amdhsa_exception_fp_ieee_inexact 0
-; GCN-NEXT: .amdhsa_exception_int_div_zero 0
-; GCN-NEXT: .end_amdhsa_kernel
-; GCN-NEXT: .text
%shift = ashr <2 x i128> %lhs, %rhs
store <2 x i128> %shift, <2 x i128> addrspace(1)* null
ret void
; Have another non-tail in the function
; GCN-LABEL: {{^}}sibling_call_i32_fastcc_i32_i32_other_call:
-; GCN: s_mov_b32 s5, s32
-; GCN: s_add_u32 s32, s32, 0x400
-
; GCN: s_or_saveexec_b64 s{{\[[0-9]+:[0-9]+\]}}, -1
-; GCN-NEXT: buffer_store_dword v34, off, s[0:3], s5 offset:8
+; GCN-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
; GCN-NEXT: s_mov_b64 exec
-; GCN-DAG: s_getpc_b64
+; GCN: s_mov_b32 s34, s32
+; GCN-DAG: s_add_u32 s32, s32, 0x400
+
+; GCN-DAG: buffer_store_dword v32, off, s[0:3], s34 offset:4 ; 4-byte Folded Spill
+; GCN-DAG: buffer_store_dword v33, off, s[0:3], s34 ; 4-byte Folded Spill
+; GCN-DAG: v_writelane_b32 v34, s36, 0
+; GCN-DAG: v_writelane_b32 v34, s37, 1
+
+; GCN-DAG: s_getpc_b64 s[4:5]
+; GCN-DAG: s_add_u32 s4, s4, i32_fastcc_i32_i32@gotpcrel32@lo+4
+; GCN-DAG: s_addc_u32 s5, s5, i32_fastcc_i32_i32@gotpcrel32@hi+4
-; GCN: buffer_store_dword v32, off, s[0:3], s5 offset:4 ; 4-byte Folded Spill
-; GCN: buffer_store_dword v33, off, s[0:3], s5 ; 4-byte Folded Spill
-; GCN-DAG: v_writelane_b32 v34, s34, 0
-; GCN-DAG: v_writelane_b32 v34, s35, 1
; GCN: s_swappc_b64
-; GCN-DAG: v_readlane_b32 s34, v34, 0
-; GCN-DAG: v_readlane_b32 s35, v34, 1
+; GCN-DAG: v_readlane_b32 s36, v34, 0
+; GCN-DAG: v_readlane_b32 s37, v34, 1
-; GCN: buffer_load_dword v33, off, s[0:3], s5 ; 4-byte Folded Reload
-; GCN: buffer_load_dword v32, off, s[0:3], s5 offset:4 ; 4-byte Folded Reload
-; GCN: s_getpc_b64 s[6:7]
-; GCN: s_add_u32 s6, s6, sibling_call_i32_fastcc_i32_i32@rel32@lo+4
-; GCN: s_addc_u32 s7, s7, sibling_call_i32_fastcc_i32_i32@rel32@hi+4
-; GCN: s_or_saveexec_b64 s{{\[[0-9]+:[0-9]+\]}}, -1
-; GCN-NEXT: buffer_load_dword v34, off, s[0:3], s5 offset:8
-; GCN-NEXT: s_mov_b64 exec
+; GCN: buffer_load_dword v33, off, s[0:3], s34 ; 4-byte Folded Reload
+; GCN: buffer_load_dword v32, off, s[0:3], s34 offset:4 ; 4-byte Folded Reload
+
+; GCN: s_getpc_b64 s[4:5]
+; GCN-NEXT: s_add_u32 s4, s4, sibling_call_i32_fastcc_i32_i32@rel32@lo+4
+; GCN-NEXT: s_addc_u32 s5, s5, sibling_call_i32_fastcc_i32_i32@rel32@hi+4
; GCN: s_sub_u32 s32, s32, 0x400
-; GCN: s_setpc_b64 s[6:7]
+; GCN-NEXT: v_readlane_b32 s34,
+; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1
+; GCN-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; GCN-NEXT: s_mov_b64 exec, s[6:7]
+; GCN-NEXT: s_setpc_b64 s[4:5]
define fastcc i32 @sibling_call_i32_fastcc_i32_i32_other_call(i32 %a, i32 %b, i32 %c) #1 {
entry:
%other.call = tail call fastcc i32 @i32_fastcc_i32_i32(i32 %a, i32 %b)
; GCN-NOT: s33
; GCN: buffer_load_dword v{{[0-9]+}}, off, s[0:3], s32 offset:
-; GCN: s_setpc_b64 s[6:7]
+; GCN: s_setpc_b64 s[4:5]
define fastcc i32 @sibling_call_stack_objecti32_fastcc_i32_i32_a32i32(i32 %a, i32 %b, [32 x i32] %c) #1 {
entry:
%alloca = alloca [16 x i32], align 4, addrspace(5)
; GCN: buffer_store_dword v{{[0-9]+}}, off, s[0:3], s32 offset:44
; GCN-NOT: s33
-; GCN: s_setpc_b64 s[6:7]
+; GCN: s_setpc_b64 s[4:5]
define fastcc i32 @sibling_call_stack_objecti32_fastcc_i32_i32_a32i32_larger_arg_area(i32 %a, i32 %b, [36 x i32] %c) #1 {
entry:
%alloca = alloca [16 x i32], align 4, addrspace(5)
; RUN: llc -mtriple=amdgcn-amd-amdhsa -verify-machineinstrs -stress-regalloc=1 < %s | FileCheck -check-prefix=GCN %s
-; For the CSR copy of s5, it may be possible to see it in
-; storeRegToStackSlot.
-
; GCN-LABEL: {{^}}spill_csr_s5_copy:
-; GCN: buffer_store_dword v32, off, s[0:3], s5 offset:4 ; 4-byte Folded Spill
-; GCN: v_writelane_b32 v32, s5, 2
+; GCN: s_or_saveexec_b64
+; GCN-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; GCN-NEXT: s_mov_b64 exec
+; GCN: v_writelane_b32 v32, s34, 2
; GCN: s_swappc_b64
-; GCN: v_readlane_b32 s5, v32, 2
+
; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 9
-; GCN: buffer_store_dword [[K]], off, s[0:3], s5{{$}}
-; GCN: buffer_load_dword v32, off, s[0:3], s5 offset:4 ; 4-byte Folded Reload
+; GCN: buffer_store_dword [[K]], off, s[0:3], s34{{$}}
+
+; GCN: v_readlane_b32 s34, v32, 2
+; GCN: s_or_saveexec_b64
+; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; GCN: s_mov_b64 exec
; GCN: s_setpc_b64
define void @spill_csr_s5_copy() #0 {
bb:
}
attributes #0 = { nounwind }
-attributes #1 = { nounwind "amdgpu-num-sgpr"="18" "amdgpu-num-vgpr"="8" }
-attributes #2 = { nounwind "amdgpu-num-sgpr"="15" "amdgpu-num-vgpr"="8" }
+attributes #1 = { nounwind "amdgpu-num-sgpr"="17" "amdgpu-num-vgpr"="8" }
+attributes #2 = { nounwind "amdgpu-num-sgpr"="14" "amdgpu-num-vgpr"="8" }
; GCN-LABEL: {{^}}needs_align16_stack_align4:
; GCN: s_add_u32 [[SCRATCH_REG:s[0-9]+]], s32, 0x3c0{{$}}
-; GCN: s_and_b32 s5, [[SCRATCH_REG]], 0xfffffc00
+; GCN: s_and_b32 s34, [[SCRATCH_REG]], 0xfffffc00
; GCN: s_add_u32 s32, s32, 0x2800{{$}}
; GCN: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[0:3], s33 offen
; GCN-LABEL: {{^}}needs_align32:
; GCN: s_add_u32 [[SCRATCH_REG:s[0-9]+]], s32, 0x7c0{{$}}
-; GCN: s_and_b32 s5, [[SCRATCH_REG]], 0xfffff800
+; GCN: s_and_b32 s34, [[SCRATCH_REG]], 0xfffff800
; GCN: s_add_u32 s32, s32, 0x3000{{$}}
; GCN: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[0:3], s33 offen
; GCN-LABEL: {{^}}force_realign4:
; GCN: s_add_u32 [[SCRATCH_REG:s[0-9]+]], s32, 0xc0{{$}}
-; GCN: s_and_b32 s5, [[SCRATCH_REG]], 0xffffff00
+; GCN: s_and_b32 s34, [[SCRATCH_REG]], 0xffffff00
; GCN: s_add_u32 s32, s32, 0xd00{{$}}
; GCN: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[0:3], s33 offen
; GCN-LABEL: {{^}}default_realign_align128:
; GCN: s_add_u32 [[TMP:s[0-9]+]], s32, 0x1fc0
-; GCN-NEXT: s_and_b32 s5, [[TMP]], 0xffffe000
+; GCN-NEXT: s_mov_b32 [[FP_COPY:s[0-9]+]], s34
+; GCN-NEXT: s_and_b32 s34, [[TMP]], 0xffffe000
; GCN-NEXT: s_add_u32 s32, s32, 0x4000
-; GCN-NOT: s5
-; GCN: buffer_store_dword v0, off, s[0:3], s5{{$}}
+; GCN-NOT: s34
+; GCN: buffer_store_dword v0, off, s[0:3], s34{{$}}
; GCN: s_sub_u32 s32, s32, 0x4000
+; GCN: s_mov_b32 s34, [[FP_COPY]]
define void @default_realign_align128(i32 %idx) #0 {
%alloca.align = alloca i32, align 128, addrspace(5)
store volatile i32 9, i32 addrspace(5)* %alloca.align, align 128
; GCN-LABEL: {{^}}callee_no_stack_with_call:
; GCN: s_waitcnt
-; GCN: s_mov_b32 s5, s32
-; GFX1064: s_add_u32 s32, s32, 0x400
-; GFX1032: s_add_u32 s32, s32, 0x200
-
-; GFX1064: s_or_saveexec_b64 [[COPY_EXEC0:s\[[0-9]+:[0-9]+\]]], -1{{$}}
-; GFX1032: s_or_saveexec_b32 [[COPY_EXEC0:s[0-9]]], -1{{$}}
-
-; GCN-NEXT: buffer_store_dword v32, off, s[0:3], s5 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt_vscnt
+; GFX1064-NEXT: s_or_saveexec_b64 [[COPY_EXEC0:s\[[0-9]+:[0-9]+\]]], -1{{$}}
+; GFX1032-NEXT: s_or_saveexec_b32 [[COPY_EXEC0:s[0-9]]], -1{{$}}
+; GCN-NEXT: buffer_store_dword v32, off, s[0:3], s32 ; 4-byte Folded Spill
; GFX1064-NEXT: s_mov_b64 exec, [[COPY_EXEC0]]
; GFX1032-NEXT: s_mov_b32 exec_lo, [[COPY_EXEC0]]
-; GCN-DAG: v_writelane_b32 v32, s34, 0
-; GCN-DAG: v_writelane_b32 v32, s35, 1
-; GCN-DAG: s_mov_b32 [[COPY_FP:s[0-9]+]], s5
+; GCN-NEXT: v_writelane_b32 v32, s34, 2
+; GCN: s_mov_b32 s34, s32
+; GFX1064: s_add_u32 s32, s32, 0x400
+; GFX1032: s_add_u32 s32, s32, 0x200
+
+
+; GCN-DAG: v_writelane_b32 v32, s36, 0
+; GCN-DAG: v_writelane_b32 v32, s37, 1
; GCN: s_swappc_b64
-; GCN-DAG: s_mov_b32 s5, [[COPY_FP]]
-; GCN-DAG: v_readlane_b32 s35, v32, 1
-; GCN-DAG: v_readlane_b32 s34, v32, 0
+; GCN-DAG: v_readlane_b32 s36, v32, 0
+; GCN-DAG: v_readlane_b32 s37, v32, 1
+
+; GFX1064: s_sub_u32 s32, s32, 0x400
+; GFX1032: s_sub_u32 s32, s32, 0x200
+; GCN: v_readlane_b32 s34, v32, 2
; GFX1064: s_or_saveexec_b64 [[COPY_EXEC1:s\[[0-9]+:[0-9]+\]]], -1{{$}}
; GFX1032: s_or_saveexec_b32 [[COPY_EXEC1:s[0-9]]], -1{{$}}
-; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s5 ; 4-byte Folded Reload
+; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 ; 4-byte Folded Reload
; GFX1064-NEXT: s_mov_b64 exec, [[COPY_EXEC1]]
; GFX1032-NEXT: s_mov_b32 exec_lo, [[COPY_EXEC1]]
-
-; GFX1064: s_sub_u32 s32, s32, 0x400
-; GFX1032: s_sub_u32 s32, s32, 0x200
-; GCN: s_setpc_b64
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: s_setpc_b64
define void @callee_no_stack_with_call() #1 {
call void @external_void_func_void()
ret void
; CHECK-NEXT: waveLimiter: false
; CHECK-NEXT: scratchRSrcReg: '$sgpr0_sgpr1_sgpr2_sgpr3'
; CHECK-NEXT: scratchWaveOffsetReg: '$sgpr33'
-; CHECK-NEXT: frameOffsetReg: '$sgpr5'
+; CHECK-NEXT: frameOffsetReg: '$sgpr34'
; CHECK-NEXT: stackPtrOffsetReg: '$sgpr32'
; CHECK-NEXT: argumentInfo:
; CHECK-NEXT: privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
; CHECK-NEXT: waveLimiter: false
; CHECK-NEXT: scratchRSrcReg: '$sgpr0_sgpr1_sgpr2_sgpr3'
; CHECK-NEXT: scratchWaveOffsetReg: '$sgpr33'
-; CHECK-NEXT: frameOffsetReg: '$sgpr5'
+; CHECK-NEXT: frameOffsetReg: '$sgpr34'
; CHECK-NEXT: stackPtrOffsetReg: '$sgpr32'
; CHECK-NEXT: argumentInfo:
; CHECK-NEXT: privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }