[LLVMMatchType<0>], [IntrNoMem, IntrSpeculatable]
>;
+// Copies the active channels of the source value to the destination value,
+// with the guarantee that the source value is computed as if the entire
+// program were executed in Whole Wavefront Mode, i.e. with all channels
+// enabled, with a few exceptions: - Phi nodes with require WWM return an
+// undefined value.
+def int_amdgcn_wwm : Intrinsic<[llvm_any_ty],
+ [LLVMMatchType<0>], [IntrNoMem, IntrSpeculatable]
+>;
+
//===----------------------------------------------------------------------===//
// CI+ Intrinsics
//===----------------------------------------------------------------------===//
FunctionPass *createSIDebuggerInsertNopsPass();
FunctionPass *createSIInsertWaitsPass();
FunctionPass *createSIInsertWaitcntsPass();
+FunctionPass *createSIFixWWMLivenessPass();
FunctionPass *createAMDGPUCodeGenPreparePass();
FunctionPass *createAMDGPUMachineCFGStructurizerPass();
FunctionPass *createAMDGPURewriteOutArgumentsPass();
void initializeSIOptimizeExecMaskingPass(PassRegistry &);
extern char &SIOptimizeExecMaskingID;
+void initializeSIFixWWMLivenessPass(PassRegistry &);
+extern char &SIFixWWMLivenessID;
+
// Passes common to R600 and SI
FunctionPass *createAMDGPUPromoteAlloca();
void initializeAMDGPUPromoteAllocaPass(PassRegistry&);
initializeSIMemoryLegalizerPass(*PR);
initializeSIDebuggerInsertNopsPass(*PR);
initializeSIOptimizeExecMaskingPass(*PR);
+ initializeSIFixWWMLivenessPass(*PR);
initializeAMDGPUUnifyDivergentExitNodesPass(*PR);
initializeAMDGPUAAWrapperPassPass(*PR);
}
// SI_ELSE will introduce a copy of the tied operand source after the else.
insertPass(&PHIEliminationID, &SILowerControlFlowID, false);
+ // This must be run after SILowerControlFlow, since it needs to use the
+ // machine-level CFG, but before register allocation.
+ insertPass(&SILowerControlFlowID, &SIFixWWMLivenessID, false);
+
TargetPassConfig::addFastRegAlloc(RegAllocPass);
}
// SI_ELSE will introduce a copy of the tied operand source after the else.
insertPass(&PHIEliminationID, &SILowerControlFlowID, false);
+ // This must be run after SILowerControlFlow, since it needs to use the
+ // machine-level CFG, but before register allocation.
+ insertPass(&SILowerControlFlowID, &SIFixWWMLivenessID, false);
+
TargetPassConfig::addOptimizedRegAlloc(RegAllocPass);
}
SIFixControlFlowLiveIntervals.cpp
SIFixSGPRCopies.cpp
SIFixVGPRCopies.cpp
+ SIFixWWMLiveness.cpp
SIFoldOperands.cpp
SIFrameLowering.cpp
SIInsertSkips.cpp
default:
continue;
case AMDGPU::COPY:
- case AMDGPU::WQM: {
+ case AMDGPU::WQM:
+ case AMDGPU::WWM: {
// If the destination register is a physical register there isn't really
// much we can do to fix this.
if (!TargetRegisterInfo::isVirtualRegister(MI.getOperand(0).getReg()))
--- /dev/null
+//===-- SIFixWWMLiveness.cpp - Fix WWM live intervals ---------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// \brief Computations in WWM can overwrite values in inactive channels for
+/// variables that the register allocator thinks are dead. This pass adds fake
+/// uses of those variables to WWM instructions to make sure that they aren't
+/// overwritten.
+///
+/// As an example, consider this snippet:
+/// %vgpr0 = V_MOV_B32_e32 0.0
+/// if (...) {
+/// %vgpr1 = ...
+/// %vgpr2 = WWM %vgpr1<kill>
+/// ... = %vgpr2<kill>
+/// %vgpr0 = V_MOV_B32_e32 1.0
+/// }
+/// ... = %vgpr0
+///
+/// The live intervals of %vgpr0 don't overlap with those of %vgpr1. Normally,
+/// we can safely allocate %vgpr0 and %vgpr1 in the same register, since
+/// writing %vgpr1 would only write to channels that would be clobbered by the
+/// second write to %vgpr0 anyways. But if %vgpr1 is written with WWM enabled,
+/// it would clobber even the inactive channels for which the if-condition is
+/// false, for which %vgpr0 is supposed to be 0. This pass adds an implicit use
+/// of %vgpr0 to the WWM instruction to make sure they aren't allocated to the
+/// same register.
+///
+/// In general, we need to figure out what registers might have their inactive
+/// channels which are eventually used accidentally clobbered by a WWM
+/// instruction. We approximate this using two conditions:
+///
+/// 1. A definition of the variable reaches the WWM instruction.
+/// 2. The variable would be live at the WWM instruction if all its defs were
+/// partial defs (i.e. considered as a use), ignoring normal uses.
+///
+/// If a register matches both conditions, then we add an implicit use of it to
+/// the WWM instruction. Condition #2 is the heart of the matter: every
+/// definition is really a partial definition, since every VALU instruction is
+/// implicitly predicated. We can usually ignore this, but WWM forces us not
+/// to. Condition #1 prevents false positives if the variable is undefined at
+/// the WWM instruction anyways. This is overly conservative in certain cases,
+/// especially in uniform control flow, but this is a workaround anyways until
+/// LLVM gains the notion of predicated uses and definitions of variables.
+///
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPU.h"
+#include "AMDGPUSubtarget.h"
+#include "SIInstrInfo.h"
+#include "SIRegisterInfo.h"
+#include "llvm/ADT/DepthFirstIterator.h"
+#include "llvm/ADT/SparseBitVector.h"
+#include "llvm/CodeGen/LiveIntervalAnalysis.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "si-fix-wwm-liveness"
+
+namespace {
+
+class SIFixWWMLiveness : public MachineFunctionPass {
+private:
+ LiveIntervals *LIS = nullptr;
+ const SIRegisterInfo *TRI;
+ MachineRegisterInfo *MRI;
+
+public:
+ static char ID;
+
+ SIFixWWMLiveness() : MachineFunctionPass(ID) {
+ initializeSIFixWWMLivenessPass(*PassRegistry::getPassRegistry());
+ }
+
+ bool runOnMachineFunction(MachineFunction &MF) override;
+
+ bool runOnWWMInstruction(MachineInstr &MI);
+
+ void addDefs(const MachineInstr &MI, SparseBitVector<> &set);
+
+ StringRef getPassName() const override { return "SI Fix WWM Liveness"; }
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ // Should preserve the same set that TwoAddressInstructions does.
+ AU.addPreserved<SlotIndexes>();
+ AU.addPreserved<LiveIntervals>();
+ AU.addPreservedID(LiveVariablesID);
+ AU.addPreservedID(MachineLoopInfoID);
+ AU.addPreservedID(MachineDominatorsID);
+ AU.setPreservesCFG();
+ MachineFunctionPass::getAnalysisUsage(AU);
+ }
+};
+
+} // End anonymous namespace.
+
+INITIALIZE_PASS(SIFixWWMLiveness, DEBUG_TYPE,
+ "SI fix WWM liveness", false, false)
+
+char SIFixWWMLiveness::ID = 0;
+
+char &llvm::SIFixWWMLivenessID = SIFixWWMLiveness::ID;
+
+FunctionPass *llvm::createSIFixWWMLivenessPass() {
+ return new SIFixWWMLiveness();
+}
+
+void SIFixWWMLiveness::addDefs(const MachineInstr &MI, SparseBitVector<> &Regs)
+{
+ for (const MachineOperand &Op : MI.defs()) {
+ if (Op.isReg()) {
+ unsigned Reg = Op.getReg();
+ if (TRI->isVGPR(*MRI, Reg))
+ Regs.set(Reg);
+ }
+ }
+}
+
+bool SIFixWWMLiveness::runOnWWMInstruction(MachineInstr &WWM) {
+ MachineBasicBlock *MBB = WWM.getParent();
+
+ // Compute the registers that are live out of MI by figuring out which defs
+ // are reachable from MI.
+ SparseBitVector<> LiveOut;
+
+ for (auto II = MachineBasicBlock::iterator(WWM), IE =
+ MBB->end(); II != IE; ++II) {
+ addDefs(*II, LiveOut);
+ }
+
+ for (df_iterator<MachineBasicBlock *> I = ++df_begin(MBB),
+ E = df_end(MBB);
+ I != E; ++I) {
+ for (const MachineInstr &MI : **I) {
+ addDefs(MI, LiveOut);
+ }
+ }
+
+ // Compute the registers that reach MI.
+ SparseBitVector<> Reachable;
+
+ for (auto II = ++MachineBasicBlock::reverse_iterator(WWM), IE =
+ MBB->rend(); II != IE; ++II) {
+ addDefs(*II, Reachable);
+ }
+
+ for (idf_iterator<MachineBasicBlock *> I = ++idf_begin(MBB),
+ E = idf_end(MBB);
+ I != E; ++I) {
+ for (const MachineInstr &MI : **I) {
+ addDefs(MI, Reachable);
+ }
+ }
+
+ // find the intersection, and add implicit uses.
+ LiveOut &= Reachable;
+
+ bool Modified = false;
+ for (unsigned Reg : LiveOut) {
+ WWM.addOperand(MachineOperand::CreateReg(Reg, false, /*isImp=*/true));
+ if (LIS) {
+ // FIXME: is there a better way to update the live interval?
+ LIS->removeInterval(Reg);
+ LIS->createAndComputeVirtRegInterval(Reg);
+ }
+ Modified = true;
+ }
+
+ return Modified;
+}
+
+bool SIFixWWMLiveness::runOnMachineFunction(MachineFunction &MF) {
+ bool Modified = false;
+
+ // This doesn't actually need LiveIntervals, but we can preserve them.
+ LIS = getAnalysisIfAvailable<LiveIntervals>();
+
+ const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
+ const SIInstrInfo *TII = ST.getInstrInfo();
+
+ TRI = &TII->getRegisterInfo();
+ MRI = &MF.getRegInfo();
+
+ for (MachineBasicBlock &MBB : MF) {
+ for (MachineInstr &MI : MBB) {
+ if (MI.getOpcode() == AMDGPU::EXIT_WWM) {
+ Modified |= runOnWWMInstruction(MI);
+ }
+ }
+ }
+
+ return Modified;
+}
return SDValue(DAG.getMachineNode(AMDGPU::WQM, DL, Src.getValueType(), Src),
0);
}
+ case Intrinsic::amdgcn_wwm: {
+ SDValue Src = Op.getOperand(1);
+ return SDValue(DAG.getMachineNode(AMDGPU::WWM, DL, Src.getValueType(), Src),
+ 0);
+ }
default:
return Op;
}
MI.eraseFromParent();
break;
}
+ case AMDGPU::EXIT_WWM: {
+ // This only gets its own opcode so that SIFixWWMLiveness can tell when WWM
+ // is exited.
+ MI.setDesc(get(AMDGPU::S_MOV_B64));
+ break;
+ }
}
return true;
}
case AMDGPU::PHI: return AMDGPU::PHI;
case AMDGPU::INSERT_SUBREG: return AMDGPU::INSERT_SUBREG;
case AMDGPU::WQM: return AMDGPU::WQM;
+ case AMDGPU::WWM: return AMDGPU::WWM;
case AMDGPU::S_MOV_B32:
return MI.getOperand(1).isReg() ?
AMDGPU::COPY : AMDGPU::V_MOV_B32_e32;
case AMDGPU::REG_SEQUENCE:
case AMDGPU::INSERT_SUBREG:
case AMDGPU::WQM:
+ case AMDGPU::WWM:
if (RI.hasVGPRs(NewDstRC))
return nullptr;
def V_MOV_B64_PSEUDO : VPseudoInstSI <(outs VReg_64:$vdst),
(ins VSrc_b64:$src0)>;
-// Pseudoinstruction for @llvm.amdgcn.wqm. It is turned into a copy
-// after the WQM pass processes them.
+// Pseudoinstruction for @llvm.amdgcn.wqm. It is turned into a copy after the
+// WQM pass processes it.
def WQM : PseudoInstSI <(outs unknown:$vdst), (ins unknown:$src0)>;
+// Pseudoinstruction for @llvm.amdgcn.wwm. It is turned into a copy post-RA, so
+// that the @earlyclobber is respected. The @earlyclobber is to make sure that
+// the instruction that defines $src0 (which is run in WWM) doesn't
+// accidentally clobber inactive channels of $vdst.
+let Constraints = "@earlyclobber $vdst" in {
+def WWM : PseudoInstSI <(outs unknown:$vdst), (ins unknown:$src0)>;
+}
+
} // End let hasSideEffects = 0, mayLoad = 0, mayStore = 0, Uses = [EXEC]
+def EXIT_WWM : SPseudoInstSI <(outs SReg_64:$sdst), (ins SReg_64:$src0)> {
+ let hasSideEffects = 0;
+ let mayLoad = 0;
+ let mayStore = 0;
+}
+
let usesCustomInserter = 1, SALU = 1 in {
def GET_GROUPSTATICSIZE : PseudoInstSI <(outs SReg_32:$sdst), (ins),
[(set SReg_32:$sdst, (int_amdgcn_groupstaticsize))]>;
//
/// \file
/// \brief This pass adds instructions to enable whole quad mode for pixel
-/// shaders.
+/// shaders, and whole wavefront mode for all programs.
///
/// Whole quad mode is required for derivative computations, but it interferes
/// with shader side effects (stores and atomics). This pass is run on the
/// ...
/// S_MOV_B64 EXEC, Tmp
///
+/// We also compute when a sequence of instructions requires Whole Wavefront
+/// Mode (WWM) and insert instructions to save and restore it:
+///
+/// S_OR_SAVEEXEC_B64 Tmp, -1
+/// ...
+/// S_MOV_B64 EXEC, Tmp
+///
/// In order to avoid excessive switching during sequences of Exact
/// instructions, the pass first analyzes which instructions must be run in WQM
/// (aka which instructions produce values that lead to derivative
enum {
StateWQM = 0x1,
- StateExact = 0x2,
+ StateWWM = 0x2,
+ StateExact = 0x4,
};
struct PrintState {
static raw_ostream &operator<<(raw_ostream &OS, const PrintState &PS) {
if (PS.State & StateWQM)
OS << "WQM";
- if (PS.State & StateExact) {
+ if (PS.State & StateWWM) {
if (PS.State & StateWQM)
OS << '|';
+ OS << "WWM";
+ }
+ if (PS.State & StateExact) {
+ if (PS.State & (StateWQM | StateWWM))
+ OS << '|';
OS << "Exact";
}
class SIWholeQuadMode : public MachineFunctionPass {
private:
+ CallingConv::ID CallingConv;
const SIInstrInfo *TII;
const SIRegisterInfo *TRI;
MachineRegisterInfo *MRI;
unsigned SaveWQM, unsigned LiveMaskReg);
void toWQM(MachineBasicBlock &MBB, MachineBasicBlock::iterator Before,
unsigned SavedWQM);
+ void toWWM(MachineBasicBlock &MBB, MachineBasicBlock::iterator Before,
+ unsigned SaveOrig);
+ void fromWWM(MachineBasicBlock &MBB, MachineBasicBlock::iterator Before,
+ unsigned SavedOrig);
void processBlock(MachineBasicBlock &MBB, unsigned LiveMaskReg, bool isEntry);
void lowerLiveMaskQueries(unsigned LiveMaskReg);
std::vector<WorkItem> &Worklist) {
InstrInfo &II = Instructions[&MI];
- assert(Flag == StateWQM);
+ assert(!(Flag & StateExact) && Flag != 0);
// Remove any disabled states from the flag. The user that required it gets
// an undefined value in the helper lanes. For example, this can happen if
/// Mark all instructions defining the uses in \p MI with \p Flag.
void SIWholeQuadMode::markInstructionUses(const MachineInstr &MI, char Flag,
std::vector<WorkItem> &Worklist) {
- assert(Flag == StateWQM);
for (const MachineOperand &Use : MI.uses()) {
if (!Use.isReg() || !Use.isUse())
continue;
unsigned Opcode = MI.getOpcode();
char Flags = 0;
- if (TII->isDS(Opcode)) {
+ if (TII->isDS(Opcode) && CallingConv == CallingConv::AMDGPU_PS) {
Flags = StateWQM;
} else if (TII->isWQM(Opcode)) {
// Sampling instructions don't need to produce results for all pixels
// correct, so we need it to be in WQM.
Flags = StateWQM;
LowerToCopyInstrs.push_back(&MI);
+ } else if (Opcode == AMDGPU::WWM) {
+ // The WWM intrinsic doesn't make the same guarantee, and plus it needs
+ // to be executed in WQM or Exact so that its copy doesn't clobber
+ // inactive lanes.
+ markInstructionUses(MI, StateWWM, Worklist);
+ GlobalFlags |= StateWWM;
+ LowerToCopyInstrs.push_back(&MI);
+ continue;
} else if (TII->isDisableWQM(MI)) {
BBI.Needs |= StateExact;
if (!(BBI.InNeeds & StateExact)) {
Worklist.push_back(&MBB);
}
GlobalFlags |= StateExact;
- III.Disabled = StateWQM;
+ III.Disabled = StateWQM | StateWWM;
continue;
} else {
if (Opcode == AMDGPU::SI_PS_LIVE) {
// Propagate backwards within block
if (MachineInstr *PrevMI = MI.getPrevNode()) {
- char InNeeds = II.Needs | II.OutNeeds;
+ char InNeeds = (II.Needs & ~StateWWM) | II.OutNeeds;
if (!PrevMI->isPHI()) {
InstrInfo &PrevII = Instructions[PrevMI];
if ((PrevII.OutNeeds | InNeeds) != PrevII.OutNeeds) {
LIS->InsertMachineInstrInMaps(*MI);
}
+void SIWholeQuadMode::toWWM(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator Before,
+ unsigned SaveOrig) {
+ MachineInstr *MI;
+
+ assert(SaveOrig);
+ MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::S_OR_SAVEEXEC_B64),
+ SaveOrig)
+ .addImm(-1);
+ LIS->InsertMachineInstrInMaps(*MI);
+}
+
+void SIWholeQuadMode::fromWWM(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator Before,
+ unsigned SavedOrig) {
+ MachineInstr *MI;
+
+ assert(SavedOrig);
+ MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::EXIT_WWM), AMDGPU::EXEC)
+ .addReg(SavedOrig);
+ LIS->InsertMachineInstrInMaps(*MI);
+}
+
void SIWholeQuadMode::processBlock(MachineBasicBlock &MBB, unsigned LiveMaskReg,
bool isEntry) {
auto BII = Blocks.find(&MBB);
const BlockInfo &BI = BII->second;
- if (!(BI.InNeeds & StateWQM))
- return;
-
// This is a non-entry block that is WQM throughout, so no need to do
// anything.
- if (!isEntry && !(BI.Needs & StateExact) && BI.OutNeeds != StateExact)
+ if (!isEntry && BI.Needs == StateWQM && BI.OutNeeds != StateExact)
return;
DEBUG(dbgs() << "\nProcessing block BB#" << MBB.getNumber() << ":\n");
unsigned SavedWQMReg = 0;
+ unsigned SavedNonWWMReg = 0;
bool WQMFromExec = isEntry;
- char State = isEntry ? StateExact : StateWQM;
+ char State = (isEntry || !(BI.InNeeds & StateWQM)) ? StateExact : StateWQM;
+ char NonWWMState = 0;
auto II = MBB.getFirstNonPHI(), IE = MBB.end();
if (isEntry)
++II; // Skip the instruction that saves LiveMask
- MachineBasicBlock::iterator First = IE;
+ // This stores the first instruction where it's safe to switch from WQM to
+ // Exact or vice versa.
+ MachineBasicBlock::iterator FirstWQM = IE;
+
+ // This stores the first instruction where it's safe to switch from WWM to
+ // Exact/WQM or to switch to WWM. It must always be the same as, or after,
+ // FirstWQM since if it's safe to switch to/from WWM, it must be safe to
+ // switch to/from WQM as well.
+ MachineBasicBlock::iterator FirstWWM = IE;
for (;;) {
MachineBasicBlock::iterator Next = II;
- char Needs = StateExact | StateWQM;
+ char Needs = StateExact | StateWQM; // WWM is disabled by default
char OutNeeds = 0;
- if (First == IE)
- First = II;
+ if (FirstWQM == IE)
+ FirstWQM = II;
+
+ if (FirstWWM == IE)
+ FirstWWM = II;
+ // First, figure out the allowed states (Needs) based on the propagated
+ // flags.
if (II != IE) {
MachineInstr &MI = *II;
if (requiresCorrectState(MI)) {
auto III = Instructions.find(&MI);
if (III != Instructions.end()) {
- if (III->second.Needs & StateWQM)
+ if (III->second.Needs & StateWWM)
+ Needs = StateWWM;
+ else if (III->second.Needs & StateWQM)
Needs = StateWQM;
else
Needs &= ~III->second.Disabled;
OutNeeds = III->second.OutNeeds;
}
+ } else {
+ // If the instruction doesn't actually need a correct EXEC, then we can
+ // safely leave WWM enabled.
+ Needs = StateExact | StateWQM | StateWWM;
}
if (MI.isTerminator() && OutNeeds == StateExact)
Needs = StateWQM | StateExact;
}
+ // Now, transition if necessary.
if (!(Needs & State)) {
+ MachineBasicBlock::iterator First;
+ if (State == StateWWM || Needs == StateWWM) {
+ // We must switch to or from WWM
+ First = FirstWWM;
+ } else {
+ // We only need to switch to/from WQM, so we can use FirstWQM
+ First = FirstWQM;
+ }
+
MachineBasicBlock::iterator Before =
prepareInsertion(MBB, First, II, Needs == StateWQM,
Needs == StateExact || WQMFromExec);
- if (Needs == StateExact) {
- if (!WQMFromExec && (OutNeeds & StateWQM))
- SavedWQMReg = MRI->createVirtualRegister(&AMDGPU::SReg_64RegClass);
+ if (State == StateWWM) {
+ assert(SavedNonWWMReg);
+ fromWWM(MBB, Before, SavedNonWWMReg);
+ State = NonWWMState;
+ }
- toExact(MBB, Before, SavedWQMReg, LiveMaskReg);
- State = StateExact;
+ if (Needs == StateWWM) {
+ NonWWMState = State;
+ SavedNonWWMReg = MRI->createVirtualRegister(&AMDGPU::SReg_64RegClass);
+ toWWM(MBB, Before, SavedNonWWMReg);
+ State = StateWWM;
} else {
- assert(Needs == StateWQM);
- assert(WQMFromExec == (SavedWQMReg == 0));
+ if (State == StateWQM && (Needs & StateExact) && !(Needs & StateWQM)) {
+ if (!WQMFromExec && (OutNeeds & StateWQM))
+ SavedWQMReg = MRI->createVirtualRegister(&AMDGPU::SReg_64RegClass);
- toWQM(MBB, Before, SavedWQMReg);
+ toExact(MBB, Before, SavedWQMReg, LiveMaskReg);
+ State = StateExact;
+ } else if (State == StateExact && (Needs & StateWQM) &&
+ !(Needs & StateExact)) {
+ assert(WQMFromExec == (SavedWQMReg == 0));
- if (SavedWQMReg) {
- LIS->createAndComputeVirtRegInterval(SavedWQMReg);
- SavedWQMReg = 0;
+ toWQM(MBB, Before, SavedWQMReg);
+
+ if (SavedWQMReg) {
+ LIS->createAndComputeVirtRegInterval(SavedWQMReg);
+ SavedWQMReg = 0;
+ }
+ State = StateWQM;
+ } else {
+ // We can get here if we transitioned from WWM to a non-WWM state that
+ // already matches our needs, but we shouldn't need to do anything.
+ assert(Needs & State);
}
- State = StateWQM;
}
-
- First = IE;
}
- if (Needs != (StateExact | StateWQM))
- First = IE;
+ if (Needs != (StateExact | StateWQM | StateWWM)) {
+ if (Needs != (StateExact | StateWQM))
+ FirstWQM = IE;
+ FirstWWM = IE;
+ }
if (II == IE)
break;
}
bool SIWholeQuadMode::runOnMachineFunction(MachineFunction &MF) {
- if (MF.getFunction()->getCallingConv() != CallingConv::AMDGPU_PS)
- return false;
-
Instructions.clear();
Blocks.clear();
LiveMaskQueries.clear();
LowerToCopyInstrs.clear();
+ CallingConv = MF.getFunction()->getCallingConv();
const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
LIS = &getAnalysis<LiveIntervals>();
char GlobalFlags = analyzeFunction(MF);
+ unsigned LiveMaskReg = 0;
if (!(GlobalFlags & StateWQM)) {
lowerLiveMaskQueries(AMDGPU::EXEC);
- return !LiveMaskQueries.empty();
- }
-
- // Store a copy of the original live mask when required
- unsigned LiveMaskReg = 0;
- {
+ if (!(GlobalFlags & StateWWM))
+ return !LiveMaskQueries.empty();
+ } else {
+ // Store a copy of the original live mask when required
MachineBasicBlock &Entry = MF.front();
MachineBasicBlock::iterator EntryMI = Entry.getFirstNonPHI();
LIS->InsertMachineInstrInMaps(*MI);
}
+ lowerLiveMaskQueries(LiveMaskReg);
+
if (GlobalFlags == StateWQM) {
// For a shader that needs only WQM, we can just set it once.
BuildMI(Entry, EntryMI, DebugLoc(), TII->get(AMDGPU::S_WQM_B64),
AMDGPU::EXEC)
.addReg(AMDGPU::EXEC);
- lowerLiveMaskQueries(LiveMaskReg);
lowerCopyInstrs();
// EntryMI may become invalid here
return true;
DEBUG(printInfo());
- lowerLiveMaskQueries(LiveMaskReg);
lowerCopyInstrs();
// Handle the general case
--- /dev/null
+# RUN: llc -march=amdgcn -verify-machineinstrs -run-pass si-fix-wwm-liveness -o - %s | FileCheck %s
+#CHECK: %exec = EXIT_WWM killed %19, implicit %21
+
+---
+name: test_wwm_liveness
+alignment: 0
+exposesReturnsTwice: false
+legalized: false
+regBankSelected: false
+selected: false
+tracksRegLiveness: true
+registers:
+ - { id: 0, class: sreg_64, preferred-register: '' }
+ - { id: 1, class: sgpr_32, preferred-register: '' }
+ - { id: 2, class: sgpr_32, preferred-register: '' }
+ - { id: 3, class: vgpr_32, preferred-register: '' }
+ - { id: 4, class: vgpr_32, preferred-register: '' }
+ - { id: 5, class: vgpr_32, preferred-register: '' }
+ - { id: 6, class: vgpr_32, preferred-register: '' }
+ - { id: 7, class: vgpr_32, preferred-register: '' }
+ - { id: 8, class: sreg_64, preferred-register: '%vcc' }
+ - { id: 9, class: sreg_64, preferred-register: '' }
+ - { id: 10, class: sreg_32_xm0, preferred-register: '' }
+ - { id: 11, class: sreg_64, preferred-register: '' }
+ - { id: 12, class: sreg_32_xm0, preferred-register: '' }
+ - { id: 13, class: sreg_32_xm0, preferred-register: '' }
+ - { id: 14, class: sreg_32_xm0, preferred-register: '' }
+ - { id: 15, class: sreg_128, preferred-register: '' }
+ - { id: 16, class: vgpr_32, preferred-register: '' }
+ - { id: 17, class: vgpr_32, preferred-register: '' }
+ - { id: 18, class: vgpr_32, preferred-register: '' }
+ - { id: 19, class: sreg_64, preferred-register: '' }
+ - { id: 20, class: sreg_64, preferred-register: '' }
+ - { id: 21, class: vgpr_32, preferred-register: '' }
+ - { id: 22, class: sreg_64, preferred-register: '' }
+ - { id: 23, class: sreg_64, preferred-register: '' }
+liveins:
+body: |
+ bb.0:
+ successors: %bb.1(0x40000000), %bb.2(0x40000000)
+
+ %21 = V_MOV_B32_e32 0, implicit %exec
+ %5 = V_MBCNT_LO_U32_B32_e64 -1, 0, implicit %exec
+ %6 = V_MBCNT_HI_U32_B32_e32 -1, killed %5, implicit %exec
+ %8 = V_CMP_GT_U32_e64 32, killed %6, implicit %exec
+ %22 = COPY %exec, implicit-def %exec
+ %23 = S_AND_B64 %22, %8, implicit-def dead %scc
+ %0 = S_XOR_B64 %23, %22, implicit-def dead %scc
+ %exec = S_MOV_B64_term killed %23
+ SI_MASK_BRANCH %bb.2, implicit %exec
+ S_BRANCH %bb.1
+
+ bb.1:
+ successors: %bb.2(0x80000000)
+
+ %13 = S_MOV_B32 61440
+ %14 = S_MOV_B32 -1
+ %15 = REG_SEQUENCE undef %12, 1, undef %10, 2, killed %14, 3, killed %13, 4
+ %19 = COPY %exec
+ %exec = S_MOV_B64 -1
+ %16 = BUFFER_LOAD_DWORD_OFFSET %15, 0, 0, 0, 0, 0, implicit %exec :: (volatile load 4)
+ %17 = V_ADD_F32_e32 1065353216, killed %16, implicit %exec
+ %exec = EXIT_WWM killed %19
+ %21 = V_MOV_B32_e32 1, implicit %exec
+ early-clobber %18 = WWM killed %17, implicit %exec
+ BUFFER_STORE_DWORD_OFFSET killed %18, killed %15, 0, 0, 0, 0, 0, implicit %exec :: (store 4)
+
+ bb.2:
+ %exec = S_OR_B64 %exec, killed %0, implicit-def %scc
+ %vgpr0 = COPY killed %21
+ SI_RETURN_TO_EPILOG killed %vgpr0
+
+...
ret float %out.2
}
+; Check that WWM is triggered by the wwm intrinsic.
+;
+;CHECK-LABEL: {{^}}test_wwm1:
+;CHECK: s_or_saveexec_b64 s{{\[[0-9]+:[0-9]+\]}}, -1
+;CHECK: buffer_load_dword
+;CHECK: buffer_load_dword
+;CHECK: v_add_f32_e32
+define amdgpu_ps float @test_wwm1(i32 inreg %idx0, i32 inreg %idx1) {
+main_body:
+ %src0 = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> undef, i32 %idx0, i32 0, i1 0, i1 0)
+ %src1 = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> undef, i32 %idx1, i32 0, i1 0, i1 0)
+ %out = fadd float %src0, %src1
+ %out.0 = call float @llvm.amdgcn.wwm.f32(float %out)
+ ret float %out.0
+}
+
+; Same as above, but with an integer type.
+;
+;CHECK-LABEL: {{^}}test_wwm2:
+;CHECK: s_or_saveexec_b64 s{{\[[0-9]+:[0-9]+\]}}, -1
+;CHECK: buffer_load_dword
+;CHECK: buffer_load_dword
+;CHECK: v_add_i32_e32
+define amdgpu_ps float @test_wwm2(i32 inreg %idx0, i32 inreg %idx1) {
+main_body:
+ %src0 = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> undef, i32 %idx0, i32 0, i1 0, i1 0)
+ %src1 = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> undef, i32 %idx1, i32 0, i1 0, i1 0)
+ %src0.0 = bitcast float %src0 to i32
+ %src1.0 = bitcast float %src1 to i32
+ %out = add i32 %src0.0, %src1.0
+ %out.0 = call i32 @llvm.amdgcn.wwm.i32(i32 %out)
+ %out.1 = bitcast i32 %out.0 to float
+ ret float %out.1
+}
+
+; Check that we don't leave WWM on for computations that don't require WWM,
+; since that will lead clobbering things that aren't supposed to be clobbered
+; in cases like this.
+;
+;CHECK-LABEL: {{^}}test_wwm3:
+;CHECK: s_or_saveexec_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], -1
+;CHECK: buffer_load_dword
+;CHECK: v_add_f32_e32
+;CHECK: s_mov_b64 exec, [[ORIG]]
+;CHECK: v_add_f32_e32
+define amdgpu_ps float @test_wwm3(i32 inreg %idx) {
+main_body:
+ ; use mbcnt to make sure the branch is divergent
+ %lo = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)
+ %hi = call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 %lo)
+ %cc = icmp uge i32 %hi, 32
+ br i1 %cc, label %endif, label %if
+
+if:
+ %src = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> undef, i32 %idx, i32 0, i1 0, i1 0)
+ %out = fadd float %src, %src
+ %out.0 = call float @llvm.amdgcn.wwm.f32(float %out)
+ %out.1 = fadd float %src, %out.0
+ br label %endif
+
+endif:
+ %out.2 = phi float [ %out.1, %if ], [ 0.0, %main_body ]
+ ret float %out.2
+}
+
+; Check that WWM writes aren't coalesced with non-WWM writes, since the WWM
+; write could clobber disabled channels in the non-WWM one.
+;
+;CHECK-LABEL: {{^}}test_wwm4:
+;CHECK: s_or_saveexec_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], -1
+;CHECK: buffer_load_dword
+;CHECK: v_add_f32_e32
+;CHECK: s_mov_b64 exec, [[ORIG]]
+;CHECK-NEXT: v_mov_b32_e32
+define amdgpu_ps float @test_wwm4(i32 inreg %idx) {
+main_body:
+ ; use mbcnt to make sure the branch is divergent
+ %lo = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)
+ %hi = call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 %lo)
+ %cc = icmp uge i32 %hi, 32
+ br i1 %cc, label %endif, label %if
+
+if:
+ %src = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> undef, i32 %idx, i32 0, i1 0, i1 0)
+ %out = fadd float %src, %src
+ %out.0 = call float @llvm.amdgcn.wwm.f32(float %out)
+ br label %endif
+
+endif:
+ %out.1 = phi float [ %out.0, %if ], [ 0.0, %main_body ]
+ ret float %out.1
+}
+
+; Make sure the transition from Exact to WWM then WQM works properly.
+;
+;CHECK-LABEL: {{^}}test_wwm5:
+;CHECK: buffer_load_dword
+;CHECK: buffer_store_dword
+;CHECK: s_or_saveexec_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], -1
+;CHECK: buffer_load_dword
+;CHECK: v_add_f32_e32
+;CHECK: s_mov_b64 exec, [[ORIG]]
+;CHECK: s_wqm_b64 exec, exec
+define amdgpu_ps float @test_wwm5(i32 inreg %idx0, i32 inreg %idx1) {
+main_body:
+ %src0 = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> undef, i32 %idx0, i32 0, i1 0, i1 0)
+ call void @llvm.amdgcn.buffer.store.f32(float %src0, <4 x i32> undef, i32 %idx0, i32 0, i1 0, i1 0)
+ %src1 = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> undef, i32 %idx1, i32 0, i1 0, i1 0)
+ %temp = fadd float %src1, %src1
+ %temp.0 = call float @llvm.amdgcn.wwm.f32(float %temp)
+ %out = fadd float %temp.0, %temp.0
+ %out.0 = call float @llvm.amdgcn.wqm.f32(float %out)
+ ret float %out.0
+}
+
+; Check that WWM is turned on correctly across basic block boundaries.
+;
+;CHECK-LABEL: {{^}}test_wwm6:
+;CHECK: s_or_saveexec_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], -1
+;SI-CHECK: buffer_load_dword
+;VI-CHECK: flat_load_dword
+;CHECK: s_mov_b64 exec, [[ORIG]]
+;CHECK: %if
+;CHECK: s_or_saveexec_b64 [[ORIG2:s\[[0-9]+:[0-9]+\]]], -1
+;SI-CHECK: buffer_load_dword
+;VI-CHECK: flat_load_dword
+;CHECK: v_add_f32_e32
+;CHECK: s_mov_b64 exec, [[ORIG2]]
+define amdgpu_ps float @test_wwm6() {
+main_body:
+ %src0 = load volatile float, float addrspace(1)* undef
+ ; use mbcnt to make sure the branch is divergent
+ %lo = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)
+ %hi = call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 %lo)
+ %cc = icmp uge i32 %hi, 32
+ br i1 %cc, label %endif, label %if
+
+if:
+ %src1 = load volatile float, float addrspace(1)* undef
+ %out = fadd float %src0, %src1
+ %out.0 = call float @llvm.amdgcn.wwm.f32(float %out)
+ br label %endif
+
+endif:
+ %out.1 = phi float [ %out.0, %if ], [ 0.0, %main_body ]
+ ret float %out.1
+}
+
; Check a case of one branch of an if-else requiring WQM, the other requiring
; exact.
;
declare void @llvm.AMDGPU.kill(float) #1
declare float @llvm.amdgcn.wqm.f32(float) #3
declare i32 @llvm.amdgcn.wqm.i32(i32) #3
+declare float @llvm.amdgcn.wwm.f32(float) #3
+declare i32 @llvm.amdgcn.wwm.i32(i32) #3
+declare i32 @llvm.amdgcn.mbcnt.lo(i32, i32) #3
+declare i32 @llvm.amdgcn.mbcnt.hi(i32, i32) #3
attributes #1 = { nounwind }
attributes #2 = { nounwind readonly }
--- /dev/null
+# RUN: llc -march=amdgcn -verify-machineinstrs -run-pass si-wqm -o - %s | FileCheck %s
+
+---
+# Check for awareness that s_or_saveexec_b64 clobbers SCC
+#
+#CHECK: S_OR_SAVEEXEC_B64
+#CHECK: S_CMP_LT_I32
+#CHECK: S_CSELECT_B32
+name: test_wwm_scc
+alignment: 0
+exposesReturnsTwice: false
+legalized: false
+regBankSelected: false
+selected: false
+tracksRegLiveness: true
+registers:
+ - { id: 0, class: sgpr_32, preferred-register: '' }
+ - { id: 1, class: sgpr_32, preferred-register: '' }
+ - { id: 2, class: sgpr_32, preferred-register: '' }
+ - { id: 3, class: vgpr_32, preferred-register: '' }
+ - { id: 4, class: vgpr_32, preferred-register: '' }
+ - { id: 5, class: sgpr_32, preferred-register: '' }
+ - { id: 6, class: vgpr_32, preferred-register: '' }
+ - { id: 7, class: vgpr_32, preferred-register: '' }
+ - { id: 8, class: sreg_32_xm0, preferred-register: '' }
+ - { id: 9, class: sreg_32, preferred-register: '' }
+ - { id: 10, class: sreg_32, preferred-register: '' }
+ - { id: 11, class: vgpr_32, preferred-register: '' }
+ - { id: 12, class: vgpr_32, preferred-register: '' }
+liveins:
+ - { reg: '%sgpr0', virtual-reg: '%0' }
+ - { reg: '%sgpr1', virtual-reg: '%1' }
+ - { reg: '%sgpr2', virtual-reg: '%2' }
+ - { reg: '%vgpr0', virtual-reg: '%3' }
+body: |
+ bb.0:
+ liveins: %sgpr0, %sgpr1, %sgpr2, %vgpr0
+
+ %3 = COPY %vgpr0
+ %2 = COPY %sgpr2
+ %1 = COPY %sgpr1
+ %0 = COPY %sgpr0
+ S_CMP_LT_I32 0, %0, implicit-def %scc
+ %12 = V_ADD_I32_e32 %3, %3, implicit-def %vcc, implicit %exec
+ %5 = S_CSELECT_B32 %2, %1, implicit %scc
+ %11 = V_ADD_I32_e32 %5, %12, implicit-def %vcc, implicit %exec
+ %vgpr0 = WWM %11, implicit %exec
+ SI_RETURN_TO_EPILOG %vgpr0
+
+...