//===----------------------------------------------------------------------===//
GCNHazardRecognizer::GCNHazardRecognizer(const MachineFunction &MF) :
+ IsHazardRecognizerMode(false),
CurrCycleInstr(nullptr),
MF(MF),
ST(MF.getSubtarget<GCNSubtarget>()),
}
unsigned GCNHazardRecognizer::PreEmitNoops(SUnit *SU) {
- return PreEmitNoops(SU->getInstr());
+ IsHazardRecognizerMode = false;
+ return PreEmitNoopsCommon(SU->getInstr());
}
unsigned GCNHazardRecognizer::PreEmitNoops(MachineInstr *MI) {
+ IsHazardRecognizerMode = true;
+ CurrCycleInstr = MI;
+ unsigned W = PreEmitNoopsCommon(MI);
+ CurrCycleInstr = nullptr;
+ return W;
+}
+
+unsigned GCNHazardRecognizer::PreEmitNoopsCommon(MachineInstr *MI) {
int WaitStates = std::max(0, checkAnyInstHazards(MI));
if (SIInstrInfo::isSMRD(*MI))
// Do not track non-instructions which do not affect the wait states.
// If included, these instructions can lead to buffer overflow such that
// detectable hazards are missed.
- if (CurrCycleInstr->getOpcode() == AMDGPU::IMPLICIT_DEF)
+ if (CurrCycleInstr->isImplicitDef())
return;
else if (CurrCycleInstr->isDebugInstr())
return;
// Helper Functions
//===----------------------------------------------------------------------===//
-int GCNHazardRecognizer::getWaitStatesSince(
- function_ref<bool(MachineInstr *)> IsHazard) {
+typedef function_ref<bool(MachineInstr *, int WaitStates)> IsExpiredFn;
+
+// Returns a minimum wait states since \p I walking all predecessors.
+// Only scans until \p IsExpired does not return true.
+// Can only be run in a hazard recognizer mode.
+static int getWaitStatesSince(GCNHazardRecognizer::IsHazardFn IsHazard,
+ MachineBasicBlock *MBB,
+ MachineBasicBlock::reverse_instr_iterator I,
+ int WaitStates,
+ IsExpiredFn IsExpired,
+ DenseSet<const MachineBasicBlock *> &Visited) {
+
+ for (auto E = MBB->rend() ; I != E; ++I) {
+ if (IsHazard(&*I))
+ return WaitStates;
+
+ if (I->isInlineAsm() || I->isImplicitDef() || I->isDebugInstr())
+ continue;
+
+ WaitStates += SIInstrInfo::getNumWaitStates(*I);
+
+ if (IsExpired(&*I, WaitStates))
+ return std::numeric_limits<int>::max();
+ }
+
+ int MinWaitStates = WaitStates;
+ bool Found = false;
+ for (MachineBasicBlock *Pred : MBB->predecessors()) {
+ if (!Visited.insert(Pred).second)
+ continue;
+
+ int W = getWaitStatesSince(IsHazard, Pred, Pred->instr_rbegin(),
+ WaitStates, IsExpired, Visited);
+
+ if (W == std::numeric_limits<int>::max())
+ continue;
+
+ MinWaitStates = Found ? std::min(MinWaitStates, W) : W;
+ if (IsExpired(nullptr, MinWaitStates))
+ return MinWaitStates;
+
+ Found = true;
+ }
+
+ if (Found)
+ return MinWaitStates;
+
+ return std::numeric_limits<int>::max();
+}
+
+static int getWaitStatesSince(GCNHazardRecognizer::IsHazardFn IsHazard,
+ MachineInstr *MI,
+ IsExpiredFn IsExpired) {
+ DenseSet<const MachineBasicBlock *> Visited;
+ return getWaitStatesSince(IsHazard, MI->getParent(),
+ std::next(MI->getReverseIterator()),
+ 0, IsExpired, Visited);
+}
+
+int GCNHazardRecognizer::getWaitStatesSince(IsHazardFn IsHazard, int Limit) {
+ if (IsHazardRecognizerMode) {
+ auto IsExpiredFn = [Limit] (MachineInstr *, int WaitStates) {
+ return WaitStates >= Limit;
+ };
+ return ::getWaitStatesSince(IsHazard, CurrCycleInstr, IsExpiredFn);
+ }
+
int WaitStates = 0;
for (MachineInstr *MI : EmittedInstrs) {
if (MI) {
if (IsHazard(MI))
return WaitStates;
- unsigned Opcode = MI->getOpcode();
- if (Opcode == AMDGPU::INLINEASM)
+ if (MI->isInlineAsm())
continue;
}
++WaitStates;
+
+ if (WaitStates >= Limit)
+ break;
}
return std::numeric_limits<int>::max();
}
-int GCNHazardRecognizer::getWaitStatesSinceDef(
- unsigned Reg, function_ref<bool(MachineInstr *)> IsHazardDef) {
+int GCNHazardRecognizer::getWaitStatesSinceDef(unsigned Reg,
+ IsHazardFn IsHazardDef,
+ int Limit) {
const SIRegisterInfo *TRI = ST.getRegisterInfo();
auto IsHazardFn = [IsHazardDef, TRI, Reg] (MachineInstr *MI) {
return IsHazardDef(MI) && MI->modifiesRegister(Reg, TRI);
};
- return getWaitStatesSince(IsHazardFn);
+ return getWaitStatesSince(IsHazardFn, Limit);
}
-int GCNHazardRecognizer::getWaitStatesSinceSetReg(
- function_ref<bool(MachineInstr *)> IsHazard) {
+int GCNHazardRecognizer::getWaitStatesSinceSetReg(IsHazardFn IsHazard,
+ int Limit) {
auto IsHazardFn = [IsHazard] (MachineInstr *MI) {
return isSSetReg(MI->getOpcode()) && IsHazard(MI);
};
- return getWaitStatesSince(IsHazardFn);
+ return getWaitStatesSince(IsHazardFn, Limit);
}
//===----------------------------------------------------------------------===//
if (!Use.isReg())
continue;
int WaitStatesNeededForUse =
- SmrdSgprWaitStates - getWaitStatesSinceDef(Use.getReg(), IsHazardDefFn);
+ SmrdSgprWaitStates - getWaitStatesSinceDef(Use.getReg(), IsHazardDefFn,
+ SmrdSgprWaitStates);
WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
// This fixes what appears to be undocumented hardware behavior in SI where
if (IsBufferSMRD) {
int WaitStatesNeededForUse =
SmrdSgprWaitStates - getWaitStatesSinceDef(Use.getReg(),
- IsBufferHazardDefFn);
+ IsBufferHazardDefFn,
+ SmrdSgprWaitStates);
WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
}
}
continue;
int WaitStatesNeededForUse =
- VmemSgprWaitStates - getWaitStatesSinceDef(Use.getReg(), IsHazardDefFn);
+ VmemSgprWaitStates - getWaitStatesSinceDef(Use.getReg(), IsHazardDefFn,
+ VmemSgprWaitStates);
WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
}
return WaitStatesNeeded;
if (!Use.isReg() || !TRI->isVGPR(MF.getRegInfo(), Use.getReg()))
continue;
int WaitStatesNeededForUse =
- DppVgprWaitStates - getWaitStatesSinceDef(Use.getReg());
+ DppVgprWaitStates - getWaitStatesSinceDef(Use.getReg(),
+ [](MachineInstr *) { return true; },
+ DppVgprWaitStates);
WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
}
WaitStatesNeeded = std::max(
WaitStatesNeeded,
- DppExecWaitStates - getWaitStatesSinceDef(AMDGPU::EXEC, IsHazardDefFn));
+ DppExecWaitStates - getWaitStatesSinceDef(AMDGPU::EXEC, IsHazardDefFn,
+ DppExecWaitStates));
return WaitStatesNeeded;
}
// instruction.
const int DivFMasWaitStates = 4;
auto IsHazardDefFn = [TII] (MachineInstr *MI) { return TII->isVALU(*MI); };
- int WaitStatesNeeded = getWaitStatesSinceDef(AMDGPU::VCC, IsHazardDefFn);
+ int WaitStatesNeeded = getWaitStatesSinceDef(AMDGPU::VCC, IsHazardDefFn,
+ DivFMasWaitStates);
return DivFMasWaitStates - WaitStatesNeeded;
}
auto IsHazardFn = [TII, GetRegHWReg] (MachineInstr *MI) {
return GetRegHWReg == getHWReg(TII, *MI);
};
- int WaitStatesNeeded = getWaitStatesSinceSetReg(IsHazardFn);
+ int WaitStatesNeeded = getWaitStatesSinceSetReg(IsHazardFn, GetRegWaitStates);
return GetRegWaitStates - WaitStatesNeeded;
}
auto IsHazardFn = [TII, HWReg] (MachineInstr *MI) {
return HWReg == getHWReg(TII, *MI);
};
- int WaitStatesNeeded = getWaitStatesSinceSetReg(IsHazardFn);
+ int WaitStatesNeeded = getWaitStatesSinceSetReg(IsHazardFn, SetRegWaitStates);
return SetRegWaitStates - WaitStatesNeeded;
}
TRI->regsOverlap(MI->getOperand(DataIdx).getReg(), Reg);
};
int WaitStatesNeededForDef =
- VALUWaitStates - getWaitStatesSince(IsHazardFn);
+ VALUWaitStates - getWaitStatesSince(IsHazardFn, VALUWaitStates);
WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
return WaitStatesNeeded;
};
const int RWLaneWaitStates = 4;
- int WaitStatesSince = getWaitStatesSinceDef(LaneSelectReg, IsHazardFn);
+ int WaitStatesSince = getWaitStatesSinceDef(LaneSelectReg, IsHazardFn,
+ RWLaneWaitStates);
return RWLaneWaitStates - WaitStatesSince;
}
auto IsHazardFn = [TII] (MachineInstr *MI) {
return getHWReg(TII, *MI) == AMDGPU::Hwreg::ID_TRAPSTS;
};
- int WaitStatesNeeded = getWaitStatesSinceSetReg(IsHazardFn);
+ int WaitStatesNeeded = getWaitStatesSinceSetReg(IsHazardFn, RFEWaitStates);
return RFEWaitStates - WaitStatesNeeded;
}
return MI->getOpcode() == AMDGPU::S_MOV_FED_B32;
};
int WaitStatesNeededForUse =
- MovFedWaitStates - getWaitStatesSinceDef(Use.getReg(), IsHazardFn);
+ MovFedWaitStates - getWaitStatesSinceDef(Use.getReg(), IsHazardFn,
+ MovFedWaitStates);
WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
}
auto IsHazardFn = [TII] (MachineInstr *MI) {
return TII->isSALU(*MI);
};
- return SMovRelWaitStates - getWaitStatesSinceDef(AMDGPU::M0, IsHazardFn);
+ return SMovRelWaitStates - getWaitStatesSinceDef(AMDGPU::M0, IsHazardFn,
+ SMovRelWaitStates);
}
--- /dev/null
+# RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs -run-pass post-RA-hazard-rec -o - %s | FileCheck -check-prefix=GCN %s
+
+# GCN-LABEL: name: vmem_vcc_fallthrough
+# GCN: bb.1:
+# GCN-NEXT: S_NOP
+# GCN-NEXT: S_NOP
+# GCN-NEXT: S_NOP
+# GCN-NEXT: S_NOP
+# GCN-NEXT: S_NOP
+# GCN-NEXT: BUFFER_LOAD_DWORD_OFFEN
+---
+name: vmem_vcc_fallthrough
+body: |
+ bb.0:
+ successors: %bb.1
+
+ $sgpr0_sgpr1_sgpr2_sgpr3 = IMPLICIT_DEF
+ $vgpr0 = IMPLICIT_DEF
+ $vgpr1 = V_ADDC_U32_e32 $vgpr0, $vgpr0, implicit-def $vcc, implicit $vcc, implicit $exec
+
+ bb.1:
+ $vgpr1 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $vcc_lo, 0, 0, 0, 0, implicit $exec
+...
+# GCN-LABEL: name: vmem_vcc_branch_to_next
+# GCN: bb.1:
+# GCN-NEXT: S_NOP
+# GCN-NEXT: S_NOP
+# GCN-NEXT: S_NOP
+# GCN-NEXT: S_NOP
+# GCN-NEXT: BUFFER_LOAD_DWORD_OFFEN
+---
+name: vmem_vcc_branch_to_next
+body: |
+ bb.0:
+ successors: %bb.1
+
+ $sgpr0_sgpr1_sgpr2_sgpr3 = IMPLICIT_DEF
+ $vgpr0 = IMPLICIT_DEF
+ $vgpr1 = V_ADDC_U32_e32 $vgpr0, $vgpr0, implicit-def $vcc, implicit $vcc, implicit $exec
+ S_BRANCH %bb.1
+
+ bb.1:
+ $vgpr1 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $vcc_lo, 0, 0, 0, 0, implicit $exec
+...
+# GCN-LABEL: name: vmem_vcc_fallthrough_no_hazard_too_far
+# GCN: bb.1:
+# GCN-NEXT: BUFFER_LOAD_DWORD_OFFEN
+---
+name: vmem_vcc_fallthrough_no_hazard_too_far
+body: |
+ bb.0:
+ successors: %bb.1
+
+ $sgpr0_sgpr1_sgpr2_sgpr3 = IMPLICIT_DEF
+ $vgpr0 = IMPLICIT_DEF
+ $vgpr1 = V_ADDC_U32_e32 $vgpr0, $vgpr0, implicit-def $vcc, implicit $vcc, implicit $exec
+ $sgpr0 = S_MOV_B32 0
+ $sgpr0 = S_MOV_B32 0
+ $sgpr0 = S_MOV_B32 0
+ $sgpr0 = S_MOV_B32 0
+ $sgpr0 = S_MOV_B32 0
+
+ bb.1:
+ $vgpr1 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $vcc_lo, 0, 0, 0, 0, implicit $exec
+...
+# GCN-LABEL: name: vmem_vcc_fallthrough_no_hazard_nops
+# GCN: bb.1:
+# GCN-NEXT: BUFFER_LOAD_DWORD_OFFEN
+---
+name: vmem_vcc_fallthrough_no_hazard_nops
+body: |
+ bb.0:
+ successors: %bb.1
+
+ $sgpr0_sgpr1_sgpr2_sgpr3 = IMPLICIT_DEF
+ $vgpr0 = IMPLICIT_DEF
+ $vgpr1 = V_ADDC_U32_e32 $vgpr0, $vgpr0, implicit-def $vcc, implicit $vcc, implicit $exec
+ S_NOP 4
+
+ bb.1:
+ $vgpr1 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $vcc_lo, 0, 0, 0, 0, implicit $exec
+...
+# GCN-LABEL: name: vmem_vcc_branch_around
+# GCN: bb.2:
+# GCN-NEXT: S_NOP
+# GCN-NEXT: S_NOP
+# GCN-NEXT: S_NOP
+# GCN-NEXT: S_NOP
+# GCN-NEXT: BUFFER_LOAD_DWORD_OFFEN
+---
+name: vmem_vcc_branch_around
+body: |
+ bb.0:
+ successors: %bb.2
+
+ $sgpr0_sgpr1_sgpr2_sgpr3 = IMPLICIT_DEF
+ $vgpr0 = IMPLICIT_DEF
+ $vgpr1 = V_ADDC_U32_e32 $vgpr0, $vgpr0, implicit-def $vcc, implicit $vcc, implicit $exec
+ S_BRANCH %bb.2
+
+ bb.1:
+ successors: %bb.2
+
+ S_NOP 0
+ S_NOP 0
+ S_NOP 0
+ S_NOP 0
+
+ bb.2:
+ $vgpr1 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $vcc_lo, 0, 0, 0, 0, implicit $exec
+...
+# GCN-LABEL: name: vmem_vcc_branch_backedge
+# GCN: S_NOP
+# GCN-NEXT: S_NOP
+# GCN-NEXT: S_NOP
+# GCN-NEXT: S_NOP
+# GCN-NEXT: BUFFER_LOAD_DWORD_OFFEN
+---
+name: vmem_vcc_branch_backedge
+body: |
+ bb.0:
+ successors: %bb.1
+
+ $vgpr0 = IMPLICIT_DEF
+ $sgpr0_sgpr1_sgpr2_sgpr3 = IMPLICIT_DEF
+ $vgpr1 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $vcc_lo, 0, 0, 0, 0, implicit $exec
+
+ bb.1:
+ $vgpr0 = IMPLICIT_DEF
+ $vgpr1 = V_ADDC_U32_e32 $vgpr0, $vgpr0, implicit-def $vcc, implicit $vcc, implicit $exec
+ S_BRANCH %bb.0
+...
+# GCN-LABEL: name: vmem_vcc_min_of_two
+# GCN: bb.2:
+# GCN-NEXT: S_NOP
+# GCN-NEXT: S_NOP
+# GCN-NEXT: S_NOP
+# GCN-NEXT: S_NOP
+# GCN-NEXT: S_NOP
+# GCN-NEXT: BUFFER_LOAD_DWORD_OFFEN
+---
+name: vmem_vcc_min_of_two
+body: |
+ bb.0:
+ successors: %bb.2
+
+ $sgpr0_sgpr1_sgpr2_sgpr3 = IMPLICIT_DEF
+ $vgpr0 = IMPLICIT_DEF
+ $vgpr1 = V_ADDC_U32_e32 $vgpr0, $vgpr0, implicit-def $vcc, implicit $vcc, implicit $exec
+ S_NOP 0
+ S_BRANCH %bb.2
+
+ bb.1:
+ successors: %bb.2
+
+ $vgpr1 = V_ADDC_U32_e32 $vgpr0, $vgpr0, implicit-def $vcc, implicit $vcc, implicit $exec
+
+ bb.2:
+ $vgpr1 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $vcc_lo, 0, 0, 0, 0, implicit $exec
+...
+# GCN-LABEL: name: vmem_vcc_self_loop
+# GCN: S_NOP
+# GCN-NEXT: S_NOP
+# GCN-NEXT: S_NOP
+# GCN-NEXT: S_NOP
+# GCN-NEXT: BUFFER_LOAD_DWORD_OFFEN
+---
+name: vmem_vcc_self_loop
+body: |
+ bb.0:
+ successors: %bb.0
+
+ $vgpr0 = IMPLICIT_DEF
+ $sgpr0_sgpr1_sgpr2_sgpr3 = IMPLICIT_DEF
+ $vgpr1 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $vcc_lo, 0, 0, 0, 0, implicit $exec
+ $vgpr1 = V_ADDC_U32_e32 $vgpr0, $vgpr0, implicit-def $vcc, implicit $vcc, implicit $exec
+ S_BRANCH %bb.0
+...
+# GCN-LABEL: name: vmem_vcc_min_of_two_self_loop1
+# GCN: bb.1:
+# GCN: $sgpr0 = S_MOV_B32 0
+# GCN-NEXT: S_NOP
+# GCN-NEXT: S_NOP
+# GCN-NEXT: S_NOP
+# GCN-NEXT: S_NOP
+# GCN-NEXT: BUFFER_LOAD_DWORD_OFFEN
+---
+name: vmem_vcc_min_of_two_self_loop1
+body: |
+ bb.0:
+ successors: %bb.1
+
+ $sgpr0_sgpr1_sgpr2_sgpr3 = IMPLICIT_DEF
+ $vgpr0 = IMPLICIT_DEF
+ $vgpr1 = V_ADDC_U32_e32 $vgpr0, $vgpr0, implicit-def $vcc, implicit $vcc, implicit $exec
+
+ bb.1:
+ successors: %bb.1
+
+ $sgpr0 = S_MOV_B32 0
+ $vgpr1 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $vcc_lo, 0, 0, 0, 0, implicit $exec
+ $vgpr1 = V_ADDC_U32_e32 $vgpr1, $vgpr1, implicit-def $vcc, implicit $vcc, implicit $exec
+ S_BRANCH %bb.1
+...
+# GCN-LABEL: name: vmem_vcc_min_of_two_self_loop2
+# GCN: bb.1:
+# GCN: $sgpr0 = S_MOV_B32 0
+# GCN-NEXT: S_NOP
+# GCN-NEXT: S_NOP
+# GCN-NEXT: S_NOP
+# GCN-NEXT: BUFFER_LOAD_DWORD_OFFEN
+---
+name: vmem_vcc_min_of_two_self_loop2
+body: |
+ bb.0:
+ successors: %bb.1
+
+ $sgpr0_sgpr1_sgpr2_sgpr3 = IMPLICIT_DEF
+ $vgpr0 = IMPLICIT_DEF
+ $vgpr1 = V_ADDC_U32_e32 $vgpr0, $vgpr0, implicit-def $vcc, implicit $vcc, implicit $exec
+ S_NOP 0
+
+ bb.1:
+ successors: %bb.1
+
+ $sgpr0 = S_MOV_B32 0
+ $vgpr1 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $vcc_lo, 0, 0, 0, 0, implicit $exec
+ $vgpr1 = V_ADDC_U32_e32 $vgpr1, $vgpr1, implicit-def $vcc, implicit $vcc, implicit $exec
+ S_BRANCH %bb.1
+...