]> granicus.if.org Git - llvm/commitdiff
[AMDGPU] Fixed hazard recognizer to walk predecessors
authorStanislav Mekhanoshin <Stanislav.Mekhanoshin@amd.com>
Mon, 21 Jan 2019 19:11:26 +0000 (19:11 +0000)
committerStanislav Mekhanoshin <Stanislav.Mekhanoshin@amd.com>
Mon, 21 Jan 2019 19:11:26 +0000 (19:11 +0000)
Fixes two problems with GCNHazardRecognizer:
1. It only scans up to 5 instructions emitted earlier.
2. It does not take control flow into account. An earlier instruction
from the previous basic block is not necessarily a predecessor.
At the same time a real predecessor block is not scanned.

The patch provides a way to distinguish between scheduler and
hazard recognizer mode. It is OK to work with emitted instructions
in the scheduler because we do not really know what will be emitted
later and its order. However, when pass works as a hazard recognizer
the schedule is already finalized, and we have full access to the
instructions for the whole function, so we can properly traverse
predecessors and their instructions.

Differential Revision: https://reviews.llvm.org/D56923

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@351759 91177308-0d34-0410-b5e6-96231b3b80d8

lib/Target/AMDGPU/GCNHazardRecognizer.cpp
lib/Target/AMDGPU/GCNHazardRecognizer.h
lib/Target/AMDGPU/SIInstrInfo.cpp
lib/Target/AMDGPU/SIInstrInfo.h
test/CodeGen/AMDGPU/vmem-vcc-hazard.mir [new file with mode: 0644]

index bb1c6dbff2b75f77e178987d598def75db237d50..4af8a457a0e5dfe02d7786280ead0d901ce64156 100644 (file)
@@ -37,6 +37,7 @@ using namespace llvm;
 //===----------------------------------------------------------------------===//
 
 GCNHazardRecognizer::GCNHazardRecognizer(const MachineFunction &MF) :
+  IsHazardRecognizerMode(false),
   CurrCycleInstr(nullptr),
   MF(MF),
   ST(MF.getSubtarget<GCNSubtarget>()),
@@ -172,10 +173,19 @@ GCNHazardRecognizer::getHazardType(SUnit *SU, int Stalls) {
 }
 
 unsigned GCNHazardRecognizer::PreEmitNoops(SUnit *SU) {
-  return PreEmitNoops(SU->getInstr());
+  IsHazardRecognizerMode = false;
+  return PreEmitNoopsCommon(SU->getInstr());
 }
 
 unsigned GCNHazardRecognizer::PreEmitNoops(MachineInstr *MI) {
+  IsHazardRecognizerMode = true;
+  CurrCycleInstr = MI;
+  unsigned W = PreEmitNoopsCommon(MI);
+  CurrCycleInstr = nullptr;
+  return W;
+}
+
+unsigned GCNHazardRecognizer::PreEmitNoopsCommon(MachineInstr *MI) {
   int WaitStates = std::max(0, checkAnyInstHazards(MI));
 
   if (SIInstrInfo::isSMRD(*MI))
@@ -231,7 +241,7 @@ void GCNHazardRecognizer::AdvanceCycle() {
   // Do not track non-instructions which do not affect the wait states.
   // If included, these instructions can lead to buffer overflow such that
   // detectable hazards are missed.
-  if (CurrCycleInstr->getOpcode() == AMDGPU::IMPLICIT_DEF)
+  if (CurrCycleInstr->isImplicitDef())
     return;
   else if (CurrCycleInstr->isDebugInstr())
     return;
@@ -265,41 +275,109 @@ void GCNHazardRecognizer::RecedeCycle() {
 // Helper Functions
 //===----------------------------------------------------------------------===//
 
-int GCNHazardRecognizer::getWaitStatesSince(
-    function_ref<bool(MachineInstr *)> IsHazard) {
+typedef function_ref<bool(MachineInstr *, int WaitStates)> IsExpiredFn;
+
+// Returns a minimum wait states since \p I walking all predecessors.
+// Only scans until \p IsExpired does not return true.
+// Can only be run in a hazard recognizer mode.
+static int getWaitStatesSince(GCNHazardRecognizer::IsHazardFn IsHazard,
+                              MachineBasicBlock *MBB,
+                              MachineBasicBlock::reverse_instr_iterator I,
+                              int WaitStates,
+                              IsExpiredFn IsExpired,
+                              DenseSet<const MachineBasicBlock *> &Visited) {
+
+  for (auto E = MBB->rend() ; I != E; ++I) {
+    if (IsHazard(&*I))
+      return WaitStates;
+
+    if (I->isInlineAsm() || I->isImplicitDef() || I->isDebugInstr())
+      continue;
+
+    WaitStates += SIInstrInfo::getNumWaitStates(*I);
+
+    if (IsExpired(&*I, WaitStates))
+      return std::numeric_limits<int>::max();
+  }
+
+  int MinWaitStates = WaitStates;
+  bool Found = false;
+  for (MachineBasicBlock *Pred : MBB->predecessors()) {
+    if (!Visited.insert(Pred).second)
+      continue;
+
+    int W = getWaitStatesSince(IsHazard, Pred, Pred->instr_rbegin(),
+                               WaitStates, IsExpired, Visited);
+
+    if (W == std::numeric_limits<int>::max())
+      continue;
+
+    MinWaitStates = Found ? std::min(MinWaitStates, W) : W;
+    if (IsExpired(nullptr, MinWaitStates))
+      return MinWaitStates;
+
+    Found = true;
+  }
+
+  if (Found)
+    return MinWaitStates;
+
+  return std::numeric_limits<int>::max();
+}
+
+static int getWaitStatesSince(GCNHazardRecognizer::IsHazardFn IsHazard,
+                              MachineInstr *MI,
+                              IsExpiredFn IsExpired) {
+  DenseSet<const MachineBasicBlock *> Visited;
+  return getWaitStatesSince(IsHazard, MI->getParent(),
+                            std::next(MI->getReverseIterator()),
+                            0, IsExpired, Visited);
+}
+
+int GCNHazardRecognizer::getWaitStatesSince(IsHazardFn IsHazard, int Limit) {
+  if (IsHazardRecognizerMode) {
+    auto IsExpiredFn = [Limit] (MachineInstr *, int WaitStates) {
+      return WaitStates >= Limit;
+    };
+    return ::getWaitStatesSince(IsHazard, CurrCycleInstr, IsExpiredFn);
+  }
+
   int WaitStates = 0;
   for (MachineInstr *MI : EmittedInstrs) {
     if (MI) {
       if (IsHazard(MI))
         return WaitStates;
 
-      unsigned Opcode = MI->getOpcode();
-      if (Opcode == AMDGPU::INLINEASM)
+      if (MI->isInlineAsm())
         continue;
     }
     ++WaitStates;
+
+    if (WaitStates >= Limit)
+      break;
   }
   return std::numeric_limits<int>::max();
 }
 
-int GCNHazardRecognizer::getWaitStatesSinceDef(
-    unsigned Reg, function_ref<bool(MachineInstr *)> IsHazardDef) {
+int GCNHazardRecognizer::getWaitStatesSinceDef(unsigned Reg,
+                                               IsHazardFn IsHazardDef,
+                                               int Limit) {
   const SIRegisterInfo *TRI = ST.getRegisterInfo();
 
   auto IsHazardFn = [IsHazardDef, TRI, Reg] (MachineInstr *MI) {
     return IsHazardDef(MI) && MI->modifiesRegister(Reg, TRI);
   };
 
-  return getWaitStatesSince(IsHazardFn);
+  return getWaitStatesSince(IsHazardFn, Limit);
 }
 
-int GCNHazardRecognizer::getWaitStatesSinceSetReg(
-    function_ref<bool(MachineInstr *)> IsHazard) {
+int GCNHazardRecognizer::getWaitStatesSinceSetReg(IsHazardFn IsHazard,
+                                                  int Limit) {
   auto IsHazardFn = [IsHazard] (MachineInstr *MI) {
     return isSSetReg(MI->getOpcode()) && IsHazard(MI);
   };
 
-  return getWaitStatesSince(IsHazardFn);
+  return getWaitStatesSince(IsHazardFn, Limit);
 }
 
 //===----------------------------------------------------------------------===//
@@ -397,7 +475,8 @@ int GCNHazardRecognizer::checkSMRDHazards(MachineInstr *SMRD) {
     if (!Use.isReg())
       continue;
     int WaitStatesNeededForUse =
-        SmrdSgprWaitStates - getWaitStatesSinceDef(Use.getReg(), IsHazardDefFn);
+        SmrdSgprWaitStates - getWaitStatesSinceDef(Use.getReg(), IsHazardDefFn,
+                                                   SmrdSgprWaitStates);
     WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
 
     // This fixes what appears to be undocumented hardware behavior in SI where
@@ -410,7 +489,8 @@ int GCNHazardRecognizer::checkSMRDHazards(MachineInstr *SMRD) {
     if (IsBufferSMRD) {
       int WaitStatesNeededForUse =
         SmrdSgprWaitStates - getWaitStatesSinceDef(Use.getReg(),
-                                                   IsBufferHazardDefFn);
+                                                   IsBufferHazardDefFn,
+                                                   SmrdSgprWaitStates);
       WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
     }
   }
@@ -434,7 +514,8 @@ int GCNHazardRecognizer::checkVMEMHazards(MachineInstr* VMEM) {
       continue;
 
     int WaitStatesNeededForUse =
-        VmemSgprWaitStates - getWaitStatesSinceDef(Use.getReg(), IsHazardDefFn);
+        VmemSgprWaitStates - getWaitStatesSinceDef(Use.getReg(), IsHazardDefFn,
+                                                   VmemSgprWaitStates);
     WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
   }
   return WaitStatesNeeded;
@@ -454,13 +535,16 @@ int GCNHazardRecognizer::checkDPPHazards(MachineInstr *DPP) {
     if (!Use.isReg() || !TRI->isVGPR(MF.getRegInfo(), Use.getReg()))
       continue;
     int WaitStatesNeededForUse =
-        DppVgprWaitStates - getWaitStatesSinceDef(Use.getReg());
+        DppVgprWaitStates - getWaitStatesSinceDef(Use.getReg(),
+                              [](MachineInstr *) { return true; },
+                              DppVgprWaitStates);
     WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
   }
 
   WaitStatesNeeded = std::max(
       WaitStatesNeeded,
-      DppExecWaitStates - getWaitStatesSinceDef(AMDGPU::EXEC, IsHazardDefFn));
+      DppExecWaitStates - getWaitStatesSinceDef(AMDGPU::EXEC, IsHazardDefFn,
+                                                DppExecWaitStates));
 
   return WaitStatesNeeded;
 }
@@ -472,7 +556,8 @@ int GCNHazardRecognizer::checkDivFMasHazards(MachineInstr *DivFMas) {
   // instruction.
   const int DivFMasWaitStates = 4;
   auto IsHazardDefFn = [TII] (MachineInstr *MI) { return TII->isVALU(*MI); };
-  int WaitStatesNeeded = getWaitStatesSinceDef(AMDGPU::VCC, IsHazardDefFn);
+  int WaitStatesNeeded = getWaitStatesSinceDef(AMDGPU::VCC, IsHazardDefFn,
+                                               DivFMasWaitStates);
 
   return DivFMasWaitStates - WaitStatesNeeded;
 }
@@ -485,7 +570,7 @@ int GCNHazardRecognizer::checkGetRegHazards(MachineInstr *GetRegInstr) {
   auto IsHazardFn = [TII, GetRegHWReg] (MachineInstr *MI) {
     return GetRegHWReg == getHWReg(TII, *MI);
   };
-  int WaitStatesNeeded = getWaitStatesSinceSetReg(IsHazardFn);
+  int WaitStatesNeeded = getWaitStatesSinceSetReg(IsHazardFn, GetRegWaitStates);
 
   return GetRegWaitStates - WaitStatesNeeded;
 }
@@ -499,7 +584,7 @@ int GCNHazardRecognizer::checkSetRegHazards(MachineInstr *SetRegInstr) {
   auto IsHazardFn = [TII, HWReg] (MachineInstr *MI) {
     return HWReg == getHWReg(TII, *MI);
   };
-  int WaitStatesNeeded = getWaitStatesSinceSetReg(IsHazardFn);
+  int WaitStatesNeeded = getWaitStatesSinceSetReg(IsHazardFn, SetRegWaitStates);
   return SetRegWaitStates - WaitStatesNeeded;
 }
 
@@ -570,7 +655,7 @@ int GCNHazardRecognizer::checkVALUHazardsHelper(const MachineOperand &Def,
     TRI->regsOverlap(MI->getOperand(DataIdx).getReg(), Reg);
   };
   int WaitStatesNeededForDef =
-    VALUWaitStates - getWaitStatesSince(IsHazardFn);
+    VALUWaitStates - getWaitStatesSince(IsHazardFn, VALUWaitStates);
   WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
 
   return WaitStatesNeeded;
@@ -635,7 +720,8 @@ int GCNHazardRecognizer::checkRWLaneHazards(MachineInstr *RWLane) {
   };
 
   const int RWLaneWaitStates = 4;
-  int WaitStatesSince = getWaitStatesSinceDef(LaneSelectReg, IsHazardFn);
+  int WaitStatesSince = getWaitStatesSinceDef(LaneSelectReg, IsHazardFn,
+                                              RWLaneWaitStates);
   return RWLaneWaitStates - WaitStatesSince;
 }
 
@@ -650,7 +736,7 @@ int GCNHazardRecognizer::checkRFEHazards(MachineInstr *RFE) {
   auto IsHazardFn = [TII] (MachineInstr *MI) {
     return getHWReg(TII, *MI) == AMDGPU::Hwreg::ID_TRAPSTS;
   };
-  int WaitStatesNeeded = getWaitStatesSinceSetReg(IsHazardFn);
+  int WaitStatesNeeded = getWaitStatesSinceSetReg(IsHazardFn, RFEWaitStates);
   return RFEWaitStates - WaitStatesNeeded;
 }
 
@@ -674,7 +760,8 @@ int GCNHazardRecognizer::checkAnyInstHazards(MachineInstr *MI) {
       return MI->getOpcode() == AMDGPU::S_MOV_FED_B32;
     };
     int WaitStatesNeededForUse =
-        MovFedWaitStates - getWaitStatesSinceDef(Use.getReg(), IsHazardFn);
+        MovFedWaitStates - getWaitStatesSinceDef(Use.getReg(), IsHazardFn,
+                                                 MovFedWaitStates);
     WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
   }
 
@@ -687,5 +774,6 @@ int GCNHazardRecognizer::checkReadM0Hazards(MachineInstr *MI) {
   auto IsHazardFn = [TII] (MachineInstr *MI) {
     return TII->isSALU(*MI);
   };
-  return SMovRelWaitStates - getWaitStatesSinceDef(AMDGPU::M0, IsHazardFn);
+  return SMovRelWaitStates - getWaitStatesSinceDef(AMDGPU::M0, IsHazardFn,
+                                                   SMovRelWaitStates);
 }
index cc29d74a54809043c181f7057531c308f9187511..2a6a2deed2c1e4d03264839f6ca9f4e89d03d1a6 100644 (file)
@@ -30,6 +30,13 @@ class SIRegisterInfo;
 class GCNSubtarget;
 
 class GCNHazardRecognizer final : public ScheduleHazardRecognizer {
+public:
+  typedef function_ref<bool(MachineInstr *)> IsHazardFn;
+
+private:
+  // Distinguish if we are called from scheduler or hazard recognizer
+  bool IsHazardRecognizerMode;
+
   // This variable stores the instruction that has been emitted this cycle. It
   // will be added to EmittedInstrs, when AdvanceCycle() or RecedeCycle() is
   // called.
@@ -53,11 +60,9 @@ class GCNHazardRecognizer final : public ScheduleHazardRecognizer {
 
   void addClauseInst(const MachineInstr &MI);
 
-  int getWaitStatesSince(function_ref<bool(MachineInstr *)> IsHazard);
-  int getWaitStatesSinceDef(unsigned Reg,
-                            function_ref<bool(MachineInstr *)> IsHazardDef =
-                                [](MachineInstr *) { return true; });
-  int getWaitStatesSinceSetReg(function_ref<bool(MachineInstr *)> IsHazard);
+  int getWaitStatesSince(IsHazardFn IsHazard, int Limit);
+  int getWaitStatesSinceDef(unsigned Reg, IsHazardFn IsHazardDef, int Limit);
+  int getWaitStatesSinceSetReg(IsHazardFn IsHazard, int Limit);
 
   int checkSoftClauseHazards(MachineInstr *SMEM);
   int checkSMRDHazards(MachineInstr *SMRD);
@@ -84,6 +89,7 @@ public:
   void EmitNoop() override;
   unsigned PreEmitNoops(SUnit *SU) override;
   unsigned PreEmitNoops(MachineInstr *) override;
+  unsigned PreEmitNoopsCommon(MachineInstr *);
   void AdvanceCycle() override;
   void RecedeCycle() override;
 };
index 595945e11f1434e529c52e14e7c5470830ccd2d4..88bab949a81548fcf641d7282346bad6860fb33f 100644 (file)
@@ -1152,7 +1152,7 @@ void SIInstrInfo::insertReturn(MachineBasicBlock &MBB) const {
   }
 }
 
-unsigned SIInstrInfo::getNumWaitStates(const MachineInstr &MI) const {
+unsigned SIInstrInfo::getNumWaitStates(const MachineInstr &MI) {
   switch (MI.getOpcode()) {
   default: return 1; // FIXME: Do wait states equal cycles?
 
index de250b9c62ccda2199d5c8f333a170ee7b164d64..bdc53f6257ea04979f0988242a5fcbc6ad66a983 100644 (file)
@@ -837,7 +837,7 @@ public:
   void insertReturn(MachineBasicBlock &MBB) const;
   /// Return the number of wait states that result from executing this
   /// instruction.
-  unsigned getNumWaitStates(const MachineInstr &MI) const;
+  static unsigned getNumWaitStates(const MachineInstr &MI);
 
   /// Returns the operand named \p Op.  If \p MI does not have an
   /// operand named \c Op, this function returns nullptr.
diff --git a/test/CodeGen/AMDGPU/vmem-vcc-hazard.mir b/test/CodeGen/AMDGPU/vmem-vcc-hazard.mir
new file mode 100644 (file)
index 0000000..5ccdf14
--- /dev/null
@@ -0,0 +1,230 @@
+# RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs -run-pass post-RA-hazard-rec -o - %s | FileCheck -check-prefix=GCN %s
+
+# GCN-LABEL: name: vmem_vcc_fallthrough
+# GCN:      bb.1:
+# GCN-NEXT: S_NOP
+# GCN-NEXT: S_NOP
+# GCN-NEXT: S_NOP
+# GCN-NEXT: S_NOP
+# GCN-NEXT: S_NOP
+# GCN-NEXT: BUFFER_LOAD_DWORD_OFFEN
+---
+name:            vmem_vcc_fallthrough
+body:             |
+  bb.0:
+    successors: %bb.1
+
+    $sgpr0_sgpr1_sgpr2_sgpr3 = IMPLICIT_DEF
+    $vgpr0 = IMPLICIT_DEF
+    $vgpr1 = V_ADDC_U32_e32 $vgpr0, $vgpr0, implicit-def $vcc, implicit $vcc, implicit $exec
+
+  bb.1:
+    $vgpr1 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $vcc_lo, 0, 0, 0, 0, implicit $exec
+...
+# GCN-LABEL: name: vmem_vcc_branch_to_next
+# GCN:      bb.1:
+# GCN-NEXT: S_NOP
+# GCN-NEXT: S_NOP
+# GCN-NEXT: S_NOP
+# GCN-NEXT: S_NOP
+# GCN-NEXT: BUFFER_LOAD_DWORD_OFFEN
+---
+name:            vmem_vcc_branch_to_next
+body:             |
+  bb.0:
+    successors: %bb.1
+
+    $sgpr0_sgpr1_sgpr2_sgpr3 = IMPLICIT_DEF
+    $vgpr0 = IMPLICIT_DEF
+    $vgpr1 = V_ADDC_U32_e32 $vgpr0, $vgpr0, implicit-def $vcc, implicit $vcc, implicit $exec
+    S_BRANCH %bb.1
+
+  bb.1:
+    $vgpr1 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $vcc_lo, 0, 0, 0, 0, implicit $exec
+...
+# GCN-LABEL: name: vmem_vcc_fallthrough_no_hazard_too_far
+# GCN:      bb.1:
+# GCN-NEXT: BUFFER_LOAD_DWORD_OFFEN
+---
+name:            vmem_vcc_fallthrough_no_hazard_too_far
+body:             |
+  bb.0:
+    successors: %bb.1
+
+    $sgpr0_sgpr1_sgpr2_sgpr3 = IMPLICIT_DEF
+    $vgpr0 = IMPLICIT_DEF
+    $vgpr1 = V_ADDC_U32_e32 $vgpr0, $vgpr0, implicit-def $vcc, implicit $vcc, implicit $exec
+    $sgpr0 = S_MOV_B32 0
+    $sgpr0 = S_MOV_B32 0
+    $sgpr0 = S_MOV_B32 0
+    $sgpr0 = S_MOV_B32 0
+    $sgpr0 = S_MOV_B32 0
+
+  bb.1:
+    $vgpr1 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $vcc_lo, 0, 0, 0, 0, implicit $exec
+...
+# GCN-LABEL: name: vmem_vcc_fallthrough_no_hazard_nops
+# GCN:      bb.1:
+# GCN-NEXT: BUFFER_LOAD_DWORD_OFFEN
+---
+name:            vmem_vcc_fallthrough_no_hazard_nops
+body:             |
+  bb.0:
+    successors: %bb.1
+
+    $sgpr0_sgpr1_sgpr2_sgpr3 = IMPLICIT_DEF
+    $vgpr0 = IMPLICIT_DEF
+    $vgpr1 = V_ADDC_U32_e32 $vgpr0, $vgpr0, implicit-def $vcc, implicit $vcc, implicit $exec
+    S_NOP 4
+
+  bb.1:
+    $vgpr1 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $vcc_lo, 0, 0, 0, 0, implicit $exec
+...
+# GCN-LABEL: name: vmem_vcc_branch_around
+# GCN:      bb.2:
+# GCN-NEXT: S_NOP
+# GCN-NEXT: S_NOP
+# GCN-NEXT: S_NOP
+# GCN-NEXT: S_NOP
+# GCN-NEXT: BUFFER_LOAD_DWORD_OFFEN
+---
+name:            vmem_vcc_branch_around
+body:             |
+  bb.0:
+    successors: %bb.2
+
+    $sgpr0_sgpr1_sgpr2_sgpr3 = IMPLICIT_DEF
+    $vgpr0 = IMPLICIT_DEF
+    $vgpr1 = V_ADDC_U32_e32 $vgpr0, $vgpr0, implicit-def $vcc, implicit $vcc, implicit $exec
+    S_BRANCH %bb.2
+
+  bb.1:
+    successors: %bb.2
+
+    S_NOP 0
+    S_NOP 0
+    S_NOP 0
+    S_NOP 0
+
+  bb.2:
+    $vgpr1 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $vcc_lo, 0, 0, 0, 0, implicit $exec
+...
+# GCN-LABEL: name: vmem_vcc_branch_backedge
+# GCN:      S_NOP
+# GCN-NEXT: S_NOP
+# GCN-NEXT: S_NOP
+# GCN-NEXT: S_NOP
+# GCN-NEXT: BUFFER_LOAD_DWORD_OFFEN
+---
+name:            vmem_vcc_branch_backedge
+body:             |
+  bb.0:
+    successors: %bb.1
+
+    $vgpr0 = IMPLICIT_DEF
+    $sgpr0_sgpr1_sgpr2_sgpr3 = IMPLICIT_DEF
+    $vgpr1 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $vcc_lo, 0, 0, 0, 0, implicit $exec
+
+  bb.1:
+    $vgpr0 = IMPLICIT_DEF
+    $vgpr1 = V_ADDC_U32_e32 $vgpr0, $vgpr0, implicit-def $vcc, implicit $vcc, implicit $exec
+    S_BRANCH %bb.0
+...
+# GCN-LABEL: name: vmem_vcc_min_of_two
+# GCN:      bb.2:
+# GCN-NEXT: S_NOP
+# GCN-NEXT: S_NOP
+# GCN-NEXT: S_NOP
+# GCN-NEXT: S_NOP
+# GCN-NEXT: S_NOP
+# GCN-NEXT: BUFFER_LOAD_DWORD_OFFEN
+---
+name:            vmem_vcc_min_of_two
+body:             |
+  bb.0:
+    successors: %bb.2
+
+    $sgpr0_sgpr1_sgpr2_sgpr3 = IMPLICIT_DEF
+    $vgpr0 = IMPLICIT_DEF
+    $vgpr1 = V_ADDC_U32_e32 $vgpr0, $vgpr0, implicit-def $vcc, implicit $vcc, implicit $exec
+    S_NOP 0
+    S_BRANCH %bb.2
+
+  bb.1:
+    successors: %bb.2
+
+    $vgpr1 = V_ADDC_U32_e32 $vgpr0, $vgpr0, implicit-def $vcc, implicit $vcc, implicit $exec
+
+  bb.2:
+    $vgpr1 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $vcc_lo, 0, 0, 0, 0, implicit $exec
+...
+# GCN-LABEL: name: vmem_vcc_self_loop
+# GCN:      S_NOP
+# GCN-NEXT: S_NOP
+# GCN-NEXT: S_NOP
+# GCN-NEXT: S_NOP
+# GCN-NEXT: BUFFER_LOAD_DWORD_OFFEN
+---
+name:            vmem_vcc_self_loop
+body:             |
+  bb.0:
+    successors: %bb.0
+
+    $vgpr0 = IMPLICIT_DEF
+    $sgpr0_sgpr1_sgpr2_sgpr3 = IMPLICIT_DEF
+    $vgpr1 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $vcc_lo, 0, 0, 0, 0, implicit $exec
+    $vgpr1 = V_ADDC_U32_e32 $vgpr0, $vgpr0, implicit-def $vcc, implicit $vcc, implicit $exec
+    S_BRANCH %bb.0
+...
+# GCN-LABEL: name: vmem_vcc_min_of_two_self_loop1
+# GCN:      bb.1:
+# GCN:      $sgpr0 = S_MOV_B32 0
+# GCN-NEXT: S_NOP
+# GCN-NEXT: S_NOP
+# GCN-NEXT: S_NOP
+# GCN-NEXT: S_NOP
+# GCN-NEXT: BUFFER_LOAD_DWORD_OFFEN
+---
+name:            vmem_vcc_min_of_two_self_loop1
+body:             |
+  bb.0:
+    successors: %bb.1
+
+    $sgpr0_sgpr1_sgpr2_sgpr3 = IMPLICIT_DEF
+    $vgpr0 = IMPLICIT_DEF
+    $vgpr1 = V_ADDC_U32_e32 $vgpr0, $vgpr0, implicit-def $vcc, implicit $vcc, implicit $exec
+
+  bb.1:
+    successors: %bb.1
+
+    $sgpr0 = S_MOV_B32 0
+    $vgpr1 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $vcc_lo, 0, 0, 0, 0, implicit $exec
+    $vgpr1 = V_ADDC_U32_e32 $vgpr1, $vgpr1, implicit-def $vcc, implicit $vcc, implicit $exec
+    S_BRANCH %bb.1
+...
+# GCN-LABEL: name: vmem_vcc_min_of_two_self_loop2
+# GCN:      bb.1:
+# GCN:      $sgpr0 = S_MOV_B32 0
+# GCN-NEXT: S_NOP
+# GCN-NEXT: S_NOP
+# GCN-NEXT: S_NOP
+# GCN-NEXT: BUFFER_LOAD_DWORD_OFFEN
+---
+name:            vmem_vcc_min_of_two_self_loop2
+body:             |
+  bb.0:
+    successors: %bb.1
+
+    $sgpr0_sgpr1_sgpr2_sgpr3 = IMPLICIT_DEF
+    $vgpr0 = IMPLICIT_DEF
+    $vgpr1 = V_ADDC_U32_e32 $vgpr0, $vgpr0, implicit-def $vcc, implicit $vcc, implicit $exec
+    S_NOP 0
+
+  bb.1:
+    successors: %bb.1
+
+    $sgpr0 = S_MOV_B32 0
+    $vgpr1 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $vcc_lo, 0, 0, 0, 0, implicit $exec
+    $vgpr1 = V_ADDC_U32_e32 $vgpr1, $vgpr1, implicit-def $vcc, implicit $vcc, implicit $exec
+    S_BRANCH %bb.1
+...