From a1445b51829089061793897f75f0bdec1c379b7a Mon Sep 17 00:00:00 2001
From: Stanislav Mekhanoshin <Stanislav.Mekhanoshin@amd.com>
Date: Tue, 16 May 2017 16:11:26 +0000
Subject: [PATCH] [AMDGPU] Cache live-ins and register pressure in scheduler

Using LIS can be quite expensive, so caching of calculated region
live-ins and pressure is implemented. It does two things:

1. Caches the info for the second stage when we schedule with
   decreased target occupancy.
2. Tracks the basic block from top to bottom thus eliminating the
   need to scan whole register file liveness at every region split
   in the middle of the block.

The scheduling is now done in 3 stages instead of two, with the first
one being really a no-op and only used to collect scheduling regions
as sent by the scheduler driver.

There is no functional change to the current behavior, only compilation
speed is affected. In general computeBlockPressure() could be simplified
if we switch to backward RP tracker, because scheduler sends regions
within a block starting from the last upward. We could use a natural
order of upward tracker to seamlessly change between regions of the same
block, since live reg set of a previous tracked region would become a
live-out of the next region. That however requires fixing upward tracker
to properly account defs and uses of the same instruction as both are
contributing to the current pressure. When we converge on the produced
pressure we should be able to switch between them back and forth. In
addition, backward tracker is less expensive as it uses LIS in recede
less often than forward uses it in advance.

At the moment the worst known case compilation time has improved from 26
minutes to 8.5.

Differential Revision: https://reviews.llvm.org/D33117

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@303184 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Target/AMDGPU/GCNSchedStrategy.cpp | 211 ++++++++++++++++---------
 lib/Target/AMDGPU/GCNSchedStrategy.h   |  18 ++-
 2 files changed, 154 insertions(+), 75 deletions(-)
diff --git a/lib/Target/AMDGPU/GCNSchedStrategy.cpp b/lib/Target/AMDGPU/GCNSchedStrategy.cpp
index 94949e9fc5e..c6d6022b71c 100644
--- a/lib/Target/AMDGPU/GCNSchedStrategy.cpp
+++ b/lib/Target/AMDGPU/GCNSchedStrategy.cpp
@@ -316,12 +316,18 @@ GCNScheduleDAGMILive::GCNScheduleDAGMILive(MachineSchedContext *C,
   MFI(*MF.getInfo<SIMachineFunctionInfo>()),
   StartingOccupancy(ST.getOccupancyWithLocalMemSize(MFI.getLDSSize(),
                                                     *MF.getFunction())),
-  MinOccupancy(StartingOccupancy), Stage(0) {
+  MinOccupancy(StartingOccupancy), Stage(0), RegionIdx(0) {
 
   DEBUG(dbgs() << "Starting occupancy is " << StartingOccupancy << ".\n");
 }
 
 void GCNScheduleDAGMILive::schedule() {
+  if (Stage == 0) {
+    // Just record regions at the first pass.
+    Regions.push_back(std::make_pair(RegionBegin, RegionEnd));
+    return;
+  }
+
   std::vector<MachineInstr*> Unsched;
   Unsched.reserve(NumRegionInstrs);
   for (auto &I : *this)
@@ -329,18 +335,27 @@ void GCNScheduleDAGMILive::schedule() {
 
   GCNRegPressure PressureBefore;
   if (LIS) {
-    discoverLiveIns();
-    PressureBefore = getRealRegPressure();
-
-    DEBUG(dbgs() << "Pressure before scheduling:\nSGPR = "
+    PressureBefore = Pressure[RegionIdx];
+
+    DEBUG(const SIRegisterInfo *SRI = static_cast<const SIRegisterInfo*>(TRI);
+          dbgs() << "Pressure before scheduling:\nRegion live-ins:";
+          for (unsigned I = 0, E = MRI.getNumVirtRegs(); I != E; ++I) {
+            unsigned Reg = TargetRegisterInfo::index2VirtReg(I);
+            auto It = LiveIns[RegionIdx].find(Reg);
+            if (It != LiveIns[RegionIdx].end() && It->second.any())
+              dbgs() << ' ' << PrintVRegOrUnit(Reg, SRI) << ':'
+                     << PrintLaneMask(It->second);
+          }
+          auto P = llvm::getRegPressure(MRI, LiveIns[RegionIdx]);
+          dbgs() << "\nLive-in pressure:\nSGPR = " << P.getSGPRNum()
+                 << "\nVGPR = " << P.getVGPRNum()
+                 << "\nReal region's register pressure:\nSGPR = "
                  << PressureBefore.getSGPRNum()
                  << "\nVGPR = " << PressureBefore.getVGPRNum() << '\n');
-
   }
 
   ScheduleDAGMILive::schedule();
-  if (Stage == 0)
-    Regions.push_back(std::make_pair(RegionBegin, RegionEnd));
+  Regions[RegionIdx] = std::make_pair(RegionBegin, RegionEnd);
 
   if (!LIS)
     return;
@@ -353,10 +368,9 @@ void GCNScheduleDAGMILive::schedule() {
                << PressureAfter.getSGPRNum()
                << "\nVGPR = " << PressureAfter.getVGPRNum() << '\n');
 
-  LiveIns.clear();
-
   if (PressureAfter.getSGPRNum() <= S.SGPRCriticalLimit &&
       PressureAfter.getVGPRNum() <= S.VGPRCriticalLimit) {
+    Pressure[RegionIdx] = PressureAfter;
     DEBUG(dbgs() << "Pressure in desired limits, done.\n");
     return;
   }
@@ -376,8 +390,10 @@ void GCNScheduleDAGMILive::schedule() {
                  << MinOccupancy << ".\n");
   }
 
-  if (WavesAfter >= WavesBefore)
+  if (WavesAfter >= WavesBefore) {
+    Pressure[RegionIdx] = PressureAfter;
     return;
+  }
 
   DEBUG(dbgs() << "Attempting to revert scheduling.\n");
   RegionEnd = RegionBegin;
@@ -406,86 +422,139 @@ void GCNScheduleDAGMILive::schedule() {
     DEBUG(dbgs() << "Scheduling " << *MI);
   }
   RegionBegin = Unsched.front()->getIterator();
-  if (Stage == 0)
-    Regions.back() = std::make_pair(RegionBegin, RegionEnd);
+  Regions[RegionIdx] = std::make_pair(RegionBegin, RegionEnd);
 
   placeDebugValues();
 }
 
-void GCNScheduleDAGMILive::discoverLiveIns() {
+GCNRegPressure GCNScheduleDAGMILive::getRealRegPressure() const {
   GCNDownwardRPTracker RPTracker(*LIS);
-  RPTracker.reset(*begin());
-
-  LiveIns = RPTracker.moveLiveRegs();
-
-  DEBUG(GCNRegPressure LiveInPressure = RPTracker.moveMaxPressure();
-        const SIRegisterInfo *SRI = static_cast<const SIRegisterInfo*>(TRI);
-        dbgs() << "Region live-ins:";
-        for (unsigned I = 0, E = MRI.getNumVirtRegs(); I != E; ++I) {
-          unsigned Reg = TargetRegisterInfo::index2VirtReg(I);
-          auto It = LiveIns.find(Reg);
-          if (It != LiveIns.end())
-            dbgs() << ' ' << PrintVRegOrUnit(Reg, SRI) << ':'
-                   << PrintLaneMask(It->second);
-        }
-        dbgs() << "\nLive-in pressure:\nSGPR = "
-               << LiveInPressure.getSGPRNum()
-               << "\nVGPR = " << LiveInPressure.getVGPRNum() << '\n');
+  RPTracker.advance(begin(), end(), &LiveIns[RegionIdx]);
+  return RPTracker.moveMaxPressure();
 }
 
-GCNRegPressure GCNScheduleDAGMILive::getRealRegPressure() const {
+void GCNScheduleDAGMILive::computeBlockPressure(const MachineBasicBlock *MBB) {
   GCNDownwardRPTracker RPTracker(*LIS);
-  RPTracker.advance(begin(), end(), &LiveIns);
-  return RPTracker.moveMaxPressure();
+
+  // If the block has the only successor then live-ins of that successor are
+  // live-outs of the current block. We can reuse calculated live set if the
+  // successor will be sent to scheduling past current block.
+  const MachineBasicBlock *OnlySucc = nullptr;
+  if (MBB->succ_size() == 1 && !(*MBB->succ_begin())->empty()) {
+    SlotIndexes *Ind = LIS->getSlotIndexes();
+    if (Ind->getMBBStartIdx(MBB) < Ind->getMBBStartIdx(*MBB->succ_begin()))
+      OnlySucc = *MBB->succ_begin();
+  }
+
+  // Scheduler sends regions from the end of the block upwards.
+  size_t CurRegion = RegionIdx;
+  for (size_t E = Regions.size(); CurRegion != E; ++CurRegion)
+    if (Regions[CurRegion].first->getParent() != MBB)
+      break;
+  --CurRegion;
+
+  auto I = MBB->begin();
+  auto LiveInIt = MBBLiveIns.find(MBB);
+  if (LiveInIt != MBBLiveIns.end()) {
+    auto LiveIn = std::move(LiveInIt->second);
+    RPTracker.reset(*MBB->begin(), &LiveIn);
+    MBBLiveIns.erase(LiveInIt);
+  } else {
+    I = Regions[CurRegion].first;
+    RPTracker.reset(*I);
+  }
+
+  for ( ; ; ) {
+    I = RPTracker.getNext();
+
+    if (Regions[CurRegion].first == I) {
+      LiveIns[CurRegion] = RPTracker.getLiveRegs();
+      RPTracker.clearMaxPressure();
+    }
+
+    if (Regions[CurRegion].second == I) {
+      Pressure[CurRegion] = RPTracker.moveMaxPressure();
+      if (CurRegion-- == RegionIdx)
+        break;
+    }
+    RPTracker.advanceToNext();
+    RPTracker.advanceBeforeNext();
+  }
+
+  if (OnlySucc) {
+    if (I != MBB->end()) {
+      RPTracker.advanceToNext();
+      RPTracker.advance(MBB->end());
+    }
+    RPTracker.reset(*OnlySucc->begin(), &RPTracker.getLiveRegs());
+    RPTracker.advanceBeforeNext();
+    MBBLiveIns[OnlySucc] = RPTracker.moveLiveRegs();
+  }
 }
 
 void GCNScheduleDAGMILive::finalizeSchedule() {
-  // Retry function scheduling if we found resulting occupancy and it is
-  // lower than used for first pass scheduling. This will give more freedom
-  // to schedule low register pressure blocks.
-  // Code is partially copied from MachineSchedulerBase::scheduleRegions().
+  GCNMaxOccupancySchedStrategy &S = (GCNMaxOccupancySchedStrategy&)*SchedImpl;
+  DEBUG(dbgs() << "All regions recorded, starting actual scheduling.\n");
 
-  if (!LIS || StartingOccupancy <= MinOccupancy)
-    return;
+  LiveIns.resize(Regions.size());
+  Pressure.resize(Regions.size());
 
-  DEBUG(dbgs() << "Retrying function scheduling with lowest recorded occupancy "
-               << MinOccupancy << ".\n");
+  do {
+    Stage++;
+    RegionIdx = 0;
+    MachineBasicBlock *MBB = nullptr;
 
-  Stage++;
-  GCNMaxOccupancySchedStrategy &S = (GCNMaxOccupancySchedStrategy&)*SchedImpl;
-  S.setTargetOccupancy(MinOccupancy);
+    if (Stage > 1) {
+      // Retry function scheduling if we found resulting occupancy and it is
+      // lower than used for first pass scheduling. This will give more freedom
+      // to schedule low register pressure blocks.
+      // Code is partially copied from MachineSchedulerBase::scheduleRegions().
 
-  MachineBasicBlock *MBB = nullptr;
-  for (auto Region : Regions) {
-    RegionBegin = Region.first;
-    RegionEnd = Region.second;
+      if (!LIS || StartingOccupancy <= MinOccupancy)
+        break;
 
-    if (RegionBegin->getParent() != MBB) {
-      if (MBB) finishBlock();
-      MBB = RegionBegin->getParent();
-      startBlock(MBB);
+      DEBUG(dbgs()
+              << "Retrying function scheduling with lowest recorded occupancy "
+              << MinOccupancy << ".\n");
+
+      S.setTargetOccupancy(MinOccupancy);
     }
 
-    unsigned NumRegionInstrs = std::distance(begin(), end());
-    enterRegion(MBB, begin(), end(), NumRegionInstrs);
+    for (auto Region : Regions) {
+      RegionBegin = Region.first;
+      RegionEnd = Region.second;
+
+      if (RegionBegin->getParent() != MBB) {
+        if (MBB) finishBlock();
+        MBB = RegionBegin->getParent();
+        startBlock(MBB);
+        if (Stage == 1)
+          computeBlockPressure(MBB);
+      }
+
+      unsigned NumRegionInstrs = std::distance(begin(), end());
+      enterRegion(MBB, begin(), end(), NumRegionInstrs);
+
+      // Skip empty scheduling regions (0 or 1 schedulable instructions).
+      if (begin() == end() || begin() == std::prev(end())) {
+        exitRegion();
+        continue;
+      }
+
+      DEBUG(dbgs() << "********** MI Scheduling **********\n");
+      DEBUG(dbgs() << MF.getName()
+            << ":BB#" << MBB->getNumber() << " " << MBB->getName()
+            << "\n  From: " << *begin() << "    To: ";
+            if (RegionEnd != MBB->end()) dbgs() << *RegionEnd;
+            else dbgs() << "End";
+            dbgs() << " RegionInstrs: " << NumRegionInstrs << '\n');
+
+      schedule();
 
-    // Skip empty scheduling regions (0 or 1 schedulable instructions).
-    if (begin() == end() || begin() == std::prev(end())) {
       exitRegion();
-      continue;
+      ++RegionIdx;
     }
-    DEBUG(dbgs() << "********** MI Scheduling **********\n");
-    DEBUG(dbgs() << MF.getName()
-          << ":BB#" << MBB->getNumber() << " " << MBB->getName()
-          << "\n  From: " << *begin() << "    To: ";
-          if (RegionEnd != MBB->end()) dbgs() << *RegionEnd;
-          else dbgs() << "End";
-          dbgs() << " RegionInstrs: " << NumRegionInstrs << '\n');
+    finishBlock();
 
-    schedule();
-
-    exitRegion();
-  }
-  finishBlock();
-  LiveIns.shrink_and_clear();
+  } while (Stage < 2);
 }
diff --git a/lib/Target/AMDGPU/GCNSchedStrategy.h b/lib/Target/AMDGPU/GCNSchedStrategy.h
index a9b68a3d59d..3ed3cd5b3b1 100644
--- a/lib/Target/AMDGPU/GCNSchedStrategy.h
+++ b/lib/Target/AMDGPU/GCNSchedStrategy.h
@@ -75,19 +75,29 @@ class GCNScheduleDAGMILive : public ScheduleDAGMILive {
   // Scheduling stage number.
   unsigned Stage;
 
+  // Current region index.
+  size_t RegionIdx;
+
   // Vecor of regions recorder for later rescheduling
   SmallVector<std::pair<MachineBasicBlock::iterator,
                         MachineBasicBlock::iterator>, 32> Regions;
 
-  // Region live-ins.
-  GCNRPTracker::LiveRegSet LiveIns;
+  // Region live-in cache.
+  SmallVector<GCNRPTracker::LiveRegSet, 32> LiveIns;
+
+  // Region pressure cache.
+  SmallVector<GCNRegPressure, 32> Pressure;
 
-  // Collect current region live-ins.
-  void discoverLiveIns();
+  // Temporary basic block live-in cache.
+  DenseMap<const MachineBasicBlock*, GCNRPTracker::LiveRegSet> MBBLiveIns;
 
   // Return current region pressure.
   GCNRegPressure getRealRegPressure() const;
 
+  // Compute and cache live-ins and pressure for all regions in block.
+  void computeBlockPressure(const MachineBasicBlock *MBB);
+
+
 public:
   GCNScheduleDAGMILive(MachineSchedContext *C,
                        std::unique_ptr<MachineSchedStrategy> S);
-- 
2.40.0