From 439cc2c5de80f5810c1733445673658a446d4a0c Mon Sep 17 00:00:00 2001
From: Daniel Jasper <djasper@google.com>
Date: Sat, 14 Mar 2015 10:58:38 +0000
Subject: [PATCH] [MachineLICM] First steps of sinking GEPs near calls.

Specifically, if there are copy-like instructions in the loop header
they are moved into the loop close to their uses. This reduces the live
intervals of the values and can avoid register spills.

This is working towards a fix for http://llvm.org/PR22230.
Review: http://reviews.llvm.org/D7259

Next steps:
- Find a better cost model (which non-copy instructions should be sunk?)
- Make this dependent on register pressure

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@232262 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/CodeGen/MachineLICM.cpp                 | 61 ++++++++++++++++++++
 test/CodeGen/X86/sink-cheap-instructions.ll | 62 +++++++++++++++++++++
 2 files changed, 123 insertions(+)
 create mode 100644 test/CodeGen/X86/sink-cheap-instructions.ll
diff --git a/lib/CodeGen/MachineLICM.cpp b/lib/CodeGen/MachineLICM.cpp
index 64d0932087f..2f65a2ec25f 100644
--- a/lib/CodeGen/MachineLICM.cpp
+++ b/lib/CodeGen/MachineLICM.cpp
@@ -54,6 +54,12 @@ HoistCheapInsts("hoist-cheap-insts",
                 cl::desc("MachineLICM should hoist even cheap instructions"),
                 cl::init(false), cl::Hidden);
 
+static cl::opt<bool>
+SinkInstsToAvoidSpills("sink-insts-to-avoid-spills",
+                       cl::desc("MachineLICM should sink instructions into "
+                                "loops to avoid register spills"),
+                       cl::init(false), cl::Hidden);
+
 STATISTIC(NumHoisted,
           "Number of machine instructions hoisted out of loops");
 STATISTIC(NumLowRP,
@@ -243,6 +249,11 @@ namespace {
     void HoistOutOfLoop(MachineDomTreeNode *LoopHeaderNode);
     void HoistRegion(MachineDomTreeNode *N, bool IsHeader);
 
+    /// SinkIntoLoop - Sink instructions into loops if profitable. This
+    /// especially tries to prevent register spills caused by register pressure
+    /// if there is little to no overhead moving instructions into loops.
+    void SinkIntoLoop();
+
     /// getRegisterClassIDAndCost - For a given MI, register, and the operand
     /// index, return the ID and cost of its representative register class by
     /// reference.
@@ -381,6 +392,9 @@ bool MachineLICM::runOnMachineFunction(MachineFunction &MF) {
       FirstInLoop = true;
       HoistOutOfLoop(N);
       CSEMap.clear();
+
+      if (SinkInstsToAvoidSpills)
+        SinkIntoLoop();
     }
   }
 
@@ -771,6 +785,53 @@ void MachineLICM::HoistOutOfLoop(MachineDomTreeNode *HeaderN) {
   }
 }
 
+void MachineLICM::SinkIntoLoop() {
+  MachineBasicBlock *Preheader = getCurPreheader();
+  if (!Preheader)
+    return;
+
+  SmallVector<MachineInstr *, 8> Candidates;
+  for (MachineBasicBlock::instr_iterator I = Preheader->instr_begin();
+       I != Preheader->instr_end(); ++I) {
+    // We need to ensure that we can safely move this instruction into the loop.
+    // As such, it must not have side-effects, e.g. such as a call has.  
+    if (IsLoopInvariantInst(*I) && !HasLoopPHIUse(I))
+      Candidates.push_back(I);
+  }
+
+  for (MachineInstr *I : Candidates) {
+    const MachineOperand &MO = I->getOperand(0);
+    if (!MO.isDef() || !MO.isReg() || !MO.getReg())
+      continue;
+    if (!MRI->hasOneDef(MO.getReg()))
+      continue;
+    bool CanSink = true;
+    MachineBasicBlock *B = nullptr;
+    for (MachineInstr &MI : MRI->use_instructions(MO.getReg())) {
+      // FIXME: Come up with a proper cost model that estimates whether sinking
+      // the instruction (and thus possibly executing it on every loop
+      // iteration) is more expensive than a register.
+      // For now assumes that copies are cheap and thus almost always worth it.
+      if (!MI.isCopy()) {
+        CanSink = false;
+        break;
+      }
+      if (!B) {
+        B = MI.getParent();
+        continue;
+      }
+      B = DT->findNearestCommonDominator(B, MI.getParent());
+      if (!B) {
+        CanSink = false;
+        break;
+      }
+    }
+    if (!CanSink || !B || B == Preheader)
+      continue;
+    B->splice(B->getFirstNonPHI(), Preheader, I);
+  }
+}
+
 static bool isOperandKill(const MachineOperand &MO, MachineRegisterInfo *MRI) {
   return MO.isKill() || MRI->hasOneNonDBGUse(MO.getReg());
 }
diff --git a/test/CodeGen/X86/sink-cheap-instructions.ll b/test/CodeGen/X86/sink-cheap-instructions.ll
new file mode 100644
index 00000000000..9b9a6865af9
--- /dev/null
+++ b/test/CodeGen/X86/sink-cheap-instructions.ll
@@ -0,0 +1,62 @@
+; RUN: llc < %s -mtriple=x86_64-linux | FileCheck %s -check-prefix=CHECK
+; RUN: llc < %s -mtriple=x86_64-linux -sink-insts-to-avoid-spills | FileCheck %s -check-prefix=SINK
+
+; Ensure that we sink copy-like instructions into loops to avoid register
+; spills.
+
+; CHECK: Spill
+; SINK-NOT: Spill
+
+%struct.A = type { i32, i32, i32, i32, i32, i32 }
+
+define void @_Z1fPhP1A(i8* nocapture readonly %input, %struct.A* %a) {
+  %1 = getelementptr inbounds %struct.A, %struct.A* %a, i64 0, i32 0
+  %2 = getelementptr inbounds %struct.A, %struct.A* %a, i64 0, i32 1
+  %3 = getelementptr inbounds %struct.A, %struct.A* %a, i64 0, i32 2
+  %4 = getelementptr inbounds %struct.A, %struct.A* %a, i64 0, i32 3
+  %5 = getelementptr inbounds %struct.A, %struct.A* %a, i64 0, i32 4
+  %6 = getelementptr inbounds %struct.A, %struct.A* %a, i64 0, i32 5
+  br label %.backedge
+
+.backedge:
+  %.0 = phi i8* [ %input, %0 ], [ %7, %.backedge.backedge ]
+  %7 = getelementptr inbounds i8, i8* %.0, i64 1
+  %8 = load i8, i8* %7, align 1
+  switch i8 %8, label %.backedge.backedge [
+    i8 0, label %9
+    i8 10, label %10
+    i8 20, label %11
+    i8 30, label %12
+    i8 40, label %13
+    i8 50, label %14
+  ]
+
+; <label>:9
+  tail call void @_Z6assignPj(i32* %1)
+  br label %.backedge.backedge
+
+; <label>:10
+  tail call void @_Z6assignPj(i32* %2)
+  br label %.backedge.backedge
+
+.backedge.backedge:
+  br label %.backedge
+
+; <label>:11
+  tail call void @_Z6assignPj(i32* %3)
+  br label %.backedge.backedge
+
+; <label>:12
+  tail call void @_Z6assignPj(i32* %4)
+  br label %.backedge.backedge
+
+; <label>:13
+  tail call void @_Z6assignPj(i32* %5)
+  br label %.backedge.backedge
+
+; <label>:14
+  tail call void @_Z6assignPj(i32* %6)
+  br label %.backedge.backedge
+}
+
+declare void @_Z6assignPj(i32*)
-- 
2.50.1