From ce8c51822a93c69541fbe46fc783bfdc45d6a6e6 Mon Sep 17 00:00:00 2001
From: Oliver Stannard <oliver.stannard@linaro.org>
Date: Wed, 3 Jul 2019 09:58:52 +0000
Subject: [PATCH] [ARM] Thumb2: favor R4-R7 over R12/LR in allocation order
 when opt for minsize

For Thumb2, we prefer low regs (costPerUse = 0) to allow narrow
encoding. However, current allocation order is like:
  R0-R3, R12, LR, R4-R11

As a result, a lot of instructs that use R12/LR will be wide instrs.

This patch changes the allocation order to:
  R0-R7, R12, LR, R8-R11
for thumb2 and -Osize.

In most cases, there is no extra push/pop instrs as they will be folded
into existing ones. There might be slight performance impact due to more
stack usage, so we only enable it when opt for min size.

https://reviews.llvm.org/D30324

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@365014 91177308-0d34-0410-b5e6-96231b3b80d8
---
 include/llvm/CodeGen/TargetSubtargetInfo.h  |  8 ++++
 lib/CodeGen/RegisterClassInfo.cpp           |  4 +-
 lib/Target/ARM/ARMRegisterInfo.td           | 15 +++++---
 lib/Target/ARM/ARMSubtarget.cpp             | 42 +++++++++++++++++++++
 lib/Target/ARM/ARMSubtarget.h               |  4 ++
 test/CodeGen/ARM/avoid-cpsr-rmw.ll          |  2 +-
 test/CodeGen/ARM/favor-low-reg-for-Osize.ll | 29 ++++++++++++++
 7 files changed, 96 insertions(+), 8 deletions(-)
 create mode 100644 test/CodeGen/ARM/favor-low-reg-for-Osize.ll
diff --git a/include/llvm/CodeGen/TargetSubtargetInfo.h b/include/llvm/CodeGen/TargetSubtargetInfo.h
index 9057b2d87c3..037fc3ed324 100644
--- a/include/llvm/CodeGen/TargetSubtargetInfo.h
+++ b/include/llvm/CodeGen/TargetSubtargetInfo.h
@@ -291,6 +291,14 @@ public:
 
   /// This is called after a .mir file was loaded.
   virtual void mirFileLoaded(MachineFunction &MF) const;
+
+  /// True if the register allocator should use the allocation orders exactly as
+  /// written in the tablegen descriptions, false if it should allocate
+  /// the specified physical register later if is it callee-saved.
+  virtual bool ignoreCSRForAllocationOrder(const MachineFunction &MF,
+                                           unsigned PhysReg) const {
+    return false;
+  }
 };
 
 } // end namespace llvm
diff --git a/lib/CodeGen/RegisterClassInfo.cpp b/lib/CodeGen/RegisterClassInfo.cpp
index 3d6687ef7b8..530e0cccf1d 100644
--- a/lib/CodeGen/RegisterClassInfo.cpp
+++ b/lib/CodeGen/RegisterClassInfo.cpp
@@ -90,6 +90,7 @@ void RegisterClassInfo::runOnMachineFunction(const MachineFunction &mf) {
 void RegisterClassInfo::compute(const TargetRegisterClass *RC) const {
   assert(RC && "no register class given");
   RCInfo &RCI = RegClass[RC->getID()];
+  auto &STI = MF->getSubtarget();
 
   // Raw register count, including all reserved regs.
   unsigned NumRegs = RC->getNumRegs();
@@ -114,7 +115,8 @@ void RegisterClassInfo::compute(const TargetRegisterClass *RC) const {
     unsigned Cost = TRI->getCostPerUse(PhysReg);
     MinCost = std::min(MinCost, Cost);
 
-    if (CalleeSavedAliases[PhysReg])
+    if (CalleeSavedAliases[PhysReg] &&
+        !STI.ignoreCSRForAllocationOrder(*MF, PhysReg))
       // PhysReg aliases a CSR, save it for later.
       CSRAlias.push_back(PhysReg);
     else {
diff --git a/lib/Target/ARM/ARMRegisterInfo.td b/lib/Target/ARM/ARMRegisterInfo.td
index 7844edc68e8..92ae26b3729 100644
--- a/lib/Target/ARM/ARMRegisterInfo.td
+++ b/lib/Target/ARM/ARMRegisterInfo.td
@@ -227,9 +227,10 @@ def GPR : RegisterClass<"ARM", [i32], 32, (add (sequence "R%u", 0, 12),
   // know how to spill them. If we make our prologue/epilogue code smarter at
   // some point, we can go back to using the above allocation orders for the
   // Thumb1 instructions that know how to use hi regs.
-  let AltOrders = [(add LR, GPR), (trunc GPR, 8)];
+  let AltOrders = [(add LR, GPR), (trunc GPR, 8),
+                   (add (trunc GPR, 8), R12, LR, (shl GPR, 8))];
   let AltOrderSelect = [{
-      return 1 + MF.getSubtarget<ARMSubtarget>().isThumb1Only();
+      return MF.getSubtarget<ARMSubtarget>().getGPRAllocationOrder(MF);
   }];
   let DiagnosticString = "operand must be a register in range [r0, r15]";
 }
@@ -238,9 +239,10 @@ def GPR : RegisterClass<"ARM", [i32], 32, (add (sequence "R%u", 0, 12),
 // certain operand slots, particularly as the destination.  Primarily
 // useful for disassembly.
 def GPRnopc : RegisterClass<"ARM", [i32], 32, (sub GPR, PC)> {
-  let AltOrders = [(add LR, GPRnopc), (trunc GPRnopc, 8)];
+  let AltOrders = [(add LR, GPRnopc), (trunc GPRnopc, 8),
+                   (add (trunc GPRnopc, 8), R12, LR, (shl GPRnopc, 8))];
   let AltOrderSelect = [{
-      return 1 + MF.getSubtarget<ARMSubtarget>().isThumb1Only();
+      return MF.getSubtarget<ARMSubtarget>().getGPRAllocationOrder(MF);
   }];
   let DiagnosticString = "operand must be a register in range [r0, r14]";
 }
@@ -295,9 +297,10 @@ def GPRlr : RegisterClass<"ARM", [i32], 32, (add LR)>;
 // or SP (R13 or R15) are used. The ARM ISA refers to these operands
 // via the BadReg() pseudo-code description.
 def rGPR : RegisterClass<"ARM", [i32], 32, (sub GPR, SP, PC)> {
-  let AltOrders = [(add LR, rGPR), (trunc rGPR, 8)];
+  let AltOrders = [(add LR, rGPR), (trunc rGPR, 8),
+                   (add (trunc rGPR, 8), R12, LR, (shl rGPR, 8))];
   let AltOrderSelect = [{
-      return 1 + MF.getSubtarget<ARMSubtarget>().isThumb1Only();
+      return MF.getSubtarget<ARMSubtarget>().getGPRAllocationOrder(MF);
   }];
   let DiagnosticType = "rGPR";
 }
diff --git a/lib/Target/ARM/ARMSubtarget.cpp b/lib/Target/ARM/ARMSubtarget.cpp
index 63f694199f4..978faed776b 100644
--- a/lib/Target/ARM/ARMSubtarget.cpp
+++ b/lib/Target/ARM/ARMSubtarget.cpp
@@ -413,3 +413,45 @@ bool ARMSubtarget::useFastISel() const {
          ((isTargetMachO() && !isThumb1Only()) ||
           (isTargetLinux() && !isThumb()) || (isTargetNaCl() && !isThumb()));
 }
+
+unsigned ARMSubtarget::getGPRAllocationOrder(const MachineFunction &MF) const {
+  // The GPR register class has multiple possible allocation orders, with
+  // tradeoffs preferred by different sub-architectures and optimisation goals.
+  // The allocation orders are:
+  // 0: (the default tablegen order, not used)
+  // 1: r14, r0-r13
+  // 2: r0-r7
+  // 3: r0-r7, r12, lr, r8-r11
+  // Note that the register allocator will change this order so that
+  // callee-saved registers are used later, as they require extra work in the
+  // prologue/epilogue (though we sometimes override that).
+
+  // For thumb1-only targets, only the low registers are allocatable.
+  if (isThumb1Only())
+    return 2;
+
+  // Allocate low registers first, so we can select more 16-bit instructions.
+  // We also (in ignoreCSRForAllocationOrder) override  the default behaviour
+  // with regards to callee-saved registers, because pushing extra registers is
+  // much cheaper (in terms of code size) than using high registers. After
+  // that, we allocate r12 (doesn't need to be saved), lr (saving it means we
+  // can return with the pop, don't need an extra "bx lr") and then the rest of
+  // the high registers.
+  if (isThumb2() && MF.getFunction().hasMinSize())
+    return 3;
+
+  // Otherwise, allocate in the default order, using LR first because saving it
+  // allows a shorter epilogue sequence.
+  return 1;
+}
+
+bool ARMSubtarget::ignoreCSRForAllocationOrder(const MachineFunction &MF,
+                                               unsigned PhysReg) const {
+  // To minimize code size in Thumb2, we prefer the usage of low regs (lower
+  // cost per use) so we can  use narrow encoding. By default, caller-saved
+  // registers (e.g. lr, r12) are always  allocated first, regardless of
+  // their cost per use. When optForMinSize, we prefer the low regs even if
+  // they are CSR because usually push/pop can be folded into existing ones.
+  return isThumb2() && MF.getFunction().hasMinSize() &&
+         ARM::GPRRegClass.contains(PhysReg);
+}
diff --git a/lib/Target/ARM/ARMSubtarget.h b/lib/Target/ARM/ARMSubtarget.h
index 83113efb8db..c2b0f052b84 100644
--- a/lib/Target/ARM/ARMSubtarget.h
+++ b/lib/Target/ARM/ARMSubtarget.h
@@ -856,6 +856,10 @@ public:
   unsigned getPrefLoopAlignment() const {
     return PrefLoopAlignment;
   }
+
+  bool ignoreCSRForAllocationOrder(const MachineFunction &MF,
+                                   unsigned PhysReg) const override;
+  unsigned getGPRAllocationOrder(const MachineFunction &MF) const;
 };
 
 } // end namespace llvm
diff --git a/test/CodeGen/ARM/avoid-cpsr-rmw.ll b/test/CodeGen/ARM/avoid-cpsr-rmw.ll
index 9373c5d4421..5c8350af98f 100644
--- a/test/CodeGen/ARM/avoid-cpsr-rmw.ll
+++ b/test/CodeGen/ARM/avoid-cpsr-rmw.ll
@@ -60,7 +60,7 @@ entry:
 
 while.body:
 ; CHECK: while.body
-; CHECK: mul r{{[0-9]+}}
+; CHECK: muls r{{[0-9]+}}
 ; CHECK: muls
   %ptr1.addr.09 = phi i32* [ %add.ptr, %while.body ], [ %ptr1, %entry ]
   %ptr2.addr.08 = phi i32* [ %incdec.ptr, %while.body ], [ %ptr2, %entry ]
diff --git a/test/CodeGen/ARM/favor-low-reg-for-Osize.ll b/test/CodeGen/ARM/favor-low-reg-for-Osize.ll
new file mode 100644
index 00000000000..0ebdca3dbda
--- /dev/null
+++ b/test/CodeGen/ARM/favor-low-reg-for-Osize.ll
@@ -0,0 +1,29 @@
+; REQUIRES: asserts
+; RUN: llc -debug-only=regalloc < %s 2>%t | FileCheck %s --check-prefix=CHECK
+; RUN: FileCheck %s < %t --check-prefix=DEBUG
+
+target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-a:0:32-n8:16:32-S64"
+target triple = "thumbv7m--linux-gnueabi"
+
+
+; DEBUG:         AllocationOrder(GPR) = [ $r0 $r1 $r2 $r3 $r4 $r5 $r6 $r7 $r12 $lr $r8 $r9 $r10 $r11 ]
+
+define i32 @test_minsize(i32 %x) optsize minsize {
+; CHECK-LABEL: test_minsize:
+entry:
+; CHECK: mov     r4, r0
+  tail call void asm sideeffect "", "~{r0},~{r1},~{r2},~{r3}"()
+; CHECK: mov     r0, r4
+  ret i32 %x
+}
+
+; DEBUG: AllocationOrder(GPR) = [ $r0 $r1 $r2 $r3 $r12 $lr $r4 $r5 $r6 $r7 $r8 $r9 $r10 $r11 ]
+
+define i32 @test_optsize(i32 %x) optsize {
+; CHECK-LABEL: test_optsize:
+entry:
+; CHECK: mov     r12, r0
+  tail call void asm sideeffect "", "~{r0},~{r1},~{r2},~{r3}"()
+; CHECK: mov     r0, r12
+  ret i32 %x
+}
-- 
2.50.1