[ARM] Thumb2: favor R4-R7 over R12/LR in allocation order when opt for minsize

author Oliver Stannard <oliver.stannard@linaro.org>

Wed, 3 Jul 2019 09:58:52 +0000 (09:58 +0000)

committer Oliver Stannard <oliver.stannard@linaro.org>

Wed, 3 Jul 2019 09:58:52 +0000 (09:58 +0000)
author Oliver Stannard <oliver.stannard@linaro.org>
Wed, 3 Jul 2019 09:58:52 +0000 (09:58 +0000)
committer Oliver Stannard <oliver.stannard@linaro.org>
Wed, 3 Jul 2019 09:58:52 +0000 (09:58 +0000)
diff --git a/include/llvm/CodeGen/TargetSubtargetInfo.h b/include/llvm/CodeGen/TargetSubtargetInfo.h

index 9057b2d87c3964c0fbe1022dd474141213630628..037fc3ed3243e08b4f31f0fe4ad3ea77b03ebeef 100644 (file)
--- a/include/llvm/CodeGen/TargetSubtargetInfo.h
+++ b/include/llvm/CodeGen/TargetSubtargetInfo.h
@@ -291,6 +291,14 @@ public:
  
    /// This is called after a .mir file was loaded.
    virtual void mirFileLoaded(MachineFunction &MF) const;
+
+  /// True if the register allocator should use the allocation orders exactly as
+  /// written in the tablegen descriptions, false if it should allocate
+  /// the specified physical register later if is it callee-saved.
+  virtual bool ignoreCSRForAllocationOrder(const MachineFunction &MF,
+                                           unsigned PhysReg) const {
+    return false;
+  }
  };
  
  } // end namespace llvm
diff --git a/lib/CodeGen/RegisterClassInfo.cpp b/lib/CodeGen/RegisterClassInfo.cpp

index 3d6687ef7b80b2c51f756bd1d323a6b2f060eb61..530e0cccf1d4823f312e6a7ab5d3611691814129 100644 (file)
--- a/lib/CodeGen/RegisterClassInfo.cpp
+++ b/lib/CodeGen/RegisterClassInfo.cpp
@@ -90,6 +90,7 @@ void RegisterClassInfo::runOnMachineFunction(const MachineFunction &mf) {
  void RegisterClassInfo::compute(const TargetRegisterClass *RC) const {
    assert(RC && "no register class given");
    RCInfo &RCI = RegClass[RC->getID()];
+  auto &STI = MF->getSubtarget();
  
    // Raw register count, including all reserved regs.
    unsigned NumRegs = RC->getNumRegs();
@@ -114,7 +115,8 @@ void RegisterClassInfo::compute(const TargetRegisterClass *RC) const {
      unsigned Cost = TRI->getCostPerUse(PhysReg);
      MinCost = std::min(MinCost, Cost);
  
-    if (CalleeSavedAliases[PhysReg])
+    if (CalleeSavedAliases[PhysReg] &&
+        !STI.ignoreCSRForAllocationOrder(*MF, PhysReg))
        // PhysReg aliases a CSR, save it for later.
        CSRAlias.push_back(PhysReg);
      else {
diff --git a/lib/Target/ARM/ARMRegisterInfo.td b/lib/Target/ARM/ARMRegisterInfo.td

index 7844edc68e80e94c60d9ccc6dd95eb26ceedd7c5..92ae26b3729d72e82799b60da8518842e1267f13 100644 (file)
--- a/lib/Target/ARM/ARMRegisterInfo.td
+++ b/lib/Target/ARM/ARMRegisterInfo.td
@@ -227,9 +227,10 @@ def GPR : RegisterClass<"ARM", [i32], 32, (add (sequence "R%u", 0, 12),
    // know how to spill them. If we make our prologue/epilogue code smarter at
    // some point, we can go back to using the above allocation orders for the
    // Thumb1 instructions that know how to use hi regs.
-  let AltOrders = [(add LR, GPR), (trunc GPR, 8)];
+  let AltOrders = [(add LR, GPR), (trunc GPR, 8),
+                   (add (trunc GPR, 8), R12, LR, (shl GPR, 8))];
    let AltOrderSelect = [{
-      return 1 + MF.getSubtarget<ARMSubtarget>().isThumb1Only();
+      return MF.getSubtarget<ARMSubtarget>().getGPRAllocationOrder(MF);
    }];
    let DiagnosticString = "operand must be a register in range [r0, r15]";
  }
@@ -238,9 +239,10 @@ def GPR : RegisterClass<"ARM", [i32], 32, (add (sequence "R%u", 0, 12),
  // certain operand slots, particularly as the destination.  Primarily
  // useful for disassembly.
  def GPRnopc : RegisterClass<"ARM", [i32], 32, (sub GPR, PC)> {
-  let AltOrders = [(add LR, GPRnopc), (trunc GPRnopc, 8)];
+  let AltOrders = [(add LR, GPRnopc), (trunc GPRnopc, 8),
+                   (add (trunc GPRnopc, 8), R12, LR, (shl GPRnopc, 8))];
    let AltOrderSelect = [{
-      return 1 + MF.getSubtarget<ARMSubtarget>().isThumb1Only();
+      return MF.getSubtarget<ARMSubtarget>().getGPRAllocationOrder(MF);
    }];
    let DiagnosticString = "operand must be a register in range [r0, r14]";
  }
@@ -295,9 +297,10 @@ def GPRlr : RegisterClass<"ARM", [i32], 32, (add LR)>;
  // or SP (R13 or R15) are used. The ARM ISA refers to these operands
  // via the BadReg() pseudo-code description.
  def rGPR : RegisterClass<"ARM", [i32], 32, (sub GPR, SP, PC)> {
-  let AltOrders = [(add LR, rGPR), (trunc rGPR, 8)];
+  let AltOrders = [(add LR, rGPR), (trunc rGPR, 8),
+                   (add (trunc rGPR, 8), R12, LR, (shl rGPR, 8))];
    let AltOrderSelect = [{
-      return 1 + MF.getSubtarget<ARMSubtarget>().isThumb1Only();
+      return MF.getSubtarget<ARMSubtarget>().getGPRAllocationOrder(MF);
    }];
    let DiagnosticType = "rGPR";
  }
diff --git a/lib/Target/ARM/ARMSubtarget.cpp b/lib/Target/ARM/ARMSubtarget.cpp

index 63f694199f44d69fc17532cc9998e9504904d356..978faed776b0e9a5aec8424a3468e3c0b4dfea75 100644 (file)
--- a/lib/Target/ARM/ARMSubtarget.cpp
+++ b/lib/Target/ARM/ARMSubtarget.cpp
@@ -413,3 +413,45 @@ bool ARMSubtarget::useFastISel() const {
           ((isTargetMachO() && !isThumb1Only()) ||
            (isTargetLinux() && !isThumb()) || (isTargetNaCl() && !isThumb()));
  }
+
+unsigned ARMSubtarget::getGPRAllocationOrder(const MachineFunction &MF) const {
+  // The GPR register class has multiple possible allocation orders, with
+  // tradeoffs preferred by different sub-architectures and optimisation goals.
+  // The allocation orders are:
+  // 0: (the default tablegen order, not used)
+  // 1: r14, r0-r13
+  // 2: r0-r7
+  // 3: r0-r7, r12, lr, r8-r11
+  // Note that the register allocator will change this order so that
+  // callee-saved registers are used later, as they require extra work in the
+  // prologue/epilogue (though we sometimes override that).
+
+  // For thumb1-only targets, only the low registers are allocatable.
+  if (isThumb1Only())
+    return 2;
+
+  // Allocate low registers first, so we can select more 16-bit instructions.
+  // We also (in ignoreCSRForAllocationOrder) override  the default behaviour
+  // with regards to callee-saved registers, because pushing extra registers is
+  // much cheaper (in terms of code size) than using high registers. After
+  // that, we allocate r12 (doesn't need to be saved), lr (saving it means we
+  // can return with the pop, don't need an extra "bx lr") and then the rest of
+  // the high registers.
+  if (isThumb2() && MF.getFunction().hasMinSize())
+    return 3;
+
+  // Otherwise, allocate in the default order, using LR first because saving it
+  // allows a shorter epilogue sequence.
+  return 1;
+}
+
+bool ARMSubtarget::ignoreCSRForAllocationOrder(const MachineFunction &MF,
+                                               unsigned PhysReg) const {
+  // To minimize code size in Thumb2, we prefer the usage of low regs (lower
+  // cost per use) so we can  use narrow encoding. By default, caller-saved
+  // registers (e.g. lr, r12) are always  allocated first, regardless of
+  // their cost per use. When optForMinSize, we prefer the low regs even if
+  // they are CSR because usually push/pop can be folded into existing ones.
+  return isThumb2() && MF.getFunction().hasMinSize() &&
+         ARM::GPRRegClass.contains(PhysReg);
+}
diff --git a/lib/Target/ARM/ARMSubtarget.h b/lib/Target/ARM/ARMSubtarget.h

index 83113efb8db765d645951c5147ca38081d805a43..c2b0f052b84351cb6c0c86669b539699a5a9ae11 100644 (file)
--- a/lib/Target/ARM/ARMSubtarget.h
+++ b/lib/Target/ARM/ARMSubtarget.h
@@ -856,6 +856,10 @@ public:
    unsigned getPrefLoopAlignment() const {
      return PrefLoopAlignment;
    }
+
+  bool ignoreCSRForAllocationOrder(const MachineFunction &MF,
+                                   unsigned PhysReg) const override;
+  unsigned getGPRAllocationOrder(const MachineFunction &MF) const;
  };
  
  } // end namespace llvm
diff --git a/test/CodeGen/ARM/avoid-cpsr-rmw.ll b/test/CodeGen/ARM/avoid-cpsr-rmw.ll

index 9373c5d44210f6883b7695d0c75c1ed5769d95db..5c8350af98f5a7580996cdc3a5b64e7bb8fbc6d5 100644 (file)
--- a/test/CodeGen/ARM/avoid-cpsr-rmw.ll
+++ b/test/CodeGen/ARM/avoid-cpsr-rmw.ll
@@ -60,7 +60,7 @@ entry:
  
  while.body:
  ; CHECK: while.body
-; CHECK: mul r{{[0-9]+}}
+; CHECK: muls r{{[0-9]+}}
  ; CHECK: muls
    %ptr1.addr.09 = phi i32* [ %add.ptr, %while.body ], [ %ptr1, %entry ]
    %ptr2.addr.08 = phi i32* [ %incdec.ptr, %while.body ], [ %ptr2, %entry ]
diff --git a/test/CodeGen/ARM/favor-low-reg-for-Osize.ll b/test/CodeGen/ARM/favor-low-reg-for-Osize.ll

new file mode 100644 (file)

index 0000000..0ebdca3
--- /dev/null
+++ b/test/CodeGen/ARM/favor-low-reg-for-Osize.ll
@@ -0,0 +1,29 @@
+; REQUIRES: asserts
+; RUN: llc -debug-only=regalloc < %s 2>%t | FileCheck %s --check-prefix=CHECK
+; RUN: FileCheck %s < %t --check-prefix=DEBUG
+
+target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-a:0:32-n8:16:32-S64"
+target triple = "thumbv7m--linux-gnueabi"
+
+
+; DEBUG:         AllocationOrder(GPR) = [ $r0 $r1 $r2 $r3 $r4 $r5 $r6 $r7 $r12 $lr $r8 $r9 $r10 $r11 ]
+
+define i32 @test_minsize(i32 %x) optsize minsize {
+; CHECK-LABEL: test_minsize:
+entry:
+; CHECK: mov     r4, r0
+  tail call void asm sideeffect "", "~{r0},~{r1},~{r2},~{r3}"()
+; CHECK: mov     r0, r4
+  ret i32 %x
+}
+
+; DEBUG: AllocationOrder(GPR) = [ $r0 $r1 $r2 $r3 $r12 $lr $r4 $r5 $r6 $r7 $r8 $r9 $r10 $r11 ]
+
+define i32 @test_optsize(i32 %x) optsize {
+; CHECK-LABEL: test_optsize:
+entry:
+; CHECK: mov     r12, r0
+  tail call void asm sideeffect "", "~{r0},~{r1},~{r2},~{r3}"()
+; CHECK: mov     r0, r12
+  ret i32 %x
+}
author	Oliver Stannard <oliver.stannard@linaro.org>
	Wed, 3 Jul 2019 09:58:52 +0000 (09:58 +0000)
committer	Oliver Stannard <oliver.stannard@linaro.org>
	Wed, 3 Jul 2019 09:58:52 +0000 (09:58 +0000)
include/llvm/CodeGen/TargetSubtargetInfo.h		patch \| blob \| history
lib/CodeGen/RegisterClassInfo.cpp		patch \| blob \| history
lib/Target/ARM/ARMRegisterInfo.td		patch \| blob \| history
lib/Target/ARM/ARMSubtarget.cpp		patch \| blob \| history
lib/Target/ARM/ARMSubtarget.h		patch \| blob \| history
test/CodeGen/ARM/avoid-cpsr-rmw.ll		patch \| blob \| history
test/CodeGen/ARM/favor-low-reg-for-Osize.ll	[new file with mode: 0644]	patch \| blob