From 8babc52a2bee666616c8279a04f068f4f48e726c Mon Sep 17 00:00:00 2001
From: David Green <david.green@arm.com>
Date: Thu, 7 Mar 2019 13:44:40 +0000
Subject: [PATCH] [LSR] Attempt to increase the accuracy of LSR's setup cost

In some loops, we end up generating loop induction variables that look like:
  {(-1 * (zext i16 (%i0 * %i1) to i32))<nsw>,+,1}
As opposed to the simpler:
  {(zext i16 (%i0 * %i1) to i32),+,-1}
i.e we count up from -limit to 0, not the simpler counting down from limit to
0. This is because the scores, as LSR calculates them, are the same and the
second is filtered in place of the first. We end up with a redundant SUB from 0
in the code.

This patch tries to make the calculation of the setup cost a little more
thoroughly, recursing into the scev members to better approximate the setup
required. The cost function for comparing LSR costs is:

return std::tie(C1.NumRegs, C1.AddRecCost, C1.NumIVMuls, C1.NumBaseAdds,
                C1.ScaleCost, C1.ImmCost, C1.SetupCost) <
       std::tie(C2.NumRegs, C2.AddRecCost, C2.NumIVMuls, C2.NumBaseAdds,
                C2.ScaleCost, C2.ImmCost, C2.SetupCost);
So this will only alter results if none of the other variables turn out to be
different.

Differential Revision: https://reviews.llvm.org/D58770


git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@355597 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Transforms/Scalar/LoopStrengthReduce.cpp  |  31 ++++--
 test/CodeGen/ARM/lsr-setupcost.ll             | 100 ++++++++++++++++++
 test/CodeGen/Hexagon/swp-carried-1.ll         |   2 +-
 test/CodeGen/Hexagon/swp-epilog-phi5.ll       |   4 +-
 .../two-combinations-bug.ll                   |   2 +-
 5 files changed, 129 insertions(+), 10 deletions(-)
 create mode 100644 test/CodeGen/ARM/lsr-setupcost.ll
diff --git a/lib/Transforms/Scalar/LoopStrengthReduce.cpp b/lib/Transforms/Scalar/LoopStrengthReduce.cpp
index a1533019128..77af68ef119 100644
--- a/lib/Transforms/Scalar/LoopStrengthReduce.cpp
+++ b/lib/Transforms/Scalar/LoopStrengthReduce.cpp
@@ -115,6 +115,7 @@
 #include <cstdlib>
 #include <iterator>
 #include <limits>
+#include <numeric>
 #include <map>
 #include <utility>
 
@@ -163,6 +164,10 @@ static cl::opt<unsigned> ComplexityLimit(
   cl::init(std::numeric_limits<uint16_t>::max()),
   cl::desc("LSR search space complexity limit"));
 
+static cl::opt<bool> EnableRecursiveSetupCost(
+  "lsr-recursive-setupcost", cl::Hidden, cl::init(true),
+  cl::desc("Enable more thorough lsr setup cost calculation"));
+
 #ifndef NDEBUG
 // Stress test IV chain generation.
 static cl::opt<bool> StressIVChain(
@@ -1211,6 +1216,25 @@ static bool isAMCompletelyFolded(const TargetTransformInfo &TTI,
                                  bool HasBaseReg, int64_t Scale,
                                  Instruction *Fixup = nullptr);
 
+static unsigned getSetupCost(const SCEV *Reg) {
+  if (isa<SCEVUnknown>(Reg) || isa<SCEVConstant>(Reg))
+    return 1;
+  if (!EnableRecursiveSetupCost)
+    return 0;
+  if (const auto *S = dyn_cast<SCEVAddRecExpr>(Reg))
+    return getSetupCost(S->getStart());
+  if (auto S = dyn_cast<SCEVCastExpr>(Reg))
+    return getSetupCost(S->getOperand());
+  if (auto S = dyn_cast<SCEVNAryExpr>(Reg))
+    return std::accumulate(S->op_begin(), S->op_end(), 0,
+                           [](unsigned i, const SCEV *Reg) {
+                             return i + getSetupCost(Reg);
+                           });
+  if (auto S = dyn_cast<SCEVUDivExpr>(Reg))
+    return getSetupCost(S->getLHS()) + getSetupCost(S->getRHS());
+  return 0;
+}
+
 /// Tally up interesting quantities from the given register.
 void Cost::RateRegister(const Formula &F, const SCEV *Reg,
                         SmallPtrSetImpl<const SCEV *> &Regs,
@@ -1276,12 +1300,7 @@ void Cost::RateRegister(const Formula &F, const SCEV *Reg,
 
   // Rough heuristic; favor registers which don't require extra setup
   // instructions in the preheader.
-  if (!isa<SCEVUnknown>(Reg) &&
-      !isa<SCEVConstant>(Reg) &&
-      !(isa<SCEVAddRecExpr>(Reg) &&
-        (isa<SCEVUnknown>(cast<SCEVAddRecExpr>(Reg)->getStart()) ||
-         isa<SCEVConstant>(cast<SCEVAddRecExpr>(Reg)->getStart()))))
-    ++C.SetupCost;
+  C.SetupCost += getSetupCost(Reg);
 
   C.NumIVMuls += isa<SCEVMulExpr>(Reg) &&
                SE.hasComputableLoopEvolution(Reg, L);
diff --git a/test/CodeGen/ARM/lsr-setupcost.ll b/test/CodeGen/ARM/lsr-setupcost.ll
new file mode 100644
index 00000000000..a60f19afa24
--- /dev/null
+++ b/test/CodeGen/ARM/lsr-setupcost.ll
@@ -0,0 +1,100 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -mtriple=thumbv6m-none-eabi -loop-reduce %s -S -o - | FileCheck %s
+
+target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-a:0:32-n32-S64"
+
+%struct.arm_matrix_instance_q15 = type { i16, i16, i16* }
+
+define i32 @arm_mat_add_q15(%struct.arm_matrix_instance_q15* nocapture readonly %pSrcA, %struct.arm_matrix_instance_q15* nocapture readonly %pSrcB, %struct.arm_matrix_instance_q15* nocapture readonly %pDst)  {
+; CHECK-LABEL: @arm_mat_add_q15(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[NUMROWS:%.*]] = getelementptr inbounds [[STRUCT_ARM_MATRIX_INSTANCE_Q15:%.*]], %struct.arm_matrix_instance_q15* [[PSRCA:%.*]], i32 0, i32 0
+; CHECK-NEXT:    [[I0:%.*]] = load i16, i16* [[NUMROWS]], align 4
+; CHECK-NEXT:    [[NUMCOLS:%.*]] = getelementptr inbounds [[STRUCT_ARM_MATRIX_INSTANCE_Q15]], %struct.arm_matrix_instance_q15* [[PSRCA]], i32 0, i32 1
+; CHECK-NEXT:    [[I1:%.*]] = load i16, i16* [[NUMCOLS]], align 2
+; CHECK-NEXT:    [[MUL:%.*]] = mul i16 [[I1]], [[I0]]
+; CHECK-NEXT:    [[CMP22:%.*]] = icmp eq i16 [[MUL]], 0
+; CHECK-NEXT:    br i1 [[CMP22]], label [[WHILE_END:%.*]], label [[WHILE_BODY_PREHEADER:%.*]]
+; CHECK:       while.body.preheader:
+; CHECK-NEXT:    [[CONV5:%.*]] = zext i16 [[MUL]] to i32
+; CHECK-NEXT:    [[PDATA2:%.*]] = getelementptr inbounds [[STRUCT_ARM_MATRIX_INSTANCE_Q15]], %struct.arm_matrix_instance_q15* [[PDST:%.*]], i32 0, i32 2
+; CHECK-NEXT:    [[I2:%.*]] = load i16*, i16** [[PDATA2]], align 4
+; CHECK-NEXT:    [[PDATA1:%.*]] = getelementptr inbounds [[STRUCT_ARM_MATRIX_INSTANCE_Q15]], %struct.arm_matrix_instance_q15* [[PSRCB:%.*]], i32 0, i32 2
+; CHECK-NEXT:    [[I3:%.*]] = load i16*, i16** [[PDATA1]], align 4
+; CHECK-NEXT:    [[PDATA:%.*]] = getelementptr inbounds [[STRUCT_ARM_MATRIX_INSTANCE_Q15]], %struct.arm_matrix_instance_q15* [[PSRCA]], i32 0, i32 2
+; CHECK-NEXT:    [[I4:%.*]] = load i16*, i16** [[PDATA]], align 4
+; CHECK-NEXT:    br label [[WHILE_BODY:%.*]]
+; CHECK:       while.body:
+; CHECK-NEXT:    [[PINA_026:%.*]] = phi i16* [ [[INCDEC_PTR:%.*]], [[WHILE_BODY]] ], [ [[I4]], [[WHILE_BODY_PREHEADER]] ]
+; CHECK-NEXT:    [[BLKCNT_025:%.*]] = phi i32 [ [[DEC:%.*]], [[WHILE_BODY]] ], [ [[CONV5]], [[WHILE_BODY_PREHEADER]] ]
+; CHECK-NEXT:    [[PINB_024:%.*]] = phi i16* [ [[INCDEC_PTR8:%.*]], [[WHILE_BODY]] ], [ [[I3]], [[WHILE_BODY_PREHEADER]] ]
+; CHECK-NEXT:    [[POUT_023:%.*]] = phi i16* [ [[INCDEC_PTR11:%.*]], [[WHILE_BODY]] ], [ [[I2]], [[WHILE_BODY_PREHEADER]] ]
+; CHECK-NEXT:    [[INCDEC_PTR]] = getelementptr inbounds i16, i16* [[PINA_026]], i32 1
+; CHECK-NEXT:    [[I5:%.*]] = load i16, i16* [[PINA_026]], align 2
+; CHECK-NEXT:    [[CONV7:%.*]] = sext i16 [[I5]] to i32
+; CHECK-NEXT:    [[INCDEC_PTR8]] = getelementptr inbounds i16, i16* [[PINB_024]], i32 1
+; CHECK-NEXT:    [[I6:%.*]] = load i16, i16* [[PINB_024]], align 2
+; CHECK-NEXT:    [[CONV9:%.*]] = sext i16 [[I6]] to i32
+; CHECK-NEXT:    [[ADD:%.*]] = add nsw i32 [[CONV9]], [[CONV7]]
+; CHECK-NEXT:    [[I7:%.*]] = icmp sgt i32 [[ADD]], -32768
+; CHECK-NEXT:    [[SPEC_SELECT_I:%.*]] = select i1 [[I7]], i32 [[ADD]], i32 -32768
+; CHECK-NEXT:    [[I8:%.*]] = icmp slt i32 [[SPEC_SELECT_I]], 32767
+; CHECK-NEXT:    [[CALL21:%.*]] = select i1 [[I8]], i32 [[SPEC_SELECT_I]], i32 32767
+; CHECK-NEXT:    [[CONV10:%.*]] = trunc i32 [[CALL21]] to i16
+; CHECK-NEXT:    [[INCDEC_PTR11]] = getelementptr inbounds i16, i16* [[POUT_023]], i32 1
+; CHECK-NEXT:    store i16 [[CONV10]], i16* [[POUT_023]], align 2
+; CHECK-NEXT:    [[DEC]] = add nsw i32 [[BLKCNT_025]], -1
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[DEC]], 0
+; CHECK-NEXT:    br i1 [[CMP]], label [[WHILE_END_LOOPEXIT:%.*]], label [[WHILE_BODY]]
+; CHECK:       while.end.loopexit:
+; CHECK-NEXT:    br label [[WHILE_END]]
+; CHECK:       while.end:
+; CHECK-NEXT:    ret i32 0
+;
+entry:
+  %numRows = getelementptr inbounds %struct.arm_matrix_instance_q15, %struct.arm_matrix_instance_q15* %pSrcA, i32 0, i32 0
+  %i0 = load i16, i16* %numRows, align 4
+  %numCols = getelementptr inbounds %struct.arm_matrix_instance_q15, %struct.arm_matrix_instance_q15* %pSrcA, i32 0, i32 1
+  %i1 = load i16, i16* %numCols, align 2
+  %mul = mul i16 %i1, %i0
+  %cmp22 = icmp eq i16 %mul, 0
+  br i1 %cmp22, label %while.end, label %while.body.preheader
+
+while.body.preheader:                             ; preds = %entry
+  %conv5 = zext i16 %mul to i32
+  %pData2 = getelementptr inbounds %struct.arm_matrix_instance_q15, %struct.arm_matrix_instance_q15* %pDst, i32 0, i32 2
+  %i2 = load i16*, i16** %pData2, align 4
+  %pData1 = getelementptr inbounds %struct.arm_matrix_instance_q15, %struct.arm_matrix_instance_q15* %pSrcB, i32 0, i32 2
+  %i3 = load i16*, i16** %pData1, align 4
+  %pData = getelementptr inbounds %struct.arm_matrix_instance_q15, %struct.arm_matrix_instance_q15* %pSrcA, i32 0, i32 2
+  %i4 = load i16*, i16** %pData, align 4
+  br label %while.body
+
+while.body:                                       ; preds = %while.body.preheader, %while.body
+  %pInA.026 = phi i16* [ %incdec.ptr, %while.body ], [ %i4, %while.body.preheader ]
+  %blkCnt.025 = phi i32 [ %dec, %while.body ], [ %conv5, %while.body.preheader ]
+  %pInB.024 = phi i16* [ %incdec.ptr8, %while.body ], [ %i3, %while.body.preheader ]
+  %pOut.023 = phi i16* [ %incdec.ptr11, %while.body ], [ %i2, %while.body.preheader ]
+  %incdec.ptr = getelementptr inbounds i16, i16* %pInA.026, i32 1
+  %i5 = load i16, i16* %pInA.026, align 2
+  %conv7 = sext i16 %i5 to i32
+  %incdec.ptr8 = getelementptr inbounds i16, i16* %pInB.024, i32 1
+  %i6 = load i16, i16* %pInB.024, align 2
+  %conv9 = sext i16 %i6 to i32
+  %add = add nsw i32 %conv9, %conv7
+  %i7 = icmp sgt i32 %add, -32768
+  %spec.select.i = select i1 %i7, i32 %add, i32 -32768
+  %i8 = icmp slt i32 %spec.select.i, 32767
+  %call21 = select i1 %i8, i32 %spec.select.i, i32 32767
+  %conv10 = trunc i32 %call21 to i16
+  %incdec.ptr11 = getelementptr inbounds i16, i16* %pOut.023, i32 1
+  store i16 %conv10, i16* %pOut.023, align 2
+  %dec = add nsw i32 %blkCnt.025, -1
+  %cmp = icmp eq i32 %dec, 0
+  br i1 %cmp, label %while.end, label %while.body
+
+while.end:                                        ; preds = %while.body, %entry
+  ret i32 0
+}
+
+
diff --git a/test/CodeGen/Hexagon/swp-carried-1.ll b/test/CodeGen/Hexagon/swp-carried-1.ll
index 641c61d5e4b..740802787d2 100644
--- a/test/CodeGen/Hexagon/swp-carried-1.ll
+++ b/test/CodeGen/Hexagon/swp-carried-1.ll
@@ -1,4 +1,4 @@
-; RUN: llc -march=hexagon -rdf-opt=0 -disable-hexagon-misched -hexagon-initial-cfg-cleanup=0 < %s | FileCheck %s
+; RUN: llc -march=hexagon -rdf-opt=0 -disable-hexagon-misched -hexagon-initial-cfg-cleanup=0 -lsr-recursive-setupcost=0 < %s | FileCheck %s
 
 ; Test that we generate the correct code when a loop carried value
 ; is scheduled one stage earlier than it's use. The code in
diff --git a/test/CodeGen/Hexagon/swp-epilog-phi5.ll b/test/CodeGen/Hexagon/swp-epilog-phi5.ll
index 3dcecad5456..a524dc0d5be 100644
--- a/test/CodeGen/Hexagon/swp-epilog-phi5.ll
+++ b/test/CodeGen/Hexagon/swp-epilog-phi5.ll
@@ -7,10 +7,10 @@
 
 ; In this test case, the second loop is pipelined, block b5.
 
-; CHECK: loop0
+; CHECK: loop1
 ; CHECK: [[REG0:r([0-9]+)]] += mpyi
 ; CHECK: [[REG2:r([0-9]+)]] = add([[REG1:r([0-9]+)]],add([[REG0]],#8
-; CHECK: endloop0
+; CHECK: endloop1
 
 %s.0 = type { %s.1*, %s.4*, %s.7*, i8*, i8, i32, %s.8*, i32, i32, i32, i8, i8, i32, i32, double, i8, i8, i8, i8, i8, i8, i8, i8, i32, i8, i8, i8, i32, i32, i32, i32, i32, i32, i8**, i32, i32, i32, i32, i32, [64 x i32]*, [4 x %s.9*], [4 x %s.10*], [4 x %s.10*], i32, %s.23*, i8, i8, [16 x i8], [16 x i8], [16 x i8], i32, i8, i8, i8, i8, i16, i16, i8, i8, i8, %s.11*, i32, i32, i32, i32, i8*, i32, [4 x %s.23*], i32, i32, i32, [10 x i32], i32, i32, i32, i32, i32, %s.12*, %s.13*, %s.14*, %s.15*, %s.16*, %s.17*, %s.18*, %s.19*, %s.20*, %s.21*, %s.22* }
 %s.1 = type { void (%s.2*)*, void (%s.2*, i32)*, void (%s.2*)*, void (%s.2*, i8*)*, void (%s.2*)*, i32, %s.3, i32, i32, i8**, i32, i8**, i32, i32 }
diff --git a/test/Transforms/LoopStrengthReduce/two-combinations-bug.ll b/test/Transforms/LoopStrengthReduce/two-combinations-bug.ll
index 21917f5959c..ab6dd488319 100644
--- a/test/Transforms/LoopStrengthReduce/two-combinations-bug.ll
+++ b/test/Transforms/LoopStrengthReduce/two-combinations-bug.ll
@@ -1,4 +1,4 @@
-; RUN: opt < %s -loop-reduce -S | FileCheck %s
+; RUN: opt < %s -loop-reduce -lsr-recursive-setupcost=0 -S | FileCheck %s
 
 ; This test is adapted from the n-body test of the LLVM test-suite: A bug in
 ; r345114 caused LSR to generate incorrect code. The test verifies that the
-- 
2.50.1