[Unroll] Do NOT unroll a loop with small runtime upperbound

author Zhaoshi Zheng <zhaoshiz@quicinc.com>

Thu, 26 Sep 2019 21:40:27 +0000 (21:40 +0000)

committer Zhaoshi Zheng <zhaoshiz@quicinc.com>

Thu, 26 Sep 2019 21:40:27 +0000 (21:40 +0000)
author Zhaoshi Zheng <zhaoshiz@quicinc.com>
Thu, 26 Sep 2019 21:40:27 +0000 (21:40 +0000)
committer Zhaoshi Zheng <zhaoshiz@quicinc.com>
Thu, 26 Sep 2019 21:40:27 +0000 (21:40 +0000)
diff --git a/include/llvm/Transforms/Utils/UnrollLoop.h b/include/llvm/Transforms/Utils/UnrollLoop.h

index 9012920745e1ba75bfa12b4141d02d8eeed4724b..02b81b4b7ee2650cc0bd37b1611cf3ad90aef301 100644 (file)
--- a/include/llvm/Transforms/Utils/UnrollLoop.h
+++ b/include/llvm/Transforms/Utils/UnrollLoop.h
@@ -114,8 +114,8 @@ bool computeUnrollCount(Loop *L, const TargetTransformInfo &TTI,
                          DominatorTree &DT, LoopInfo *LI, ScalarEvolution &SE,
                          const SmallPtrSetImpl<const Value *> &EphValues,
                          OptimizationRemarkEmitter *ORE, unsigned &TripCount,
-                        unsigned MaxTripCount, unsigned &TripMultiple,
-                        unsigned LoopSize,
+                        unsigned MaxTripCount, bool MaxOrZero,
+                        unsigned &TripMultiple, unsigned LoopSize,
                          TargetTransformInfo::UnrollingPreferences &UP,
                          bool &UseUpperBound);
  
diff --git a/lib/Transforms/Scalar/LoopUnrollAndJamPass.cpp b/lib/Transforms/Scalar/LoopUnrollAndJamPass.cpp

index d40ef6be9b226d748005452d4c5fba6bac191cfa..8d88be420314153b6ae018c77b390e866fddbea9 100644 (file)
--- a/lib/Transforms/Scalar/LoopUnrollAndJamPass.cpp
+++ b/lib/Transforms/Scalar/LoopUnrollAndJamPass.cpp
@@ -166,7 +166,7 @@ static bool computeUnrollAndJamCount(
    bool UseUpperBound = false;
    bool ExplicitUnroll = computeUnrollCount(
        L, TTI, DT, LI, SE, EphValues, ORE, OuterTripCount, MaxTripCount,
-      OuterTripMultiple, OuterLoopSize, UP, UseUpperBound);
+      /*MaxOrZero*/ false, OuterTripMultiple, OuterLoopSize, UP, UseUpperBound);
    if (ExplicitUnroll || UseUpperBound) {
      // If the user explicitly set the loop as unrolled, dont UnJ it. Leave it
      // for the unroller instead.
diff --git a/lib/Transforms/Scalar/LoopUnrollPass.cpp b/lib/Transforms/Scalar/LoopUnrollPass.cpp

index f801c2089031afb641616a6eb50ec90fa99f7ef1..a6d4164c364551736f4389fca583a03be1ea8a27 100644 (file)
--- a/lib/Transforms/Scalar/LoopUnrollPass.cpp
+++ b/lib/Transforms/Scalar/LoopUnrollPass.cpp
@@ -737,7 +737,7 @@ bool llvm::computeUnrollCount(
      Loop *L, const TargetTransformInfo &TTI, DominatorTree &DT, LoopInfo *LI,
      ScalarEvolution &SE, const SmallPtrSetImpl<const Value *> &EphValues,
      OptimizationRemarkEmitter *ORE, unsigned &TripCount, unsigned MaxTripCount,
-    unsigned &TripMultiple, unsigned LoopSize,
+    bool MaxOrZero, unsigned &TripMultiple, unsigned LoopSize,
      TargetTransformInfo::UnrollingPreferences &UP, bool &UseUpperBound) {
  
    // Check for explicit Count.
@@ -788,18 +788,34 @@ bool llvm::computeUnrollCount(
    // Also we need to check if we exceed FullUnrollMaxCount.
    // If using the upper bound to unroll, TripMultiple should be set to 1 because
    // we do not know when loop may exit.
-  // MaxTripCount and ExactTripCount cannot both be non zero since we only
+
+  // We can unroll by the upper bound amount if it's generally allowed or if
+  // we know that the loop is executed either the upper bound or zero times.
+  // (MaxOrZero unrolling keeps only the first loop test, so the number of
+  // loop tests remains the same compared to the non-unrolled version, whereas
+  // the generic upper bound unrolling keeps all but the last loop test so the
+  // number of loop tests goes up which may end up being worse on targets with
+  // constrained branch predictor resources so is controlled by an option.)
+  // In addition we only unroll small upper bounds.
+  unsigned FullUnrollMaxTripCount = MaxTripCount;
+  if (!(UP.UpperBound || MaxOrZero) ||
+      FullUnrollMaxTripCount > UnrollMaxUpperBound)
+    FullUnrollMaxTripCount = 0;
+
+  // UnrollByMaxCount and ExactTripCount cannot both be non zero since we only
    // compute the former when the latter is zero.
    unsigned ExactTripCount = TripCount;
-  assert((ExactTripCount == 0 || MaxTripCount == 0) &&
-         "ExtractTripCount and MaxTripCount cannot both be non zero.");
-  unsigned FullUnrollTripCount = ExactTripCount ? ExactTripCount : MaxTripCount;
+  assert((ExactTripCount == 0 || FullUnrollMaxTripCount == 0) &&
+         "ExtractTripCount and UnrollByMaxCount cannot both be non zero.");
+
+  unsigned FullUnrollTripCount =
+      ExactTripCount ? ExactTripCount : FullUnrollMaxTripCount;
    UP.Count = FullUnrollTripCount;
    if (FullUnrollTripCount && FullUnrollTripCount <= UP.FullUnrollMaxCount) {
      // When computing the unrolled size, note that BEInsns are not replicated
      // like the rest of the loop body.
      if (getUnrolledLoopSize(LoopSize, UP) < UP.Threshold) {
-      UseUpperBound = (MaxTripCount == FullUnrollTripCount);
+      UseUpperBound = (FullUnrollMaxTripCount == FullUnrollTripCount);
        TripCount = FullUnrollTripCount;
        TripMultiple = UP.UpperBound ? 1 : TripMultiple;
        return ExplicitUnroll;
@@ -813,7 +829,7 @@ bool llvm::computeUnrollCount(
          unsigned Boost =
              getFullUnrollBoostingFactor(*Cost, UP.MaxPercentThresholdBoost);
          if (Cost->UnrolledCost < UP.Threshold * Boost / 100) {
-          UseUpperBound = (MaxTripCount == FullUnrollTripCount);
+          UseUpperBound = (FullUnrollMaxTripCount == FullUnrollTripCount);
            TripCount = FullUnrollTripCount;
            TripMultiple = UP.UpperBound ? 1 : TripMultiple;
            return ExplicitUnroll;
@@ -889,6 +905,8 @@ bool llvm::computeUnrollCount(
                    "because "
                    "unrolled size is too large.";
        });
+    LLVM_DEBUG(dbgs() << "  partially unrolling with count: " << UP.Count
+                      << "\n");
      return ExplicitUnroll;
    }
    assert(TripCount == 0 &&
@@ -910,6 +928,12 @@ bool llvm::computeUnrollCount(
      return false;
    }
  
+  // Don't unroll a small upper bound loop unless user or TTI asked to do so.
+  if (MaxTripCount && !UP.Force && MaxTripCount < UnrollMaxUpperBound) {
+    UP.Count = 0;
+    return false;
+  }
+
    // Check if the runtime trip count is too small when profile is available.
    if (L->getHeader()->getParent()->hasProfileData()) {
      if (auto ProfileTripCount = getLoopEstimatedTripCount(L)) {
@@ -973,7 +997,11 @@ bool llvm::computeUnrollCount(
  
    if (UP.Count > UP.MaxCount)
      UP.Count = UP.MaxCount;
-  LLVM_DEBUG(dbgs() << "  partially unrolling with count: " << UP.Count
+
+  if (MaxTripCount && UP.Count > MaxTripCount)
+    UP.Count = MaxTripCount;
+
+  LLVM_DEBUG(dbgs() << "  runtime unrolling with count: " << UP.Count
                      << "\n");
    if (UP.Count < 2)
      UP.Count = 0;
@@ -1049,7 +1077,6 @@ static LoopUnrollResult tryToUnrollLoop(
  
    // Find trip count and trip multiple if count is not available
    unsigned TripCount = 0;
-  unsigned MaxTripCount = 0;
    unsigned TripMultiple = 1;
    // If there are multiple exiting blocks but one of them is the latch, use the
    // latch for the trip count estimation. Otherwise insist on a single exiting
@@ -1079,28 +1106,18 @@ static LoopUnrollResult tryToUnrollLoop(
  
    // Try to find the trip count upper bound if we cannot find the exact trip
    // count.
+  unsigned MaxTripCount = 0;
    bool MaxOrZero = false;
    if (!TripCount) {
      MaxTripCount = SE.getSmallConstantMaxTripCount(L);
      MaxOrZero = SE.isBackedgeTakenCountMaxOrZero(L);
-    // We can unroll by the upper bound amount if it's generally allowed or if
-    // we know that the loop is executed either the upper bound or zero times.
-    // (MaxOrZero unrolling keeps only the first loop test, so the number of
-    // loop tests remains the same compared to the non-unrolled version, whereas
-    // the generic upper bound unrolling keeps all but the last loop test so the
-    // number of loop tests goes up which may end up being worse on targets with
-    // constrained branch predictor resources so is controlled by an option.)
-    // In addition we only unroll small upper bounds.
-    if (!(UP.UpperBound || MaxOrZero) || MaxTripCount > UnrollMaxUpperBound) {
-      MaxTripCount = 0;
-    }
    }
  
    // computeUnrollCount() decides whether it is beneficial to use upper bound to
    // fully unroll the loop.
    bool UseUpperBound = false;
    bool IsCountSetExplicitly = computeUnrollCount(
-      L, TTI, DT, LI, SE, EphValues, &ORE, TripCount, MaxTripCount,
+      L, TTI, DT, LI, SE, EphValues, &ORE, TripCount, MaxTripCount, MaxOrZero,
        TripMultiple, LoopSize, UP, UseUpperBound);
    if (!UP.Count)
      return LoopUnrollResult::Unmodified;
diff --git a/test/Transforms/LoopUnroll/runtime-small-upperbound.ll b/test/Transforms/LoopUnroll/runtime-small-upperbound.ll

new file mode 100644 (file)

index 0000000..95632a5
--- /dev/null
+++ b/test/Transforms/LoopUnroll/runtime-small-upperbound.ll
@@ -0,0 +1,70 @@
+; RUN: opt -S -loop-unroll -unroll-runtime %s -o - | FileCheck %s
+; RUN: opt -S -loop-unroll -unroll-runtime -unroll-max-upperbound=6 %s -o - | FileCheck %s --check-prefix=UPPER
+
+target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-a:0:32-n32-S64"
+
+@global = dso_local local_unnamed_addr global i32 0, align 4
+@global.1 = dso_local local_unnamed_addr global i8* null, align 4
+
+; Check that loop in hoge_3, with a runtime upperbound of 3, is not unrolled.
+; CHECK-LABEL: hoge_3
+; CHECK: loop:
+; CHECK: store
+; CHECK-NOT: store
+; CHECK: br i1 %{{.*}}, label %loop
+; UPPER-LABEL: hoge_3
+; UPPER: loop:
+; UPPER: store
+; UPPER-NOT: store
+; UPPER: br i1 %{{.*}}, label %loop
+define dso_local void @hoge_3(i8 %arg) {
+entry:
+  %x = load i32, i32* @global, align 4
+  %y = load i8*, i8** @global.1, align 4
+  %0 = icmp ult i32 %x, 17
+  br i1 %0, label %loop, label %exit
+
+loop:
+  %iv = phi i32 [ %x, %entry ], [ %iv.next, %loop ]
+  %ptr = phi i8* [ %y, %entry ], [ %ptr.next, %loop ]
+  %iv.next = add nuw i32 %iv, 8
+  %ptr.next = getelementptr inbounds i8, i8* %ptr, i32 1
+  store i8 %arg, i8* %ptr.next, align 1
+  %1 = icmp ult i32 %iv.next, 17
+  br i1 %1, label %loop, label %exit
+
+exit:
+  ret void
+}
+
+; Check that loop in hoge_5, with a runtime upperbound of 5, is unrolled when -unroll-max-upperbound=4
+; CHECK-LABEL: hoge_5
+; CHECK: loop:
+; CHECK: store
+; CHECK-NOT: store
+; CHECK: br i1 %{{.*}}, label %loop
+; UPPER-LABEL: hoge_5
+; UPPER: loop:
+; UPPER: store
+; UPPER: store
+; UPPER: store
+; UPPER: br i1 %{{.*}}, label %loop
+define dso_local void @hoge_5(i8 %arg) {
+entry:
+  %x = load i32, i32* @global, align 4
+  %y = load i8*, i8** @global.1, align 4
+  %0 = icmp ult i32 %x, 17
+  br i1 %0, label %loop, label %exit
+
+loop:
+  %iv = phi i32 [ %x, %entry ], [ %iv.next, %loop ]
+  %ptr = phi i8* [ %y, %entry ], [ %ptr.next, %loop ]
+  %iv.next = add nuw i32 %iv, 4
+  %ptr.next = getelementptr inbounds i8, i8* %ptr, i32 1
+  store i8 %arg, i8* %ptr.next, align 1
+  %1 = icmp ult i32 %iv.next, 17
+  br i1 %1, label %loop, label %exit
+
+exit:
+  ret void
+}
author	Zhaoshi Zheng <zhaoshiz@quicinc.com>
	Thu, 26 Sep 2019 21:40:27 +0000 (21:40 +0000)
committer	Zhaoshi Zheng <zhaoshiz@quicinc.com>
	Thu, 26 Sep 2019 21:40:27 +0000 (21:40 +0000)
include/llvm/Transforms/Utils/UnrollLoop.h		patch \| blob \| history
lib/Transforms/Scalar/LoopUnrollAndJamPass.cpp		patch \| blob \| history
lib/Transforms/Scalar/LoopUnrollPass.cpp		patch \| blob \| history
test/Transforms/LoopUnroll/runtime-small-upperbound.ll	[new file with mode: 0644]	patch \| blob