From b8ce2355f664eed3c749cbcd7fc8963480c8cddf Mon Sep 17 00:00:00 2001 From: Zhaoshi Zheng Date: Thu, 26 Sep 2019 21:40:27 +0000 Subject: [PATCH] [Unroll] Do NOT unroll a loop with small runtime upperbound For a runtime loop if we can compute its trip count upperbound: Don't unroll if: 1. loop is not guaranteed to run either zero or upperbound iterations; and 2. trip count upperbound is less than UnrollMaxUpperBound Unless user or TTI asked to do so. If unrolling, limit unroll factor to loop's trip count upperbound. Differential Revision: https://reviews.llvm.org/D62989 Change-Id: I6083c46a9d98b2e22cd855e60523fdc5a4929c73 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@373017 91177308-0d34-0410-b5e6-96231b3b80d8 --- include/llvm/Transforms/Utils/UnrollLoop.h | 4 +- .../Scalar/LoopUnrollAndJamPass.cpp | 2 +- lib/Transforms/Scalar/LoopUnrollPass.cpp | 59 ++++++++++------ .../LoopUnroll/runtime-small-upperbound.ll | 70 +++++++++++++++++++ 4 files changed, 111 insertions(+), 24 deletions(-) create mode 100644 test/Transforms/LoopUnroll/runtime-small-upperbound.ll diff --git a/include/llvm/Transforms/Utils/UnrollLoop.h b/include/llvm/Transforms/Utils/UnrollLoop.h index 9012920745e..02b81b4b7ee 100644 --- a/include/llvm/Transforms/Utils/UnrollLoop.h +++ b/include/llvm/Transforms/Utils/UnrollLoop.h @@ -114,8 +114,8 @@ bool computeUnrollCount(Loop *L, const TargetTransformInfo &TTI, DominatorTree &DT, LoopInfo *LI, ScalarEvolution &SE, const SmallPtrSetImpl &EphValues, OptimizationRemarkEmitter *ORE, unsigned &TripCount, - unsigned MaxTripCount, unsigned &TripMultiple, - unsigned LoopSize, + unsigned MaxTripCount, bool MaxOrZero, + unsigned &TripMultiple, unsigned LoopSize, TargetTransformInfo::UnrollingPreferences &UP, bool &UseUpperBound); diff --git a/lib/Transforms/Scalar/LoopUnrollAndJamPass.cpp b/lib/Transforms/Scalar/LoopUnrollAndJamPass.cpp index d40ef6be9b2..8d88be42031 100644 --- a/lib/Transforms/Scalar/LoopUnrollAndJamPass.cpp +++ b/lib/Transforms/Scalar/LoopUnrollAndJamPass.cpp @@ -166,7 +166,7 @@ static bool computeUnrollAndJamCount( bool UseUpperBound = false; bool ExplicitUnroll = computeUnrollCount( L, TTI, DT, LI, SE, EphValues, ORE, OuterTripCount, MaxTripCount, - OuterTripMultiple, OuterLoopSize, UP, UseUpperBound); + /*MaxOrZero*/ false, OuterTripMultiple, OuterLoopSize, UP, UseUpperBound); if (ExplicitUnroll || UseUpperBound) { // If the user explicitly set the loop as unrolled, dont UnJ it. Leave it // for the unroller instead. diff --git a/lib/Transforms/Scalar/LoopUnrollPass.cpp b/lib/Transforms/Scalar/LoopUnrollPass.cpp index f801c208903..a6d4164c364 100644 --- a/lib/Transforms/Scalar/LoopUnrollPass.cpp +++ b/lib/Transforms/Scalar/LoopUnrollPass.cpp @@ -737,7 +737,7 @@ bool llvm::computeUnrollCount( Loop *L, const TargetTransformInfo &TTI, DominatorTree &DT, LoopInfo *LI, ScalarEvolution &SE, const SmallPtrSetImpl &EphValues, OptimizationRemarkEmitter *ORE, unsigned &TripCount, unsigned MaxTripCount, - unsigned &TripMultiple, unsigned LoopSize, + bool MaxOrZero, unsigned &TripMultiple, unsigned LoopSize, TargetTransformInfo::UnrollingPreferences &UP, bool &UseUpperBound) { // Check for explicit Count. @@ -788,18 +788,34 @@ bool llvm::computeUnrollCount( // Also we need to check if we exceed FullUnrollMaxCount. // If using the upper bound to unroll, TripMultiple should be set to 1 because // we do not know when loop may exit. - // MaxTripCount and ExactTripCount cannot both be non zero since we only + + // We can unroll by the upper bound amount if it's generally allowed or if + // we know that the loop is executed either the upper bound or zero times. + // (MaxOrZero unrolling keeps only the first loop test, so the number of + // loop tests remains the same compared to the non-unrolled version, whereas + // the generic upper bound unrolling keeps all but the last loop test so the + // number of loop tests goes up which may end up being worse on targets with + // constrained branch predictor resources so is controlled by an option.) + // In addition we only unroll small upper bounds. + unsigned FullUnrollMaxTripCount = MaxTripCount; + if (!(UP.UpperBound || MaxOrZero) || + FullUnrollMaxTripCount > UnrollMaxUpperBound) + FullUnrollMaxTripCount = 0; + + // UnrollByMaxCount and ExactTripCount cannot both be non zero since we only // compute the former when the latter is zero. unsigned ExactTripCount = TripCount; - assert((ExactTripCount == 0 || MaxTripCount == 0) && - "ExtractTripCount and MaxTripCount cannot both be non zero."); - unsigned FullUnrollTripCount = ExactTripCount ? ExactTripCount : MaxTripCount; + assert((ExactTripCount == 0 || FullUnrollMaxTripCount == 0) && + "ExtractTripCount and UnrollByMaxCount cannot both be non zero."); + + unsigned FullUnrollTripCount = + ExactTripCount ? ExactTripCount : FullUnrollMaxTripCount; UP.Count = FullUnrollTripCount; if (FullUnrollTripCount && FullUnrollTripCount <= UP.FullUnrollMaxCount) { // When computing the unrolled size, note that BEInsns are not replicated // like the rest of the loop body. if (getUnrolledLoopSize(LoopSize, UP) < UP.Threshold) { - UseUpperBound = (MaxTripCount == FullUnrollTripCount); + UseUpperBound = (FullUnrollMaxTripCount == FullUnrollTripCount); TripCount = FullUnrollTripCount; TripMultiple = UP.UpperBound ? 1 : TripMultiple; return ExplicitUnroll; @@ -813,7 +829,7 @@ bool llvm::computeUnrollCount( unsigned Boost = getFullUnrollBoostingFactor(*Cost, UP.MaxPercentThresholdBoost); if (Cost->UnrolledCost < UP.Threshold * Boost / 100) { - UseUpperBound = (MaxTripCount == FullUnrollTripCount); + UseUpperBound = (FullUnrollMaxTripCount == FullUnrollTripCount); TripCount = FullUnrollTripCount; TripMultiple = UP.UpperBound ? 1 : TripMultiple; return ExplicitUnroll; @@ -889,6 +905,8 @@ bool llvm::computeUnrollCount( "because " "unrolled size is too large."; }); + LLVM_DEBUG(dbgs() << " partially unrolling with count: " << UP.Count + << "\n"); return ExplicitUnroll; } assert(TripCount == 0 && @@ -910,6 +928,12 @@ bool llvm::computeUnrollCount( return false; } + // Don't unroll a small upper bound loop unless user or TTI asked to do so. + if (MaxTripCount && !UP.Force && MaxTripCount < UnrollMaxUpperBound) { + UP.Count = 0; + return false; + } + // Check if the runtime trip count is too small when profile is available. if (L->getHeader()->getParent()->hasProfileData()) { if (auto ProfileTripCount = getLoopEstimatedTripCount(L)) { @@ -973,7 +997,11 @@ bool llvm::computeUnrollCount( if (UP.Count > UP.MaxCount) UP.Count = UP.MaxCount; - LLVM_DEBUG(dbgs() << " partially unrolling with count: " << UP.Count + + if (MaxTripCount && UP.Count > MaxTripCount) + UP.Count = MaxTripCount; + + LLVM_DEBUG(dbgs() << " runtime unrolling with count: " << UP.Count << "\n"); if (UP.Count < 2) UP.Count = 0; @@ -1049,7 +1077,6 @@ static LoopUnrollResult tryToUnrollLoop( // Find trip count and trip multiple if count is not available unsigned TripCount = 0; - unsigned MaxTripCount = 0; unsigned TripMultiple = 1; // If there are multiple exiting blocks but one of them is the latch, use the // latch for the trip count estimation. Otherwise insist on a single exiting @@ -1079,28 +1106,18 @@ static LoopUnrollResult tryToUnrollLoop( // Try to find the trip count upper bound if we cannot find the exact trip // count. + unsigned MaxTripCount = 0; bool MaxOrZero = false; if (!TripCount) { MaxTripCount = SE.getSmallConstantMaxTripCount(L); MaxOrZero = SE.isBackedgeTakenCountMaxOrZero(L); - // We can unroll by the upper bound amount if it's generally allowed or if - // we know that the loop is executed either the upper bound or zero times. - // (MaxOrZero unrolling keeps only the first loop test, so the number of - // loop tests remains the same compared to the non-unrolled version, whereas - // the generic upper bound unrolling keeps all but the last loop test so the - // number of loop tests goes up which may end up being worse on targets with - // constrained branch predictor resources so is controlled by an option.) - // In addition we only unroll small upper bounds. - if (!(UP.UpperBound || MaxOrZero) || MaxTripCount > UnrollMaxUpperBound) { - MaxTripCount = 0; - } } // computeUnrollCount() decides whether it is beneficial to use upper bound to // fully unroll the loop. bool UseUpperBound = false; bool IsCountSetExplicitly = computeUnrollCount( - L, TTI, DT, LI, SE, EphValues, &ORE, TripCount, MaxTripCount, + L, TTI, DT, LI, SE, EphValues, &ORE, TripCount, MaxTripCount, MaxOrZero, TripMultiple, LoopSize, UP, UseUpperBound); if (!UP.Count) return LoopUnrollResult::Unmodified; diff --git a/test/Transforms/LoopUnroll/runtime-small-upperbound.ll b/test/Transforms/LoopUnroll/runtime-small-upperbound.ll new file mode 100644 index 00000000000..95632a5a3be --- /dev/null +++ b/test/Transforms/LoopUnroll/runtime-small-upperbound.ll @@ -0,0 +1,70 @@ +; RUN: opt -S -loop-unroll -unroll-runtime %s -o - | FileCheck %s +; RUN: opt -S -loop-unroll -unroll-runtime -unroll-max-upperbound=6 %s -o - | FileCheck %s --check-prefix=UPPER + +target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-a:0:32-n32-S64" + +@global = dso_local local_unnamed_addr global i32 0, align 4 +@global.1 = dso_local local_unnamed_addr global i8* null, align 4 + +; Check that loop in hoge_3, with a runtime upperbound of 3, is not unrolled. +; CHECK-LABEL: hoge_3 +; CHECK: loop: +; CHECK: store +; CHECK-NOT: store +; CHECK: br i1 %{{.*}}, label %loop +; UPPER-LABEL: hoge_3 +; UPPER: loop: +; UPPER: store +; UPPER-NOT: store +; UPPER: br i1 %{{.*}}, label %loop +define dso_local void @hoge_3(i8 %arg) { +entry: + %x = load i32, i32* @global, align 4 + %y = load i8*, i8** @global.1, align 4 + %0 = icmp ult i32 %x, 17 + br i1 %0, label %loop, label %exit + +loop: + %iv = phi i32 [ %x, %entry ], [ %iv.next, %loop ] + %ptr = phi i8* [ %y, %entry ], [ %ptr.next, %loop ] + %iv.next = add nuw i32 %iv, 8 + %ptr.next = getelementptr inbounds i8, i8* %ptr, i32 1 + store i8 %arg, i8* %ptr.next, align 1 + %1 = icmp ult i32 %iv.next, 17 + br i1 %1, label %loop, label %exit + +exit: + ret void +} + +; Check that loop in hoge_5, with a runtime upperbound of 5, is unrolled when -unroll-max-upperbound=4 +; CHECK-LABEL: hoge_5 +; CHECK: loop: +; CHECK: store +; CHECK-NOT: store +; CHECK: br i1 %{{.*}}, label %loop +; UPPER-LABEL: hoge_5 +; UPPER: loop: +; UPPER: store +; UPPER: store +; UPPER: store +; UPPER: br i1 %{{.*}}, label %loop +define dso_local void @hoge_5(i8 %arg) { +entry: + %x = load i32, i32* @global, align 4 + %y = load i8*, i8** @global.1, align 4 + %0 = icmp ult i32 %x, 17 + br i1 %0, label %loop, label %exit + +loop: + %iv = phi i32 [ %x, %entry ], [ %iv.next, %loop ] + %ptr = phi i8* [ %y, %entry ], [ %ptr.next, %loop ] + %iv.next = add nuw i32 %iv, 4 + %ptr.next = getelementptr inbounds i8, i8* %ptr, i32 1 + store i8 %arg, i8* %ptr.next, align 1 + %1 = icmp ult i32 %iv.next, 17 + br i1 %1, label %loop, label %exit + +exit: + ret void +} -- 2.40.0