From 818cdb5df7fe6e14a94a9f069d0bb95db8a8a0a5 Mon Sep 17 00:00:00 2001 From: Lei Huang Date: Thu, 12 Oct 2017 16:43:33 +0000 Subject: [PATCH] [PowerPC] Add profitablilty check for conversion to mtctr loops Add profitability checks for modifying counted loops to use the mtctr instruction. The latency of mtctr is only justified if there are more than 4 comparisons that will be removed as a result. Usually counted loops are formed relatively early and before unrolling, so most low trip count loops often don't survive. However we want to ensure that if they do, we do not mistakenly update them to mtctr loops. Use CodeMetrics to ensure we are only doing this for small loops with small trip counts. Differential Revision: https://reviews.llvm.org/D38212 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@315592 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/PowerPC/PPCCTRLoops.cpp | 33 +++++- test/CodeGen/PowerPC/ctr-minmaxnum.ll | 47 ++++++++- test/CodeGen/PowerPC/ctrloop-shortLoops.ll | 116 +++++++++++++++++++++ 3 files changed, 190 insertions(+), 6 deletions(-) create mode 100644 test/CodeGen/PowerPC/ctrloop-shortLoops.ll diff --git a/lib/Target/PowerPC/PPCCTRLoops.cpp b/lib/Target/PowerPC/PPCCTRLoops.cpp index 8d61e81b1fc..8784a831902 100644 --- a/lib/Target/PowerPC/PPCCTRLoops.cpp +++ b/lib/Target/PowerPC/PPCCTRLoops.cpp @@ -26,12 +26,17 @@ #include "PPC.h" #include "PPCSubtarget.h" #include "PPCTargetMachine.h" +#include "PPCTargetTransformInfo.h" #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/Statistic.h" +#include "llvm/Analysis/AssumptionCache.h" +#include "llvm/Analysis/CodeMetrics.h" #include "llvm/Analysis/LoopInfo.h" #include "llvm/Analysis/ScalarEvolutionExpander.h" #include "llvm/Analysis/TargetLibraryInfo.h" +#include "llvm/Analysis/TargetTransformInfo.h" #include "llvm/CodeGen/TargetPassConfig.h" +#include "llvm/CodeGen/TargetSchedule.h" #include "llvm/IR/Constants.h" #include "llvm/IR/DerivedTypes.h" #include "llvm/IR/Dominators.h" @@ -64,6 +69,13 @@ using namespace llvm; static cl::opt CTRLoopLimit("ppc-max-ctrloop", cl::Hidden, cl::init(-1)); #endif +// The latency of mtctr is only justified if there are more than 4 +// comparisons that will be removed as a result. +static cl::opt +SmallCTRLoopThreshold("min-ctr-loop-threshold", cl::init(4), cl::Hidden, + cl::desc("Loops with a constant trip count smaller than " + "this value will not use the count register.")); + STATISTIC(NumCTRLoops, "Number of loops converted to CTR loops"); namespace llvm { @@ -95,6 +107,8 @@ namespace { AU.addRequired(); AU.addPreserved(); AU.addRequired(); + AU.addRequired(); + AU.addRequired(); } private: @@ -107,10 +121,12 @@ namespace { const PPCTargetLowering *TLI; const DataLayout *DL; const TargetLibraryInfo *LibInfo; + const TargetTransformInfo *TTI; LoopInfo *LI; ScalarEvolution *SE; DominatorTree *DT; bool PreserveLCSSA; + TargetSchedModel SchedModel; }; char PPCCTRLoops::ID = 0; @@ -179,6 +195,7 @@ bool PPCCTRLoops::runOnFunction(Function &F) { LI = &getAnalysis().getLoopInfo(); SE = &getAnalysis().getSE(); DT = &getAnalysis().getDomTree(); + TTI = &getAnalysis().getTTI(F); DL = &F.getParent()->getDataLayout(); auto *TLIP = getAnalysisIfAvailable(); LibInfo = TLIP ? &TLIP->getTLI() : nullptr; @@ -462,10 +479,24 @@ bool PPCCTRLoops::mightUseCTR(BasicBlock *BB) { return false; } - bool PPCCTRLoops::convertToCTRLoop(Loop *L) { bool MadeChange = false; + // Do not convert small short loops to CTR loop. + unsigned ConstTripCount = SE->getSmallConstantTripCount(L); + if (ConstTripCount && ConstTripCount < SmallCTRLoopThreshold) { + SmallPtrSet EphValues; + auto AC = getAnalysis().getAssumptionCache( + *L->getHeader()->getParent()); + CodeMetrics::collectEphemeralValues(L, &AC, EphValues); + CodeMetrics Metrics; + for (BasicBlock *BB : L->blocks()) + Metrics.analyzeBasicBlock(BB, *TTI, EphValues); + // 6 is an approximate latency for the mtctr instruction. + if (Metrics.NumInsts <= (6 * SchedModel.getIssueWidth())) + return false; + } + // Process nested loops first. for (Loop::iterator I = L->begin(), E = L->end(); I != E; ++I) { MadeChange |= convertToCTRLoop(*I); diff --git a/test/CodeGen/PowerPC/ctr-minmaxnum.ll b/test/CodeGen/PowerPC/ctr-minmaxnum.ll index 2b0a7cceb68..e38f851620b 100644 --- a/test/CodeGen/PowerPC/ctr-minmaxnum.ll +++ b/test/CodeGen/PowerPC/ctr-minmaxnum.ll @@ -1,6 +1,5 @@ -; RUN: llc -verify-machineinstrs -mcpu=pwr7 < %s | FileCheck %s -; RUN: llc -verify-machineinstrs -mcpu=a2q < %s | FileCheck %s --check-prefix=QPX -target triple = "powerpc64-unknown-linux-gnu" +; RUN: llc -mtriple=powerpc64-unknown-linux-gnu -verify-machineinstrs -mcpu=pwr7 < %s | FileCheck %s +; RUN: llc -mtriple=powerpc64-unknown-linux-gnu -verify-machineinstrs -mcpu=a2q < %s | FileCheck %s --check-prefix=QPX declare float @fabsf(float) @@ -38,6 +37,9 @@ loop_exit: ; CHECK-LABEL: test1: ; CHECK-NOT: mtctr ; CHECK: bl fminf +; CHECK-NOT: bl fminf +; CHECK-NOT: mtctr +; CHECK: blr define void @test1v(<4 x float> %f, <4 x float>* %fp) { entry: @@ -48,7 +50,7 @@ loop_body: %0 = call <4 x float> @llvm.minnum.v4f32(<4 x float> %f, <4 x float> ) store <4 x float> %0, <4 x float>* %fp, align 16 %1 = add i64 %invar_address.dim.0.01, 1 - %2 = icmp eq i64 %1, 2 + %2 = icmp eq i64 %1, 4 br i1 %2, label %loop_exit, label %loop_body loop_exit: @@ -56,8 +58,15 @@ loop_exit: } ; CHECK-LABEL: test1v: +; CHECK: bl fminf +; CHECK-NOT: mtctr +; CHECK: bl fminf ; CHECK-NOT: mtctr ; CHECK: bl fminf +; CHECK-NOT: mtctr +; CHECK: bl fminf +; CHECK-NOT: bl fminf +; CHECK: blr ; QPX-LABEL: test1v: ; QPX: mtctr @@ -83,6 +92,9 @@ loop_exit: ; CHECK-LABEL: test1a: ; CHECK-NOT: mtctr ; CHECK: bl fminf +; CHECK-NOT: bl fminf +; CHECK-NOT: mtctr +; CHECK: blr define void @test2(float %f, float* %fp) { entry: @@ -103,6 +115,9 @@ loop_exit: ; CHECK-LABEL: test2: ; CHECK-NOT: mtctr ; CHECK: bl fmaxf +; CHECK-NOT: bl fmaxf +; CHECK-NOT: mtctr +; CHECK: blr define void @test2v(<4 x double> %f, <4 x double>* %fp) { entry: @@ -113,7 +128,7 @@ loop_body: %0 = call <4 x double> @llvm.maxnum.v4f64(<4 x double> %f, <4 x double> ) store <4 x double> %0, <4 x double>* %fp, align 16 %1 = add i64 %invar_address.dim.0.01, 1 - %2 = icmp eq i64 %1, 2 + %2 = icmp eq i64 %1, 4 br i1 %2, label %loop_exit, label %loop_body loop_exit: @@ -121,8 +136,15 @@ loop_exit: } ; CHECK-LABEL: test2v: +; CHECK: bl fmax +; CHECK-NOT: mtctr +; CHECK: bl fmax +; CHECK-NOT: mtctr +; CHECK: bl fmax ; CHECK-NOT: mtctr ; CHECK: bl fmax +; CHECK-NOT: bl fmax +; CHECK: blr ; QPX-LABEL: test2v: ; QPX: mtctr @@ -148,6 +170,9 @@ loop_exit: ; CHECK-LABEL: test2a: ; CHECK-NOT: mtctr ; CHECK: bl fmaxf +; CHECK-NOT: bl fmaxf +; CHECK-NOT: mtctr +; CHECK: blr define void @test3(double %f, double* %fp) { entry: @@ -168,6 +193,9 @@ loop_exit: ; CHECK-LABEL: test3: ; CHECK-NOT: mtctr ; CHECK: bl fmin +; CHECK-NOT: bl fmin +; CHECK-NOT: mtctr +; CHECK: blr define void @test3a(double %f, double* %fp) { entry: @@ -188,6 +216,9 @@ loop_exit: ; CHECK-LABEL: test3a: ; CHECK-NOT: mtctr ; CHECK: bl fmin +; CHECK-NOT: bl fmin +; CHECK-NOT: mtctr +; CHECK: blr define void @test4(double %f, double* %fp) { entry: @@ -208,6 +239,9 @@ loop_exit: ; CHECK-LABEL: test4: ; CHECK-NOT: mtctr ; CHECK: bl fmax +; CHECK-NOT: bl fmax +; CHECK-NOT: mtctr +; CHECK: blr define void @test4a(double %f, double* %fp) { entry: @@ -228,4 +262,7 @@ loop_exit: ; CHECK-LABEL: test4a: ; CHECK-NOT: mtctr ; CHECK: bl fmax +; CHECK-NOT: bl fmax +; CHECK-NOT: mtctr +; CHECK: blr diff --git a/test/CodeGen/PowerPC/ctrloop-shortLoops.ll b/test/CodeGen/PowerPC/ctrloop-shortLoops.ll new file mode 100644 index 00000000000..481ec54e79a --- /dev/null +++ b/test/CodeGen/PowerPC/ctrloop-shortLoops.ll @@ -0,0 +1,116 @@ +; RUN: llc < %s -mtriple=powerpc64le-unknown-linux-gnu -verify-machineinstrs -mcpu=pwr8 | FileCheck %s +; RUN: llc < %s -mtriple=powerpc64le-unknown-linux-gnu -verify-machineinstrs -mcpu=a2q | FileCheck %s + +; Verify that we do NOT generate the mtctr instruction for loop trip counts < 4 +; The latency of the mtctr is only justified if there are more than 4 comparisons that are removed as a result. + +@a = common local_unnamed_addr global i32 0, align 4 +@arr = common local_unnamed_addr global [5 x i32] zeroinitializer, align 4 + +; Function Attrs: norecurse nounwind readonly +define signext i32 @testTripCount2(i32 signext %a) { + +; CHECK-LABEL: testTripCount2: +; CHECK-NOT: mtctr +; CHECK: blr + +entry: + br label %for.body + +for.cond.cleanup: ; preds = %for.body + ret i32 %add + +for.body: ; preds = %entry, %for.body + %indvars.iv = phi i64 [ 1, %entry ], [ %indvars.iv.next, %for.body ] + %Sum.05 = phi i32 [ 0, %entry ], [ %add, %for.body ] + %arrayidx = getelementptr inbounds [5 x i32], [5 x i32]* @arr, i64 0, i64 %indvars.iv + %0 = load i32, i32* %arrayidx, align 4 + %add = add nsw i32 %0, %Sum.05 + %indvars.iv.next = add nsw i64 %indvars.iv, -1 + %tobool = icmp eq i64 %indvars.iv, 0 + br i1 %tobool, label %for.cond.cleanup, label %for.body +} + +; Function Attrs: norecurse nounwind readonly +define signext i32 @testTripCount3(i32 signext %a) { + +; CHECK-LABEL: testTripCount3: +; CHECK-NOT: mtctr +; CHECK: blr + +entry: + br label %for.body + +for.cond.cleanup: ; preds = %for.body + ret i32 %add + +for.body: ; preds = %entry, %for.body + %indvars.iv = phi i64 [ 2, %entry ], [ %indvars.iv.next, %for.body ] + %Sum.05 = phi i32 [ 0, %entry ], [ %add, %for.body ] + %arrayidx = getelementptr inbounds [5 x i32], [5 x i32]* @arr, i64 0, i64 %indvars.iv + %0 = load i32, i32* %arrayidx, align 4 + %add = add nsw i32 %0, %Sum.05 + %indvars.iv.next = add nsw i64 %indvars.iv, -1 + %tobool = icmp eq i64 %indvars.iv, 0 + br i1 %tobool, label %for.cond.cleanup, label %for.body +} + +; Function Attrs: norecurse nounwind readonly + +define signext i32 @testTripCount4(i32 signext %a) { + +; CHECK-LABEL: testTripCount4: +; CHECK: mtctr +; CHECK: bdnz + +entry: + br label %for.body + +for.cond.cleanup: ; preds = %for.body + ret i32 %add + +for.body: ; preds = %entry, %for.body + %indvars.iv = phi i64 [ 3, %entry ], [ %indvars.iv.next, %for.body ] + %Sum.05 = phi i32 [ 0, %entry ], [ %add, %for.body ] + %arrayidx = getelementptr inbounds [5 x i32], [5 x i32]* @arr, i64 0, i64 %indvars.iv + %0 = load i32, i32* %arrayidx, align 4 + %add = add nsw i32 %0, %Sum.05 + %indvars.iv.next = add nsw i64 %indvars.iv, -1 + %tobool = icmp eq i64 %indvars.iv, 0 + br i1 %tobool, label %for.cond.cleanup, label %for.body +} + +; Function Attrs: norecurse nounwind +define signext i32 @testTripCount2NonSmallLoop() { + +; CHECK-LABEL: testTripCount2NonSmallLoop: +; CHECK: mtctr +; CHECK: blr + +entry: + %.pre = load i32, i32* @a, align 4 + br label %for.body + +for.body: ; preds = %entry, %if.end + %0 = phi i32 [ %.pre, %entry ], [ %1, %if.end ] + %dec4 = phi i32 [ 1, %entry ], [ %dec, %if.end ] + %b.03 = phi i8 [ 0, %entry ], [ %b.1, %if.end ] + %tobool1 = icmp eq i32 %0, 0 + br i1 %tobool1, label %if.end, label %if.then + +if.then: ; preds = %for.body + store i32 2, i32* @a, align 4 + br label %if.end + +if.end: ; preds = %for.body, %if.then + %1 = phi i32 [ 2, %if.then ], [ 0, %for.body ] + %b.1 = phi i8 [ 2, %if.then ], [ %b.03, %for.body ] + %dec = add nsw i32 %dec4, -1 + %tobool = icmp eq i32 %dec4, 0 + br i1 %tobool, label %for.end, label %for.body + +for.end: ; preds = %if.end + %conv = zext i8 %b.1 to i32 + ret i32 %conv +} + -- 2.50.1