#include "PPC.h"
#include "PPCSubtarget.h"
#include "PPCTargetMachine.h"
+#include "PPCTargetTransformInfo.h"
#include "llvm/ADT/STLExtras.h"
#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/AssumptionCache.h"
+#include "llvm/Analysis/CodeMetrics.h"
#include "llvm/Analysis/LoopInfo.h"
#include "llvm/Analysis/ScalarEvolutionExpander.h"
#include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
#include "llvm/CodeGen/TargetPassConfig.h"
+#include "llvm/CodeGen/TargetSchedule.h"
#include "llvm/IR/Constants.h"
#include "llvm/IR/DerivedTypes.h"
#include "llvm/IR/Dominators.h"
static cl::opt<int> CTRLoopLimit("ppc-max-ctrloop", cl::Hidden, cl::init(-1));
#endif
+// The latency of mtctr is only justified if there are more than 4
+// comparisons that will be removed as a result.
+static cl::opt<unsigned>
+SmallCTRLoopThreshold("min-ctr-loop-threshold", cl::init(4), cl::Hidden,
+ cl::desc("Loops with a constant trip count smaller than "
+ "this value will not use the count register."));
+
STATISTIC(NumCTRLoops, "Number of loops converted to CTR loops");
namespace llvm {
AU.addRequired<DominatorTreeWrapperPass>();
AU.addPreserved<DominatorTreeWrapperPass>();
AU.addRequired<ScalarEvolutionWrapperPass>();
+ AU.addRequired<AssumptionCacheTracker>();
+ AU.addRequired<TargetTransformInfoWrapperPass>();
}
private:
const PPCTargetLowering *TLI;
const DataLayout *DL;
const TargetLibraryInfo *LibInfo;
+ const TargetTransformInfo *TTI;
LoopInfo *LI;
ScalarEvolution *SE;
DominatorTree *DT;
bool PreserveLCSSA;
+ TargetSchedModel SchedModel;
};
char PPCCTRLoops::ID = 0;
LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+ TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
DL = &F.getParent()->getDataLayout();
auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>();
LibInfo = TLIP ? &TLIP->getTLI() : nullptr;
return false;
}
-
bool PPCCTRLoops::convertToCTRLoop(Loop *L) {
bool MadeChange = false;
+ // Do not convert small short loops to CTR loop.
+ unsigned ConstTripCount = SE->getSmallConstantTripCount(L);
+ if (ConstTripCount && ConstTripCount < SmallCTRLoopThreshold) {
+ SmallPtrSet<const Value *, 32> EphValues;
+ auto AC = getAnalysis<AssumptionCacheTracker>().getAssumptionCache(
+ *L->getHeader()->getParent());
+ CodeMetrics::collectEphemeralValues(L, &AC, EphValues);
+ CodeMetrics Metrics;
+ for (BasicBlock *BB : L->blocks())
+ Metrics.analyzeBasicBlock(BB, *TTI, EphValues);
+ // 6 is an approximate latency for the mtctr instruction.
+ if (Metrics.NumInsts <= (6 * SchedModel.getIssueWidth()))
+ return false;
+ }
+
// Process nested loops first.
for (Loop::iterator I = L->begin(), E = L->end(); I != E; ++I) {
MadeChange |= convertToCTRLoop(*I);
-; RUN: llc -verify-machineinstrs -mcpu=pwr7 < %s | FileCheck %s
-; RUN: llc -verify-machineinstrs -mcpu=a2q < %s | FileCheck %s --check-prefix=QPX
-target triple = "powerpc64-unknown-linux-gnu"
+; RUN: llc -mtriple=powerpc64-unknown-linux-gnu -verify-machineinstrs -mcpu=pwr7 < %s | FileCheck %s
+; RUN: llc -mtriple=powerpc64-unknown-linux-gnu -verify-machineinstrs -mcpu=a2q < %s | FileCheck %s --check-prefix=QPX
declare float @fabsf(float)
; CHECK-LABEL: test1:
; CHECK-NOT: mtctr
; CHECK: bl fminf
+; CHECK-NOT: bl fminf
+; CHECK-NOT: mtctr
+; CHECK: blr
define void @test1v(<4 x float> %f, <4 x float>* %fp) {
entry:
%0 = call <4 x float> @llvm.minnum.v4f32(<4 x float> %f, <4 x float> <float 1.0, float 1.0, float 1.0, float 1.0>)
store <4 x float> %0, <4 x float>* %fp, align 16
%1 = add i64 %invar_address.dim.0.01, 1
- %2 = icmp eq i64 %1, 2
+ %2 = icmp eq i64 %1, 4
br i1 %2, label %loop_exit, label %loop_body
loop_exit:
}
; CHECK-LABEL: test1v:
+; CHECK: bl fminf
+; CHECK-NOT: mtctr
+; CHECK: bl fminf
; CHECK-NOT: mtctr
; CHECK: bl fminf
+; CHECK-NOT: mtctr
+; CHECK: bl fminf
+; CHECK-NOT: bl fminf
+; CHECK: blr
; QPX-LABEL: test1v:
; QPX: mtctr
; CHECK-LABEL: test1a:
; CHECK-NOT: mtctr
; CHECK: bl fminf
+; CHECK-NOT: bl fminf
+; CHECK-NOT: mtctr
+; CHECK: blr
define void @test2(float %f, float* %fp) {
entry:
; CHECK-LABEL: test2:
; CHECK-NOT: mtctr
; CHECK: bl fmaxf
+; CHECK-NOT: bl fmaxf
+; CHECK-NOT: mtctr
+; CHECK: blr
define void @test2v(<4 x double> %f, <4 x double>* %fp) {
entry:
%0 = call <4 x double> @llvm.maxnum.v4f64(<4 x double> %f, <4 x double> <double 1.0, double 1.0, double 1.0, double 1.0>)
store <4 x double> %0, <4 x double>* %fp, align 16
%1 = add i64 %invar_address.dim.0.01, 1
- %2 = icmp eq i64 %1, 2
+ %2 = icmp eq i64 %1, 4
br i1 %2, label %loop_exit, label %loop_body
loop_exit:
}
; CHECK-LABEL: test2v:
+; CHECK: bl fmax
+; CHECK-NOT: mtctr
+; CHECK: bl fmax
+; CHECK-NOT: mtctr
+; CHECK: bl fmax
; CHECK-NOT: mtctr
; CHECK: bl fmax
+; CHECK-NOT: bl fmax
+; CHECK: blr
; QPX-LABEL: test2v:
; QPX: mtctr
; CHECK-LABEL: test2a:
; CHECK-NOT: mtctr
; CHECK: bl fmaxf
+; CHECK-NOT: bl fmaxf
+; CHECK-NOT: mtctr
+; CHECK: blr
define void @test3(double %f, double* %fp) {
entry:
; CHECK-LABEL: test3:
; CHECK-NOT: mtctr
; CHECK: bl fmin
+; CHECK-NOT: bl fmin
+; CHECK-NOT: mtctr
+; CHECK: blr
define void @test3a(double %f, double* %fp) {
entry:
; CHECK-LABEL: test3a:
; CHECK-NOT: mtctr
; CHECK: bl fmin
+; CHECK-NOT: bl fmin
+; CHECK-NOT: mtctr
+; CHECK: blr
define void @test4(double %f, double* %fp) {
entry:
; CHECK-LABEL: test4:
; CHECK-NOT: mtctr
; CHECK: bl fmax
+; CHECK-NOT: bl fmax
+; CHECK-NOT: mtctr
+; CHECK: blr
define void @test4a(double %f, double* %fp) {
entry:
; CHECK-LABEL: test4a:
; CHECK-NOT: mtctr
; CHECK: bl fmax
+; CHECK-NOT: bl fmax
+; CHECK-NOT: mtctr
+; CHECK: blr
--- /dev/null
+; RUN: llc < %s -mtriple=powerpc64le-unknown-linux-gnu -verify-machineinstrs -mcpu=pwr8 | FileCheck %s
+; RUN: llc < %s -mtriple=powerpc64le-unknown-linux-gnu -verify-machineinstrs -mcpu=a2q | FileCheck %s
+
+; Verify that we do NOT generate the mtctr instruction for loop trip counts < 4
+; The latency of the mtctr is only justified if there are more than 4 comparisons that are removed as a result.
+
+@a = common local_unnamed_addr global i32 0, align 4
+@arr = common local_unnamed_addr global [5 x i32] zeroinitializer, align 4
+
+; Function Attrs: norecurse nounwind readonly
+define signext i32 @testTripCount2(i32 signext %a) {
+
+; CHECK-LABEL: testTripCount2:
+; CHECK-NOT: mtctr
+; CHECK: blr
+
+entry:
+ br label %for.body
+
+for.cond.cleanup: ; preds = %for.body
+ ret i32 %add
+
+for.body: ; preds = %entry, %for.body
+ %indvars.iv = phi i64 [ 1, %entry ], [ %indvars.iv.next, %for.body ]
+ %Sum.05 = phi i32 [ 0, %entry ], [ %add, %for.body ]
+ %arrayidx = getelementptr inbounds [5 x i32], [5 x i32]* @arr, i64 0, i64 %indvars.iv
+ %0 = load i32, i32* %arrayidx, align 4
+ %add = add nsw i32 %0, %Sum.05
+ %indvars.iv.next = add nsw i64 %indvars.iv, -1
+ %tobool = icmp eq i64 %indvars.iv, 0
+ br i1 %tobool, label %for.cond.cleanup, label %for.body
+}
+
+; Function Attrs: norecurse nounwind readonly
+define signext i32 @testTripCount3(i32 signext %a) {
+
+; CHECK-LABEL: testTripCount3:
+; CHECK-NOT: mtctr
+; CHECK: blr
+
+entry:
+ br label %for.body
+
+for.cond.cleanup: ; preds = %for.body
+ ret i32 %add
+
+for.body: ; preds = %entry, %for.body
+ %indvars.iv = phi i64 [ 2, %entry ], [ %indvars.iv.next, %for.body ]
+ %Sum.05 = phi i32 [ 0, %entry ], [ %add, %for.body ]
+ %arrayidx = getelementptr inbounds [5 x i32], [5 x i32]* @arr, i64 0, i64 %indvars.iv
+ %0 = load i32, i32* %arrayidx, align 4
+ %add = add nsw i32 %0, %Sum.05
+ %indvars.iv.next = add nsw i64 %indvars.iv, -1
+ %tobool = icmp eq i64 %indvars.iv, 0
+ br i1 %tobool, label %for.cond.cleanup, label %for.body
+}
+
+; Function Attrs: norecurse nounwind readonly
+
+define signext i32 @testTripCount4(i32 signext %a) {
+
+; CHECK-LABEL: testTripCount4:
+; CHECK: mtctr
+; CHECK: bdnz
+
+entry:
+ br label %for.body
+
+for.cond.cleanup: ; preds = %for.body
+ ret i32 %add
+
+for.body: ; preds = %entry, %for.body
+ %indvars.iv = phi i64 [ 3, %entry ], [ %indvars.iv.next, %for.body ]
+ %Sum.05 = phi i32 [ 0, %entry ], [ %add, %for.body ]
+ %arrayidx = getelementptr inbounds [5 x i32], [5 x i32]* @arr, i64 0, i64 %indvars.iv
+ %0 = load i32, i32* %arrayidx, align 4
+ %add = add nsw i32 %0, %Sum.05
+ %indvars.iv.next = add nsw i64 %indvars.iv, -1
+ %tobool = icmp eq i64 %indvars.iv, 0
+ br i1 %tobool, label %for.cond.cleanup, label %for.body
+}
+
+; Function Attrs: norecurse nounwind
+define signext i32 @testTripCount2NonSmallLoop() {
+
+; CHECK-LABEL: testTripCount2NonSmallLoop:
+; CHECK: mtctr
+; CHECK: blr
+
+entry:
+ %.pre = load i32, i32* @a, align 4
+ br label %for.body
+
+for.body: ; preds = %entry, %if.end
+ %0 = phi i32 [ %.pre, %entry ], [ %1, %if.end ]
+ %dec4 = phi i32 [ 1, %entry ], [ %dec, %if.end ]
+ %b.03 = phi i8 [ 0, %entry ], [ %b.1, %if.end ]
+ %tobool1 = icmp eq i32 %0, 0
+ br i1 %tobool1, label %if.end, label %if.then
+
+if.then: ; preds = %for.body
+ store i32 2, i32* @a, align 4
+ br label %if.end
+
+if.end: ; preds = %for.body, %if.then
+ %1 = phi i32 [ 2, %if.then ], [ 0, %for.body ]
+ %b.1 = phi i8 [ 2, %if.then ], [ %b.03, %for.body ]
+ %dec = add nsw i32 %dec4, -1
+ %tobool = icmp eq i32 %dec4, 0
+ br i1 %tobool, label %for.end, label %for.body
+
+for.end: ; preds = %if.end
+ %conv = zext i8 %b.1 to i32
+ ret i32 %conv
+}
+