[PowerPC] Add profitablilty check for conversion to mtctr loops

author Lei Huang <lei@ca.ibm.com>

Thu, 12 Oct 2017 16:43:33 +0000 (16:43 +0000)

committer Lei Huang <lei@ca.ibm.com>

Thu, 12 Oct 2017 16:43:33 +0000 (16:43 +0000)
author Lei Huang <lei@ca.ibm.com>
Thu, 12 Oct 2017 16:43:33 +0000 (16:43 +0000)
committer Lei Huang <lei@ca.ibm.com>
Thu, 12 Oct 2017 16:43:33 +0000 (16:43 +0000)
diff --git a/lib/Target/PowerPC/PPCCTRLoops.cpp b/lib/Target/PowerPC/PPCCTRLoops.cpp

index 8d61e81b1fc7ca2934040c2e119bf8cd854b863d..8784a83190292d5d92c95f8e90eff87c57f3e3ed 100644 (file)
--- a/lib/Target/PowerPC/PPCCTRLoops.cpp
+++ b/lib/Target/PowerPC/PPCCTRLoops.cpp
@@ -26,12 +26,17 @@
  #include "PPC.h"
  #include "PPCSubtarget.h"
  #include "PPCTargetMachine.h"
+#include "PPCTargetTransformInfo.h"
  #include "llvm/ADT/STLExtras.h"
  #include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/AssumptionCache.h"
+#include "llvm/Analysis/CodeMetrics.h"
  #include "llvm/Analysis/LoopInfo.h"
  #include "llvm/Analysis/ScalarEvolutionExpander.h"
  #include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
  #include "llvm/CodeGen/TargetPassConfig.h"
+#include "llvm/CodeGen/TargetSchedule.h"
  #include "llvm/IR/Constants.h"
  #include "llvm/IR/DerivedTypes.h"
  #include "llvm/IR/Dominators.h"
@@ -64,6 +69,13 @@ using namespace llvm;
  static cl::opt<int> CTRLoopLimit("ppc-max-ctrloop", cl::Hidden, cl::init(-1));
  #endif
  
+// The latency of mtctr is only justified if there are more than 4
+// comparisons that will be removed as a result.
+static cl::opt<unsigned>
+SmallCTRLoopThreshold("min-ctr-loop-threshold", cl::init(4), cl::Hidden,
+                      cl::desc("Loops with a constant trip count smaller than "
+                               "this value will not use the count register."));
+
  STATISTIC(NumCTRLoops, "Number of loops converted to CTR loops");
  
  namespace llvm {
@@ -95,6 +107,8 @@ namespace {
        AU.addRequired<DominatorTreeWrapperPass>();
        AU.addPreserved<DominatorTreeWrapperPass>();
        AU.addRequired<ScalarEvolutionWrapperPass>();
+      AU.addRequired<AssumptionCacheTracker>();
+      AU.addRequired<TargetTransformInfoWrapperPass>();
      }
  
    private:
@@ -107,10 +121,12 @@ namespace {
      const PPCTargetLowering *TLI;
      const DataLayout *DL;
      const TargetLibraryInfo *LibInfo;
+    const TargetTransformInfo *TTI;
      LoopInfo *LI;
      ScalarEvolution *SE;
      DominatorTree *DT;
      bool PreserveLCSSA;
+    TargetSchedModel SchedModel;
    };
  
    char PPCCTRLoops::ID = 0;
@@ -179,6 +195,7 @@ bool PPCCTRLoops::runOnFunction(Function &F) {
    LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
    SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
    DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+  TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
    DL = &F.getParent()->getDataLayout();
    auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>();
    LibInfo = TLIP ? &TLIP->getTLI() : nullptr;
@@ -462,10 +479,24 @@ bool PPCCTRLoops::mightUseCTR(BasicBlock *BB) {
  
    return false;
  }
-
  bool PPCCTRLoops::convertToCTRLoop(Loop *L) {
    bool MadeChange = false;
  
+  // Do not convert small short loops to CTR loop.
+  unsigned ConstTripCount = SE->getSmallConstantTripCount(L);
+  if (ConstTripCount && ConstTripCount < SmallCTRLoopThreshold) {
+    SmallPtrSet<const Value *, 32> EphValues;
+    auto AC = getAnalysis<AssumptionCacheTracker>().getAssumptionCache(
+        *L->getHeader()->getParent());
+    CodeMetrics::collectEphemeralValues(L, &AC, EphValues);
+    CodeMetrics Metrics;
+    for (BasicBlock *BB : L->blocks())
+      Metrics.analyzeBasicBlock(BB, *TTI, EphValues);
+    // 6 is an approximate latency for the mtctr instruction.
+    if (Metrics.NumInsts <= (6 * SchedModel.getIssueWidth()))
+      return false;
+  }
+
    // Process nested loops first.
    for (Loop::iterator I = L->begin(), E = L->end(); I != E; ++I) {
      MadeChange |= convertToCTRLoop(*I);
diff --git a/test/CodeGen/PowerPC/ctr-minmaxnum.ll b/test/CodeGen/PowerPC/ctr-minmaxnum.ll

index 2b0a7cceb68d3fc97d7279b5f02007190e63cb0a..e38f851620b7397a68801268435c6c3736abf423 100644 (file)
--- a/test/CodeGen/PowerPC/ctr-minmaxnum.ll
+++ b/test/CodeGen/PowerPC/ctr-minmaxnum.ll
@@ -1,6 +1,5 @@
-; RUN: llc -verify-machineinstrs -mcpu=pwr7 < %s | FileCheck %s
-; RUN: llc -verify-machineinstrs -mcpu=a2q < %s | FileCheck %s --check-prefix=QPX
-target triple = "powerpc64-unknown-linux-gnu"
+; RUN: llc -mtriple=powerpc64-unknown-linux-gnu -verify-machineinstrs -mcpu=pwr7 < %s | FileCheck %s
+; RUN: llc -mtriple=powerpc64-unknown-linux-gnu -verify-machineinstrs -mcpu=a2q < %s | FileCheck %s --check-prefix=QPX
  
  declare float @fabsf(float)
  
@@ -38,6 +37,9 @@ loop_exit:
  ; CHECK-LABEL: test1:
  ; CHECK-NOT: mtctr
  ; CHECK: bl fminf
+; CHECK-NOT: bl fminf
+; CHECK-NOT: mtctr
+; CHECK: blr
  
  define void @test1v(<4 x float> %f, <4 x float>* %fp) {
  entry:
@@ -48,7 +50,7 @@ loop_body:
    %0 = call <4 x float> @llvm.minnum.v4f32(<4 x float> %f, <4 x float> <float 1.0, float 1.0, float 1.0, float 1.0>)
    store <4 x float> %0, <4 x float>* %fp, align 16
    %1 = add i64 %invar_address.dim.0.01, 1
-  %2 = icmp eq i64 %1, 2
+  %2 = icmp eq i64 %1, 4
    br i1 %2, label %loop_exit, label %loop_body
  
  loop_exit:
@@ -56,8 +58,15 @@ loop_exit:
  }
  
  ; CHECK-LABEL: test1v:
+; CHECK: bl fminf
+; CHECK-NOT: mtctr
+; CHECK: bl fminf
  ; CHECK-NOT: mtctr
  ; CHECK: bl fminf
+; CHECK-NOT: mtctr
+; CHECK: bl fminf
+; CHECK-NOT: bl fminf
+; CHECK: blr
  
  ; QPX-LABEL: test1v:
  ; QPX: mtctr
@@ -83,6 +92,9 @@ loop_exit:
  ; CHECK-LABEL: test1a:
  ; CHECK-NOT: mtctr
  ; CHECK: bl fminf
+; CHECK-NOT: bl fminf
+; CHECK-NOT: mtctr
+; CHECK: blr
  
  define void @test2(float %f, float* %fp) {
  entry:
@@ -103,6 +115,9 @@ loop_exit:
  ; CHECK-LABEL: test2:
  ; CHECK-NOT: mtctr
  ; CHECK: bl fmaxf
+; CHECK-NOT: bl fmaxf
+; CHECK-NOT: mtctr
+; CHECK: blr
  
  define void @test2v(<4 x double> %f, <4 x double>* %fp) {
  entry:
@@ -113,7 +128,7 @@ loop_body:
    %0 = call <4 x double> @llvm.maxnum.v4f64(<4 x double> %f, <4 x double> <double 1.0, double 1.0, double 1.0, double 1.0>)
    store <4 x double> %0, <4 x double>* %fp, align 16
    %1 = add i64 %invar_address.dim.0.01, 1
-  %2 = icmp eq i64 %1, 2
+  %2 = icmp eq i64 %1, 4
    br i1 %2, label %loop_exit, label %loop_body
  
  loop_exit:
@@ -121,8 +136,15 @@ loop_exit:
  }
  
  ; CHECK-LABEL: test2v:
+; CHECK: bl fmax
+; CHECK-NOT: mtctr
+; CHECK: bl fmax
+; CHECK-NOT: mtctr
+; CHECK: bl fmax
  ; CHECK-NOT: mtctr
  ; CHECK: bl fmax
+; CHECK-NOT: bl fmax
+; CHECK: blr
  
  ; QPX-LABEL: test2v:
  ; QPX: mtctr
@@ -148,6 +170,9 @@ loop_exit:
  ; CHECK-LABEL: test2a:
  ; CHECK-NOT: mtctr
  ; CHECK: bl fmaxf
+; CHECK-NOT: bl fmaxf
+; CHECK-NOT: mtctr
+; CHECK: blr
  
  define void @test3(double %f, double* %fp) {
  entry:
@@ -168,6 +193,9 @@ loop_exit:
  ; CHECK-LABEL: test3:
  ; CHECK-NOT: mtctr
  ; CHECK: bl fmin
+; CHECK-NOT: bl fmin
+; CHECK-NOT: mtctr
+; CHECK: blr
  
  define void @test3a(double %f, double* %fp) {
  entry:
@@ -188,6 +216,9 @@ loop_exit:
  ; CHECK-LABEL: test3a:
  ; CHECK-NOT: mtctr
  ; CHECK: bl fmin
+; CHECK-NOT: bl fmin
+; CHECK-NOT: mtctr
+; CHECK: blr
  
  define void @test4(double %f, double* %fp) {
  entry:
@@ -208,6 +239,9 @@ loop_exit:
  ; CHECK-LABEL: test4:
  ; CHECK-NOT: mtctr
  ; CHECK: bl fmax
+; CHECK-NOT: bl fmax
+; CHECK-NOT: mtctr
+; CHECK: blr
  
  define void @test4a(double %f, double* %fp) {
  entry:
@@ -228,4 +262,7 @@ loop_exit:
  ; CHECK-LABEL: test4a:
  ; CHECK-NOT: mtctr
  ; CHECK: bl fmax
+; CHECK-NOT: bl fmax
+; CHECK-NOT: mtctr
+; CHECK: blr
  
diff --git a/test/CodeGen/PowerPC/ctrloop-shortLoops.ll b/test/CodeGen/PowerPC/ctrloop-shortLoops.ll

new file mode 100644 (file)

index 0000000..481ec54
--- /dev/null
+++ b/test/CodeGen/PowerPC/ctrloop-shortLoops.ll
@@ -0,0 +1,116 @@
+; RUN: llc < %s -mtriple=powerpc64le-unknown-linux-gnu -verify-machineinstrs -mcpu=pwr8 | FileCheck %s
+; RUN: llc < %s -mtriple=powerpc64le-unknown-linux-gnu -verify-machineinstrs -mcpu=a2q | FileCheck %s
+
+; Verify that we do NOT generate the mtctr instruction for loop trip counts < 4
+; The latency of the mtctr is only justified if there are more than 4 comparisons that are removed as a result.
+
+@a = common local_unnamed_addr global i32 0, align 4
+@arr = common local_unnamed_addr global [5 x i32] zeroinitializer, align 4
+
+; Function Attrs: norecurse nounwind readonly
+define signext i32 @testTripCount2(i32 signext %a) {
+
+; CHECK-LABEL: testTripCount2:
+; CHECK-NOT: mtctr
+; CHECK: blr
+
+entry:
+  br label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.body
+  ret i32 %add
+
+for.body:                                         ; preds = %entry, %for.body
+  %indvars.iv = phi i64 [ 1, %entry ], [ %indvars.iv.next, %for.body ]
+  %Sum.05 = phi i32 [ 0, %entry ], [ %add, %for.body ]
+  %arrayidx = getelementptr inbounds [5 x i32], [5 x i32]* @arr, i64 0, i64 %indvars.iv
+  %0 = load i32, i32* %arrayidx, align 4
+  %add = add nsw i32 %0, %Sum.05
+  %indvars.iv.next = add nsw i64 %indvars.iv, -1
+  %tobool = icmp eq i64 %indvars.iv, 0
+  br i1 %tobool, label %for.cond.cleanup, label %for.body
+}
+
+; Function Attrs: norecurse nounwind readonly
+define signext i32 @testTripCount3(i32 signext %a) {
+
+; CHECK-LABEL: testTripCount3:
+; CHECK-NOT: mtctr
+; CHECK: blr
+
+entry:
+  br label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.body
+  ret i32 %add
+
+for.body:                                         ; preds = %entry, %for.body
+  %indvars.iv = phi i64 [ 2, %entry ], [ %indvars.iv.next, %for.body ]
+  %Sum.05 = phi i32 [ 0, %entry ], [ %add, %for.body ]
+  %arrayidx = getelementptr inbounds [5 x i32], [5 x i32]* @arr, i64 0, i64 %indvars.iv
+  %0 = load i32, i32* %arrayidx, align 4
+  %add = add nsw i32 %0, %Sum.05
+  %indvars.iv.next = add nsw i64 %indvars.iv, -1
+  %tobool = icmp eq i64 %indvars.iv, 0
+  br i1 %tobool, label %for.cond.cleanup, label %for.body
+}
+
+; Function Attrs: norecurse nounwind readonly
+
+define signext i32 @testTripCount4(i32 signext %a) {
+
+; CHECK-LABEL: testTripCount4:
+; CHECK: mtctr
+; CHECK: bdnz
+
+entry:
+  br label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.body
+  ret i32 %add
+
+for.body:                                         ; preds = %entry, %for.body
+  %indvars.iv = phi i64 [ 3, %entry ], [ %indvars.iv.next, %for.body ]
+  %Sum.05 = phi i32 [ 0, %entry ], [ %add, %for.body ]
+  %arrayidx = getelementptr inbounds [5 x i32], [5 x i32]* @arr, i64 0, i64 %indvars.iv
+  %0 = load i32, i32* %arrayidx, align 4
+  %add = add nsw i32 %0, %Sum.05
+  %indvars.iv.next = add nsw i64 %indvars.iv, -1
+  %tobool = icmp eq i64 %indvars.iv, 0
+  br i1 %tobool, label %for.cond.cleanup, label %for.body
+}
+
+; Function Attrs: norecurse nounwind
+define signext i32 @testTripCount2NonSmallLoop() {
+
+; CHECK-LABEL: testTripCount2NonSmallLoop:
+; CHECK: mtctr
+; CHECK: blr
+
+entry:
+  %.pre = load i32, i32* @a, align 4
+  br label %for.body
+
+for.body:                                         ; preds = %entry, %if.end
+  %0 = phi i32 [ %.pre, %entry ], [ %1, %if.end ]
+  %dec4 = phi i32 [ 1, %entry ], [ %dec, %if.end ]
+  %b.03 = phi i8 [ 0, %entry ], [ %b.1, %if.end ]
+  %tobool1 = icmp eq i32 %0, 0
+  br i1 %tobool1, label %if.end, label %if.then
+
+if.then:                                          ; preds = %for.body
+  store i32 2, i32* @a, align 4
+  br label %if.end
+
+if.end:                                           ; preds = %for.body, %if.then
+  %1 = phi i32 [ 2, %if.then ], [ 0, %for.body ]
+  %b.1 = phi i8 [ 2, %if.then ], [ %b.03, %for.body ]
+  %dec = add nsw i32 %dec4, -1
+  %tobool = icmp eq i32 %dec4, 0
+  br i1 %tobool, label %for.end, label %for.body
+
+for.end:                                          ; preds = %if.end
+  %conv = zext i8 %b.1 to i32
+  ret i32 %conv
+}
+
author	Lei Huang <lei@ca.ibm.com>
	Thu, 12 Oct 2017 16:43:33 +0000 (16:43 +0000)
committer	Lei Huang <lei@ca.ibm.com>
	Thu, 12 Oct 2017 16:43:33 +0000 (16:43 +0000)
lib/Target/PowerPC/PPCCTRLoops.cpp		patch \| blob \| history
test/CodeGen/PowerPC/ctr-minmaxnum.ll		patch \| blob \| history
test/CodeGen/PowerPC/ctrloop-shortLoops.ll	[new file with mode: 0644]	patch \| blob