Improve profile-guided heuristics to use estimated trip count.

author Taewook Oh <twoh@fb.com>

Mon, 19 Jun 2017 18:48:58 +0000 (18:48 +0000)

committer Taewook Oh <twoh@fb.com>

Mon, 19 Jun 2017 18:48:58 +0000 (18:48 +0000)
author Taewook Oh <twoh@fb.com>
Mon, 19 Jun 2017 18:48:58 +0000 (18:48 +0000)
committer Taewook Oh <twoh@fb.com>
Mon, 19 Jun 2017 18:48:58 +0000 (18:48 +0000)
diff --git a/include/llvm/Transforms/Vectorize/LoopVectorize.h b/include/llvm/Transforms/Vectorize/LoopVectorize.h

index 73d1f264c37b53ca4e2c263c8a71028c5639f4be..57d10c4c74734502feb561ff48c0ded659c6d15b 100644 (file)
--- a/include/llvm/Transforms/Vectorize/LoopVectorize.h
+++ b/include/llvm/Transforms/Vectorize/LoopVectorize.h
@@ -87,8 +87,6 @@ struct LoopVectorizePass : public PassInfoMixin<LoopVectorizePass> {
    std::function<const LoopAccessInfo &(Loop &)> *GetLAA;
    OptimizationRemarkEmitter *ORE;
  
-  BlockFrequency ColdEntryFreq;
-
    PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM);
  
    // Shim for old PM.
diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp

index 1abdb2484850656486147667972988380e249268..eac2867233bc0360b10c1949078d571b919c65d1 100644 (file)
--- a/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -5702,14 +5702,14 @@ bool LoopVectorizationLegality::memoryInstructionCanBeWidened(Instruction *I,
  void LoopVectorizationCostModel::collectLoopUniforms(unsigned VF) {
  
    // We should not collect Uniforms more than once per VF. Right now,
-  // this function is called from collectUniformsAndScalars(), which 
+  // this function is called from collectUniformsAndScalars(), which
    // already does this check. Collecting Uniforms for VF=1 does not make any
    // sense.
  
    assert(VF >= 2 && !Uniforms.count(VF) &&
           "This function should not be visited twice for the same VF");
  
-  // Visit the list of Uniforms. If we'll not find any uniform value, we'll 
+  // Visit the list of Uniforms. If we'll not find any uniform value, we'll
    // not analyze again.  Uniforms.count(VF) will return 1.
    Uniforms[VF].clear();
  
@@ -5988,10 +5988,10 @@ void InterleavedAccessInfo::collectConstStrideAccesses(
          continue;
  
        Value *Ptr = getPointerOperand(&I);
-      // We don't check wrapping here because we don't know yet if Ptr will be 
-      // part of a full group or a group with gaps. Checking wrapping for all 
+      // We don't check wrapping here because we don't know yet if Ptr will be
+      // part of a full group or a group with gaps. Checking wrapping for all
        // pointers (even those that end up in groups with no gaps) will be overly
-      // conservative. For full groups, wrapping should be ok since if we would 
+      // conservative. For full groups, wrapping should be ok since if we would
        // wrap around the address space we would do a memory access at nullptr
        // even without the transformation. The wrapping checks are therefore
        // deferred until after we've formed the interleaved groups.
@@ -6244,7 +6244,7 @@ void InterleavedAccessInfo::analyzeInterleaving(
      Instruction *LastMember = Group->getMember(Group->getFactor() - 1);
      if (LastMember) {
        Value *LastMemberPtr = getPointerOperand(LastMember);
-      if (!getPtrStride(PSE, LastMemberPtr, TheLoop, Strides, /*Assume=*/false, 
+      if (!getPtrStride(PSE, LastMemberPtr, TheLoop, Strides, /*Assume=*/false,
                          /*ShouldCheckWrap=*/true)) {
          DEBUG(dbgs() << "LV: Invalidate candidate interleaved group due to "
                          "last group member potentially pointer-wrapping.\n");
@@ -6252,9 +6252,9 @@ void InterleavedAccessInfo::analyzeInterleaving(
        }
      } else {
        // Case 3: A non-reversed interleaved load group with gaps: We need
-      // to execute at least one scalar epilogue iteration. This will ensure 
+      // to execute at least one scalar epilogue iteration. This will ensure
        // we don't speculatively access memory out-of-bounds. We only need
-      // to look for a member at index factor - 1, since every group must have 
+      // to look for a member at index factor - 1, since every group must have
        // a member at index zero.
        if (Group->isReverse()) {
          releaseGroup(Group);
@@ -7789,8 +7789,18 @@ bool LoopVectorizePass::processLoop(Loop *L) {
  
    // Check the loop for a trip count threshold:
    // do not vectorize loops with a tiny trip count.
-  const unsigned MaxTC = SE->getSmallConstantMaxTripCount(L);
-  if (MaxTC > 0u && MaxTC < TinyTripCountVectorThreshold) {
+  unsigned ExpectedTC = SE->getSmallConstantMaxTripCount(L);
+  bool HasExpectedTC = (ExpectedTC > 0);
+
+  if (!HasExpectedTC && LoopVectorizeWithBlockFrequency) {
+    auto EstimatedTC = getLoopEstimatedTripCount(L);
+    if (EstimatedTC) {
+      ExpectedTC = *EstimatedTC;
+      HasExpectedTC = true;
+    }
+  }
+
+  if (HasExpectedTC && ExpectedTC < TinyTripCountVectorThreshold) {
      DEBUG(dbgs() << "LV: Found a loop with a very small trip count. "
                   << "This loop is not worth vectorizing.");
      if (Hints.getForce() == LoopVectorizeHints::FK_Enabled)
@@ -7822,18 +7832,6 @@ bool LoopVectorizePass::processLoop(Loop *L) {
    bool OptForSize =
        Hints.getForce() != LoopVectorizeHints::FK_Enabled && F->optForSize();
  
-  // Compute the weighted frequency of this loop being executed and see if it
-  // is less than 20% of the function entry baseline frequency. Note that we
-  // always have a canonical loop here because we think we *can* vectorize.
-  // FIXME: This is hidden behind a flag due to pervasive problems with
-  // exactly what block frequency models.
-  if (LoopVectorizeWithBlockFrequency) {
-    BlockFrequency LoopEntryFreq = BFI->getBlockFreq(L->getLoopPreheader());
-    if (Hints.getForce() != LoopVectorizeHints::FK_Enabled &&
-        LoopEntryFreq < ColdEntryFreq)
-      OptForSize = true;
-  }
-
    // Check the function attributes to see if implicit floats are allowed.
    // FIXME: This check doesn't seem possibly correct -- what if the loop is
    // an integer loop and the vector instructions selected are purely integer
@@ -8015,11 +8013,6 @@ bool LoopVectorizePass::runImpl(
    DB = &DB_;
    ORE = &ORE_;
  
-  // Compute some weights outside of the loop over the loops. Compute this
-  // using a BranchProbability to re-use its scaling math.
-  const BranchProbability ColdProb(1, 5); // 20%
-  ColdEntryFreq = BlockFrequency(BFI->getEntryFreq()) * ColdProb;
-
    // Don't attempt if
    // 1. the target claims to have no vector registers, and
    // 2. interleaving won't help ILP.
diff --git a/test/Transforms/LoopVectorize/X86/small-size.ll b/test/Transforms/LoopVectorize/X86/small-size.ll

index 47c262b11b4633916043f273932c71d36056f0fa..89d69e232f5b524042e885075682bc594a7ce5d6 100644 (file)
--- a/test/Transforms/LoopVectorize/X86/small-size.ll
+++ b/test/Transforms/LoopVectorize/X86/small-size.ll
@@ -115,32 +115,6 @@ define void @example3(i32 %n, i32* noalias nocapture %p, i32* noalias nocapture
    ret void
  }
  
-; N is unknown, we need a tail. Can't vectorize because the loop is cold.
-;CHECK-LABEL: @example4(
-;CHECK-NOT: <4 x i32>
-;CHECK: ret void
-define void @example4(i32 %n, i32* noalias nocapture %p, i32* noalias nocapture %q) {
-  %1 = icmp eq i32 %n, 0
-  br i1 %1, label %._crit_edge, label %.lr.ph, !prof !0
-
-.lr.ph:                                           ; preds = %0, %.lr.ph
-  %.05 = phi i32 [ %2, %.lr.ph ], [ %n, %0 ]
-  %.014 = phi i32* [ %5, %.lr.ph ], [ %p, %0 ]
-  %.023 = phi i32* [ %3, %.lr.ph ], [ %q, %0 ]
-  %2 = add nsw i32 %.05, -1
-  %3 = getelementptr inbounds i32, i32* %.023, i64 1
-  %4 = load i32, i32* %.023, align 16
-  %5 = getelementptr inbounds i32, i32* %.014, i64 1
-  store i32 %4, i32* %.014, align 16
-  %6 = icmp eq i32 %2, 0
-  br i1 %6, label %._crit_edge, label %.lr.ph
-
-._crit_edge:                                      ; preds = %.lr.ph, %0
-  ret void
-}
-
-!0 = !{!"branch_weights", i32 64, i32 4}
-
  ; We can't vectorize this one because we need a runtime ptr check.
  ;CHECK-LABEL: @example23(
  ;CHECK-NOT: <4 x i32>
diff --git a/test/Transforms/LoopVectorize/tripcount.ll b/test/Transforms/LoopVectorize/tripcount.ll

new file mode 100644 (file)

index 0000000..03b3aa1
--- /dev/null
+++ b/test/Transforms/LoopVectorize/tripcount.ll
@@ -0,0 +1,91 @@
+; This test verifies that the loop vectorizer will not vectorizes low trip count
+; loops that require runtime checks (Trip count is computed with profile info).
+; REQUIRES: asserts
+; RUN: opt < %s -loop-vectorize -loop-vectorize-with-block-frequency -S | FileCheck %s
+
+target datalayout = "E-m:e-p:32:32-i64:32-f64:32:64-a:0:32-n32-S128"
+
+@tab = common global [32 x i8] zeroinitializer, align 1
+
+define i32 @foo_low_trip_count1(i32 %bound) {
+; Simple loop with low tripcount. Should not be vectorized.
+
+; CHECK-LABEL: @foo_low_trip_count1(
+; CHECK-NOT: <{{[0-9]+}} x i8>
+
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %i.08 = phi i32 [ 0, %entry ], [ %inc, %for.body ]
+  %arrayidx = getelementptr inbounds [32 x i8], [32 x i8]* @tab, i32 0, i32 %i.08
+  %0 = load i8, i8* %arrayidx, align 1
+  %cmp1 = icmp eq i8 %0, 0
+  %. = select i1 %cmp1, i8 2, i8 1
+  store i8 %., i8* %arrayidx, align 1
+  %inc = add nsw i32 %i.08, 1
+  %exitcond = icmp eq i32 %i.08, %bound
+  br i1 %exitcond, label %for.end, label %for.body, !prof !1
+
+for.end:                                          ; preds = %for.body
+  ret i32 0
+}
+
+define i32 @foo_low_trip_count2(i32 %bound) !prof !0 {
+; The loop has a same invocation count with the function, but has a low
+; trip_count per invocation and not worth to vectorize.
+
+; CHECK-LABEL: @foo_low_trip_count2(
+; CHECK-NOT: <{{[0-9]+}} x i8>
+
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %i.08 = phi i32 [ 0, %entry ], [ %inc, %for.body ]
+  %arrayidx = getelementptr inbounds [32 x i8], [32 x i8]* @tab, i32 0, i32 %i.08
+  %0 = load i8, i8* %arrayidx, align 1
+  %cmp1 = icmp eq i8 %0, 0
+  %. = select i1 %cmp1, i8 2, i8 1
+  store i8 %., i8* %arrayidx, align 1
+  %inc = add nsw i32 %i.08, 1
+  %exitcond = icmp eq i32 %i.08, %bound
+  br i1 %exitcond, label %for.end, label %for.body, !prof !1
+
+for.end:                                          ; preds = %for.body
+  ret i32 0
+}
+
+define i32 @foo_low_trip_count3(i1 %cond, i32 %bound) !prof !0 {
+; The loop has low invocation count compare to the function invocation count, 
+; but has a high trip count per invocation. Vectorize it.
+
+; CHECK-LABEL: @foo_low_trip_count3(
+; CHECK: vector.body:
+
+entry:
+  br i1 %cond, label %for.preheader, label %for.end, !prof !2
+
+for.preheader:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %i.08 = phi i32 [ 0, %for.preheader ], [ %inc, %for.body ]
+  %arrayidx = getelementptr inbounds [32 x i8], [32 x i8]* @tab, i32 0, i32 %i.08
+  %0 = load i8, i8* %arrayidx, align 1
+  %cmp1 = icmp eq i8 %0, 0
+  %. = select i1 %cmp1, i8 2, i8 1
+  store i8 %., i8* %arrayidx, align 1
+  %inc = add nsw i32 %i.08, 1
+  %exitcond = icmp eq i32 %i.08, %bound
+  br i1 %exitcond, label %for.end, label %for.body, !prof !3
+
+for.end:                                          ; preds = %for.body
+  ret i32 0
+}
+
+
+!0 = !{!"function_entry_count", i64 100}
+!1 = !{!"branch_weights", i32 100, i32 0}
+!2 = !{!"branch_weights", i32 10, i32 90}
+!3 = !{!"branch_weights", i32 10, i32 10000}
author	Taewook Oh <twoh@fb.com>
	Mon, 19 Jun 2017 18:48:58 +0000 (18:48 +0000)
committer	Taewook Oh <twoh@fb.com>
	Mon, 19 Jun 2017 18:48:58 +0000 (18:48 +0000)
include/llvm/Transforms/Vectorize/LoopVectorize.h		patch \| blob \| history
lib/Transforms/Vectorize/LoopVectorize.cpp		patch \| blob \| history
test/Transforms/LoopVectorize/X86/small-size.ll		patch \| blob \| history
test/Transforms/LoopVectorize/tripcount.ll	[new file with mode: 0644]	patch \| blob