[LV] Optimize for size when vectorizing loops with tiny trip count

author Ayal Zaks <ayal.zaks@intel.com>

Fri, 30 Jun 2017 08:02:35 +0000 (08:02 +0000)

committer Ayal Zaks <ayal.zaks@intel.com>

Fri, 30 Jun 2017 08:02:35 +0000 (08:02 +0000)
author Ayal Zaks <ayal.zaks@intel.com>
Fri, 30 Jun 2017 08:02:35 +0000 (08:02 +0000)
committer Ayal Zaks <ayal.zaks@intel.com>
Fri, 30 Jun 2017 08:02:35 +0000 (08:02 +0000)
diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp

index d1ac444bb5bf1c48832361e9b35b13fbd7de7f3d..0cf2b9f15a844a5ae314ac71cdd44f1cc05a3600 100644 (file)
--- a/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -114,12 +114,13 @@ static cl::opt<bool>
      EnableIfConversion("enable-if-conversion", cl::init(true), cl::Hidden,
                         cl::desc("Enable if-conversion during vectorization."));
  
-/// We don't vectorize loops with a known constant trip count below this number.
+/// Loops with a known constant trip count below this number are vectorized only
+/// if no scalar iteration overheads are incurred.
  static cl::opt<unsigned> TinyTripCountVectorThreshold(
      "vectorizer-min-trip-count", cl::init(16), cl::Hidden,
-    cl::desc("Don't vectorize loops with a constant "
-             "trip count that is smaller than this "
-             "value."));
+    cl::desc("Loops with a constant trip count that is smaller than this "
+             "value are vectorized only if no scalar iteration overheads "
+             "are incurred."));
  
  static cl::opt<bool> MaximizeBandwidth(
      "vectorizer-maximize-bandwidth", cl::init(false), cl::Hidden,
@@ -7801,8 +7802,25 @@ bool LoopVectorizePass::processLoop(Loop *L) {
      return false;
    }
  
-  // Check the loop for a trip count threshold:
-  // do not vectorize loops with a tiny trip count.
+  PredicatedScalarEvolution PSE(*SE, *L);
+
+  // Check if it is legal to vectorize the loop.
+  LoopVectorizationRequirements Requirements(*ORE);
+  LoopVectorizationLegality LVL(L, PSE, DT, TLI, AA, F, TTI, GetLAA, LI, ORE,
+                                &Requirements, &Hints);
+  if (!LVL.canVectorize()) {
+    DEBUG(dbgs() << "LV: Not vectorizing: Cannot prove legality.\n");
+    emitMissedWarning(F, L, Hints, ORE);
+    return false;
+  }
+
+  // Check the function attributes to find out if this function should be
+  // optimized for size.
+  bool OptForSize =
+      Hints.getForce() != LoopVectorizeHints::FK_Enabled && F->optForSize();
+
+  // Check the loop for a trip count threshold: vectorize loops with a tiny trip
+  // count by optimizing for size, to minimize overheads.
    unsigned ExpectedTC = SE->getSmallConstantMaxTripCount(L);
    bool HasExpectedTC = (ExpectedTC > 0);
  
@@ -7816,36 +7834,19 @@ bool LoopVectorizePass::processLoop(Loop *L) {
  
    if (HasExpectedTC && ExpectedTC < TinyTripCountVectorThreshold) {
      DEBUG(dbgs() << "LV: Found a loop with a very small trip count. "
-                 << "This loop is not worth vectorizing.");
+                 << "This loop is worth vectorizing only if no scalar "
+                 << "iteration overheads are incurred.");
      if (Hints.getForce() == LoopVectorizeHints::FK_Enabled)
        DEBUG(dbgs() << " But vectorizing was explicitly forced.\n");
      else {
        DEBUG(dbgs() << "\n");
-      ORE->emit(createMissedAnalysis(Hints.vectorizeAnalysisPassName(),
-                                     "NotBeneficial", L)
-                << "vectorization is not beneficial "
-                   "and is not explicitly forced");
-      return false;
+      // Loops with a very small trip count are considered for vectorization
+      // under OptForSize, thereby making sure the cost of their loop body is
+      // dominant, free of runtime guards and scalar iteration overheads.
+      OptForSize = true;
      }
    }
  
-  PredicatedScalarEvolution PSE(*SE, *L);
-
-  // Check if it is legal to vectorize the loop.
-  LoopVectorizationRequirements Requirements(*ORE);
-  LoopVectorizationLegality LVL(L, PSE, DT, TLI, AA, F, TTI, GetLAA, LI, ORE,
-                                &Requirements, &Hints);
-  if (!LVL.canVectorize()) {
-    DEBUG(dbgs() << "LV: Not vectorizing: Cannot prove legality.\n");
-    emitMissedWarning(F, L, Hints, ORE);
-    return false;
-  }
-
-  // Check the function attributes to find out if this function should be
-  // optimized for size.
-  bool OptForSize =
-      Hints.getForce() != LoopVectorizeHints::FK_Enabled && F->optForSize();
-
    // Check the function attributes to see if implicit floats are allowed.
    // FIXME: This check doesn't seem possibly correct -- what if the loop is
    // an integer loop and the vector instructions selected are purely integer
diff --git a/test/Transforms/LoopVectorize/X86/vect.omp.force.small-tc.ll b/test/Transforms/LoopVectorize/X86/vect.omp.force.small-tc.ll

index 8d139ac7e5af010457c71e0302fcfa1d1f99f0da..46fd022af6653cf162448528527f5bc3139ccef0 100644 (file)
--- a/test/Transforms/LoopVectorize/X86/vect.omp.force.small-tc.ll
+++ b/test/Transforms/LoopVectorize/X86/vect.omp.force.small-tc.ll
@@ -3,10 +3,11 @@
  
  ; CHECK: LV: Loop hints: force=enabled
  ; CHECK: LV: Loop hints: force=?
+; CHECK: LV: Loop hints: force=?
  ; No more loops in the module
  ; CHECK-NOT: LV: Loop hints: force=
-; CHECK: 2 loop-vectorize               - Number of loops analyzed for vectorization
-; CHECK: 1 loop-vectorize               - Number of loops vectorized
+; CHECK: 3 loop-vectorize               - Number of loops analyzed for vectorization
+; CHECK: 2 loop-vectorize               - Number of loops vectorized
  
  target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
  target triple = "x86_64-apple-macosx10.8.0"
@@ -71,3 +72,29 @@ for.end:
  
  !3 = !{!3}
  
+;
+; This loop will be vectorized as the trip count is below the threshold but no
+; scalar iterations are needed.
+;
+define void @vectorized2(float* noalias nocapture %A, float* noalias nocapture readonly %B) {
+entry:
+  br label %for.body
+
+for.body:
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds float, float* %B, i64 %indvars.iv
+  %0 = load float, float* %arrayidx, align 4, !llvm.mem.parallel_loop_access !3
+  %arrayidx2 = getelementptr inbounds float, float* %A, i64 %indvars.iv
+  %1 = load float, float* %arrayidx2, align 4, !llvm.mem.parallel_loop_access !3
+  %add = fadd fast float %0, %1
+  store float %add, float* %arrayidx2, align 4, !llvm.mem.parallel_loop_access !3
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 16
+  br i1 %exitcond, label %for.end, label %for.body, !llvm.loop !4
+
+for.end:
+  ret void
+}
+
+!4 = !{!4}
+
diff --git a/test/Transforms/LoopVectorize/small-loop.ll b/test/Transforms/LoopVectorize/small-loop.ll

index 9a5dc4aa1b7465574d7dbe6a68a61e6e6ad8216c..378283b464b9903f187d3a0605e52ff961eaefad 100644 (file)
--- a/test/Transforms/LoopVectorize/small-loop.ll
+++ b/test/Transforms/LoopVectorize/small-loop.ll
@@ -7,7 +7,7 @@ target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f3
  @c = common global [2048 x i32] zeroinitializer, align 16
  
  ;CHECK-LABEL: @example1(
-;CHECK-NOT: load <4 x i32>
+;CHECK: load <4 x i32>
  ;CHECK: ret void
  define void @example1() nounwind uwtable ssp {
    br label %1
@@ -23,8 +23,8 @@ define void @example1() nounwind uwtable ssp {
    store i32 %6, i32* %7, align 4
    %indvars.iv.next = add i64 %indvars.iv, 1
    %lftr.wideiv = trunc i64 %indvars.iv.next to i32
-  %exitcond = icmp eq i32 %lftr.wideiv, 8  ;   <-----  A really small trip count.
-  br i1 %exitcond, label %8, label %1
+  %exitcond = icmp eq i32 %lftr.wideiv, 8  ;   <-----  A really small trip count
+  br i1 %exitcond, label %8, label %1      ;           w/o scalar iteration overhead.
  
  ; <label>:8                                       ; preds = %1
    ret void
author	Ayal Zaks <ayal.zaks@intel.com>
	Fri, 30 Jun 2017 08:02:35 +0000 (08:02 +0000)
committer	Ayal Zaks <ayal.zaks@intel.com>
	Fri, 30 Jun 2017 08:02:35 +0000 (08:02 +0000)
lib/Transforms/Vectorize/LoopVectorize.cpp		patch \| blob \| history
test/Transforms/LoopVectorize/X86/vect.omp.force.small-tc.ll		patch \| blob \| history
test/Transforms/LoopVectorize/small-loop.ll		patch \| blob \| history