[LV] Run loop-simplify and LCSSA explicitly instead of "requiring" them

author Michael Kuperstein <mkuper@google.com>

Thu, 19 Jan 2017 00:42:28 +0000 (00:42 +0000)

committer Michael Kuperstein <mkuper@google.com>

Thu, 19 Jan 2017 00:42:28 +0000 (00:42 +0000)
author Michael Kuperstein <mkuper@google.com>
Thu, 19 Jan 2017 00:42:28 +0000 (00:42 +0000)
committer Michael Kuperstein <mkuper@google.com>
Thu, 19 Jan 2017 00:42:28 +0000 (00:42 +0000)
diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp

index 18cfef026fb0a8fa3da8c1b2ef607e3d934185e4..71424b2bac207235a68353b469c04cd38e1338ef 100644 (file)
--- a/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -92,6 +92,7 @@
  #include "llvm/Transforms/Scalar.h"
  #include "llvm/Transforms/Utils/BasicBlockUtils.h"
  #include "llvm/Transforms/Utils/Local.h"
+#include "llvm/Transforms/Utils/LoopSimplify.h"
  #include "llvm/Transforms/Utils/LoopUtils.h"
  #include "llvm/Transforms/Utils/LoopVersioning.h"
  #include "llvm/Transforms/Vectorize.h"
@@ -2134,8 +2135,6 @@ struct LoopVectorize : public FunctionPass {
  
    void getAnalysisUsage(AnalysisUsage &AU) const override {
      AU.addRequired<AssumptionCacheTracker>();
-    AU.addRequiredID(LoopSimplifyID);
-    AU.addRequiredID(LCSSAID);
      AU.addRequired<BlockFrequencyInfoWrapperPass>();
      AU.addRequired<DominatorTreeWrapperPass>();
      AU.addRequired<LoopInfoWrapperPass>();
@@ -7169,9 +7168,7 @@ INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
  INITIALIZE_PASS_DEPENDENCY(BlockFrequencyInfoWrapperPass)
  INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
  INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(LCSSAWrapperPass)
  INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(LoopSimplify)
  INITIALIZE_PASS_DEPENDENCY(LoopAccessLegacyAnalysis)
  INITIALIZE_PASS_DEPENDENCY(DemandedBitsWrapperPass)
  INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass)
@@ -7543,6 +7540,8 @@ bool LoopVectorizePass::processLoop(Loop *L) {
      DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n');
    }
  
+  formLCSSARecursively(*L, *DT, LI, SE);
+
    using namespace ore;
    if (!VectorizeLoop) {
      assert(IC > 1 && "interleave count should not be 1 or 0");
@@ -7618,6 +7617,16 @@ bool LoopVectorizePass::runImpl(
    if (!TTI->getNumberOfRegisters(true) && TTI->getMaxInterleaveFactor(1) < 2)
      return false;
  
+  bool Changed = false;
+
+  // The vectorizer requires loops to be in simplified form.
+  // Since simplification may add new inner loops, it has to run before the
+  // legality and profitability checks. This means running the loop vectorizer
+  // will simplify all loops, regardless of whether anything end up being
+  // vectorized.
+  for (auto &L : *LI)
+    Changed |= simplifyLoop(L, DT, LI, SE, AC, false /* PreserveLCSSA */);
+
    // Build up a worklist of inner-loops to vectorize. This is necessary as
    // the act of vectorizing or partially unrolling a loop creates new loops
    // and can invalidate iterators across the loops.
@@ -7629,7 +7638,6 @@ bool LoopVectorizePass::runImpl(
    LoopsAnalyzed += Worklist.size();
  
    // Now walk the identified inner loops.
-  bool Changed = false;
    while (!Worklist.empty())
      Changed |= processLoop(Worklist.pop_back_val());
  
diff --git a/test/Transforms/LoopVectorize/partial-lcssa.ll b/test/Transforms/LoopVectorize/partial-lcssa.ll

new file mode 100644 (file)

index 0000000..1306ed9
--- /dev/null
+++ b/test/Transforms/LoopVectorize/partial-lcssa.ll
@@ -0,0 +1,54 @@
+; RUN: opt < %s -loop-vectorize -force-vector-interleave=1 -force-vector-width=4 -S | FileCheck %s
+; We vectorize the inner loop, so we have to put it in LCSSA form.
+; However, there's no reason to touch the outer loop.
+
+; CHECK-LABEL: @foo
+; CHECK-LABEL: for.end.inner.loopexit:
+; CHECK: %[[LCSSAPHI:.*]] = phi i64 [ %indvars.iv, %for.body.inner ], [ %{{.*}}, %middle.block ]
+; CHECK: store i64 %[[LCSSAPHI]], i64* %O1, align 4
+; CHECK-LABEL: for.end.outer.loopexit
+; CHECK: store i64 %indvars.outer, i64* %O2, align 4
+
+
+define i64 @foo(i32* nocapture %A, i32* nocapture %B, i64 %n, i64 %m, i64* %O1, i64* %O2) {
+entry:
+  %cmp = icmp sgt i64 %n, 0
+  br i1 %cmp, label %for.body.outer.preheader, label %for.end.outer
+
+for.body.outer.preheader:                         ; preds = %entry
+  br label %for.body.outer
+
+for.body.outer:                                   ; preds = %for.body.outer.preheader, %for.end.inner
+  %indvars.outer = phi i64 [ %indvars.outer.next, %for.end.inner ], [ 0, %for.body.outer.preheader ]
+  %cmp2 = icmp sgt i64 %m, 0
+  br i1 %cmp2, label %for.body.inner.preheader, label %for.end.inner
+
+for.body.inner.preheader:                         ; preds = %for.body.outer
+  br label %for.body.inner
+
+for.body.inner:                                   ; preds = %for.body.inner.preheader, %for.body.inner
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body.inner ], [ 0, %for.body.inner.preheader ]
+  %arrayidx = getelementptr inbounds i32, i32* %A, i64 %indvars.iv
+  %v = load i32, i32* %arrayidx, align 4
+  %arrayidx2 = getelementptr inbounds i32, i32* %B, i64 %indvars.iv
+  store i32 %v, i32* %arrayidx2, align 4
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv, %n
+  br i1 %exitcond, label %for.end.inner.loopexit, label %for.body.inner
+
+for.end.inner.loopexit:                           ; preds = %for.body.inner
+  store i64 %indvars.iv, i64 *%O1, align 4
+  br label %for.end.inner
+
+for.end.inner:                                    ; preds = %for.end.inner.loopexit, %for.body.outer
+  %indvars.outer.next = add i64 %indvars.outer, 1
+  %exitcond.outer = icmp eq i64 %indvars.outer, %m
+  br i1 %exitcond.outer, label %for.end.outer.loopexit, label %for.body.outer
+
+for.end.outer.loopexit:                           ; preds = %for.end.inner
+  store i64 %indvars.outer, i64 *%O2, align 4
+  br label %for.end.outer
+
+for.end.outer:                                    ; preds = %for.end.outer.loopexit, %entry
+  ret i64 undef
+}
diff --git a/test/Transforms/LoopVectorize/pr31190.ll b/test/Transforms/LoopVectorize/pr31190.ll

index afb1754983cd3efe832e7f158ee68cb599e9796f..1ff8b2ba7ce4ba68159b8a47d85eb0fc4e2b0deb 100644 (file)
--- a/test/Transforms/LoopVectorize/pr31190.ll
+++ b/test/Transforms/LoopVectorize/pr31190.ll
@@ -9,13 +9,6 @@
  ; Since %inc54 is the IV of the outer loop, and %0 equivalent to it,
  ; we get the situation described above.
  
-; This test uses the new PM, because with the old PM, running loop-vectorize
-; would explicitly run loop-simplify. Even though this loop is already in
-; simplified form, loop-simplify would still clean up the phi.
-; The reason this matters is that in a real optimizer pipeline, LICM can create
-; such PHIs, and since it preserves loop simplified form, the cleanup has
-; no chance to run.
-
  ; Code that leads to this situation can look something like:
  ;
  ; int a, b[1], c;
@@ -28,11 +21,14 @@
  ;
  ; The PHI is an artifact of the register promotion of c.
  
+; Note that we can no longer get the vectorizer to actually see such PHIs,
+; because LV now simplifies the loop internally, but the test is still
+; useful as a regression test, and in case loop-simplify behavior changes.
+
  @c = external global i32, align 4
  @a = external global i32, align 4
  @b = external global [1 x i32], align 4
  
-; CHECK: LV: PHI is a recurrence with respect to an outer loop.
  ; CHECK: LV: Not vectorizing: Cannot prove legality.
  ; CHECK-LABEL: @test
  define void @test() {
author	Michael Kuperstein <mkuper@google.com>
	Thu, 19 Jan 2017 00:42:28 +0000 (00:42 +0000)
committer	Michael Kuperstein <mkuper@google.com>
	Thu, 19 Jan 2017 00:42:28 +0000 (00:42 +0000)
lib/Transforms/Vectorize/LoopVectorize.cpp		patch \| blob \| history
test/Transforms/LoopVectorize/partial-lcssa.ll	[new file with mode: 0644]	patch \| blob
test/Transforms/LoopVectorize/pr31190.ll		patch \| blob \| history