#include "llvm/Transforms/Scalar.h"
#include "llvm/Transforms/Utils/BasicBlockUtils.h"
#include "llvm/Transforms/Utils/Local.h"
+#include "llvm/Transforms/Utils/LoopSimplify.h"
#include "llvm/Transforms/Utils/LoopUtils.h"
#include "llvm/Transforms/Utils/LoopVersioning.h"
#include "llvm/Transforms/Vectorize.h"
void getAnalysisUsage(AnalysisUsage &AU) const override {
AU.addRequired<AssumptionCacheTracker>();
- AU.addRequiredID(LoopSimplifyID);
- AU.addRequiredID(LCSSAID);
AU.addRequired<BlockFrequencyInfoWrapperPass>();
AU.addRequired<DominatorTreeWrapperPass>();
AU.addRequired<LoopInfoWrapperPass>();
INITIALIZE_PASS_DEPENDENCY(BlockFrequencyInfoWrapperPass)
INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(LCSSAWrapperPass)
INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(LoopSimplify)
INITIALIZE_PASS_DEPENDENCY(LoopAccessLegacyAnalysis)
INITIALIZE_PASS_DEPENDENCY(DemandedBitsWrapperPass)
INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass)
DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n');
}
+ formLCSSARecursively(*L, *DT, LI, SE);
+
using namespace ore;
if (!VectorizeLoop) {
assert(IC > 1 && "interleave count should not be 1 or 0");
if (!TTI->getNumberOfRegisters(true) && TTI->getMaxInterleaveFactor(1) < 2)
return false;
+ bool Changed = false;
+
+ // The vectorizer requires loops to be in simplified form.
+ // Since simplification may add new inner loops, it has to run before the
+ // legality and profitability checks. This means running the loop vectorizer
+ // will simplify all loops, regardless of whether anything end up being
+ // vectorized.
+ for (auto &L : *LI)
+ Changed |= simplifyLoop(L, DT, LI, SE, AC, false /* PreserveLCSSA */);
+
// Build up a worklist of inner-loops to vectorize. This is necessary as
// the act of vectorizing or partially unrolling a loop creates new loops
// and can invalidate iterators across the loops.
LoopsAnalyzed += Worklist.size();
// Now walk the identified inner loops.
- bool Changed = false;
while (!Worklist.empty())
Changed |= processLoop(Worklist.pop_back_val());
--- /dev/null
+; RUN: opt < %s -loop-vectorize -force-vector-interleave=1 -force-vector-width=4 -S | FileCheck %s
+; We vectorize the inner loop, so we have to put it in LCSSA form.
+; However, there's no reason to touch the outer loop.
+
+; CHECK-LABEL: @foo
+; CHECK-LABEL: for.end.inner.loopexit:
+; CHECK: %[[LCSSAPHI:.*]] = phi i64 [ %indvars.iv, %for.body.inner ], [ %{{.*}}, %middle.block ]
+; CHECK: store i64 %[[LCSSAPHI]], i64* %O1, align 4
+; CHECK-LABEL: for.end.outer.loopexit
+; CHECK: store i64 %indvars.outer, i64* %O2, align 4
+
+
+define i64 @foo(i32* nocapture %A, i32* nocapture %B, i64 %n, i64 %m, i64* %O1, i64* %O2) {
+entry:
+ %cmp = icmp sgt i64 %n, 0
+ br i1 %cmp, label %for.body.outer.preheader, label %for.end.outer
+
+for.body.outer.preheader: ; preds = %entry
+ br label %for.body.outer
+
+for.body.outer: ; preds = %for.body.outer.preheader, %for.end.inner
+ %indvars.outer = phi i64 [ %indvars.outer.next, %for.end.inner ], [ 0, %for.body.outer.preheader ]
+ %cmp2 = icmp sgt i64 %m, 0
+ br i1 %cmp2, label %for.body.inner.preheader, label %for.end.inner
+
+for.body.inner.preheader: ; preds = %for.body.outer
+ br label %for.body.inner
+
+for.body.inner: ; preds = %for.body.inner.preheader, %for.body.inner
+ %indvars.iv = phi i64 [ %indvars.iv.next, %for.body.inner ], [ 0, %for.body.inner.preheader ]
+ %arrayidx = getelementptr inbounds i32, i32* %A, i64 %indvars.iv
+ %v = load i32, i32* %arrayidx, align 4
+ %arrayidx2 = getelementptr inbounds i32, i32* %B, i64 %indvars.iv
+ store i32 %v, i32* %arrayidx2, align 4
+ %indvars.iv.next = add i64 %indvars.iv, 1
+ %exitcond = icmp eq i64 %indvars.iv, %n
+ br i1 %exitcond, label %for.end.inner.loopexit, label %for.body.inner
+
+for.end.inner.loopexit: ; preds = %for.body.inner
+ store i64 %indvars.iv, i64 *%O1, align 4
+ br label %for.end.inner
+
+for.end.inner: ; preds = %for.end.inner.loopexit, %for.body.outer
+ %indvars.outer.next = add i64 %indvars.outer, 1
+ %exitcond.outer = icmp eq i64 %indvars.outer, %m
+ br i1 %exitcond.outer, label %for.end.outer.loopexit, label %for.body.outer
+
+for.end.outer.loopexit: ; preds = %for.end.inner
+ store i64 %indvars.outer, i64 *%O2, align 4
+ br label %for.end.outer
+
+for.end.outer: ; preds = %for.end.outer.loopexit, %entry
+ ret i64 undef
+}
; Since %inc54 is the IV of the outer loop, and %0 equivalent to it,
; we get the situation described above.
-; This test uses the new PM, because with the old PM, running loop-vectorize
-; would explicitly run loop-simplify. Even though this loop is already in
-; simplified form, loop-simplify would still clean up the phi.
-; The reason this matters is that in a real optimizer pipeline, LICM can create
-; such PHIs, and since it preserves loop simplified form, the cleanup has
-; no chance to run.
-
; Code that leads to this situation can look something like:
;
; int a, b[1], c;
;
; The PHI is an artifact of the register promotion of c.
+; Note that we can no longer get the vectorizer to actually see such PHIs,
+; because LV now simplifies the loop internally, but the test is still
+; useful as a regression test, and in case loop-simplify behavior changes.
+
@c = external global i32, align 4
@a = external global i32, align 4
@b = external global [1 x i32], align 4
-; CHECK: LV: PHI is a recurrence with respect to an outer loop.
; CHECK: LV: Not vectorizing: Cannot prove legality.
; CHECK-LABEL: @test
define void @test() {