/// respective conditions.
void predicateInstructions();
+ /// Collect the instructions from the original loop that would be trivially
+ /// dead in the vectorized loop if generated.
+ void collectTriviallyDeadInstructions();
+
/// Shrinks vector element sizes to the smallest bitwidth they can be legally
/// represented as.
void truncateToMinimalBitwidths();
// Record whether runtime checks are added.
bool AddedSafetyChecks;
+
+ // Holds instructions from the original loop whose counterparts in the
+ // vectorized loop would be trivially dead if generated. For example,
+ // original induction update instructions can become dead because we
+ // separately emit induction "steps" when generating code for the new loop.
+ // Similarly, we create a new latch condition when setting up the structure
+ // of the new loop, so the old one can become dead.
+ SmallPtrSet<Instruction *, 4> DeadInstructions;
};
class InnerLoopUnroller : public InnerLoopVectorizer {
// are vectorized, so we can use them to construct the PHI.
PhiVector PHIsToFix;
+ // Collect instructions from the original loop that will become trivially
+ // dead in the vectorized loop. We don't need to vectorize these
+ // instructions.
+ collectTriviallyDeadInstructions();
+
// Scan the loop in a topological order to ensure that defs are vectorized
// before users.
LoopBlocksDFS DFS(OrigLoop);
}
}
+void InnerLoopVectorizer::collectTriviallyDeadInstructions() {
+ BasicBlock *Latch = OrigLoop->getLoopLatch();
+
+ // We create new control-flow for the vectorized loop, so the original
+ // condition will be dead after vectorization if it's only used by the
+ // branch.
+ auto *Cmp = dyn_cast<Instruction>(Latch->getTerminator()->getOperand(0));
+ if (Cmp && Cmp->hasOneUse())
+ DeadInstructions.insert(Cmp);
+
+ // We create new "steps" for induction variable updates to which the original
+ // induction variables map. An original update instruction will be dead if
+ // all its users except the induction variable are dead.
+ for (auto &Induction : *Legal->getInductionVars()) {
+ PHINode *Ind = Induction.first;
+ auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
+ if (all_of(IndUpdate->users(), [&](User *U) -> bool {
+ return U == Ind || DeadInstructions.count(cast<Instruction>(U));
+ }))
+ DeadInstructions.insert(IndUpdate);
+ }
+}
+
void InnerLoopVectorizer::predicateInstructions() {
// For each instruction I marked for predication on value C, split I into its
// For each instruction in the old loop.
for (Instruction &I : *BB) {
+ // If the instruction will become trivially dead when vectorized, we don't
+ // need to generate it.
+ if (DeadInstructions.count(&I))
+ continue;
+
// Scalarize instructions that should remain scalar after vectorization.
if (!(isa<BranchInst>(&I) || isa<PHINode>(&I) ||
isa<DbgInfoIntrinsic>(&I)) &&
--- /dev/null
+; RUN: opt < %s -force-vector-width=2 -force-vector-interleave=2 -loop-vectorize -S | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128"
+
+; CHECK-LABEL: @dead_instructions_01
+;
+; This test ensures that we don't generate trivially dead instructions prior to
+; instruction simplification. We don't need to generate instructions
+; corresponding to the original induction variable update or branch condition,
+; since we rewrite the loop structure.
+;
+; CHECK: vector.body:
+; CHECK: %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+; CHECK: %[[I0:.+]] = add i64 %index, 0
+; CHECK: %[[I2:.+]] = add i64 %index, 2
+; CHECK: getelementptr inbounds i64, i64* %a, i64 %[[I0]]
+; CHECK: getelementptr inbounds i64, i64* %a, i64 %[[I2]]
+; CHECK-NOT: add nuw nsw i64 %[[I0]], 1
+; CHECK-NOT: add nuw nsw i64 %[[I2]], 1
+; CHECK-NOT: icmp slt i64 {{.*}}, %n
+; CHECK: %index.next = add i64 %index, 4
+; CHECK: %[[CMP:.+]] = icmp eq i64 %index.next, %n.vec
+; CHECK: br i1 %[[CMP]], label %middle.block, label %vector.body
+;
+define i64 @dead_instructions_01(i64 *%a, i64 %n) {
+entry:
+ br label %for.body
+
+for.body:
+ %i = phi i64 [ %i.next, %for.body ], [ 0, %entry ]
+ %r = phi i64 [ %tmp2, %for.body ], [ 0, %entry ]
+ %tmp0 = getelementptr inbounds i64, i64* %a, i64 %i
+ %tmp1 = load i64, i64* %tmp0, align 8
+ %tmp2 = add i64 %tmp1, %r
+ %i.next = add nuw nsw i64 %i, 1
+ %cond = icmp slt i64 %i.next, %n
+ br i1 %cond, label %for.body, label %for.end
+
+for.end:
+ %tmp3 = phi i64 [ %tmp2, %for.body ]
+ ret i64 %tmp3
+}