/// See PR14725.
void fixLCSSAPHIs();
+ /// Iteratively sink the scalarized operands of a predicated instruction into
+ /// the block that was created for it.
+ void sinkScalarOperands(Instruction *PredInst);
+
/// Predicate conditional instructions that require predication on their
/// respective conditions.
void predicateInstructions();
}
}
+void InnerLoopVectorizer::sinkScalarOperands(Instruction *PredInst) {
+
+ // The basic block and loop containing the predicated instruction.
+ auto *PredBB = PredInst->getParent();
+ auto *VectorLoop = LI->getLoopFor(PredBB);
+
+ // Initialize a worklist with the operands of the predicated instruction.
+ SetVector<Value *> Worklist(PredInst->op_begin(), PredInst->op_end());
+
+ // Holds instructions that we need to analyze again. An instruction may be
+ // reanalyzed if we don't yet know if we can sink it or not.
+ SmallVector<Instruction *, 8> InstsToReanalyze;
+
+ // Returns true if a given use occurs in the predicated block. Phi nodes use
+ // their operands in their corresponding predecessor blocks.
+ auto isBlockOfUsePredicated = [&](Use &U) -> bool {
+ auto *I = cast<Instruction>(U.getUser());
+ BasicBlock *BB = I->getParent();
+ if (auto *Phi = dyn_cast<PHINode>(I))
+ BB = Phi->getIncomingBlock(
+ PHINode::getIncomingValueNumForOperand(U.getOperandNo()));
+ return BB == PredBB;
+ };
+
+ // Iteratively sink the scalarized operands of the predicated instruction
+ // into the block we created for it. When an instruction is sunk, it's
+ // operands are then added to the worklist. The algorithm ends after one pass
+ // through the worklist doesn't sink a single instruction.
+ bool Changed;
+ do {
+
+ // Add the instructions that need to be reanalyzed to the worklist, and
+ // reset the changed indicator.
+ Worklist.insert(InstsToReanalyze.begin(), InstsToReanalyze.end());
+ InstsToReanalyze.clear();
+ Changed = false;
+
+ while (!Worklist.empty()) {
+ auto *I = dyn_cast<Instruction>(Worklist.pop_back_val());
+
+ // We can't sink an instruction if it is a phi node, is already in the
+ // predicated block, is not in the loop, or may have side effects.
+ if (!I || isa<PHINode>(I) || I->getParent() == PredBB ||
+ !VectorLoop->contains(I) || I->mayHaveSideEffects())
+ continue;
+
+ // It's legal to sink the instruction if all its uses occur in the
+ // predicated block. Otherwise, there's nothing to do yet, and we may
+ // need to reanalyze the instruction.
+ if (!all_of(I->uses(), isBlockOfUsePredicated)) {
+ InstsToReanalyze.push_back(I);
+ continue;
+ }
+
+ // Move the instruction to the beginning of the predicated block, and add
+ // it's operands to the worklist.
+ I->moveBefore(&*PredBB->getFirstInsertionPt());
+ Worklist.insert(I->op_begin(), I->op_end());
+
+ // The sinking may have enabled other instructions to be sunk, so we will
+ // need to iterate.
+ Changed = true;
+ }
+ } while (Changed);
+}
+
void InnerLoopVectorizer::predicateInstructions() {
// For each instruction I marked for predication on value C, split I into its
- // own basic block to form an if-then construct over C.
- // Since I may be fed by extractelement and/or be feeding an insertelement
- // generated during scalarization we try to move such instructions into the
- // predicated basic block as well. For the insertelement this also means that
- // the PHI will be created for the resulting vector rather than for the
- // scalar instruction.
+ // own basic block to form an if-then construct over C. Since I may be fed by
+ // an extractelement instruction or other scalar operand, we try to
+ // iteratively sink its scalar operands into the predicated block. If I feeds
+ // an insertelement instruction, we try to move this instruction into the
+ // predicated block as well. For non-void types, a phi node will be created
+ // for the resulting value (either vector or scalar).
+ //
// So for some predicated instruction, e.g. the conditional sdiv in:
//
// for.body:
auto *T = SplitBlockAndInsertIfThen(KV.second, &*I, /*Unreachable=*/false,
/*BranchWeights=*/nullptr, DT, LI);
I->moveBefore(T);
- // Try to move any extractelement we may have created for the predicated
- // instruction into the Then block.
- for (Use &Op : I->operands()) {
- auto *OpInst = dyn_cast<ExtractElementInst>(&*Op);
- if (OpInst && OpInst->hasOneUse()) // TODO: more accurately - hasOneUser()
- OpInst->moveBefore(&*I);
- }
+ sinkScalarOperands(&*I);
I->getParent()->setName(Twine("pred.") + I->getOpcodeName() + ".if");
BB->setName(Twine("pred.") + I->getOpcodeName() + ".continue");
; INTER-NOT: LV: Found uniform instruction: %tmp0 = getelementptr inbounds %pair, %pair* %p, i64 %i, i32 0
; INTER: vector.body
; INTER: %index = phi i64 [ 0, %vector.ph ], [ %index.next, {{.*}} ]
-; INTER: %[[I1:.+]] = or i64 %index, 1
-; INTER: %[[I2:.+]] = or i64 %index, 2
-; INTER: %[[I3:.+]] = or i64 %index, 3
; INTER: %[[G0:.+]] = getelementptr inbounds %pair, %pair* %p, i64 %index, i32 0
+; INTER: %[[B0:.+]] = bitcast i32* %[[G0]] to <8 x i32>*
+; INTER: %wide.vec = load <8 x i32>, <8 x i32>* %[[B0]], align 8
+; INTER: %[[I1:.+]] = or i64 %index, 1
; INTER: getelementptr inbounds %pair, %pair* %p, i64 %[[I1]], i32 0
+; INTER: %[[I2:.+]] = or i64 %index, 2
; INTER: getelementptr inbounds %pair, %pair* %p, i64 %[[I2]], i32 0
+; INTER: %[[I3:.+]] = or i64 %index, 3
; INTER: getelementptr inbounds %pair, %pair* %p, i64 %[[I3]], i32 0
-; INTER: %[[B0:.+]] = bitcast i32* %[[G0]] to <8 x i32>*
-; INTER: %wide.vec = load <8 x i32>, <8 x i32>* %[[B0]], align 8
; INTER: br i1 {{.*}}, label %middle.block, label %vector.body
;
define void @predicated_store(%pair *%p, i32 %x, i64 %n) {
; VEC-LABEL: test
; VEC: %[[v0:.+]] = add i64 %index, 0
-; VEC: %[[v1:.+]] = add i64 %index, 1
-; VEC: %[[v2:.+]] = getelementptr inbounds i32, i32* %f, i64 %[[v0]]
-; VEC: %[[v4:.+]] = getelementptr inbounds i32, i32* %f, i64 %[[v1]]
; VEC: %[[v8:.+]] = icmp sgt <2 x i32> %{{.*}}, <i32 100, i32 100>
; VEC: %[[v9:.+]] = add nsw <2 x i32> %{{.*}}, <i32 20, i32 20>
; VEC: %[[v10:.+]] = and <2 x i1> %[[v8]], <i1 true, i1 true>
;
; VEC: [[cond]]:
; VEC: %[[v13:.+]] = extractelement <2 x i32> %[[v9]], i32 0
+; VEC: %[[v2:.+]] = getelementptr inbounds i32, i32* %f, i64 %[[v0]]
; VEC: store i32 %[[v13]], i32* %[[v2]], align 4
; VEC: br label %[[else:.+]]
;
;
; VEC: [[cond2]]:
; VEC: %[[v17:.+]] = extractelement <2 x i32> %[[v9]], i32 1
+; VEC: %[[v1:.+]] = add i64 %index, 1
+; VEC: %[[v4:.+]] = getelementptr inbounds i32, i32* %f, i64 %[[v1]]
; VEC: store i32 %[[v17]], i32* %[[v4]], align 4
; VEC: br label %[[else2:.+]]
;
; UNROLL: %[[v3:[a-zA-Z0-9]+]] = load i32, i32* %[[v1]], align 4
; UNROLL: %[[v4:[a-zA-Z0-9]+]] = icmp sgt i32 %[[v2]], 100
; UNROLL: %[[v5:[a-zA-Z0-9]+]] = icmp sgt i32 %[[v3]], 100
-; UNROLL: %[[v6:[a-zA-Z0-9]+]] = add nsw i32 %[[v2]], 20
-; UNROLL: %[[v7:[a-zA-Z0-9]+]] = add nsw i32 %[[v3]], 20
; UNROLL: %[[o1:[a-zA-Z0-9]+]] = or i1 false, %[[v4]]
; UNROLL: %[[o2:[a-zA-Z0-9]+]] = or i1 false, %[[v5]]
; UNROLL: %[[v8:[a-zA-Z0-9]+]] = icmp eq i1 %[[o1]], true
; UNROLL: br i1 %[[v8]], label %[[cond:[a-zA-Z0-9.]+]], label %[[else:[a-zA-Z0-9.]+]]
;
; UNROLL: [[cond]]:
+; UNROLL: %[[v6:[a-zA-Z0-9]+]] = add nsw i32 %[[v2]], 20
; UNROLL: store i32 %[[v6]], i32* %[[v0]], align 4
; UNROLL: br label %[[else]]
;
; UNROLL: br i1 %[[v9]], label %[[cond2:[a-zA-Z0-9.]+]], label %[[else2:[a-zA-Z0-9.]+]]
;
; UNROLL: [[cond2]]:
+; UNROLL: %[[v7:[a-zA-Z0-9]+]] = add nsw i32 %[[v3]], 20
; UNROLL: store i32 %[[v7]], i32* %[[v1]], align 4
; UNROLL: br label %[[else2]]
;
; CHECK: vector.body:
; CHECK: %index = phi i32 [ 0, %vector.ph ], [ %index.next, %pred.udiv.continue2 ]
; CHECK: %[[I0:.+]] = add i32 %index, 0
-; CHECK: %[[I1:.+]] = add i32 %index, 1
; CHECK: getelementptr inbounds i32, i32* %a, i32 %[[I0]]
; CHECK: pred.udiv.if:
; CHECK: udiv i32 {{.*}}, %[[I0]]
; CHECK: pred.udiv.if1:
+; CHECK: %[[I1:.+]] = add i32 %index, 1
; CHECK: udiv i32 {{.*}}, %[[I1]]
;
; UNROLL-NO_IC-LABEL: @scalarize_induction_variable_05(
; UNROLL-NO-IC: vector.body:
; UNROLL-NO-IC: %index = phi i32 [ 0, %vector.ph ], [ %index.next, %pred.udiv.continue11 ]
; UNROLL-NO-IC: %[[I0:.+]] = add i32 %index, 0
-; UNROLL-NO-IC: %[[I1:.+]] = add i32 %index, 1
; UNROLL-NO-IC: %[[I2:.+]] = add i32 %index, 2
-; UNROLL-NO-IC: %[[I3:.+]] = add i32 %index, 3
; UNROLL-NO-IC: getelementptr inbounds i32, i32* %a, i32 %[[I0]]
; UNROLL-NO-IC: getelementptr inbounds i32, i32* %a, i32 %[[I2]]
; UNROLL-NO-IC: pred.udiv.if:
; UNROLL-NO-IC: udiv i32 {{.*}}, %[[I0]]
; UNROLL-NO-IC: pred.udiv.if6:
+; UNROLL-NO-IC: %[[I1:.+]] = add i32 %index, 1
; UNROLL-NO-IC: udiv i32 {{.*}}, %[[I1]]
; UNROLL-NO-IC: pred.udiv.if8:
; UNROLL-NO-IC: udiv i32 {{.*}}, %[[I2]]
; UNROLL-NO-IC: pred.udiv.if10:
+; UNROLL-NO-IC: %[[I3:.+]] = add i32 %index, 3
; UNROLL-NO-IC: udiv i32 {{.*}}, %[[I3]]
;
; IND-LABEL: @scalarize_induction_variable_05(
; IND: vector.body:
; IND: %index = phi i32 [ 0, %vector.ph ], [ %index.next, %pred.udiv.continue2 ]
-; IND: %[[I1:.+]] = or i32 %index, 1
; IND: %[[E0:.+]] = sext i32 %index to i64
; IND: getelementptr inbounds i32, i32* %a, i64 %[[E0]]
; IND: pred.udiv.if:
; IND: udiv i32 {{.*}}, %index
; IND: pred.udiv.if1:
+; IND: %[[I1:.+]] = or i32 %index, 1
; IND: udiv i32 {{.*}}, %[[I1]]
;
; UNROLL-LABEL: @scalarize_induction_variable_05(
; UNROLL: vector.body:
; UNROLL: %index = phi i32 [ 0, %vector.ph ], [ %index.next, %pred.udiv.continue11 ]
-; UNROLL: %[[I1:.+]] = or i32 %index, 1
; UNROLL: %[[I2:.+]] = or i32 %index, 2
-; UNROLL: %[[I3:.+]] = or i32 %index, 3
; UNROLL: %[[E0:.+]] = sext i32 %index to i64
; UNROLL: %[[G0:.+]] = getelementptr inbounds i32, i32* %a, i64 %[[E0]]
; UNROLL: getelementptr i32, i32* %[[G0]], i64 2
; UNROLL: pred.udiv.if:
; UNROLL: udiv i32 {{.*}}, %index
; UNROLL: pred.udiv.if6:
+; UNROLL: %[[I1:.+]] = or i32 %index, 1
; UNROLL: udiv i32 {{.*}}, %[[I1]]
; UNROLL: pred.udiv.if8:
; UNROLL: udiv i32 {{.*}}, %[[I2]]
; UNROLL: pred.udiv.if10:
+; UNROLL: %[[I3:.+]] = or i32 %index, 3
; UNROLL: udiv i32 {{.*}}, %[[I3]]
define i32 @scalarize_induction_variable_05(i32* %a, i32 %x, i1 %c, i32 %n) {