addMVEVectorTypes(Subtarget->hasMVEFloatOps());
// Combine low-overhead loop intrinsics so that we can lower i1 types.
- if (Subtarget->hasLOB())
+ if (Subtarget->hasLOB()) {
setTargetDAGCombine(ISD::BRCOND);
+ setTargetDAGCombine(ISD::BR_CC);
+ }
if (Subtarget->hasNEON()) {
addDRTypeForNEON(MVT::v2f32);
case ARMISD::VST3LN_UPD: return "ARMISD::VST3LN_UPD";
case ARMISD::VST4LN_UPD: return "ARMISD::VST4LN_UPD";
case ARMISD::WLS: return "ARMISD::WLS";
+ case ARMISD::LE: return "ARMISD::LE";
+ case ARMISD::LOOP_DEC: return "ARMISD::LOOP_DEC";
}
return nullptr;
}
return V;
}
+// Given N, the value controlling the conditional branch, search for the loop
+// intrinsic, returning it, along with how the value is used. We need to handle
+// patterns such as the following:
+// (brcond (xor (setcc (loop.decrement), 0, ne), 1), exit)
+// (brcond (setcc (loop.decrement), 0, eq), exit)
+// (brcond (setcc (loop.decrement), 0, ne), header)
+static SDValue SearchLoopIntrinsic(SDValue N, ISD::CondCode &CC, int &Imm,
+ bool &Negate) {
+ switch (N->getOpcode()) {
+ default:
+ break;
+ case ISD::XOR: {
+ if (!isa<ConstantSDNode>(N.getOperand(1)))
+ return SDValue();
+ if (!cast<ConstantSDNode>(N.getOperand(1))->isOne())
+ return SDValue();
+ Negate = !Negate;
+ return SearchLoopIntrinsic(N.getOperand(0), CC, Imm, Negate);
+ }
+ case ISD::SETCC: {
+ auto *Const = dyn_cast<ConstantSDNode>(N.getOperand(1));
+ if (!Const)
+ return SDValue();
+ if (Const->isNullValue())
+ Imm = 0;
+ else if (Const->isOne())
+ Imm = 1;
+ else
+ return SDValue();
+ CC = cast<CondCodeSDNode>(N.getOperand(2))->get();
+ return SearchLoopIntrinsic(N->getOperand(0), CC, Imm, Negate);
+ }
+ case ISD::INTRINSIC_W_CHAIN: {
+ unsigned IntOp = cast<ConstantSDNode>(N.getOperand(1))->getZExtValue();
+ if (IntOp != Intrinsic::test_set_loop_iterations &&
+ IntOp != Intrinsic::loop_decrement_reg)
+ return SDValue();
+ return N;
+ }
+ }
+ return SDValue();
+}
+
static SDValue PerformHWLoopCombine(SDNode *N,
TargetLowering::DAGCombinerInfo &DCI,
const ARMSubtarget *ST) {
- // Look for (brcond (xor test.set.loop.iterations, -1)
- SDValue CC = N->getOperand(1);
- unsigned Opc = CC->getOpcode();
- SDValue Int;
- if ((Opc == ISD::XOR || Opc == ISD::SETCC) &&
- (CC->getOperand(0)->getOpcode() == ISD::INTRINSIC_W_CHAIN)) {
+ // The hwloop intrinsics that we're interested are used for control-flow,
+ // either for entering or exiting the loop:
+ // - test.set.loop.iterations will test whether its operand is zero. If it
+ // is zero, the proceeding branch should not enter the loop.
+ // - loop.decrement.reg also tests whether its operand is zero. If it is
+ // zero, the proceeding branch should not branch back to the beginning of
+ // the loop.
+ // So here, we need to check that how the brcond is using the result of each
+ // of the intrinsics to ensure that we're branching to the right place at the
+ // right time.
+
+ ISD::CondCode CC;
+ SDValue Cond;
+ int Imm = 1;
+ bool Negate = false;
+ SDValue Chain = N->getOperand(0);
+ SDValue Dest;
- assert((isa<ConstantSDNode>(CC->getOperand(1)) &&
- cast<ConstantSDNode>(CC->getOperand(1))->isOne()) &&
- "Expected to compare against 1");
+ if (N->getOpcode() == ISD::BRCOND) {
+ CC = ISD::SETEQ;
+ Cond = N->getOperand(1);
+ Dest = N->getOperand(2);
+ } else {
+ assert(N->getOpcode() == ISD::BR_CC && "Expected BRCOND or BR_CC!");
+ CC = cast<CondCodeSDNode>(N->getOperand(1))->get();
+ Cond = N->getOperand(2);
+ Dest = N->getOperand(4);
+ if (auto *Const = dyn_cast<ConstantSDNode>(N->getOperand(3))) {
+ if (!Const->isOne() && !Const->isNullValue())
+ return SDValue();
+ Imm = Const->getZExtValue();
+ } else
+ return SDValue();
+ }
- Int = CC->getOperand(0);
- } else if (CC->getOpcode() == ISD::INTRINSIC_W_CHAIN)
- Int = CC;
- else
+ SDValue Int = SearchLoopIntrinsic(Cond, CC, Imm, Negate);
+ if (!Int)
return SDValue();
- unsigned IntOp = cast<ConstantSDNode>(Int.getOperand(1))->getZExtValue();
- if (IntOp != Intrinsic::test_set_loop_iterations)
- return SDValue();
+ if (Negate)
+ CC = ISD::getSetCCInverse(CC, true);
+
+ auto IsTrueIfZero = [](ISD::CondCode CC, int Imm) {
+ return (CC == ISD::SETEQ && Imm == 0) ||
+ (CC == ISD::SETNE && Imm == 1) ||
+ (CC == ISD::SETLT && Imm == 1) ||
+ (CC == ISD::SETULT && Imm == 1);
+ };
+
+ auto IsFalseIfZero = [](ISD::CondCode CC, int Imm) {
+ return (CC == ISD::SETEQ && Imm == 1) ||
+ (CC == ISD::SETNE && Imm == 0) ||
+ (CC == ISD::SETGT && Imm == 0) ||
+ (CC == ISD::SETUGT && Imm == 0) ||
+ (CC == ISD::SETGE && Imm == 1) ||
+ (CC == ISD::SETUGE && Imm == 1);
+ };
+
+ assert((IsTrueIfZero(CC, Imm) || IsFalseIfZero(CC, Imm)) &&
+ "unsupported condition");
SDLoc dl(Int);
- SDValue Chain = N->getOperand(0);
+ SelectionDAG &DAG = DCI.DAG;
SDValue Elements = Int.getOperand(2);
- SDValue ExitBlock = N->getOperand(2);
+ unsigned IntOp = cast<ConstantSDNode>(Int->getOperand(1))->getZExtValue();
+ assert((N->hasOneUse() && N->use_begin()->getOpcode() == ISD::BR)
+ && "expected single br user");
+ SDNode *Br = *N->use_begin();
+ SDValue OtherTarget = Br->getOperand(1);
+
+ // Update the unconditional branch to branch to the given Dest.
+ auto UpdateUncondBr = [](SDNode *Br, SDValue Dest, SelectionDAG &DAG) {
+ SDValue NewBrOps[] = { Br->getOperand(0), Dest };
+ SDValue NewBr = DAG.getNode(ISD::BR, SDLoc(Br), MVT::Other, NewBrOps);
+ DAG.ReplaceAllUsesOfValueWith(SDValue(Br, 0), NewBr);
+ };
- // TODO: Once we start supporting tail predication, we can add another
- // operand to WLS for the number of elements processed in a vector loop.
+ if (IntOp == Intrinsic::test_set_loop_iterations) {
+ SDValue Res;
+ // We expect this 'instruction' to branch when the counter is zero.
+ if (IsTrueIfZero(CC, Imm)) {
+ SDValue Ops[] = { Chain, Elements, Dest };
+ Res = DAG.getNode(ARMISD::WLS, dl, MVT::Other, Ops);
+ } else {
+ // The logic is the reverse of what we need for WLS, so find the other
+ // basic block target: the target of the proceeding br.
+ UpdateUncondBr(Br, Dest, DAG);
- SDValue Ops[] = { Chain, Elements, ExitBlock };
- SDValue Res = DCI.DAG.getNode(ARMISD::WLS, dl, MVT::Other, Ops);
- DCI.DAG.ReplaceAllUsesOfValueWith(Int.getValue(1), Int.getOperand(0));
- return Res;
+ SDValue Ops[] = { Chain, Elements, OtherTarget };
+ Res = DAG.getNode(ARMISD::WLS, dl, MVT::Other, Ops);
+ }
+ DAG.ReplaceAllUsesOfValueWith(Int.getValue(1), Int.getOperand(0));
+ return Res;
+ } else {
+ SDValue Size = DAG.getTargetConstant(
+ cast<ConstantSDNode>(Int.getOperand(3))->getZExtValue(), dl, MVT::i32);
+ SDValue Args[] = { Int.getOperand(0), Elements, Size, };
+ SDValue LoopDec = DAG.getNode(ARMISD::LOOP_DEC, dl,
+ DAG.getVTList(MVT::i32, MVT::Other), Args);
+ DAG.ReplaceAllUsesWith(Int.getNode(), LoopDec.getNode());
+
+ // We expect this instruction to branch when the count is not zero.
+ SDValue Target = IsFalseIfZero(CC, Imm) ? Dest : OtherTarget;
+
+ // Update the unconditional branch to target the loop preheader if we've
+ // found the condition has been reversed.
+ if (Target == OtherTarget)
+ UpdateUncondBr(Br, Dest, DAG);
+
+ Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
+ SDValue(LoopDec.getNode(), 1), Chain);
+
+ SDValue EndArgs[] = { Chain, SDValue(LoopDec.getNode(), 0), Target };
+ return DAG.getNode(ARMISD::LE, dl, MVT::Other, EndArgs);
+ }
+ return SDValue();
}
/// PerformBRCONDCombine - Target-specific DAG combining for ARMISD::BRCOND.
case ISD::OR: return PerformORCombine(N, DCI, Subtarget);
case ISD::XOR: return PerformXORCombine(N, DCI, Subtarget);
case ISD::AND: return PerformANDCombine(N, DCI, Subtarget);
- case ISD::BRCOND: return PerformHWLoopCombine(N, DCI, Subtarget);
+ case ISD::BRCOND:
+ case ISD::BR_CC: return PerformHWLoopCombine(N, DCI, Subtarget);
case ARMISD::ADDC:
case ARMISD::SUBC: return PerformAddcSubcCombine(N, DCI, Subtarget);
case ARMISD::SUBE: return PerformAddeSubeCombine(N, DCI, Subtarget);
--- /dev/null
+; RUN: llc -mtriple=thumbv8.1m.main -O0 -mattr=+lob -disable-arm-loloops=false -stop-before=arm-low-overhead-loops %s -o - | FileCheck %s --check-prefix=CHECK-MID
+; RUN: llc -mtriple=thumbv8.1m.main -O0 -mattr=+lob -disable-arm-loloops=false -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=CHECK-END
+
+; Test that the branch targets are correct after isel, even though the loop
+; will sometimes be reverted anyway.
+
+; CHECK-MID: name: check_loop_dec_brcond_combine
+; CHECK-MID: bb.2.for.body:
+; CHECK-MID: renamable $lr = t2LoopDec killed renamable $lr, 1
+; CHECK-MID: t2LoopEnd killed renamable $lr, %bb.3
+; CHECK-MID: bb.3.for.header:
+; CHECK-MID: tB %bb.2
+
+; CHECK-END: .LBB0_1:
+; CHECK-END: b .LBB0_3
+; CHECK-END: .LBB0_2:
+; CHECK-END: sub.w lr, lr, #1
+; CHECK-END: cmp.w lr, #0
+; CHECK-END: bne.w .LBB0_3
+; CHECK-END: b .LBB0_4
+; CHECK-END: .LBB0_3:
+; CHECK-END: b .LBB0_2
+define void @check_loop_dec_brcond_combine(i32* nocapture %a, i32* nocapture readonly %b, i32* nocapture readonly %c, i32 %N) {
+entry:
+ call void @llvm.set.loop.iterations.i32(i32 %N)
+ br label %for.body.preheader
+
+for.body.preheader:
+ %scevgep = getelementptr i32, i32* %a, i32 -1
+ %scevgep4 = getelementptr i32, i32* %c, i32 -1
+ %scevgep8 = getelementptr i32, i32* %b, i32 -1
+ br label %for.header
+
+for.body:
+ %scevgep11 = getelementptr i32, i32* %lsr.iv9, i32 1
+ %ld1 = load i32, i32* %scevgep11, align 4
+ %scevgep7 = getelementptr i32, i32* %lsr.iv5, i32 1
+ %ld2 = load i32, i32* %scevgep7, align 4
+ %mul = mul nsw i32 %ld2, %ld1
+ %scevgep3 = getelementptr i32, i32* %lsr.iv1, i32 1
+ store i32 %mul, i32* %scevgep3, align 4
+ %scevgep2 = getelementptr i32, i32* %lsr.iv1, i32 1
+ %scevgep6 = getelementptr i32, i32* %lsr.iv5, i32 1
+ %scevgep10 = getelementptr i32, i32* %lsr.iv9, i32 1
+ %count.next = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 %count, i32 1)
+ %cmp = icmp ne i32 %count.next, 0
+ br i1 %cmp, label %for.header, label %for.cond.cleanup
+
+for.header:
+ %lsr.iv9 = phi i32* [ %scevgep8, %for.body.preheader ], [ %scevgep10, %for.body ]
+ %lsr.iv5 = phi i32* [ %scevgep4, %for.body.preheader ], [ %scevgep6, %for.body ]
+ %lsr.iv1 = phi i32* [ %scevgep, %for.body.preheader ], [ %scevgep2, %for.body ]
+ %count = phi i32 [ %N, %for.body.preheader ], [ %count.next, %for.body ]
+ br label %for.body
+
+for.cond.cleanup:
+ ret void
+}
+
+; CHECK-MID: name: check_loop_dec_ugt_brcond_combine
+; CHECK-MID: bb.2.for.body:
+; CHECK-MID: renamable $lr = t2LoopDec killed renamable $lr, 1
+; CHECK-MID: t2LoopEnd killed renamable $lr, %bb.3
+; CHECK-MID: bb.3.for.header:
+; CHECK-MID: tB %bb.2
+define void @check_loop_dec_ugt_brcond_combine(i32* nocapture %a, i32* nocapture readonly %b, i32* nocapture readonly %c, i32 %N) {
+entry:
+ call void @llvm.set.loop.iterations.i32(i32 %N)
+ br label %for.body.preheader
+
+for.body.preheader:
+ %scevgep = getelementptr i32, i32* %a, i32 -1
+ %scevgep4 = getelementptr i32, i32* %c, i32 -1
+ %scevgep8 = getelementptr i32, i32* %b, i32 -1
+ br label %for.header
+
+for.body:
+ %scevgep11 = getelementptr i32, i32* %lsr.iv9, i32 1
+ %ld1 = load i32, i32* %scevgep11, align 4
+ %scevgep7 = getelementptr i32, i32* %lsr.iv5, i32 1
+ %ld2 = load i32, i32* %scevgep7, align 4
+ %mul = mul nsw i32 %ld2, %ld1
+ %scevgep3 = getelementptr i32, i32* %lsr.iv1, i32 1
+ store i32 %mul, i32* %scevgep3, align 4
+ %scevgep2 = getelementptr i32, i32* %lsr.iv1, i32 1
+ %scevgep6 = getelementptr i32, i32* %lsr.iv5, i32 1
+ %scevgep10 = getelementptr i32, i32* %lsr.iv9, i32 1
+ %count.next = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 %count, i32 1)
+ %cmp = icmp ugt i32 %count.next, 0
+ br i1 %cmp, label %for.header, label %for.cond.cleanup
+
+for.header:
+ %lsr.iv9 = phi i32* [ %scevgep8, %for.body.preheader ], [ %scevgep10, %for.body ]
+ %lsr.iv5 = phi i32* [ %scevgep4, %for.body.preheader ], [ %scevgep6, %for.body ]
+ %lsr.iv1 = phi i32* [ %scevgep, %for.body.preheader ], [ %scevgep2, %for.body ]
+ %count = phi i32 [ %N, %for.body.preheader ], [ %count.next, %for.body ]
+ br label %for.body
+
+for.cond.cleanup:
+ ret void
+}
+
+; CHECK-MID: name: check_loop_dec_ult_brcond_combine
+; CHECK-MID: bb.2.for.body:
+; CHECK-MID: renamable $lr = t2LoopDec killed renamable $lr, 1
+; CHECK-MID: t2LoopEnd killed renamable $lr, %bb.3
+; CHECK-MID: bb.3.for.header:
+; CHECK-MID: tB %bb.2
+define void @check_loop_dec_ult_brcond_combine(i32* nocapture %a, i32* nocapture readonly %b, i32* nocapture readonly %c, i32 %N) {
+entry:
+ call void @llvm.set.loop.iterations.i32(i32 %N)
+ br label %for.body.preheader
+
+for.body.preheader:
+ %scevgep = getelementptr i32, i32* %a, i32 -1
+ %scevgep4 = getelementptr i32, i32* %c, i32 -1
+ %scevgep8 = getelementptr i32, i32* %b, i32 -1
+ br label %for.header
+
+for.body:
+ %scevgep11 = getelementptr i32, i32* %lsr.iv9, i32 1
+ %ld1 = load i32, i32* %scevgep11, align 4
+ %scevgep7 = getelementptr i32, i32* %lsr.iv5, i32 1
+ %ld2 = load i32, i32* %scevgep7, align 4
+ %mul = mul nsw i32 %ld2, %ld1
+ %scevgep3 = getelementptr i32, i32* %lsr.iv1, i32 1
+ store i32 %mul, i32* %scevgep3, align 4
+ %scevgep2 = getelementptr i32, i32* %lsr.iv1, i32 1
+ %scevgep6 = getelementptr i32, i32* %lsr.iv5, i32 1
+ %scevgep10 = getelementptr i32, i32* %lsr.iv9, i32 1
+ %count.next = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 %count, i32 1)
+ %cmp = icmp ult i32 %count.next, 1
+ br i1 %cmp, label %for.cond.cleanup, label %for.header
+
+for.header:
+ %lsr.iv9 = phi i32* [ %scevgep8, %for.body.preheader ], [ %scevgep10, %for.body ]
+ %lsr.iv5 = phi i32* [ %scevgep4, %for.body.preheader ], [ %scevgep6, %for.body ]
+ %lsr.iv1 = phi i32* [ %scevgep, %for.body.preheader ], [ %scevgep2, %for.body ]
+ %count = phi i32 [ %N, %for.body.preheader ], [ %count.next, %for.body ]
+ br label %for.body
+
+for.cond.cleanup:
+ ret void
+}
+
+; CHECK-MID: name: check_loop_dec_ult_xor_brcond_combine
+; CHECK-MIO: bb.2.for.body:
+; CHECK-MID: t2LoopEnd killed renamable $lr, %bb.3
+; CHECK-MID: tB %bb.4, 14
+; CHECk-MID: bb.3.for.header:
+; CHECK-MID: tB %bb.2
+define void @check_loop_dec_ult_xor_brcond_combine(i32* nocapture %a, i32* nocapture readonly %b, i32* nocapture readonly %c, i32 %N) {
+entry:
+ call void @llvm.set.loop.iterations.i32(i32 %N)
+ br label %for.body.preheader
+
+for.body.preheader:
+ %scevgep = getelementptr i32, i32* %a, i32 -1
+ %scevgep4 = getelementptr i32, i32* %c, i32 -1
+ %scevgep8 = getelementptr i32, i32* %b, i32 -1
+ br label %for.header
+
+for.body:
+ %scevgep11 = getelementptr i32, i32* %lsr.iv9, i32 1
+ %ld1 = load i32, i32* %scevgep11, align 4
+ %scevgep7 = getelementptr i32, i32* %lsr.iv5, i32 1
+ %ld2 = load i32, i32* %scevgep7, align 4
+ %mul = mul nsw i32 %ld2, %ld1
+ %scevgep3 = getelementptr i32, i32* %lsr.iv1, i32 1
+ store i32 %mul, i32* %scevgep3, align 4
+ %scevgep2 = getelementptr i32, i32* %lsr.iv1, i32 1
+ %scevgep6 = getelementptr i32, i32* %lsr.iv5, i32 1
+ %scevgep10 = getelementptr i32, i32* %lsr.iv9, i32 1
+ %count.next = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 %count, i32 1)
+ %cmp = icmp ult i32 %count.next, 1
+ %negate = xor i1 %cmp, 1
+ br i1 %negate, label %for.header, label %for.cond.cleanup
+
+for.header:
+ %lsr.iv9 = phi i32* [ %scevgep8, %for.body.preheader ], [ %scevgep10, %for.body ]
+ %lsr.iv5 = phi i32* [ %scevgep4, %for.body.preheader ], [ %scevgep6, %for.body ]
+ %lsr.iv1 = phi i32* [ %scevgep, %for.body.preheader ], [ %scevgep2, %for.body ]
+ %count = phi i32 [ %N, %for.body.preheader ], [ %count.next, %for.body ]
+ br label %for.body
+
+for.cond.cleanup:
+ ret void
+}
+
+; CHECK-MID: name: check_loop_dec_sgt_brcond_combine
+; CHECK-MIO: bb.2.for.body:
+; CHECK-MID: t2LoopEnd killed renamable $lr, %bb.3
+; CHECK-MID: tB %bb.4, 14
+; CHECk-MID: bb.3.for.header:
+; CHECK-MID: tB %bb.2
+define void @check_loop_dec_sgt_brcond_combine(i32* nocapture %a, i32* nocapture readonly %b, i32* nocapture readonly %c, i32 %N) {
+entry:
+ call void @llvm.set.loop.iterations.i32(i32 %N)
+ br label %for.body.preheader
+
+for.body.preheader:
+ %scevgep = getelementptr i32, i32* %a, i32 -1
+ %scevgep4 = getelementptr i32, i32* %c, i32 -1
+ %scevgep8 = getelementptr i32, i32* %b, i32 -1
+ br label %for.header
+
+for.body:
+ %scevgep11 = getelementptr i32, i32* %lsr.iv9, i32 1
+ %ld1 = load i32, i32* %scevgep11, align 4
+ %scevgep7 = getelementptr i32, i32* %lsr.iv5, i32 1
+ %ld2 = load i32, i32* %scevgep7, align 4
+ %mul = mul nsw i32 %ld2, %ld1
+ %scevgep3 = getelementptr i32, i32* %lsr.iv1, i32 1
+ store i32 %mul, i32* %scevgep3, align 4
+ %scevgep2 = getelementptr i32, i32* %lsr.iv1, i32 1
+ %scevgep6 = getelementptr i32, i32* %lsr.iv5, i32 1
+ %scevgep10 = getelementptr i32, i32* %lsr.iv9, i32 1
+ %count.next = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 %count, i32 1)
+ %cmp = icmp sgt i32 %count.next, 0
+ br i1 %cmp, label %for.header, label %for.cond.cleanup
+
+for.header:
+ %lsr.iv9 = phi i32* [ %scevgep8, %for.body.preheader ], [ %scevgep10, %for.body ]
+ %lsr.iv5 = phi i32* [ %scevgep4, %for.body.preheader ], [ %scevgep6, %for.body ]
+ %lsr.iv1 = phi i32* [ %scevgep, %for.body.preheader ], [ %scevgep2, %for.body ]
+ %count = phi i32 [ %N, %for.body.preheader ], [ %count.next, %for.body ]
+ br label %for.body
+
+for.cond.cleanup:
+ ret void
+}
+
+; CHECK-MID: name: check_loop_dec_sge_brcond_combine
+; CHECK-MIO: bb.2.for.body:
+; CHECK-MID: t2LoopEnd killed renamable $lr, %bb.3
+; CHECK-MID: tB %bb.4, 14
+; CHECk-MID: bb.3.for.header:
+; CHECK-MID: tB %bb.2
+define void @check_loop_dec_sge_brcond_combine(i32* nocapture %a, i32* nocapture readonly %b, i32* nocapture readonly %c, i32 %N) {
+entry:
+ call void @llvm.set.loop.iterations.i32(i32 %N)
+ br label %for.body.preheader
+
+for.body.preheader:
+ %scevgep = getelementptr i32, i32* %a, i32 -1
+ %scevgep4 = getelementptr i32, i32* %c, i32 -1
+ %scevgep8 = getelementptr i32, i32* %b, i32 -1
+ br label %for.header
+
+for.body:
+ %scevgep11 = getelementptr i32, i32* %lsr.iv9, i32 1
+ %ld1 = load i32, i32* %scevgep11, align 4
+ %scevgep7 = getelementptr i32, i32* %lsr.iv5, i32 1
+ %ld2 = load i32, i32* %scevgep7, align 4
+ %mul = mul nsw i32 %ld2, %ld1
+ %scevgep3 = getelementptr i32, i32* %lsr.iv1, i32 1
+ store i32 %mul, i32* %scevgep3, align 4
+ %scevgep2 = getelementptr i32, i32* %lsr.iv1, i32 1
+ %scevgep6 = getelementptr i32, i32* %lsr.iv5, i32 1
+ %scevgep10 = getelementptr i32, i32* %lsr.iv9, i32 1
+ %count.next = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 %count, i32 1)
+ %cmp = icmp sge i32 %count.next, 1
+ br i1 %cmp, label %for.header, label %for.cond.cleanup
+
+for.header:
+ %lsr.iv9 = phi i32* [ %scevgep8, %for.body.preheader ], [ %scevgep10, %for.body ]
+ %lsr.iv5 = phi i32* [ %scevgep4, %for.body.preheader ], [ %scevgep6, %for.body ]
+ %lsr.iv1 = phi i32* [ %scevgep, %for.body.preheader ], [ %scevgep2, %for.body ]
+ %count = phi i32 [ %N, %for.body.preheader ], [ %count.next, %for.body ]
+ br label %for.body
+
+for.cond.cleanup:
+ ret void
+}
+
+; CHECK-MID: name: check_loop_dec_sge_xor_brcond_combine
+; CHECK-MIO: bb.2.for.body:
+; CHECK-MID: t2LoopEnd killed renamable $lr, %bb.3
+; CHECK-MID: tB %bb.4, 14
+; CHECk-MID: bb.3.for.header:
+; CHECK-MID: tB %bb.2
+define void @check_loop_dec_sge_xor_brcond_combine(i32* nocapture %a, i32* nocapture readonly %b, i32* nocapture readonly %c, i32 %N) {
+entry:
+ call void @llvm.set.loop.iterations.i32(i32 %N)
+ br label %for.body.preheader
+
+for.body.preheader:
+ %scevgep = getelementptr i32, i32* %a, i32 -1
+ %scevgep4 = getelementptr i32, i32* %c, i32 -1
+ %scevgep8 = getelementptr i32, i32* %b, i32 -1
+ br label %for.header
+
+for.body:
+ %scevgep11 = getelementptr i32, i32* %lsr.iv9, i32 1
+ %ld1 = load i32, i32* %scevgep11, align 4
+ %scevgep7 = getelementptr i32, i32* %lsr.iv5, i32 1
+ %ld2 = load i32, i32* %scevgep7, align 4
+ %mul = mul nsw i32 %ld2, %ld1
+ %scevgep3 = getelementptr i32, i32* %lsr.iv1, i32 1
+ store i32 %mul, i32* %scevgep3, align 4
+ %scevgep2 = getelementptr i32, i32* %lsr.iv1, i32 1
+ %scevgep6 = getelementptr i32, i32* %lsr.iv5, i32 1
+ %scevgep10 = getelementptr i32, i32* %lsr.iv9, i32 1
+ %count.next = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 %count, i32 1)
+ %cmp = icmp sge i32 %count.next, 1
+ %negated = xor i1 %cmp, 1
+ br i1 %negated, label %for.cond.cleanup, label %for.header
+
+for.header:
+ %lsr.iv9 = phi i32* [ %scevgep8, %for.body.preheader ], [ %scevgep10, %for.body ]
+ %lsr.iv5 = phi i32* [ %scevgep4, %for.body.preheader ], [ %scevgep6, %for.body ]
+ %lsr.iv1 = phi i32* [ %scevgep, %for.body.preheader ], [ %scevgep2, %for.body ]
+ %count = phi i32 [ %N, %for.body.preheader ], [ %count.next, %for.body ]
+ br label %for.body
+
+for.cond.cleanup:
+ ret void
+}
+
+; CHECK-MID: name: check_loop_dec_uge_brcond_combine
+; CHECK-MIO: bb.2.for.body:
+; CHECK-MID: t2LoopEnd killed renamable $lr, %bb.3
+; CHECK-MID: tB %bb.4, 14
+; CHECk-MID: bb.3.for.header:
+; CHECK-MID: tB %bb.2
+define void @check_loop_dec_uge_brcond_combine(i32* nocapture %a, i32* nocapture readonly %b, i32* nocapture readonly %c, i32 %N) {
+entry:
+ call void @llvm.set.loop.iterations.i32(i32 %N)
+ br label %for.body.preheader
+
+for.body.preheader:
+ %scevgep = getelementptr i32, i32* %a, i32 -1
+ %scevgep4 = getelementptr i32, i32* %c, i32 -1
+ %scevgep8 = getelementptr i32, i32* %b, i32 -1
+ br label %for.header
+
+for.body:
+ %scevgep11 = getelementptr i32, i32* %lsr.iv9, i32 1
+ %ld1 = load i32, i32* %scevgep11, align 4
+ %scevgep7 = getelementptr i32, i32* %lsr.iv5, i32 1
+ %ld2 = load i32, i32* %scevgep7, align 4
+ %mul = mul nsw i32 %ld2, %ld1
+ %scevgep3 = getelementptr i32, i32* %lsr.iv1, i32 1
+ store i32 %mul, i32* %scevgep3, align 4
+ %scevgep2 = getelementptr i32, i32* %lsr.iv1, i32 1
+ %scevgep6 = getelementptr i32, i32* %lsr.iv5, i32 1
+ %scevgep10 = getelementptr i32, i32* %lsr.iv9, i32 1
+ %count.next = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 %count, i32 1)
+ %cmp = icmp uge i32 %count.next, 1
+ br i1 %cmp, label %for.header, label %for.cond.cleanup
+
+for.header:
+ %lsr.iv9 = phi i32* [ %scevgep8, %for.body.preheader ], [ %scevgep10, %for.body ]
+ %lsr.iv5 = phi i32* [ %scevgep4, %for.body.preheader ], [ %scevgep6, %for.body ]
+ %lsr.iv1 = phi i32* [ %scevgep, %for.body.preheader ], [ %scevgep2, %for.body ]
+ %count = phi i32 [ %N, %for.body.preheader ], [ %count.next, %for.body ]
+ br label %for.body
+
+for.cond.cleanup:
+ ret void
+}
+
+; CHECK-MID: name: check_loop_dec_uge_xor_brcond_combine
+; CHECK-MIO: bb.2.for.body:
+; CHECK-MID: t2LoopEnd killed renamable $lr, %bb.3
+; CHECK-MID: tB %bb.4, 14
+; CHECk-MID: bb.3.for.header:
+; CHECK-MID: tB %bb.2
+define void @check_loop_dec_uge_xor_brcond_combine(i32* nocapture %a, i32* nocapture readonly %b, i32* nocapture readonly %c, i32 %N) {
+entry:
+ call void @llvm.set.loop.iterations.i32(i32 %N)
+ br label %for.body.preheader
+
+for.body.preheader:
+ %scevgep = getelementptr i32, i32* %a, i32 -1
+ %scevgep4 = getelementptr i32, i32* %c, i32 -1
+ %scevgep8 = getelementptr i32, i32* %b, i32 -1
+ br label %for.header
+
+for.body:
+ %scevgep11 = getelementptr i32, i32* %lsr.iv9, i32 1
+ %ld1 = load i32, i32* %scevgep11, align 4
+ %scevgep7 = getelementptr i32, i32* %lsr.iv5, i32 1
+ %ld2 = load i32, i32* %scevgep7, align 4
+ %mul = mul nsw i32 %ld2, %ld1
+ %scevgep3 = getelementptr i32, i32* %lsr.iv1, i32 1
+ store i32 %mul, i32* %scevgep3, align 4
+ %scevgep2 = getelementptr i32, i32* %lsr.iv1, i32 1
+ %scevgep6 = getelementptr i32, i32* %lsr.iv5, i32 1
+ %scevgep10 = getelementptr i32, i32* %lsr.iv9, i32 1
+ %count.next = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 %count, i32 1)
+ %cmp = icmp uge i32 %count.next, 1
+ %negated = xor i1 %cmp, 1
+ br i1 %negated, label %for.cond.cleanup, label %for.header
+
+for.header:
+ %lsr.iv9 = phi i32* [ %scevgep8, %for.body.preheader ], [ %scevgep10, %for.body ]
+ %lsr.iv5 = phi i32* [ %scevgep4, %for.body.preheader ], [ %scevgep6, %for.body ]
+ %lsr.iv1 = phi i32* [ %scevgep, %for.body.preheader ], [ %scevgep2, %for.body ]
+ %count = phi i32 [ %N, %for.body.preheader ], [ %count.next, %for.body ]
+ br label %for.body
+
+for.cond.cleanup:
+ ret void
+}
+
+; CHECK-MID: check_negated_xor_wls
+; CHECK-MID: t2WhileLoopStart killed renamable $r2, %bb.3
+; CHECK-MID: tB %bb.1
+; CHECK-MID: bb.1.while.body.preheader:
+; CHECK-MID: $lr = t2LoopDec killed renamable $lr, 1
+; CHECK-MID: t2LoopEnd killed renamable $lr, %bb.2
+; CHECk-MID: tB %bb.3
+; CHECK-MID: bb.3.while.end:
+define void @check_negated_xor_wls(i16* nocapture %a, i16* nocapture readonly %b, i32 %N) {
+entry:
+ %wls = call i1 @llvm.test.set.loop.iterations.i32(i32 %N)
+ %xor = xor i1 %wls, 1
+ br i1 %xor, label %while.end, label %while.body.preheader
+
+while.body.preheader:
+ br label %while.body
+
+while.body:
+ %a.addr.06 = phi i16* [ %incdec.ptr1, %while.body ], [ %a, %while.body.preheader ]
+ %b.addr.05 = phi i16* [ %incdec.ptr, %while.body ], [ %b, %while.body.preheader ]
+ %count = phi i32 [ %N, %while.body.preheader ], [ %count.next, %while.body ]
+ %incdec.ptr = getelementptr inbounds i16, i16* %b.addr.05, i32 1
+ %ld.b = load i16, i16* %b.addr.05, align 2
+ %incdec.ptr1 = getelementptr inbounds i16, i16* %a.addr.06, i32 1
+ store i16 %ld.b, i16* %a.addr.06, align 2
+ %count.next = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 %count, i32 1)
+ %cmp = icmp ne i32 %count.next, 0
+ br i1 %cmp, label %while.body, label %while.end
+
+while.end:
+ ret void
+}
+
+; CHECK-MID: check_negated_cmp_wls
+; CHECK-MID: t2WhileLoopStart killed renamable $r2, %bb.3
+; CHECK-MID: tB %bb.1
+; CHECK-MID: bb.1.while.body.preheader:
+; CHECK-MID: $lr = t2LoopDec killed renamable $lr, 1
+; CHECK-MID: t2LoopEnd killed renamable $lr, %bb.2
+; CHECk-MID: tB %bb.3
+; CHECK-MID: bb.3.while.end:
+define void @check_negated_cmp_wls(i16* nocapture %a, i16* nocapture readonly %b, i32 %N) {
+entry:
+ %wls = call i1 @llvm.test.set.loop.iterations.i32(i32 %N)
+ %cmp = icmp ne i1 %wls, 1
+ br i1 %cmp, label %while.end, label %while.body.preheader
+
+while.body.preheader:
+ br label %while.body
+
+while.body:
+ %a.addr.06 = phi i16* [ %incdec.ptr1, %while.body ], [ %a, %while.body.preheader ]
+ %b.addr.05 = phi i16* [ %incdec.ptr, %while.body ], [ %b, %while.body.preheader ]
+ %count = phi i32 [ %N, %while.body.preheader ], [ %count.next, %while.body ]
+ %incdec.ptr = getelementptr inbounds i16, i16* %b.addr.05, i32 1
+ %ld.b = load i16, i16* %b.addr.05, align 2
+ %incdec.ptr1 = getelementptr inbounds i16, i16* %a.addr.06, i32 1
+ store i16 %ld.b, i16* %a.addr.06, align 2
+ %count.next = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 %count, i32 1)
+ %cmp.1 = icmp ne i32 %count.next, 0
+ br i1 %cmp.1, label %while.body, label %while.end
+
+while.end:
+ ret void
+}
+
+; CHECK-MID: check_negated_reordered_wls
+; CHECK-MID: bb.1.while.body.preheader:
+; CHECK-MID: tB %bb.2
+; CHECK-MID: bb.2.while.body:
+; CHECK-MID: t2LoopDec killed renamable $lr, 1
+; CHECK-MID: t2LoopEnd killed renamable $lr, %bb.2
+; CHECK-MID: tB %bb.4
+; CHECK-MID: bb.3.while:
+; CHECK-MID: t2WhileLoopStart {{.*}}, %bb.4
+; CHECK-MID: bb.4.while.end
+define void @check_negated_reordered_wls(i16* nocapture %a, i16* nocapture readonly %b, i32 %N) {
+entry:
+ br label %while
+
+while.body.preheader:
+ br label %while.body
+
+while.body:
+ %a.addr.06 = phi i16* [ %incdec.ptr1, %while.body ], [ %a, %while.body.preheader ]
+ %b.addr.05 = phi i16* [ %incdec.ptr, %while.body ], [ %b, %while.body.preheader ]
+ %count = phi i32 [ %N, %while.body.preheader ], [ %count.next, %while.body ]
+ %incdec.ptr = getelementptr inbounds i16, i16* %b.addr.05, i32 1
+ %ld.b = load i16, i16* %b.addr.05, align 2
+ %incdec.ptr1 = getelementptr inbounds i16, i16* %a.addr.06, i32 1
+ store i16 %ld.b, i16* %a.addr.06, align 2
+ %count.next = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 %count, i32 1)
+ %cmp = icmp ne i32 %count.next, 0
+ br i1 %cmp, label %while.body, label %while.end
+
+while:
+ %wls = call i1 @llvm.test.set.loop.iterations.i32(i32 %N)
+ %xor = xor i1 %wls, 1
+ br i1 %xor, label %while.end, label %while.body.preheader
+
+while.end:
+ ret void
+}
+
+declare void @llvm.set.loop.iterations.i32(i32)
+declare i1 @llvm.test.set.loop.iterations.i32(i32)
+declare i32 @llvm.loop.decrement.reg.i32.i32.i32(i32, i32)