// Check that the icmp is checking for equality of Count and zero and that
// a non-zero value results in entering the loop.
auto ICmp = cast<ICmpInst>(BI->getCondition());
+ LLVM_DEBUG(dbgs() << " - Found condition: " << *ICmp << "\n");
if (!ICmp->isEquality())
return false;
// Other cases are autogenerated.
break;
}
+ case ARMISD::WLS: {
+ SDValue Ops[] = { N->getOperand(1), // Loop count
+ N->getOperand(2), // Exit target
+ N->getOperand(0) };
+ SDNode *LoopStart =
+ CurDAG->getMachineNode(ARM::t2WhileLoopStart, dl, MVT::Other, Ops);
+ ReplaceUses(N, LoopStart);
+ CurDAG->RemoveDeadNode(N);
+ return;
+ }
case ARMISD::BRCOND: {
// Pattern: (ARMbrcond:void (bb:Other):$dst, (imm:i32):$cc)
// Emits: (Bcc:void (bb:Other):$dst, (imm:i32):$cc)
if (Subtarget->hasMVEIntegerOps())
addMVEVectorTypes(Subtarget->hasMVEFloatOps());
+ // Combine low-overhead loop intrinsics so that we can lower i1 types.
+ if (Subtarget->hasLOB())
+ setTargetDAGCombine(ISD::BRCOND);
+
if (Subtarget->hasNEON()) {
addDRTypeForNEON(MVT::v2f32);
addDRTypeForNEON(MVT::v8i8);
case ARMISD::VST2LN_UPD: return "ARMISD::VST2LN_UPD";
case ARMISD::VST3LN_UPD: return "ARMISD::VST3LN_UPD";
case ARMISD::VST4LN_UPD: return "ARMISD::VST4LN_UPD";
+ case ARMISD::WLS: return "ARMISD::WLS";
}
return nullptr;
}
return V;
}
+static SDValue PerformHWLoopCombine(SDNode *N,
+ TargetLowering::DAGCombinerInfo &DCI,
+ const ARMSubtarget *ST) {
+ // Look for (brcond (xor test.set.loop.iterations, -1)
+ SDValue CC = N->getOperand(1);
+
+ if (CC->getOpcode() != ISD::XOR && CC->getOpcode() != ISD::SETCC)
+ return SDValue();
+
+ if (CC->getOperand(0)->getOpcode() != ISD::INTRINSIC_W_CHAIN)
+ return SDValue();
+
+ SDValue Int = CC->getOperand(0);
+ unsigned IntOp = cast<ConstantSDNode>(Int.getOperand(1))->getZExtValue();
+ if (IntOp != Intrinsic::test_set_loop_iterations)
+ return SDValue();
+
+ if (auto *Const = dyn_cast<ConstantSDNode>(CC->getOperand(1)))
+ assert(Const->isOne() && "Expected to compare against 1");
+ else
+ assert(Const->isOne() && "Expected to compare against 1");
+
+ SDLoc dl(Int);
+ SDValue Chain = N->getOperand(0);
+ SDValue Elements = Int.getOperand(2);
+ SDValue ExitBlock = N->getOperand(2);
+
+ // TODO: Once we start supporting tail predication, we can add another
+ // operand to WLS for the number of elements processed in a vector loop.
+
+ SDValue Ops[] = { Chain, Elements, ExitBlock };
+ SDValue Res = DCI.DAG.getNode(ARMISD::WLS, dl, MVT::Other, Ops);
+ DCI.DAG.ReplaceAllUsesOfValueWith(Int.getValue(1), Int.getOperand(0));
+ return Res;
+}
+
/// PerformBRCONDCombine - Target-specific DAG combining for ARMISD::BRCOND.
SDValue
ARMTargetLowering::PerformBRCONDCombine(SDNode *N, SelectionDAG &DAG) const {
case ISD::OR: return PerformORCombine(N, DCI, Subtarget);
case ISD::XOR: return PerformXORCombine(N, DCI, Subtarget);
case ISD::AND: return PerformANDCombine(N, DCI, Subtarget);
+ case ISD::BRCOND: return PerformHWLoopCombine(N, DCI, Subtarget);
case ARMISD::ADDC:
case ARMISD::SUBC: return PerformAddcSubcCombine(N, DCI, Subtarget);
case ARMISD::SUBE: return PerformAddeSubeCombine(N, DCI, Subtarget);
WIN__CHKSTK, // Windows' __chkstk call to do stack probing.
WIN__DBZCHK, // Windows' divide by zero check
+ WLS, // Low-overhead loops, While Loop Start
+
VCEQ, // Vector compare equal.
VCEQZ, // Vector compare equal to zero.
VCGE, // Vector compare greater than or equal.
SDTCisInt<0>,
SDTCisInt<4>]>;
+// TODO Add another operand for 'Size' so that we can re-use this node when we
+// start supporting *TP versions.
+def SDT_ARMWhileLoop : SDTypeProfile<0, 2, [SDTCisVT<0, i32>,
+ SDTCisVT<1, OtherVT>]>;
+
def ARMSmlald : SDNode<"ARMISD::SMLALD", SDT_LongMac>;
def ARMSmlaldx : SDNode<"ARMISD::SMLALDX", SDT_LongMac>;
def ARMSmlsld : SDNode<"ARMISD::SMLSLD", SDT_LongMac>;
def ARMvgetlaneu : SDNode<"ARMISD::VGETLANEu", SDTARMVGETLN>;
def ARMvgetlanes : SDNode<"ARMISD::VGETLANEs", SDTARMVGETLN>;
+def ARMWLS : SDNode<"ARMISD::WLS", SDT_ARMWhileLoop,
+ [SDNPHasChain]>;
+
//===----------------------------------------------------------------------===//
// ARM Flag Definitions.
t2PseudoInst<(outs GPRlr:$Rm), (ins GPRlr:$Rn, imm0_7:$size),
4, IIC_Br, []>, Sched<[WriteBr]>;
-let isBranch = 1, isTerminator = 1, hasSideEffects = 1 in
+let isBranch = 1, isTerminator = 1, hasSideEffects = 1 in {
+def t2WhileLoopStart :
+ t2PseudoInst<(outs),
+ (ins rGPR:$elts, brtarget:$target),
+ 4, IIC_Br, []>,
+ Sched<[WriteBr]>;
+
def t2LoopEnd :
t2PseudoInst<(outs), (ins GPRlr:$elts, brtarget:$target),
8, IIC_Br, []>, Sched<[WriteBr]>;
+} // end isBranch, isTerminator, hasSideEffects
+
} // end isNotDuplicable
class CS<string iname, bits<4> opcode, list<dag> pattern=[]>
LLVM_DEBUG(dbgs() << "ARM Loops: Processing " << *ML);
auto IsLoopStart = [](MachineInstr &MI) {
- return MI.getOpcode() == ARM::t2DoLoopStart;
+ return MI.getOpcode() == ARM::t2DoLoopStart ||
+ MI.getOpcode() == ARM::t2WhileLoopStart;
};
- auto SearchForStart =
- [&IsLoopStart](MachineBasicBlock *MBB) -> MachineInstr* {
+ // Search the given block for a loop start instruction. If one isn't found,
+ // and there's only one predecessor block, search that one too.
+ std::function<MachineInstr*(MachineBasicBlock*)> SearchForStart =
+ [&IsLoopStart, &SearchForStart](MachineBasicBlock *MBB) -> MachineInstr* {
for (auto &MI : *MBB) {
if (IsLoopStart(MI))
return &MI;
}
+ if (MBB->pred_size() == 1)
+ return SearchForStart(*MBB->pred_begin());
return nullptr;
};
MachineInstr *End = nullptr;
bool Revert = false;
- if (auto *Preheader = ML->getLoopPreheader())
+ // Search the preheader for the start intrinsic, or look through the
+ // predecessors of the header to find exactly one set.iterations intrinsic.
+ // FIXME: I don't see why we shouldn't be supporting multiple predecessors
+ // with potentially multiple set.loop.iterations, so we need to enable this.
+ if (auto *Preheader = ML->getLoopPreheader()) {
Start = SearchForStart(Preheader);
+ } else {
+ LLVM_DEBUG(dbgs() << "ARM Loops: Failed to find loop preheader!\n"
+ << " - Performing manual predecessor search.\n");
+ MachineBasicBlock *Pred = nullptr;
+ for (auto *MBB : ML->getHeader()->predecessors()) {
+ if (!ML->contains(MBB)) {
+ if (Pred) {
+ LLVM_DEBUG(dbgs() << " - Found multiple out-of-loop preds.\n");
+ Start = nullptr;
+ break;
+ }
+ Pred = MBB;
+ Start = SearchForStart(MBB);
+ }
+ }
+ }
// Find the low-overhead loop components and decide whether or not to fall
// back to a normal loop.
break;
}
- if (Start || Dec || End) {
- if (!Start || !Dec || !End)
- report_fatal_error("Failed to find all loop components");
- } else {
+ if (!Start && !Dec && !End) {
LLVM_DEBUG(dbgs() << "ARM Loops: Not a low-overhead loop.\n");
return Changed;
+ } if (!(Start && Dec && End)) {
+ report_fatal_error("Failed to find all loop components");
}
if (!End->getOperand(1).isMBB() ||
break;
}
+ unsigned Opc = Start->getOpcode() == ARM::t2DoLoopStart ?
+ ARM::t2DLS : ARM::t2WLS;
MachineInstrBuilder MIB =
- BuildMI(*MBB, InsertPt, InsertPt->getDebugLoc(), TII->get(ARM::t2DLS));
- if (InsertPt != Start)
- InsertPt->eraseFromParent();
+ BuildMI(*MBB, InsertPt, InsertPt->getDebugLoc(), TII->get(Opc));
MIB.addDef(ARM::LR);
MIB.add(Start->getOperand(0));
- LLVM_DEBUG(dbgs() << "ARM Loops: Inserted DLS: " << *MIB);
+ if (Opc == ARM::t2WLS)
+ MIB.add(Start->getOperand(1));
+
+ if (InsertPt != Start)
+ InsertPt->eraseFromParent();
Start->eraseFromParent();
+ LLVM_DEBUG(dbgs() << "ARM Loops: Inserted start: " << *MIB);
+ return &*MIB;
};
// Combine the LoopDec and LoopEnd instructions into LE(TP).
MIB.add(End->getOperand(1));
LLVM_DEBUG(dbgs() << "ARM Loops: Inserted LE: " << *MIB);
- // If there is a branch after loop end, which branches to the fallthrough
- // block, remove the branch.
- MachineBasicBlock *Latch = End->getParent();
- MachineInstr *Terminator = &Latch->instr_back();
- if (End != Terminator) {
- MachineBasicBlock *Exit = ML->getExitBlock();
- if (Latch->isLayoutSuccessor(Exit)) {
- LLVM_DEBUG(dbgs() << "ARM Loops: Removing loop exit branch: "
- << *Terminator);
- Terminator->eraseFromParent();
- }
- }
End->eraseFromParent();
Dec->eraseFromParent();
+ return &*MIB;
};
// Generate a subs, or sub and cmp, and a branch instead of an LE.
// TODO: Check flags so that we can possibly generate a subs.
+ // FIXME: Need to check that we're not trashing the CPSR when generating
+ // the cmp.
auto ExpandBranch = [this](MachineInstr *Dec, MachineInstr *End) {
LLVM_DEBUG(dbgs() << "ARM Loops: Reverting to sub, cmp, br.\n");
// Create sub
Dec->eraseFromParent();
};
+ // WhileLoopStart holds the exit block, so produce a cmp lr, 0 and then a
+ // beq that branches to the exit branch.
+ // FIXME: Need to check that we're not trashing the CPSR when generating the
+ // cmp. We could also try to generate a cbz if the value in LR is also in
+ // another low register.
+ auto ExpandStart = [this](MachineInstr *MI) {
+ MachineBasicBlock *MBB = MI->getParent();
+ MachineInstrBuilder MIB = BuildMI(*MBB, MI, MI->getDebugLoc(),
+ TII->get(ARM::t2CMPri));
+ MIB.addReg(ARM::LR);
+ MIB.addImm(0);
+ MIB.addImm(ARMCC::AL);
+ MIB.addReg(ARM::CPSR);
+
+ MIB = BuildMI(*MBB, MI, MI->getDebugLoc(), TII->get(ARM::t2Bcc));
+ MIB.add(MI->getOperand(1)); // branch target
+ MIB.addImm(ARMCC::EQ); // condition code
+ MIB.addReg(ARM::CPSR);
+ };
+
+ // TODO: We should be able to automatically remove these branches before we
+ // get here - probably by teaching analyzeBranch about the pseudo
+ // instructions.
+ // If there is an unconditional branch, after I, that just branches to the
+ // next block, remove it.
+ auto RemoveDeadBranch = [](MachineInstr *I) {
+ MachineBasicBlock *BB = I->getParent();
+ MachineInstr *Terminator = &BB->instr_back();
+ if (Terminator->isUnconditionalBranch() && I != Terminator) {
+ MachineBasicBlock *Succ = Terminator->getOperand(0).getMBB();
+ if (BB->isLayoutSuccessor(Succ)) {
+ LLVM_DEBUG(dbgs() << "ARM Loops: Removing branch: " << *Terminator);
+ Terminator->eraseFromParent();
+ }
+ }
+ };
+
if (Revert) {
- Start->eraseFromParent();
+ if (Start->getOpcode() == ARM::t2WhileLoopStart)
+ ExpandStart(Start);
ExpandBranch(Dec, End);
+ Start->eraseFromParent();
} else {
- ExpandLoopStart(ML, Start);
- ExpandLoopEnd(ML, Dec, End);
+ Start = ExpandLoopStart(ML, Start);
+ RemoveDeadBranch(Start);
+ End = ExpandLoopEnd(ML, Dec, End);
+ RemoveDeadBranch(End);
}
}
default:
break;
case Intrinsic::set_loop_iterations:
+ case Intrinsic::test_set_loop_iterations:
case Intrinsic::loop_decrement:
case Intrinsic::loop_decrement_reg:
return true;
LLVMContext &C = L->getHeader()->getContext();
HWLoopInfo.CounterInReg = true;
HWLoopInfo.IsNestingLegal = false;
+ HWLoopInfo.PerformEntryTest = true;
HWLoopInfo.CountType = Type::getInt32Ty(C);
HWLoopInfo.LoopDecrement = ConstantInt::get(HWLoopInfo.CountType, 1);
return true;
--- /dev/null
+; RUN: llc -mtriple=thumbv8.1m.main -disable-arm-loloops=false -mattr=+lob -stop-after=arm-low-overhead-loops --verify-machineinstrs %s -o - | FileCheck %s
+; RUN: llc -mtriple=thumbv8.1m.main -disable-arm-loloops=false -mattr=+lob -stop-after=arm-low-overhead-loops --verify-machineinstrs %s -o - | FileCheck %s --check-prefix=CHECK-GLOBAL
+
+; Not implemented as a mir test so that changes the generic HardwareLoop can
+; also be tested. These functions have been taken from
+; Transforms/HardwareLoops/loop-guards.ll in which can be seen the generation
+; of a few test.set intrinsics, but only one (ne_trip_count) gets generated
+; here. Simplifications result in icmps changing and maybe also the CFG. So,
+; TODO: Teach the HardwareLoops some better pattern recognition.
+
+; CHECK-GLOBAL-NOT: DoLoopStart
+; CHECK-GLOBAL-NOT: WhileLoopStart
+; CHECK-GLOBAL-NOT: LoopEnd
+
+; CHECK: ne_and_guard
+; CHECK: body:
+; CHECK: bb.0.entry:
+; CHECK: t2CMPri renamable $lr, 0
+; CHECK: tBcc %bb.3
+; CHECK: bb.1.while.body.preheader:
+; CHECK: $lr = t2DLS renamable $lr
+; CHECK: bb.2.while.body:
+; CHECK: $lr = t2LEUpdate renamable $lr, %bb.2
+define void @ne_and_guard(i1 zeroext %t1, i1 zeroext %t2, i32* nocapture %a, i32* nocapture readonly %b, i32 %N) {
+entry:
+ %brmerge.demorgan = and i1 %t1, %t2
+ %cmp6 = icmp ne i32 %N, 0
+ %or.cond = and i1 %brmerge.demorgan, %cmp6
+ br i1 %or.cond, label %while.body, label %if.end
+
+while.body: ; preds = %while.body, %entry
+ %i.09 = phi i32 [ %inc, %while.body ], [ 0, %entry ]
+ %a.addr.08 = phi i32* [ %incdec.ptr3, %while.body ], [ %a, %entry ]
+ %b.addr.07 = phi i32* [ %incdec.ptr, %while.body ], [ %b, %entry ]
+ %incdec.ptr = getelementptr inbounds i32, i32* %b.addr.07, i32 1
+ %tmp = load i32, i32* %b.addr.07, align 4
+ %incdec.ptr3 = getelementptr inbounds i32, i32* %a.addr.08, i32 1
+ store i32 %tmp, i32* %a.addr.08, align 4
+ %inc = add nuw i32 %i.09, 1
+ %exitcond = icmp eq i32 %inc, %N
+ br i1 %exitcond, label %if.end, label %while.body
+
+if.end: ; preds = %while.body, %entry
+ ret void
+}
+
+; TODO: This could generate WLS
+; CHECK: ne_preheader
+; CHECK: body:
+; CHECK: bb.0.entry:
+; CHECK: t2CMPri renamable $lr, 0
+; CHECK: tBcc %bb.3
+; CHECK: bb.1.while.body.preheader:
+; CHECK: $lr = t2DLS renamable $lr
+; CHECK: bb.2.while.body:
+; CHECK: $lr = t2LEUpdate renamable $lr, %bb.2
+define void @ne_preheader(i1 zeroext %t1, i1 zeroext %t2, i32* nocapture %a, i32* nocapture readonly %b, i32 %N) {
+entry:
+ %brmerge.demorgan = and i1 %t1, %t2
+ br i1 %brmerge.demorgan, label %while.preheader, label %if.end
+
+while.preheader: ; preds = %entry
+ %cmp = icmp ne i32 %N, 0
+ br i1 %cmp, label %while.body, label %if.end
+
+while.body: ; preds = %while.body, %while.preheader
+ %i.09 = phi i32 [ %inc, %while.body ], [ 0, %while.preheader ]
+ %a.addr.08 = phi i32* [ %incdec.ptr3, %while.body ], [ %a, %while.preheader ]
+ %b.addr.07 = phi i32* [ %incdec.ptr, %while.body ], [ %b, %while.preheader ]
+ %incdec.ptr = getelementptr inbounds i32, i32* %b.addr.07, i32 1
+ %tmp = load i32, i32* %b.addr.07, align 4
+ %incdec.ptr3 = getelementptr inbounds i32, i32* %a.addr.08, i32 1
+ store i32 %tmp, i32* %a.addr.08, align 4
+ %inc = add nuw i32 %i.09, 1
+ %exitcond = icmp eq i32 %inc, %N
+ br i1 %exitcond, label %if.end, label %while.body
+
+if.end: ; preds = %while.body, %while.preheader, %entry
+ ret void
+}
+
+; TODO: This could generate WLS
+; CHECK: eq_preheader
+; CHECK: body:
+; CHECK: bb.0.entry:
+; CHECK: t2CMPri renamable $lr, 0
+; CHECK: tBcc %bb.3
+; CHECK: bb.1.while.body.preheader:
+; CHECK: $lr = t2DLS renamable $lr
+; CHECK: bb.2.while.body:
+; CHECK: $lr = t2LEUpdate renamable $lr, %bb.2
+define void @eq_preheader(i1 zeroext %t1, i1 zeroext %t2, i32* nocapture %a, i32* nocapture readonly %b, i32 %N) {
+entry:
+ %brmerge.demorgan = and i1 %t1, %t2
+ br i1 %brmerge.demorgan, label %while.preheader, label %if.end
+
+while.preheader: ; preds = %entry
+ %cmp = icmp eq i32 %N, 0
+ br i1 %cmp, label %if.end, label %while.body
+
+while.body: ; preds = %while.body, %while.preheader
+ %i.09 = phi i32 [ %inc, %while.body ], [ 0, %while.preheader ]
+ %a.addr.08 = phi i32* [ %incdec.ptr3, %while.body ], [ %a, %while.preheader ]
+ %b.addr.07 = phi i32* [ %incdec.ptr, %while.body ], [ %b, %while.preheader ]
+ %incdec.ptr = getelementptr inbounds i32, i32* %b.addr.07, i32 1
+ %tmp = load i32, i32* %b.addr.07, align 4
+ %incdec.ptr3 = getelementptr inbounds i32, i32* %a.addr.08, i32 1
+ store i32 %tmp, i32* %a.addr.08, align 4
+ %inc = add nuw i32 %i.09, 1
+ %exitcond = icmp eq i32 %inc, %N
+ br i1 %exitcond, label %if.end, label %while.body
+
+if.end: ; preds = %while.body, %while.preheader, %entry
+ ret void
+}
+
+; TODO: This could generate WLS
+; CHECK: ne_prepreheader
+; CHECK: body:
+; CHECK: bb.0.entry:
+; CHECK: t2CMPri renamable $lr, 0
+; CHECK: tBcc %bb.3
+; CHECK: bb.1.while.body.preheader:
+; CHECK: $lr = t2DLS renamable $lr
+; CHECK: bb.2.while.body:
+; CHECK: $lr = t2LEUpdate renamable $lr, %bb.2
+define void @ne_prepreheader(i1 zeroext %t1, i1 zeroext %t2, i32* nocapture %a, i32* nocapture readonly %b, i32 %N) {
+entry:
+ %cmp = icmp ne i32 %N, 0
+ br i1 %cmp, label %while.preheader, label %if.end
+
+while.preheader: ; preds = %entry
+ %brmerge.demorgan = and i1 %t1, %t2
+ br i1 %brmerge.demorgan, label %while.body, label %if.end
+
+while.body: ; preds = %while.body, %while.preheader
+ %i.09 = phi i32 [ %inc, %while.body ], [ 0, %while.preheader ]
+ %a.addr.08 = phi i32* [ %incdec.ptr3, %while.body ], [ %a, %while.preheader ]
+ %b.addr.07 = phi i32* [ %incdec.ptr, %while.body ], [ %b, %while.preheader ]
+ %incdec.ptr = getelementptr inbounds i32, i32* %b.addr.07, i32 1
+ %tmp = load i32, i32* %b.addr.07, align 4
+ %incdec.ptr3 = getelementptr inbounds i32, i32* %a.addr.08, i32 1
+ store i32 %tmp, i32* %a.addr.08, align 4
+ %inc = add nuw i32 %i.09, 1
+ %exitcond = icmp eq i32 %inc, %N
+ br i1 %exitcond, label %if.end, label %while.body
+
+if.end: ; preds = %while.body, %while.preheader, %entry
+ ret void
+}
+
+; CHECK: be_ne
+; CHECK: body:
+; CHECK: bb.0.entry:
+; CHECK: $lr = t2DLS renamable $lr
+; CHECK: bb.1.do.body:
+; CHECK: $lr = t2LEUpdate renamable $lr, %bb.1
+define void @be_ne(i32* nocapture %a, i32* nocapture readonly %b, i32 %N) {
+entry:
+ %cmp = icmp ne i32 %N, 0
+ %sub = sub i32 %N, 1
+ %be = select i1 %cmp, i32 0, i32 %sub
+ %cmp.1 = icmp ne i32 %be, 0
+ br i1 %cmp.1, label %do.body, label %if.end
+
+do.body: ; preds = %do.body, %entry
+ %b.addr.0 = phi i32* [ %incdec.ptr, %do.body ], [ %b, %entry ]
+ %a.addr.0 = phi i32* [ %incdec.ptr3, %do.body ], [ %a, %entry ]
+ %i.0 = phi i32 [ %inc, %do.body ], [ 0, %entry ]
+ %incdec.ptr = getelementptr inbounds i32, i32* %b.addr.0, i32 1
+ %tmp = load i32, i32* %b.addr.0, align 4
+ %incdec.ptr3 = getelementptr inbounds i32, i32* %a.addr.0, i32 1
+ store i32 %tmp, i32* %a.addr.0, align 4
+ %inc = add nuw i32 %i.0, 1
+ %cmp.2 = icmp ult i32 %inc, %N
+ br i1 %cmp.2, label %do.body, label %if.end
+
+if.end: ; preds = %do.body, %entry
+ ret void
+}
+
+; TODO: Remove the tMOVr in the preheader!
+; CHECK: ne_trip_count
+; CHECK: body:
+; CHECK: bb.0.entry:
+; CHECK: $lr = t2WLS $r3, %bb.3
+; CHECK: bb.1.do.body.preheader:
+; CHECK: $lr = tMOVr
+; CHECK: bb.2.do.body:
+; CHECK: $lr = t2LEUpdate renamable $lr, %bb.2
+define void @ne_trip_count(i1 zeroext %t1, i32* nocapture %a, i32* nocapture readonly %b, i32 %N) {
+entry:
+ br label %do.body.preheader
+
+do.body.preheader:
+ %cmp = icmp ne i32 %N, 0
+ br i1 %cmp, label %do.body, label %if.end
+
+do.body:
+ %b.addr.0 = phi i32* [ %incdec.ptr, %do.body ], [ %b, %do.body.preheader ]
+ %a.addr.0 = phi i32* [ %incdec.ptr3, %do.body ], [ %a, %do.body.preheader ]
+ %i.0 = phi i32 [ %inc, %do.body ], [ 0, %do.body.preheader ]
+ %incdec.ptr = getelementptr inbounds i32, i32* %b.addr.0, i32 1
+ %tmp = load i32, i32* %b.addr.0, align 4
+ %incdec.ptr3 = getelementptr inbounds i32, i32* %a.addr.0, i32 1
+ store i32 %tmp, i32* %a.addr.0, align 4
+ %inc = add nuw i32 %i.0, 1
+ %cmp.1 = icmp ult i32 %inc, %N
+ br i1 %cmp.1, label %do.body, label %if.end
+
+if.end: ; preds = %do.body, %entry
+ ret void
+}
--- /dev/null
+# RUN: llc -mtriple=thumbv8.1m.main -mattr=+lob -run-pass=arm-low-overhead-loops --verify-machineinstrs %s -o - | FileCheck %s
+# CHECK: body:
+# CHECK: bb.0.entry:
+# CHECK: t2CMPri $lr, 0, 14
+# CHECK-NEXT: t2Bcc %bb.3, 0, $cpsr
+# CHECK-NEXT: tB %bb.1
+# CHECK: bb.1.do.body.preheader:
+# CHECK: $lr = tMOVr killed $r3
+# CHECK: bb.2.do.body:
+# CHECK: $lr = t2SUBri killed renamable $lr, 1, 14
+# CHECK-NEXT: t2CMPri $lr, 0, 14, $cpsr
+# CHECK-NEXT: t2Bcc %bb.2, 1, $cpsr
+# CHECK-NEXT: tB %bb.3, 14
+--- |
+ target datalayout = "e-m:e-p:32:32-Fi8-i64:64-v128:64:128-a:0:32-n32-S64"
+ target triple = "thumbv8.1m.main"
+
+ define void @ne_trip_count(i1 zeroext %t1, i32* nocapture %a, i32* nocapture readonly %b, i32 %N) #0 {
+ entry:
+ %cmp = icmp ne i32 %N, 0
+ %0 = call i1 @llvm.test.set.loop.iterations.i32(i32 %N)
+ br i1 %0, label %do.body.preheader, label %if.end
+
+ do.body.preheader: ; preds = %entry
+ br label %do.body
+
+ do.body: ; preds = %do.body.preheader, %do.body
+ %i.0 = phi i32 [ %inc, %do.body ], [ 0, %do.body.preheader ]
+ %1 = phi i32 [ %N, %do.body.preheader ], [ %2, %do.body ]
+ %scevgep = getelementptr i32, i32* %b, i32 %i.0
+ %scevgep1 = getelementptr i32, i32* %a, i32 %i.0
+ %size = call i32 @llvm.arm.space(i32 4096, i32 undef)
+ %tmp = load i32, i32* %scevgep, align 4
+ store i32 %tmp, i32* %scevgep1, align 4
+ %inc = add nuw i32 %i.0, 1
+ %2 = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 %1, i32 1)
+ %3 = icmp ne i32 %2, 0
+ br i1 %3, label %do.body, label %if.end
+
+ if.end: ; preds = %do.body, %entry
+ ret void
+ }
+
+ declare i32 @llvm.arm.space(i32, i32) #1
+ declare i1 @llvm.test.set.loop.iterations.i32(i32) #2
+ declare i32 @llvm.loop.decrement.reg.i32.i32.i32(i32, i32) #2
+
+ attributes #0 = { "target-features"="+lob" }
+ attributes #1 = { nounwind "target-features"="+lob" }
+ attributes #2 = { noduplicate nounwind }
+ attributes #3 = { nounwind }
+
+...
+---
+name: ne_trip_count
+alignment: 1
+exposesReturnsTwice: false
+legalized: false
+regBankSelected: false
+selected: false
+failedISel: false
+tracksRegLiveness: false
+hasWinCFI: false
+registers: []
+liveins:
+ - { reg: '$r1', virtual-reg: '' }
+ - { reg: '$r2', virtual-reg: '' }
+ - { reg: '$r3', virtual-reg: '' }
+frameInfo:
+ isFrameAddressTaken: false
+ isReturnAddressTaken: false
+ hasStackMap: false
+ hasPatchPoint: false
+ stackSize: 8
+ offsetAdjustment: 0
+ maxAlignment: 4
+ adjustsStack: false
+ hasCalls: false
+ stackProtector: ''
+ maxCallFrameSize: 0
+ cvBytesOfCalleeSavedRegisters: 0
+ hasOpaqueSPAdjustment: false
+ hasVAStart: false
+ hasMustTailInVarArgFunc: false
+ localFrameSize: 0
+ savePoint: ''
+ restorePoint: ''
+fixedStack: []
+stack:
+ - { id: 0, name: '', type: spill-slot, offset: -4, size: 4, alignment: 4,
+ stack-id: default, callee-saved-register: '$lr', callee-saved-restored: false,
+ debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
+ - { id: 1, name: '', type: spill-slot, offset: -8, size: 4, alignment: 4,
+ stack-id: default, callee-saved-register: '$r7', callee-saved-restored: true,
+ debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
+callSites: []
+constants: []
+machineFunctionInfo: {}
+body: |
+ bb.0.entry:
+ successors: %bb.1(0x40000000), %bb.3(0x40000000)
+
+ frame-setup tPUSH 14, $noreg, killed $r7, killed $lr, implicit-def $sp, implicit $sp
+ frame-setup CFI_INSTRUCTION def_cfa_offset 8
+ frame-setup CFI_INSTRUCTION offset $lr, -4
+ frame-setup CFI_INSTRUCTION offset $r7, -8
+ t2WhileLoopStart $r3, %bb.3
+ tB %bb.1, 14, $noreg
+
+ bb.1.do.body.preheader:
+ successors: %bb.2(0x80000000)
+
+ $lr = tMOVr killed $r3, 14, $noreg
+ renamable $r0, dead $cpsr = tMOVi8 0, 14, $noreg
+
+ bb.2.do.body:
+ successors: %bb.2(0x7c000000), %bb.3(0x04000000)
+
+ dead renamable $r3 = SPACE 4096, undef renamable $r0
+ renamable $r3 = t2LDRs renamable $r2, renamable $r0, 2, 14, $noreg :: (load 4 from %ir.scevgep)
+ t2STRs killed renamable $r3, renamable $r1, renamable $r0, 2, 14, $noreg :: (store 4 into %ir.scevgep1)
+ renamable $r0, dead $cpsr = nuw tADDi8 killed renamable $r0, 1, 14, $noreg
+ renamable $lr = t2LoopDec killed renamable $lr, 1
+ t2LoopEnd renamable $lr, %bb.2
+ tB %bb.3, 14, $noreg
+
+ bb.3.if.end:
+ tPOP_RET 14, $noreg, def $r7, def $pc
+
+...
--- /dev/null
+# RUN: llc -mtriple=thumbv8.1m.main -mattr=+lob %s -run-pass=arm-low-overhead-loops --verify-machineinstrs -o - | FileCheck %s
+
+# TODO: Remove the lr = tMOVr
+# CHECK: body:
+# CHECK: $lr = t2WLS $r2, [[EXIT:%bb[.0-9]+]]
+# CHECK: [[PREHEADER:bb[.0-9a-z]+]]:
+# CHECK: $lr = tMOVr killed $r2
+# CHECK: [[BODY:bb[.0-9a-z]+]]:
+# CHECK: $lr = t2LEUpdate renamable $lr
+
+--- |
+ target datalayout = "e-m:e-p:32:32-Fi8-i64:64-v128:64:128-a:0:32-n32-S64"
+ target triple = "thumbv8.1m.main-arm-unknown"
+
+ ; Function Attrs: norecurse nounwind optsize
+ define dso_local arm_aapcscc void @copy(i16* nocapture %a, i16* nocapture readonly %b, i32 %N) {
+ entry:
+ %cmp4 = icmp eq i32 %N, 0
+ %0 = call i1 @llvm.test.set.loop.iterations.i32(i32 %N)
+ br i1 %0, label %while.body.preheader, label %while.end
+
+ while.body.preheader: ; preds = %entry
+ br label %while.body
+
+ while.body: ; preds = %while.body, %while.body.preheader
+ %a.addr.06 = phi i16* [ %incdec.ptr1, %while.body ], [ %a, %while.body.preheader ]
+ %b.addr.05 = phi i16* [ %incdec.ptr, %while.body ], [ %b, %while.body.preheader ]
+ %1 = phi i32 [ %N, %while.body.preheader ], [ %3, %while.body ]
+ %incdec.ptr = getelementptr inbounds i16, i16* %b.addr.05, i32 1
+ %2 = load i16, i16* %b.addr.05, align 2, !tbaa !3
+ %incdec.ptr1 = getelementptr inbounds i16, i16* %a.addr.06, i32 1
+ store i16 %2, i16* %a.addr.06, align 2, !tbaa !3
+ %3 = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 %1, i32 1)
+ %4 = icmp ne i32 %3, 0
+ br i1 %4, label %while.body, label %while.end
+
+ while.end: ; preds = %while.body, %entry
+ ret void
+ }
+
+ declare i1 @llvm.test.set.loop.iterations.i32(i32) #1
+ declare i32 @llvm.loop.decrement.reg.i32.i32.i32(i32, i32) #1
+
+ attributes #1 = { noduplicate nounwind }
+ attributes #2 = { nounwind }
+
+ !llvm.module.flags = !{!0, !1}
+
+ !0 = !{i32 1, !"wchar_size", i32 4}
+ !1 = !{i32 1, !"min_enum_size", i32 4}
+ !3 = !{!4, !4, i64 0}
+ !4 = !{!"short", !5, i64 0}
+ !5 = !{!"omnipotent char", !6, i64 0}
+ !6 = !{!"Simple C/C++ TBAA"}
+
+...
+---
+name: copy
+alignment: 1
+exposesReturnsTwice: false
+legalized: false
+regBankSelected: false
+selected: false
+failedISel: false
+tracksRegLiveness: false
+hasWinCFI: false
+registers: []
+liveins:
+ - { reg: '$r0', virtual-reg: '' }
+ - { reg: '$r1', virtual-reg: '' }
+ - { reg: '$r2', virtual-reg: '' }
+frameInfo:
+ isFrameAddressTaken: false
+ isReturnAddressTaken: false
+ hasStackMap: false
+ hasPatchPoint: false
+ stackSize: 8
+ offsetAdjustment: 0
+ maxAlignment: 4
+ adjustsStack: false
+ hasCalls: false
+ stackProtector: ''
+ maxCallFrameSize: 0
+ cvBytesOfCalleeSavedRegisters: 0
+ hasOpaqueSPAdjustment: false
+ hasVAStart: false
+ hasMustTailInVarArgFunc: false
+ localFrameSize: 0
+ savePoint: ''
+ restorePoint: ''
+fixedStack: []
+stack:
+ - { id: 0, name: '', type: spill-slot, offset: -4, size: 4, alignment: 4,
+ stack-id: default, callee-saved-register: '$lr', callee-saved-restored: false,
+ debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
+ - { id: 1, name: '', type: spill-slot, offset: -8, size: 4, alignment: 4,
+ stack-id: default, callee-saved-register: '$r7', callee-saved-restored: true,
+ debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
+constants: []
+machineFunctionInfo: {}
+body: |
+ bb.0.entry:
+ successors: %bb.1(0x40000000), %bb.3(0x40000000)
+
+ frame-setup tPUSH 14, $noreg, $r7, killed $lr, implicit-def $sp, implicit $sp
+ frame-setup CFI_INSTRUCTION def_cfa_offset 8
+ frame-setup CFI_INSTRUCTION offset $lr, -4
+ frame-setup CFI_INSTRUCTION offset $r7, -8
+ $r7 = frame-setup tMOVr $sp, 14, $noreg
+ frame-setup CFI_INSTRUCTION def_cfa_register $r7
+ t2WhileLoopStart $r2, %bb.3
+ tB %bb.1, 14, $noreg
+
+ bb.1.while.body.preheader:
+ successors: %bb.2(0x80000000)
+
+ $lr = tMOVr killed $r2, 14, $noreg
+
+ bb.2.while.body:
+ successors: %bb.2(0x7c000000), %bb.3(0x04000000)
+
+ renamable $r2, renamable $r1 = t2LDRH_POST killed renamable $r1, 2, 14, $noreg :: (load 2 from %ir.b.addr.05, !tbaa !3)
+ early-clobber renamable $r0 = t2STRH_POST killed renamable $r2, killed renamable $r0, 2, 14, $noreg :: (store 2 into %ir.a.addr.06, !tbaa !3)
+ renamable $lr = t2LoopDec killed renamable $lr, 1
+ t2LoopEnd renamable $lr, %bb.2
+ tB %bb.3, 14, $noreg
+
+ bb.3.while.end:
+ tPOP_RET 14, $noreg, def $r7, def $pc
+
+...
@g = common local_unnamed_addr global i32* null, align 4
; CHECK-LABEL: do_with_i32_urem
+; CHECK: entry:
+; CHECK: [[TEST:%[^ ]+]] = call i1 @llvm.test.set.loop.iterations.i32(i32 %n)
+; CHECK: br i1 [[TEST]], label %while.body.preheader, label %while.end
+
; CHECK: while.body.preheader:
-; CHECK: call void @llvm.set.loop.iterations.i32(i32 %n)
; CHECK-NEXT: br label %while.body
+; CHECK: while.body:
; CHECK: [[REM:%[^ ]+]] = phi i32 [ %n, %while.body.preheader ], [ [[LOOP_DEC:%[^ ]+]], %while.body ]
; CHECK: [[LOOP_DEC]] = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 [[REM]], i32 1)
; CHECK: [[CMP:%[^ ]+]] = icmp ne i32 [[LOOP_DEC]], 0
}
; CHECK-LABEL: do_with_i32_srem
+; CHECK: entry:
+; CHECK: [[TEST:%[^ ]+]] = call i1 @llvm.test.set.loop.iterations.i32(i32 %n)
+; CHECK: br i1 [[TEST]], label %while.body.preheader, label %while.end
+
; CHECK: while.body.preheader:
-; CHECK: call void @llvm.set.loop.iterations.i32(i32 %n)
; CHECK-NEXT: br label %while.body
+; CHECK: while.body:
; CHECK: [[REM:%[^ ]+]] = phi i32 [ %n, %while.body.preheader ], [ [[LOOP_DEC:%[^ ]+]], %while.body ]
; CHECK: [[LOOP_DEC]] = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 [[REM]], i32 1)
; CHECK: [[CMP:%[^ ]+]] = icmp ne i32 [[LOOP_DEC]], 0
}
; CHECK-LABEL: do_with_i32_udiv
+; CHECK: entry:
+; CHECK: [[TEST:%[^ ]+]] = call i1 @llvm.test.set.loop.iterations.i32(i32 %n)
+; CHECK: br i1 [[TEST]], label %while.body.preheader, label %while.end
+
; CHECK: while.body.preheader:
-; CHECK: call void @llvm.set.loop.iterations.i32(i32 %n)
; CHECK-NEXT: br label %while.body
+; CHECK: while.body:
; CHECK: [[REM:%[^ ]+]] = phi i32 [ %n, %while.body.preheader ], [ [[LOOP_DEC:%[^ ]+]], %while.body ]
; CHECK: [[LOOP_DEC]] = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 [[REM]], i32 1)
; CHECK: [[CMP:%[^ ]+]] = icmp ne i32 [[LOOP_DEC]], 0
}
; CHECK-LABEL: do_with_i32_sdiv
+; CHECK: entry:
+; CHECK: [[TEST:%[^ ]+]] = call i1 @llvm.test.set.loop.iterations.i32(i32 %n)
+; CHECK: br i1 [[TEST]], label %while.body.preheader, label %while.end
+
; CHECK: while.body.preheader:
-; CHECK: call void @llvm.set.loop.iterations.i32(i32 %n)
; CHECK-NEXT: br label %while.body
+; CHECK: while.body:
; CHECK: [[REM:%[^ ]+]] = phi i32 [ %n, %while.body.preheader ], [ [[LOOP_DEC:%[^ ]+]], %while.body ]
; CHECK: [[LOOP_DEC]] = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 [[REM]], i32 1)
; CHECK: [[CMP:%[^ ]+]] = icmp ne i32 [[LOOP_DEC]], 0
}
; CHECK-LABEL: do_with_i64_urem
-; CHECK-NOT: llvm.set.loop.iterations
+; CHECK-NOT: llvm.{{.*}}.loop.iterations
; CHECK-NOT: llvm.loop.decrement
define i64 @do_with_i64_urem(i32 %n) {
entry:
}
; CHECK-LABEL: do_with_i64_srem
-; CHECK-NOT: llvm.set.loop.iterations
+; CHECK-NOT: llvm.{{.*}}.loop.iterations
; CHECK-NOT: llvm.loop.decrement
define i64 @do_with_i64_srem(i32 %n) {
entry:
}
; CHECK-LABEL: do_with_i64_udiv
-; CHECK-NOT: llvm.set.loop.iterations
+; CHECK-NOT: llvm.{{.*}}.loop.iterations
; CHECK-NOT: llvm.loop.decrement
define i64 @do_with_i64_udiv(i32 %n) {
entry:
}
; CHECK-LABEL: do_with_i64_sdiv
-; CHECK-NOT: call void @llvm.set.loop.iterations
+; CHECK-NOT: call void @llvm.{{.*}}.loop.iterations
; CHECK-NOT: call i32 @llvm.loop.decrement
define i64 @do_with_i64_sdiv(i32 %n) {
entry:
; RUN: opt -mtriple=thumbv8.1m.main-arm-none-eabi -mattr=+soft-float -hardware-loops -disable-arm-loloops=false %s -S -o - | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-SOFT
; CHECK-LABEL: test_fptosi
-; CHECK: while.body.lr.ph:
+; CHECK-SOFT-NOT: call void @llvm.set.loop.iterations
+
+; CHECK: entry:
; CHECK-FP: [[CMP:%[^ ]+]] = icmp ugt i32 %n, 1
; CHECK-FP: [[COUNT:%[^ ]+]] = select i1 [[CMP]], i32 %n, i32 1
+
+; CHECK: while.body.lr.ph:
; CHECK-FP: call void @llvm.set.loop.iterations.i32(i32 [[COUNT]])
; CHECK-FP-NEXT: br label %while.body
; CHECK-FP: [[CMP:%[^ ]+]] = icmp ne i32 [[LOOP_DEC]], 0
; CHECK-FP: br i1 [[CMP]], label %while.body, label %cleanup.loopexit
-; CHECK-SOFT-NOT: call void @llvm.set.loop.iterations
-
define void @test_fptosi(i32 %n, i32** %g, double** %d) {
entry:
%n.off = add i32 %n, -1
}
; CHECK-LABEL: test_fptoui
-; CHECK-FP: while.body.lr.ph:
+; CHECK: entry:
; CHECK-FP: [[CMP:%[^ ]+]] = icmp ugt i32 %n, 1
; CHECK-FP: [[COUNT:%[^ ]+]] = select i1 [[CMP]], i32 %n, i32 1
+; CHECK-FP: while.body.lr.ph:
; CHECK-FP: call void @llvm.set.loop.iterations.i32(i32 [[COUNT]])
; CHECK-FP-NEXT: br label %while.body
}
; CHECK-LABEL: load_store_float
+; CHECK: entry:
+; CHECK: [[CMP:%[^ ]+]] = icmp ugt i32 %n, 1
+; CHECK: [[COUNT:%[^ ]+]] = select i1 [[CMP]], i32 %n, i32 1
; CHECK: while.body.lr.ph:
-; CHECK: [[CMP:%[^ ]+]] = icmp ugt i32 %n, 1
-; CHECK: [[COUNT:%[^ ]+]] = select i1 [[CMP]], i32 %n, i32 1
-; CHECK: call void @llvm.set.loop.iterations.i32(i32 [[COUNT]])
+; CHECK: call void @llvm.set.loop.iterations.i32(i32 [[COUNT]])
; CHECK-NEXT: br label %while.body
; CHECK: [[REM:%[^ ]+]] = phi i32 [ [[COUNT]], %while.body.lr.ph ], [ [[LOOP_DEC:%[^ ]+]], %if.end4 ]
}
; CHECK-LABEL: fp_add
-; CHECK: while.body.lr.ph:
-
; CHECK-SOFT-NOT: call void @llvm.set.loop.iterations
-
+; CHECK: entry:
; CHECK-FP: [[CMP:%[^ ]+]] = icmp ugt i32 %n, 1
; CHECK-FP: [[COUNT:%[^ ]+]] = select i1 [[CMP]], i32 %n, i32 1
+; CHECK: while.body.lr.ph:
; CHECK-FP: call void @llvm.set.loop.iterations.i32(i32 [[COUNT]])
; CHECK: br label %while.body
; RUN: opt -mtriple=thumbv8.1m.main-arm-none-eabi -mattr=-lob -hardware-loops %s -S -o - | FileCheck %s --check-prefix=DISABLED
; RUN: llc -mtriple=thumbv8.1m.main-arm-none-eabi -disable-arm-loloops=false %s -o - | FileCheck %s --check-prefix=CHECK-LLC
-; DISABLED-NOT: llvm.set.loop.iterations
+; DISABLED-NOT: llvm.{{.*}}.loop.iterations
; DISABLED-NOT: llvm.loop.decrement
@g = common local_unnamed_addr global i32* null, align 4
}
; CHECK-LABEL: do_inc1
+; CHECK: entry:
+; CHECK: [[TEST:%[^ ]+]] = call i1 @llvm.test.set.loop.iterations.i32(i32 %n)
+; CHECK: br i1 [[TEST]], label %while.body.lr.ph, label %while.end
+
; CHECK: while.body.lr.ph:
-; CHECK: call void @llvm.set.loop.iterations.i32(i32 %n)
-; CHECK-NEXT: br label %while.body
+; CHECK: br label %while.body
; CHECK: [[REM:%[^ ]+]] = phi i32 [ %n, %while.body.lr.ph ], [ [[LOOP_DEC:%[^ ]+]], %while.body ]
; CHECK: [[LOOP_DEC]] = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 [[REM]], i32 1)
; CHECK: br i1 [[CMP]], label %while.body, label %while.end.loopexit
; CHECK-LLC-LABEL:do_inc1:
-; CHECK-LLC: dls lr,
+; CHECK-LLC: wls lr, {{.*}}, [[LOOP_EXIT:.[LBB_0-3]+]]
; CHECK-LLC-NOT: mov lr,
; CHECK-LLC: [[LOOP_HEADER:\.LBB[0-9_]+]]:
; CHECK-LLC: le lr, [[LOOP_HEADER]]
; CHECK-LLC-NOT: b [[LOOP_EXIT:\.LBB[0-9_]+]]
-; CHECK-LLC: [[LOOP_EXIT:\.LBB[0-9_]+]]:
+; CHECK-LLC: [[LOOP_EXIT]]:
define i32 @do_inc1(i32 %n) {
entry:
}
; CHECK-LABEL: do_inc2
-; CHECK: while.body.lr.ph:
+; CHECK: entry:
; CHECK: [[ROUND:%[^ ]+]] = add i32 %n, -1
; CHECK: [[HALVE:%[^ ]+]] = lshr i32 [[ROUND]], 1
; CHECK: [[COUNT:%[^ ]+]] = add nuw i32 [[HALVE]], 1
-; CHECK: call void @llvm.set.loop.iterations.i32(i32 [[COUNT]])
-; CHECK-NEXT: br label %while.body
-; CHECK: [[REM:%[^ ]+]] = phi i32 [ [[COUNT]], %while.body.lr.ph ], [ [[LOOP_DEC:%[^ ]+]], %while.body ]
-; CHECK: [[LOOP_DEC]] = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 [[REM]], i32 1)
-; CHECK: [[CMP:%[^ ]+]] = icmp ne i32 [[LOOP_DEC]], 0
-; CHECK: br i1 [[CMP]], label %while.body, label %while.end.loopexit
+; CHECK: while.body.lr.ph:
+; CHECK: call void @llvm.set.loop.iterations.i32(i32 [[COUNT]])
+; CHECK: br label %while.body
+; CHECK: while.body:
+; CHECK: [[REM:%[^ ]+]] = phi i32 [ [[COUNT]], %while.body.lr.ph ], [ [[LOOP_DEC:%[^ ]+]], %while.body ]
+; CHECK: [[LOOP_DEC]] = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 [[REM]], i32 1)
+; CHECK: [[CMP:%[^ ]+]] = icmp ne i32 [[LOOP_DEC]], 0
+; CHECK: br i1 [[CMP]], label %while.body, label %while.end.loopexit
; CHECK-LLC: do_inc2:
; CHECK-LLC-NOT: mov lr,
-; CHECK-LLC: dls lr,
+; CHECK-LLC: dls lr, {{.*}}
; CHECK-LLC-NOT: mov lr,
; CHECK-LLC: [[LOOP_HEADER:\.LBB[0-9._]+]]:
; CHECK-LLC: le lr, [[LOOP_HEADER]]
-; CHECK-LLC-NOT: b [[LOOP_EXIT:\.LBB[0-9._]+]]
-; CHECK-LLC: [[LOOP_EXIT:\.LBB[0-9_]+]]:
define i32 @do_inc2(i32 %n) {
entry:
; CHECK-LABEL: do_dec2
-; CHECK: while.body.lr.ph:
+; CHECK: entry:
; CHECK: [[ROUND:%[^ ]+]] = add i32 %n, 1
; CHECK: [[CMP:%[^ ]+]] = icmp slt i32 %n, 2
; CHECK: [[SMIN:%[^ ]+]] = select i1 [[CMP]], i32 %n, i32 2
; CHECK: [[SUB:%[^ ]+]] = sub i32 [[ROUND]], [[SMIN]]
; CHECK: [[HALVE:%[^ ]+]] = lshr i32 [[SUB]], 1
; CHECK: [[COUNT:%[^ ]+]] = add nuw i32 [[HALVE]], 1
+
+; CHECK: while.body.lr.ph:
; CHECK: call void @llvm.set.loop.iterations.i32(i32 [[COUNT]])
-; CHECK-NEXT: br label %while.body
+; CHECK: br label %while.body
; CHECK: [[REM:%[^ ]+]] = phi i32 [ [[COUNT]], %while.body.lr.ph ], [ [[LOOP_DEC:%[^ ]+]], %while.body ]
; CHECK: [[LOOP_DEC]] = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 [[REM]], i32 1)
; CHECK-LLC: do_dec2
; CHECK-LLC-NOT: mov lr,
-; CHECK-LLC: dls lr,
+; CHECK-LLC: dls lr, {{.*}}
; CHECK-LLC-NOT: mov lr,
; CHECK-LLC: [[LOOP_HEADER:\.LBB[0-9_]+]]:
; CHECK-LLC: le lr, [[LOOP_HEADER]]
; CHECK-LLC-NOT: b .
-; CHECK-LLC: @ %while.end
define i32 @do_dec2(i32 %n) {
entry:
%cmp6 = icmp sgt i32 %n, 0
ret i32 0
}
+; CHECK-LABEL: pre_existing_test_set
+; CHECK: call i1 @llvm.test.set.loop.iterations
+; CHECK-NOT: llvm.set{{.*}}.loop.iterations
+; CHECK: call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 %0, i32 1)
+; CHECK-NOT: call i32 @llvm.loop.decrement.reg
+define i32 @pre_existing_test_set(i32 %n, i32* nocapture %p, i32* nocapture readonly %q) {
+entry:
+ %guard = call i1 @llvm.test.set.loop.iterations.i32(i32 %n)
+ br i1 %guard, label %while.preheader, label %while.end
+
+while.preheader:
+ br label %while.body
+
+while.body: ; preds = %while.body, %entry
+ %q.addr.05 = phi i32* [ %incdec.ptr, %while.body ], [ %q, %while.preheader ]
+ %p.addr.04 = phi i32* [ %incdec.ptr1, %while.body ], [ %p, %while.preheader ]
+ %0 = phi i32 [ %n, %while.preheader ], [ %2, %while.body ]
+ %incdec.ptr = getelementptr inbounds i32, i32* %q.addr.05, i32 1
+ %1 = load i32, i32* %q.addr.05, align 4
+ %incdec.ptr1 = getelementptr inbounds i32, i32* %p.addr.04, i32 1
+ store i32 %1, i32* %p.addr.04, align 4
+ %2 = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 %0, i32 1)
+ %3 = icmp ne i32 %2, 0
+ br i1 %3, label %while.body, label %while.end
+
+while.end: ; preds = %while.body
+ ret i32 0
+}
+
; CHECK-LABEL: pre_existing_inner
; CHECK-NOT: llvm.set.loop.iterations
; CHECK: while.cond1.preheader.us:
}
; CHECK-LABEL: search
+; CHECK: entry:
+; CHECK: [[TEST:%[^ ]+]] = call i1 @llvm.test.set.loop.iterations.i32(i32 %N)
+; CHECK: br i1 [[TEST]], label %for.body.preheader, label %for.cond.cleanup
; CHECK: for.body.preheader:
-; CHECK: call void @llvm.set.loop.iterations.i32(i32 %N)
-; CHECK: br label %for.body
+; CHECK: br label %for.body
; CHECK: for.body:
; CHECK: for.inc:
-; CHECK: [[LOOP_DEC:%[^ ]+]] = call i32 @llvm.loop.decrement.reg.i32.i32.i32
-; CHECK: [[CMP:%[^ ]+]] = icmp ne i32 [[LOOP_DEC]], 0
-; CHECK: br i1 [[CMP]], label %for.body, label %for.cond.cleanup
+; CHECK: [[LOOP_DEC:%[^ ]+]] = call i32 @llvm.loop.decrement.reg.i32.i32.i32
+; CHECK: [[CMP:%[^ ]+]] = icmp ne i32 [[LOOP_DEC]], 0
+; CHECK: br i1 [[CMP]], label %for.body, label %for.cond.cleanup
define i32 @search(i8* nocapture readonly %c, i32 %N) {
entry:
%cmp11 = icmp eq i32 %N, 0
; CHECK: call i32 @llvm.loop.decrement.reg.i32.i32.i32(
; TODO: We should be able to support the unrolled loop body.
-; CHECK-UNROLL-LABEL: unroll_inc_int:
+; CHECK-UNROLL-LABEL: unroll_inc_int
; CHECK-UNROLL: [[PREHEADER:.LBB[0-9_]+]]: @ %for.body.preheader
; CHECK-UNROLL-NOT: dls
; CHECK-UNROLL: [[LOOP:.LBB[0-9_]+]]: @ %for.body
; CHECK-UNROLL-NOT: le lr, [[LOOP]]
; CHECK-UNROLL: bne [[LOOP]]
-; CHECK-UNROLL: %for.body.epil.preheader
-; CHECK-UNROLL: dls
-; CHECK-UNROLL: %for.body.epil
-; CHECK-UNROLL: le
+; CHECK-UNROLL: wls lr, lr, [[EXIT:.LBB[0-9_]+]]
+; CHECK-UNROLL: [[EPIL:.LBB[0-9_]+]]:
+; CHECK-UNROLL: le lr, [[EPIL]]
+; CHECK-UNROLL-NEXT: [[EXIT]]
define void @unroll_inc_int(i32* nocapture %a, i32* nocapture readonly %b, i32* nocapture readonly %c, i32 %N) {
entry:
}
; CHECK-LABEL: unroll_inc_unsigned
-; CHECK: call void @llvm.set.loop.iterations.i32(i32 %N)
+; CHECK: call i1 @llvm.test.set.loop.iterations.i32(i32 %N)
; CHECK: call i32 @llvm.loop.decrement.reg.i32.i32.i32(
; CHECK-LLC-LABEL: unroll_inc_unsigned:
-; CHECK-LLC: dls lr, [[COUNT:r[0-9]+]]
-; CHECK-LLC: le lr
+; CHECK-LLC: wls lr, r3, [[EXIT:.LBB[0-9_]+]]
+; CHECK-LLC: [[HEADER:.LBB[0-9_]+]]:
+; CHECK-LLC: le lr, [[HEADER]]
+; CHECK-LLC-NEXT: [[EXIT]]:
; TODO: We should be able to support the unrolled loop body.
-; CHECK-UNROLL-LABEL: unroll_inc_unsigned:
+; CHECK-UNROLL-LABEL: unroll_inc_unsigned
; CHECK-UNROLL: [[PREHEADER:.LBB[0-9_]+]]: @ %for.body.preheader
; CHECK-UNROLL-NOT: dls
; CHECK-UNROLL: [[LOOP:.LBB[0-9_]+]]: @ %for.body
; CHECK-UNROLL-NOT: le lr, [[LOOP]]
; CHECK-UNROLL: bne [[LOOP]]
-; CHECK-UNROLL: %for.body.epil.preheader
-; CHECK-UNROLL: dls
-; CHECK-UNROLL: %for.body.epil
-; CHECK-UNROLL: le
+; CHECK-UNROLL: wls lr, lr, [[EPIL_EXIT:.LBB[0-9_]+]]
+; CHECK-UNROLL: [[EPIL:.LBB[0-9_]+]]:
+; CHECK-UNROLL: le lr, [[EPIL]]
+; CHECK-UNROLL: [[EPIL_EXIT]]:
+; CHECK-UNROLL: pop
define void @unroll_inc_unsigned(i32* nocapture %a, i32* nocapture readonly %b, i32* nocapture readonly %c, i32 %N) {
entry:
%cmp8 = icmp eq i32 %N, 0
; TODO: An unnecessary register is being held to hold COUNT, lr should just
; be used instead.
; CHECK-LLC-LABEL: unroll_dec_int:
-; CHECK-LLC: dls lr, [[COUNT:r[0-9]+]]
-; CHECK-LLC: subs [[COUNT]], #1
-; CHECK-LLC: le lr
-
-; CHECK-UNROLL-LABEL: unroll_dec_int
-; CHECK-UNROLL: dls lr
-; CHECK-UNROLL: le lr
-; CHECK-UNROLL: dls lr
-; CHECK-UNROLL: le lr
+; CHECK-LLC: dls lr, r3
+; CHECK-LLC-NOT: mov lr, r3
+; CHECK-LLC: [[HEADER:.LBB[0-9_]+]]:
+; CHECK-LLC: le lr, [[HEADER]]
+
+; CHECK-UNROLL-LABEL: unroll_dec_int:
+; CHECK-UNROLL: wls lr, {{.*}}, [[PROLOGUE_EXIT:.LBB[0-9_]+]]
+; CHECK-UNROLL-NEXT: [[PROLOGUE:.LBB[0-9_]+]]:
+; CHECK-UNROLL: le lr, [[PROLOGUE]]
+; CHECK-UNROLL-NEXT: [[PROLOGUE_EXIT:.LBB[0-9_]+]]:
+; CHECK-UNROLL: dls lr, lr
+; CHECK-UNROLL: [[BODY:.LBB[0-9_]+]]:
+; CHECK-UNROLL: le lr, [[BODY]]
+; CHECK-UNROLL-NOT: b
+; CHECK-UNROLL: pop
define void @unroll_dec_int(i32* nocapture %a, i32* nocapture readonly %b, i32* nocapture readonly %c, i32 %N) {
entry:
%cmp8 = icmp sgt i32 %N, 0
}
declare void @llvm.set.loop.iterations.i32(i32) #0
+declare i1 @llvm.test.set.loop.iterations.i32(i32) #0
declare i32 @llvm.loop.decrement.reg.i32.i32.i32(i32, i32) #0