From: Sam Parker <sam.parker@arm.com>
Date: Mon, 1 Jul 2019 08:21:28 +0000 (+0000)
Subject: [ARM] WLS/LE Code Generation
X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=b74b51461d4f4f0780f77021d909e508bd407591;p=llvm

[ARM] WLS/LE Code Generation

Backend changes to enable WLS/LE low-overhead loops for armv8.1-m:
1) Use TTI to communicate to the HardwareLoop pass that we should try
   to generate intrinsics that guard the loop entry, as well as setting
   the loop trip count.
2) Lower the BRCOND that uses said intrinsic to an Arm specific node:
   ARMWLS.
3) ISelDAGToDAG the node to a new pseudo instruction:
   t2WhileLoopStart.
4) Add support in ArmLowOverheadLoops to handle the new pseudo
   instruction.

Differential Revision: https://reviews.llvm.org/D63816


git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@364733 91177308-0d34-0410-b5e6-96231b3b80d8
---

diff --git a/lib/CodeGen/HardwareLoops.cpp b/lib/CodeGen/HardwareLoops.cpp
index 0d754b902fa..5f57cabbe86 100644
--- a/lib/CodeGen/HardwareLoops.cpp
+++ b/lib/CodeGen/HardwareLoops.cpp
@@ -294,6 +294,7 @@ static bool CanGenerateTest(Loop *L, Value *Count) {
   // Check that the icmp is checking for equality of Count and zero and that
   // a non-zero value results in entering the loop.
   auto ICmp = cast<ICmpInst>(BI->getCondition());
+  LLVM_DEBUG(dbgs() << " - Found condition: " << *ICmp << "\n");
   if (!ICmp->isEquality())
     return false;
 
diff --git a/lib/Target/ARM/ARMISelDAGToDAG.cpp b/lib/Target/ARM/ARMISelDAGToDAG.cpp
index c9a88823f5e..b349627b67b 100644
--- a/lib/Target/ARM/ARMISelDAGToDAG.cpp
+++ b/lib/Target/ARM/ARMISelDAGToDAG.cpp
@@ -2998,6 +2998,16 @@ void ARMDAGToDAGISel::Select(SDNode *N) {
     // Other cases are autogenerated.
     break;
   }
+  case ARMISD::WLS: {
+    SDValue Ops[] = { N->getOperand(1),   // Loop count
+                      N->getOperand(2),   // Exit target
+                      N->getOperand(0) };
+    SDNode *LoopStart =
+      CurDAG->getMachineNode(ARM::t2WhileLoopStart, dl, MVT::Other, Ops);
+    ReplaceUses(N, LoopStart);
+    CurDAG->RemoveDeadNode(N);
+    return;
+  }
   case ARMISD::BRCOND: {
     // Pattern: (ARMbrcond:void (bb:Other):$dst, (imm:i32):$cc)
     // Emits: (Bcc:void (bb:Other):$dst, (imm:i32):$cc)
diff --git a/lib/Target/ARM/ARMISelLowering.cpp b/lib/Target/ARM/ARMISelLowering.cpp
index d2ef680524a..f2b6af1f1fd 100644
--- a/lib/Target/ARM/ARMISelLowering.cpp
+++ b/lib/Target/ARM/ARMISelLowering.cpp
@@ -633,6 +633,10 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM,
   if (Subtarget->hasMVEIntegerOps())
     addMVEVectorTypes(Subtarget->hasMVEFloatOps());
 
+  // Combine low-overhead loop intrinsics so that we can lower i1 types.
+  if (Subtarget->hasLOB())
+    setTargetDAGCombine(ISD::BRCOND);
+
   if (Subtarget->hasNEON()) {
     addDRTypeForNEON(MVT::v2f32);
     addDRTypeForNEON(MVT::v8i8);
@@ -1542,6 +1546,7 @@ const char *ARMTargetLowering::getTargetNodeName(unsigned Opcode) const {
   case ARMISD::VST2LN_UPD:    return "ARMISD::VST2LN_UPD";
   case ARMISD::VST3LN_UPD:    return "ARMISD::VST3LN_UPD";
   case ARMISD::VST4LN_UPD:    return "ARMISD::VST4LN_UPD";
+  case ARMISD::WLS:           return "ARMISD::WLS";
   }
   return nullptr;
 }
@@ -12883,6 +12888,42 @@ SDValue ARMTargetLowering::PerformCMOVToBFICombine(SDNode *CMOV, SelectionDAG &D
   return V;
 }
 
+static SDValue PerformHWLoopCombine(SDNode *N,
+                                    TargetLowering::DAGCombinerInfo &DCI,
+                                    const ARMSubtarget *ST) {
+  // Look for (brcond (xor test.set.loop.iterations, -1)
+  SDValue CC = N->getOperand(1);
+
+  if (CC->getOpcode() != ISD::XOR && CC->getOpcode() != ISD::SETCC)
+    return SDValue();
+
+  if (CC->getOperand(0)->getOpcode() != ISD::INTRINSIC_W_CHAIN)
+    return SDValue();
+
+  SDValue Int = CC->getOperand(0);
+  unsigned IntOp = cast<ConstantSDNode>(Int.getOperand(1))->getZExtValue();
+  if (IntOp != Intrinsic::test_set_loop_iterations)
+    return SDValue();
+
+  if (auto *Const = dyn_cast<ConstantSDNode>(CC->getOperand(1)))
+    assert(Const->isOne() && "Expected to compare against 1");
+  else
+    assert(Const->isOne() && "Expected to compare against 1");
+
+  SDLoc dl(Int);
+  SDValue Chain = N->getOperand(0);
+  SDValue Elements = Int.getOperand(2);
+  SDValue ExitBlock = N->getOperand(2);
+
+  // TODO: Once we start supporting tail predication, we can add another
+  // operand to WLS for the number of elements processed in a vector loop.
+
+  SDValue Ops[] = { Chain, Elements, ExitBlock };
+  SDValue Res = DCI.DAG.getNode(ARMISD::WLS, dl, MVT::Other, Ops);
+  DCI.DAG.ReplaceAllUsesOfValueWith(Int.getValue(1), Int.getOperand(0));
+  return Res;
+}
+
 /// PerformBRCONDCombine - Target-specific DAG combining for ARMISD::BRCOND.
 SDValue
 ARMTargetLowering::PerformBRCONDCombine(SDNode *N, SelectionDAG &DAG) const {
@@ -13114,6 +13155,7 @@ SDValue ARMTargetLowering::PerformDAGCombine(SDNode *N,
   case ISD::OR:         return PerformORCombine(N, DCI, Subtarget);
   case ISD::XOR:        return PerformXORCombine(N, DCI, Subtarget);
   case ISD::AND:        return PerformANDCombine(N, DCI, Subtarget);
+  case ISD::BRCOND:     return PerformHWLoopCombine(N, DCI, Subtarget);
   case ARMISD::ADDC:
   case ARMISD::SUBC:    return PerformAddcSubcCombine(N, DCI, Subtarget);
   case ARMISD::SUBE:    return PerformAddeSubeCombine(N, DCI, Subtarget);
diff --git a/lib/Target/ARM/ARMISelLowering.h b/lib/Target/ARM/ARMISelLowering.h
index 1fb89c7eff4..e79144d91b7 100644
--- a/lib/Target/ARM/ARMISelLowering.h
+++ b/lib/Target/ARM/ARMISelLowering.h
@@ -125,6 +125,8 @@ class VectorType;
       WIN__CHKSTK,  // Windows' __chkstk call to do stack probing.
       WIN__DBZCHK,  // Windows' divide by zero check
 
+      WLS,          // Low-overhead loops, While Loop Start
+
       VCEQ,         // Vector compare equal.
       VCEQZ,        // Vector compare equal to zero.
       VCGE,         // Vector compare greater than or equal.
diff --git a/lib/Target/ARM/ARMInstrInfo.td b/lib/Target/ARM/ARMInstrInfo.td
index 3799acd855e..5ea7544d2f5 100644
--- a/lib/Target/ARM/ARMInstrInfo.td
+++ b/lib/Target/ARM/ARMInstrInfo.td
@@ -106,6 +106,11 @@ def SDT_ARMIntShiftParts : SDTypeProfile<2, 3, [SDTCisSameAs<0, 1>,
                                               SDTCisInt<0>,
                                               SDTCisInt<4>]>;
 
+// TODO Add another operand for 'Size' so that we can re-use this node when we
+// start supporting *TP versions.
+def SDT_ARMWhileLoop : SDTypeProfile<0, 2, [SDTCisVT<0, i32>,
+                                            SDTCisVT<1, OtherVT>]>;
+
 def ARMSmlald        : SDNode<"ARMISD::SMLALD", SDT_LongMac>;
 def ARMSmlaldx       : SDNode<"ARMISD::SMLALDX", SDT_LongMac>;
 def ARMSmlsld        : SDNode<"ARMISD::SMLSLD", SDT_LongMac>;
@@ -244,6 +249,9 @@ def SDTARMVGETLN  : SDTypeProfile<1, 2, [SDTCisVT<0, i32>, SDTCisInt<1>,
 def ARMvgetlaneu : SDNode<"ARMISD::VGETLANEu", SDTARMVGETLN>;
 def ARMvgetlanes : SDNode<"ARMISD::VGETLANEs", SDTARMVGETLN>;
 
+def ARMWLS : SDNode<"ARMISD::WLS", SDT_ARMWhileLoop,
+                    [SDNPHasChain]>;
+
 //===----------------------------------------------------------------------===//
 // ARM Flag Definitions.
 
diff --git a/lib/Target/ARM/ARMInstrThumb2.td b/lib/Target/ARM/ARMInstrThumb2.td
index 1525cff080a..2d22c3e44ff 100644
--- a/lib/Target/ARM/ARMInstrThumb2.td
+++ b/lib/Target/ARM/ARMInstrThumb2.td
@@ -5216,11 +5216,19 @@ def t2LoopDec :
   t2PseudoInst<(outs GPRlr:$Rm), (ins GPRlr:$Rn, imm0_7:$size),
                4, IIC_Br, []>, Sched<[WriteBr]>;
 
-let isBranch = 1, isTerminator = 1, hasSideEffects = 1 in
+let isBranch = 1, isTerminator = 1, hasSideEffects = 1 in {
+def t2WhileLoopStart :
+    t2PseudoInst<(outs),
+                 (ins rGPR:$elts, brtarget:$target),
+                 4, IIC_Br, []>,
+                 Sched<[WriteBr]>;
+
 def t2LoopEnd :
   t2PseudoInst<(outs), (ins GPRlr:$elts, brtarget:$target),
   8, IIC_Br, []>, Sched<[WriteBr]>;
 
+} // end isBranch, isTerminator, hasSideEffects
+
 } // end isNotDuplicable
 
 class CS<string iname, bits<4> opcode, list<dag> pattern=[]>
diff --git a/lib/Target/ARM/ARMLowOverheadLoops.cpp b/lib/Target/ARM/ARMLowOverheadLoops.cpp
index 6a3709dc03f..ecac67a2f31 100644
--- a/lib/Target/ARM/ARMLowOverheadLoops.cpp
+++ b/lib/Target/ARM/ARMLowOverheadLoops.cpp
@@ -105,15 +105,20 @@ bool ARMLowOverheadLoops::ProcessLoop(MachineLoop *ML) {
   LLVM_DEBUG(dbgs() << "ARM Loops: Processing " << *ML);
 
   auto IsLoopStart = [](MachineInstr &MI) {
-    return MI.getOpcode() == ARM::t2DoLoopStart;
+    return MI.getOpcode() == ARM::t2DoLoopStart ||
+           MI.getOpcode() == ARM::t2WhileLoopStart;
   };
 
-  auto SearchForStart =
-    [&IsLoopStart](MachineBasicBlock *MBB) -> MachineInstr* {
+  // Search the given block for a loop start instruction. If one isn't found,
+  // and there's only one predecessor block, search that one too.
+  std::function<MachineInstr*(MachineBasicBlock*)> SearchForStart =
+    [&IsLoopStart, &SearchForStart](MachineBasicBlock *MBB) -> MachineInstr* {
     for (auto &MI : *MBB) {
       if (IsLoopStart(MI))
         return &MI;
     }
+    if (MBB->pred_size() == 1)
+      return SearchForStart(*MBB->pred_begin());
     return nullptr;
   };
 
@@ -122,8 +127,28 @@ bool ARMLowOverheadLoops::ProcessLoop(MachineLoop *ML) {
   MachineInstr *End = nullptr;
   bool Revert = false;
 
-  if (auto *Preheader = ML->getLoopPreheader())
+  // Search the preheader for the start intrinsic, or look through the
+  // predecessors of the header to find exactly one set.iterations intrinsic.
+  // FIXME: I don't see why we shouldn't be supporting multiple predecessors
+  // with potentially multiple set.loop.iterations, so we need to enable this.
+  if (auto *Preheader = ML->getLoopPreheader()) {
     Start = SearchForStart(Preheader);
+  } else {
+    LLVM_DEBUG(dbgs() << "ARM Loops: Failed to find loop preheader!\n"
+               << " - Performing manual predecessor search.\n");
+    MachineBasicBlock *Pred = nullptr;
+    for (auto *MBB : ML->getHeader()->predecessors()) {
+      if (!ML->contains(MBB)) {
+        if (Pred) {
+          LLVM_DEBUG(dbgs() << " - Found multiple out-of-loop preds.\n");
+          Start = nullptr;
+          break;
+        }
+        Pred = MBB;
+        Start = SearchForStart(MBB);
+      }
+    }
+  }
 
   // Find the low-overhead loop components and decide whether or not to fall
   // back to a normal loop.
@@ -158,12 +183,11 @@ bool ARMLowOverheadLoops::ProcessLoop(MachineLoop *ML) {
       break;
   }
 
-  if (Start || Dec || End) {
-    if (!Start || !Dec || !End)
-      report_fatal_error("Failed to find all loop components");
-  } else {
+  if (!Start && !Dec && !End) {
     LLVM_DEBUG(dbgs() << "ARM Loops: Not a low-overhead loop.\n");
     return Changed;
+  } if (!(Start && Dec && End)) {
+    report_fatal_error("Failed to find all loop components");
   }
 
   if (!End->getOperand(1).isMBB() ||
@@ -212,15 +236,21 @@ void ARMLowOverheadLoops::Expand(MachineLoop *ML, MachineInstr *Start,
       break;
     }
 
+    unsigned Opc = Start->getOpcode() == ARM::t2DoLoopStart ?
+      ARM::t2DLS : ARM::t2WLS;
     MachineInstrBuilder MIB =
-      BuildMI(*MBB, InsertPt, InsertPt->getDebugLoc(), TII->get(ARM::t2DLS));
-    if (InsertPt != Start)
-      InsertPt->eraseFromParent();
+      BuildMI(*MBB, InsertPt, InsertPt->getDebugLoc(), TII->get(Opc));
 
     MIB.addDef(ARM::LR);
     MIB.add(Start->getOperand(0));
-    LLVM_DEBUG(dbgs() << "ARM Loops: Inserted DLS: " << *MIB);
+    if (Opc == ARM::t2WLS)
+      MIB.add(Start->getOperand(1));
+
+    if (InsertPt != Start)
+      InsertPt->eraseFromParent();
     Start->eraseFromParent();
+    LLVM_DEBUG(dbgs() << "ARM Loops: Inserted start: " << *MIB);
+    return &*MIB;
   };
 
   // Combine the LoopDec and LoopEnd instructions into LE(TP).
@@ -234,24 +264,15 @@ void ARMLowOverheadLoops::Expand(MachineLoop *ML, MachineInstr *Start,
     MIB.add(End->getOperand(1));
     LLVM_DEBUG(dbgs() << "ARM Loops: Inserted LE: " << *MIB);
 
-    // If there is a branch after loop end, which branches to the fallthrough
-    // block, remove the branch.
-    MachineBasicBlock *Latch = End->getParent();
-    MachineInstr *Terminator = &Latch->instr_back();
-    if (End != Terminator) {
-      MachineBasicBlock *Exit = ML->getExitBlock();
-      if (Latch->isLayoutSuccessor(Exit)) {
-        LLVM_DEBUG(dbgs() << "ARM Loops: Removing loop exit branch: "
-                   << *Terminator);
-        Terminator->eraseFromParent();
-      }
-    }
     End->eraseFromParent();
     Dec->eraseFromParent();
+    return &*MIB;
   };
 
   // Generate a subs, or sub and cmp, and a branch instead of an LE.
   // TODO: Check flags so that we can possibly generate a subs.
+  // FIXME: Need to check that we're not trashing the CPSR when generating
+  // the cmp.
   auto ExpandBranch = [this](MachineInstr *Dec, MachineInstr *End) {
     LLVM_DEBUG(dbgs() << "ARM Loops: Reverting to sub, cmp, br.\n");
     // Create sub
@@ -282,12 +303,53 @@ void ARMLowOverheadLoops::Expand(MachineLoop *ML, MachineInstr *Start,
     Dec->eraseFromParent();
   };
 
+  // WhileLoopStart holds the exit block, so produce a cmp lr, 0 and then a
+  // beq that branches to the exit branch.
+  // FIXME: Need to check that we're not trashing the CPSR when generating the
+  // cmp. We could also try to generate a cbz if the value in LR is also in
+  // another low register.
+  auto ExpandStart = [this](MachineInstr *MI) {
+    MachineBasicBlock *MBB = MI->getParent();
+    MachineInstrBuilder MIB = BuildMI(*MBB, MI, MI->getDebugLoc(),
+                                      TII->get(ARM::t2CMPri));
+    MIB.addReg(ARM::LR);
+    MIB.addImm(0);
+    MIB.addImm(ARMCC::AL);
+    MIB.addReg(ARM::CPSR);
+
+    MIB = BuildMI(*MBB, MI, MI->getDebugLoc(), TII->get(ARM::t2Bcc));
+    MIB.add(MI->getOperand(1));   // branch target
+    MIB.addImm(ARMCC::EQ);        // condition code
+    MIB.addReg(ARM::CPSR);
+  };
+
+  // TODO: We should be able to automatically remove these branches before we
+  // get here - probably by teaching analyzeBranch about the pseudo
+  // instructions.
+  // If there is an unconditional branch, after I, that just branches to the
+  // next block, remove it.
+  auto RemoveDeadBranch = [](MachineInstr *I) {
+    MachineBasicBlock *BB = I->getParent();
+    MachineInstr *Terminator = &BB->instr_back();
+    if (Terminator->isUnconditionalBranch() && I != Terminator) {
+      MachineBasicBlock *Succ = Terminator->getOperand(0).getMBB();
+      if (BB->isLayoutSuccessor(Succ)) {
+        LLVM_DEBUG(dbgs() << "ARM Loops: Removing branch: " << *Terminator);
+        Terminator->eraseFromParent();
+      }
+    }
+  };
+
   if (Revert) {
-    Start->eraseFromParent();
+    if (Start->getOpcode() == ARM::t2WhileLoopStart)
+      ExpandStart(Start);
     ExpandBranch(Dec, End);
+    Start->eraseFromParent();
   } else {
-    ExpandLoopStart(ML, Start);
-    ExpandLoopEnd(ML, Dec, End);
+    Start = ExpandLoopStart(ML, Start);
+    RemoveDeadBranch(Start);
+    End = ExpandLoopEnd(ML, Dec, End);
+    RemoveDeadBranch(End);
   }
 }
 
diff --git a/lib/Target/ARM/ARMTargetTransformInfo.cpp b/lib/Target/ARM/ARMTargetTransformInfo.cpp
index c3a7c18bb5d..2a8ec734a05 100644
--- a/lib/Target/ARM/ARMTargetTransformInfo.cpp
+++ b/lib/Target/ARM/ARMTargetTransformInfo.cpp
@@ -806,6 +806,7 @@ bool ARMTTIImpl::isHardwareLoopProfitable(Loop *L, ScalarEvolution &SE,
       default:
         break;
       case Intrinsic::set_loop_iterations:
+      case Intrinsic::test_set_loop_iterations:
       case Intrinsic::loop_decrement:
       case Intrinsic::loop_decrement_reg:
         return true;
@@ -841,6 +842,7 @@ bool ARMTTIImpl::isHardwareLoopProfitable(Loop *L, ScalarEvolution &SE,
   LLVMContext &C = L->getHeader()->getContext();
   HWLoopInfo.CounterInReg = true;
   HWLoopInfo.IsNestingLegal = false;
+  HWLoopInfo.PerformEntryTest = true;
   HWLoopInfo.CountType = Type::getInt32Ty(C);
   HWLoopInfo.LoopDecrement = ConstantInt::get(HWLoopInfo.CountType, 1);
   return true;
diff --git a/test/Transforms/HardwareLoops/ARM/cond-mov.mir b/test/CodeGen/Thumb2/LowOverheadLoops/cond-mov.mir
similarity index 100%
rename from test/Transforms/HardwareLoops/ARM/cond-mov.mir
rename to test/CodeGen/Thumb2/LowOverheadLoops/cond-mov.mir
diff --git a/test/CodeGen/Thumb2/LowOverheadLoops/loop-guards.ll b/test/CodeGen/Thumb2/LowOverheadLoops/loop-guards.ll
new file mode 100644
index 00000000000..fd8cc9b92f2
--- /dev/null
+++ b/test/CodeGen/Thumb2/LowOverheadLoops/loop-guards.ll
@@ -0,0 +1,213 @@
+; RUN: llc -mtriple=thumbv8.1m.main -disable-arm-loloops=false -mattr=+lob -stop-after=arm-low-overhead-loops --verify-machineinstrs %s -o - | FileCheck %s
+; RUN: llc -mtriple=thumbv8.1m.main -disable-arm-loloops=false -mattr=+lob -stop-after=arm-low-overhead-loops --verify-machineinstrs %s -o - | FileCheck %s --check-prefix=CHECK-GLOBAL
+
+; Not implemented as a mir test so that changes the generic HardwareLoop can
+; also be tested. These functions have been taken from
+; Transforms/HardwareLoops/loop-guards.ll in which can be seen the generation
+; of a few test.set intrinsics, but only one (ne_trip_count) gets generated
+; here. Simplifications result in icmps changing and maybe also the CFG. So,
+; TODO: Teach the HardwareLoops some better pattern recognition.
+
+; CHECK-GLOBAL-NOT: DoLoopStart
+; CHECK-GLOBAL-NOT: WhileLoopStart
+; CHECK-GLOBAL-NOT: LoopEnd
+
+; CHECK: ne_and_guard
+; CHECK: body:
+; CHECK: bb.0.entry:
+; CHECK:   t2CMPri renamable $lr, 0
+; CHECK:   tBcc %bb.3
+; CHECK: bb.1.while.body.preheader:
+; CHECK:   $lr = t2DLS renamable $lr
+; CHECK: bb.2.while.body:
+; CHECK:   $lr = t2LEUpdate renamable $lr, %bb.2
+define void @ne_and_guard(i1 zeroext %t1, i1 zeroext %t2, i32* nocapture %a, i32* nocapture readonly %b, i32 %N) {
+entry:
+  %brmerge.demorgan = and i1 %t1, %t2
+  %cmp6 = icmp ne i32 %N, 0
+  %or.cond = and i1 %brmerge.demorgan, %cmp6
+  br i1 %or.cond, label %while.body, label %if.end
+
+while.body:                                       ; preds = %while.body, %entry
+  %i.09 = phi i32 [ %inc, %while.body ], [ 0, %entry ]
+  %a.addr.08 = phi i32* [ %incdec.ptr3, %while.body ], [ %a, %entry ]
+  %b.addr.07 = phi i32* [ %incdec.ptr, %while.body ], [ %b, %entry ]
+  %incdec.ptr = getelementptr inbounds i32, i32* %b.addr.07, i32 1
+  %tmp = load i32, i32* %b.addr.07, align 4
+  %incdec.ptr3 = getelementptr inbounds i32, i32* %a.addr.08, i32 1
+  store i32 %tmp, i32* %a.addr.08, align 4
+  %inc = add nuw i32 %i.09, 1
+  %exitcond = icmp eq i32 %inc, %N
+  br i1 %exitcond, label %if.end, label %while.body
+
+if.end:                                           ; preds = %while.body, %entry
+  ret void
+}
+
+; TODO: This could generate WLS
+; CHECK: ne_preheader
+; CHECK: body:
+; CHECK: bb.0.entry:
+; CHECK:   t2CMPri renamable $lr, 0
+; CHECK:   tBcc %bb.3
+; CHECK: bb.1.while.body.preheader:
+; CHECK:   $lr = t2DLS renamable $lr
+; CHECK: bb.2.while.body:
+; CHECK:   $lr = t2LEUpdate renamable $lr, %bb.2
+define void @ne_preheader(i1 zeroext %t1, i1 zeroext %t2, i32* nocapture %a, i32* nocapture readonly %b, i32 %N) {
+entry:
+  %brmerge.demorgan = and i1 %t1, %t2
+  br i1 %brmerge.demorgan, label %while.preheader, label %if.end
+
+while.preheader:                                  ; preds = %entry
+  %cmp = icmp ne i32 %N, 0
+  br i1 %cmp, label %while.body, label %if.end
+
+while.body:                                       ; preds = %while.body, %while.preheader
+  %i.09 = phi i32 [ %inc, %while.body ], [ 0, %while.preheader ]
+  %a.addr.08 = phi i32* [ %incdec.ptr3, %while.body ], [ %a, %while.preheader ]
+  %b.addr.07 = phi i32* [ %incdec.ptr, %while.body ], [ %b, %while.preheader ]
+  %incdec.ptr = getelementptr inbounds i32, i32* %b.addr.07, i32 1
+  %tmp = load i32, i32* %b.addr.07, align 4
+  %incdec.ptr3 = getelementptr inbounds i32, i32* %a.addr.08, i32 1
+  store i32 %tmp, i32* %a.addr.08, align 4
+  %inc = add nuw i32 %i.09, 1
+  %exitcond = icmp eq i32 %inc, %N
+  br i1 %exitcond, label %if.end, label %while.body
+
+if.end:                                           ; preds = %while.body, %while.preheader, %entry
+  ret void
+}
+
+; TODO: This could generate WLS
+; CHECK: eq_preheader
+; CHECK: body:
+; CHECK: bb.0.entry:
+; CHECK:   t2CMPri renamable $lr, 0
+; CHECK:   tBcc %bb.3
+; CHECK: bb.1.while.body.preheader:
+; CHECK:   $lr = t2DLS renamable $lr
+; CHECK: bb.2.while.body:
+; CHECK:   $lr = t2LEUpdate renamable $lr, %bb.2
+define void @eq_preheader(i1 zeroext %t1, i1 zeroext %t2, i32* nocapture %a, i32* nocapture readonly %b, i32 %N) {
+entry:
+  %brmerge.demorgan = and i1 %t1, %t2
+  br i1 %brmerge.demorgan, label %while.preheader, label %if.end
+
+while.preheader:                                  ; preds = %entry
+  %cmp = icmp eq i32 %N, 0
+  br i1 %cmp, label %if.end, label %while.body
+
+while.body:                                       ; preds = %while.body, %while.preheader
+  %i.09 = phi i32 [ %inc, %while.body ], [ 0, %while.preheader ]
+  %a.addr.08 = phi i32* [ %incdec.ptr3, %while.body ], [ %a, %while.preheader ]
+  %b.addr.07 = phi i32* [ %incdec.ptr, %while.body ], [ %b, %while.preheader ]
+  %incdec.ptr = getelementptr inbounds i32, i32* %b.addr.07, i32 1
+  %tmp = load i32, i32* %b.addr.07, align 4
+  %incdec.ptr3 = getelementptr inbounds i32, i32* %a.addr.08, i32 1
+  store i32 %tmp, i32* %a.addr.08, align 4
+  %inc = add nuw i32 %i.09, 1
+  %exitcond = icmp eq i32 %inc, %N
+  br i1 %exitcond, label %if.end, label %while.body
+
+if.end:                                           ; preds = %while.body, %while.preheader, %entry
+  ret void
+}
+
+; TODO: This could generate WLS
+; CHECK: ne_prepreheader
+; CHECK: body:
+; CHECK: bb.0.entry:
+; CHECK:   t2CMPri renamable $lr, 0
+; CHECK:   tBcc %bb.3
+; CHECK: bb.1.while.body.preheader:
+; CHECK:   $lr = t2DLS renamable $lr
+; CHECK: bb.2.while.body:
+; CHECK:   $lr = t2LEUpdate renamable $lr, %bb.2
+define void @ne_prepreheader(i1 zeroext %t1, i1 zeroext %t2, i32* nocapture %a, i32* nocapture readonly %b, i32 %N) {
+entry:
+  %cmp = icmp ne i32 %N, 0
+  br i1 %cmp, label %while.preheader, label %if.end
+
+while.preheader:                                  ; preds = %entry
+  %brmerge.demorgan = and i1 %t1, %t2
+  br i1 %brmerge.demorgan, label %while.body, label %if.end
+
+while.body:                                       ; preds = %while.body, %while.preheader
+  %i.09 = phi i32 [ %inc, %while.body ], [ 0, %while.preheader ]
+  %a.addr.08 = phi i32* [ %incdec.ptr3, %while.body ], [ %a, %while.preheader ]
+  %b.addr.07 = phi i32* [ %incdec.ptr, %while.body ], [ %b, %while.preheader ]
+  %incdec.ptr = getelementptr inbounds i32, i32* %b.addr.07, i32 1
+  %tmp = load i32, i32* %b.addr.07, align 4
+  %incdec.ptr3 = getelementptr inbounds i32, i32* %a.addr.08, i32 1
+  store i32 %tmp, i32* %a.addr.08, align 4
+  %inc = add nuw i32 %i.09, 1
+  %exitcond = icmp eq i32 %inc, %N
+  br i1 %exitcond, label %if.end, label %while.body
+
+if.end:                                           ; preds = %while.body, %while.preheader, %entry
+  ret void
+}
+
+; CHECK: be_ne
+; CHECK: body:
+; CHECK: bb.0.entry:
+; CHECK:   $lr = t2DLS renamable $lr
+; CHECK: bb.1.do.body:
+; CHECK:   $lr = t2LEUpdate renamable $lr, %bb.1
+define void @be_ne(i32* nocapture %a, i32* nocapture readonly %b, i32 %N) {
+entry:
+  %cmp = icmp ne i32 %N, 0
+  %sub = sub i32 %N, 1
+  %be = select i1 %cmp, i32 0, i32 %sub
+  %cmp.1 = icmp ne i32 %be, 0
+  br i1 %cmp.1, label %do.body, label %if.end
+
+do.body:                                          ; preds = %do.body, %entry
+  %b.addr.0 = phi i32* [ %incdec.ptr, %do.body ], [ %b, %entry ]
+  %a.addr.0 = phi i32* [ %incdec.ptr3, %do.body ], [ %a, %entry ]
+  %i.0 = phi i32 [ %inc, %do.body ], [ 0, %entry ]
+  %incdec.ptr = getelementptr inbounds i32, i32* %b.addr.0, i32 1
+  %tmp = load i32, i32* %b.addr.0, align 4
+  %incdec.ptr3 = getelementptr inbounds i32, i32* %a.addr.0, i32 1
+  store i32 %tmp, i32* %a.addr.0, align 4
+  %inc = add nuw i32 %i.0, 1
+  %cmp.2 = icmp ult i32 %inc, %N
+  br i1 %cmp.2, label %do.body, label %if.end
+
+if.end:                                           ; preds = %do.body, %entry
+  ret void
+}
+
+; TODO: Remove the tMOVr in the preheader!
+; CHECK: ne_trip_count
+; CHECK: body:
+; CHECK: bb.0.entry:
+; CHECK:   $lr = t2WLS $r3, %bb.3
+; CHECK: bb.1.do.body.preheader:
+; CHECK:   $lr = tMOVr
+; CHECK: bb.2.do.body:
+; CHECK:   $lr = t2LEUpdate renamable $lr, %bb.2
+define void @ne_trip_count(i1 zeroext %t1, i32* nocapture %a, i32* nocapture readonly %b, i32 %N) {
+entry:
+  br label %do.body.preheader
+
+do.body.preheader:
+  %cmp = icmp ne i32 %N, 0
+  br i1 %cmp, label %do.body, label %if.end
+
+do.body:
+  %b.addr.0 = phi i32* [ %incdec.ptr, %do.body ], [ %b, %do.body.preheader ]
+  %a.addr.0 = phi i32* [ %incdec.ptr3, %do.body ], [ %a, %do.body.preheader ]
+  %i.0 = phi i32 [ %inc, %do.body ], [ 0, %do.body.preheader ]
+  %incdec.ptr = getelementptr inbounds i32, i32* %b.addr.0, i32 1
+  %tmp = load i32, i32* %b.addr.0, align 4
+  %incdec.ptr3 = getelementptr inbounds i32, i32* %a.addr.0, i32 1
+  store i32 %tmp, i32* %a.addr.0, align 4
+  %inc = add nuw i32 %i.0, 1
+  %cmp.1 = icmp ult i32 %inc, %N
+  br i1 %cmp.1, label %do.body, label %if.end
+
+if.end:                                           ; preds = %do.body, %entry
+  ret void
+}
diff --git a/test/Transforms/HardwareLoops/ARM/massive.mir b/test/CodeGen/Thumb2/LowOverheadLoops/massive.mir
similarity index 100%
rename from test/Transforms/HardwareLoops/ARM/massive.mir
rename to test/CodeGen/Thumb2/LowOverheadLoops/massive.mir
diff --git a/test/Transforms/HardwareLoops/ARM/multiblock-massive.mir b/test/CodeGen/Thumb2/LowOverheadLoops/multiblock-massive.mir
similarity index 100%
rename from test/Transforms/HardwareLoops/ARM/multiblock-massive.mir
rename to test/CodeGen/Thumb2/LowOverheadLoops/multiblock-massive.mir
diff --git a/test/Transforms/HardwareLoops/ARM/revert-after-call.mir b/test/CodeGen/Thumb2/LowOverheadLoops/revert-after-call.mir
similarity index 100%
rename from test/Transforms/HardwareLoops/ARM/revert-after-call.mir
rename to test/CodeGen/Thumb2/LowOverheadLoops/revert-after-call.mir
diff --git a/test/Transforms/HardwareLoops/ARM/revert-after-spill.mir b/test/CodeGen/Thumb2/LowOverheadLoops/revert-after-spill.mir
similarity index 100%
rename from test/Transforms/HardwareLoops/ARM/revert-after-spill.mir
rename to test/CodeGen/Thumb2/LowOverheadLoops/revert-after-spill.mir
diff --git a/test/CodeGen/Thumb2/LowOverheadLoops/revert-while.mir b/test/CodeGen/Thumb2/LowOverheadLoops/revert-while.mir
new file mode 100644
index 00000000000..9a37d414ad5
--- /dev/null
+++ b/test/CodeGen/Thumb2/LowOverheadLoops/revert-while.mir
@@ -0,0 +1,130 @@
+# RUN: llc -mtriple=thumbv8.1m.main -mattr=+lob -run-pass=arm-low-overhead-loops --verify-machineinstrs %s -o - | FileCheck %s
+# CHECK:      body:
+# CHECK:      bb.0.entry:
+# CHECK:        t2CMPri $lr, 0, 14
+# CHECK-NEXT:   t2Bcc %bb.3, 0, $cpsr
+# CHECK-NEXT:   tB %bb.1
+# CHECK:      bb.1.do.body.preheader:
+# CHECK:        $lr = tMOVr killed $r3
+# CHECK:      bb.2.do.body:
+# CHECK:        $lr = t2SUBri killed renamable $lr, 1, 14
+# CHECK-NEXT:   t2CMPri $lr, 0, 14, $cpsr
+# CHECK-NEXT:   t2Bcc %bb.2, 1, $cpsr
+# CHECK-NEXT:   tB %bb.3, 14
+--- |
+  target datalayout = "e-m:e-p:32:32-Fi8-i64:64-v128:64:128-a:0:32-n32-S64"
+  target triple = "thumbv8.1m.main"
+  
+  define void @ne_trip_count(i1 zeroext %t1, i32* nocapture %a, i32* nocapture readonly %b, i32 %N) #0 {
+  entry:
+    %cmp = icmp ne i32 %N, 0
+    %0 = call i1 @llvm.test.set.loop.iterations.i32(i32 %N)
+    br i1 %0, label %do.body.preheader, label %if.end
+  
+  do.body.preheader:                                ; preds = %entry
+    br label %do.body
+  
+  do.body:                                          ; preds = %do.body.preheader, %do.body
+    %i.0 = phi i32 [ %inc, %do.body ], [ 0, %do.body.preheader ]
+    %1 = phi i32 [ %N, %do.body.preheader ], [ %2, %do.body ]
+    %scevgep = getelementptr i32, i32* %b, i32 %i.0
+    %scevgep1 = getelementptr i32, i32* %a, i32 %i.0
+    %size = call i32 @llvm.arm.space(i32 4096, i32 undef)
+    %tmp = load i32, i32* %scevgep, align 4
+    store i32 %tmp, i32* %scevgep1, align 4
+    %inc = add nuw i32 %i.0, 1
+    %2 = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 %1, i32 1)
+    %3 = icmp ne i32 %2, 0
+    br i1 %3, label %do.body, label %if.end
+  
+  if.end:                                           ; preds = %do.body, %entry
+    ret void
+  }
+  
+  declare i32 @llvm.arm.space(i32, i32) #1
+  declare i1 @llvm.test.set.loop.iterations.i32(i32) #2
+  declare i32 @llvm.loop.decrement.reg.i32.i32.i32(i32, i32) #2
+  
+  attributes #0 = { "target-features"="+lob" }
+  attributes #1 = { nounwind "target-features"="+lob" }
+  attributes #2 = { noduplicate nounwind }
+  attributes #3 = { nounwind }
+
+...
+---
+name:            ne_trip_count
+alignment:       1
+exposesReturnsTwice: false
+legalized:       false
+regBankSelected: false
+selected:        false
+failedISel:      false
+tracksRegLiveness: false
+hasWinCFI:       false
+registers:       []
+liveins:         
+  - { reg: '$r1', virtual-reg: '' }
+  - { reg: '$r2', virtual-reg: '' }
+  - { reg: '$r3', virtual-reg: '' }
+frameInfo:       
+  isFrameAddressTaken: false
+  isReturnAddressTaken: false
+  hasStackMap:     false
+  hasPatchPoint:   false
+  stackSize:       8
+  offsetAdjustment: 0
+  maxAlignment:    4
+  adjustsStack:    false
+  hasCalls:        false
+  stackProtector:  ''
+  maxCallFrameSize: 0
+  cvBytesOfCalleeSavedRegisters: 0
+  hasOpaqueSPAdjustment: false
+  hasVAStart:      false
+  hasMustTailInVarArgFunc: false
+  localFrameSize:  0
+  savePoint:       ''
+  restorePoint:    ''
+fixedStack:      []
+stack:           
+  - { id: 0, name: '', type: spill-slot, offset: -4, size: 4, alignment: 4, 
+      stack-id: default, callee-saved-register: '$lr', callee-saved-restored: false, 
+      debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
+  - { id: 1, name: '', type: spill-slot, offset: -8, size: 4, alignment: 4, 
+      stack-id: default, callee-saved-register: '$r7', callee-saved-restored: true, 
+      debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
+callSites:       []
+constants:       []
+machineFunctionInfo: {}
+body:             |
+  bb.0.entry:
+    successors: %bb.1(0x40000000), %bb.3(0x40000000)
+  
+    frame-setup tPUSH 14, $noreg, killed $r7, killed $lr, implicit-def $sp, implicit $sp
+    frame-setup CFI_INSTRUCTION def_cfa_offset 8
+    frame-setup CFI_INSTRUCTION offset $lr, -4
+    frame-setup CFI_INSTRUCTION offset $r7, -8
+    t2WhileLoopStart $r3, %bb.3
+    tB %bb.1, 14, $noreg
+  
+  bb.1.do.body.preheader:
+    successors: %bb.2(0x80000000)
+  
+    $lr = tMOVr killed $r3, 14, $noreg
+    renamable $r0, dead $cpsr = tMOVi8 0, 14, $noreg
+  
+  bb.2.do.body:
+    successors: %bb.2(0x7c000000), %bb.3(0x04000000)
+  
+    dead renamable $r3 = SPACE 4096, undef renamable $r0
+    renamable $r3 = t2LDRs renamable $r2, renamable $r0, 2, 14, $noreg :: (load 4 from %ir.scevgep)
+    t2STRs killed renamable $r3, renamable $r1, renamable $r0, 2, 14, $noreg :: (store 4 into %ir.scevgep1)
+    renamable $r0, dead $cpsr = nuw tADDi8 killed renamable $r0, 1, 14, $noreg
+    renamable $lr = t2LoopDec killed renamable $lr, 1
+    t2LoopEnd renamable $lr, %bb.2
+    tB %bb.3, 14, $noreg
+  
+  bb.3.if.end:
+    tPOP_RET 14, $noreg, def $r7, def $pc
+
+...
diff --git a/test/Transforms/HardwareLoops/ARM/size-limit.mir b/test/CodeGen/Thumb2/LowOverheadLoops/size-limit.mir
similarity index 100%
rename from test/Transforms/HardwareLoops/ARM/size-limit.mir
rename to test/CodeGen/Thumb2/LowOverheadLoops/size-limit.mir
diff --git a/test/Transforms/HardwareLoops/ARM/switch.mir b/test/CodeGen/Thumb2/LowOverheadLoops/switch.mir
similarity index 100%
rename from test/Transforms/HardwareLoops/ARM/switch.mir
rename to test/CodeGen/Thumb2/LowOverheadLoops/switch.mir
diff --git a/test/CodeGen/Thumb2/LowOverheadLoops/while.mir b/test/CodeGen/Thumb2/LowOverheadLoops/while.mir
new file mode 100644
index 00000000000..19c8fa2b1d0
--- /dev/null
+++ b/test/CodeGen/Thumb2/LowOverheadLoops/while.mir
@@ -0,0 +1,131 @@
+# RUN: llc -mtriple=thumbv8.1m.main -mattr=+lob %s -run-pass=arm-low-overhead-loops --verify-machineinstrs -o - | FileCheck %s
+
+# TODO: Remove the lr = tMOVr
+# CHECK: body:
+# CHECK:   $lr = t2WLS $r2, [[EXIT:%bb[.0-9]+]]
+# CHECK: [[PREHEADER:bb[.0-9a-z]+]]:
+# CHECK:   $lr = tMOVr killed $r2
+# CHECK: [[BODY:bb[.0-9a-z]+]]:
+# CHECK:   $lr = t2LEUpdate renamable $lr
+
+--- |
+  target datalayout = "e-m:e-p:32:32-Fi8-i64:64-v128:64:128-a:0:32-n32-S64"
+  target triple = "thumbv8.1m.main-arm-unknown"
+  
+  ; Function Attrs: norecurse nounwind optsize
+  define dso_local arm_aapcscc void @copy(i16* nocapture %a, i16* nocapture readonly %b, i32 %N) {
+  entry:
+    %cmp4 = icmp eq i32 %N, 0
+    %0 = call i1 @llvm.test.set.loop.iterations.i32(i32 %N)
+    br i1 %0, label %while.body.preheader, label %while.end
+  
+  while.body.preheader:                             ; preds = %entry
+    br label %while.body
+  
+  while.body:                                       ; preds = %while.body, %while.body.preheader
+    %a.addr.06 = phi i16* [ %incdec.ptr1, %while.body ], [ %a, %while.body.preheader ]
+    %b.addr.05 = phi i16* [ %incdec.ptr, %while.body ], [ %b, %while.body.preheader ]
+    %1 = phi i32 [ %N, %while.body.preheader ], [ %3, %while.body ]
+    %incdec.ptr = getelementptr inbounds i16, i16* %b.addr.05, i32 1
+    %2 = load i16, i16* %b.addr.05, align 2, !tbaa !3
+    %incdec.ptr1 = getelementptr inbounds i16, i16* %a.addr.06, i32 1
+    store i16 %2, i16* %a.addr.06, align 2, !tbaa !3
+    %3 = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 %1, i32 1)
+    %4 = icmp ne i32 %3, 0
+    br i1 %4, label %while.body, label %while.end
+  
+  while.end:                                        ; preds = %while.body, %entry
+    ret void
+  }
+  
+  declare i1 @llvm.test.set.loop.iterations.i32(i32) #1
+  declare i32 @llvm.loop.decrement.reg.i32.i32.i32(i32, i32) #1
+  
+  attributes #1 = { noduplicate nounwind }
+  attributes #2 = { nounwind }
+  
+  !llvm.module.flags = !{!0, !1}
+  
+  !0 = !{i32 1, !"wchar_size", i32 4}
+  !1 = !{i32 1, !"min_enum_size", i32 4}
+  !3 = !{!4, !4, i64 0}
+  !4 = !{!"short", !5, i64 0}
+  !5 = !{!"omnipotent char", !6, i64 0}
+  !6 = !{!"Simple C/C++ TBAA"}
+
+...
+---
+name:            copy
+alignment:       1
+exposesReturnsTwice: false
+legalized:       false
+regBankSelected: false
+selected:        false
+failedISel:      false
+tracksRegLiveness: false
+hasWinCFI:       false
+registers:       []
+liveins:         
+  - { reg: '$r0', virtual-reg: '' }
+  - { reg: '$r1', virtual-reg: '' }
+  - { reg: '$r2', virtual-reg: '' }
+frameInfo:       
+  isFrameAddressTaken: false
+  isReturnAddressTaken: false
+  hasStackMap:     false
+  hasPatchPoint:   false
+  stackSize:       8
+  offsetAdjustment: 0
+  maxAlignment:    4
+  adjustsStack:    false
+  hasCalls:        false
+  stackProtector:  ''
+  maxCallFrameSize: 0
+  cvBytesOfCalleeSavedRegisters: 0
+  hasOpaqueSPAdjustment: false
+  hasVAStart:      false
+  hasMustTailInVarArgFunc: false
+  localFrameSize:  0
+  savePoint:       ''
+  restorePoint:    ''
+fixedStack:      []
+stack:           
+  - { id: 0, name: '', type: spill-slot, offset: -4, size: 4, alignment: 4, 
+      stack-id: default, callee-saved-register: '$lr', callee-saved-restored: false, 
+      debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
+  - { id: 1, name: '', type: spill-slot, offset: -8, size: 4, alignment: 4, 
+      stack-id: default, callee-saved-register: '$r7', callee-saved-restored: true, 
+      debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
+constants:       []
+machineFunctionInfo: {}
+body:             |
+  bb.0.entry:
+    successors: %bb.1(0x40000000), %bb.3(0x40000000)
+  
+    frame-setup tPUSH 14, $noreg, $r7, killed $lr, implicit-def $sp, implicit $sp
+    frame-setup CFI_INSTRUCTION def_cfa_offset 8
+    frame-setup CFI_INSTRUCTION offset $lr, -4
+    frame-setup CFI_INSTRUCTION offset $r7, -8
+    $r7 = frame-setup tMOVr $sp, 14, $noreg
+    frame-setup CFI_INSTRUCTION def_cfa_register $r7
+    t2WhileLoopStart $r2, %bb.3
+    tB %bb.1, 14, $noreg
+  
+  bb.1.while.body.preheader:
+    successors: %bb.2(0x80000000)
+  
+    $lr = tMOVr killed $r2, 14, $noreg
+  
+  bb.2.while.body:
+    successors: %bb.2(0x7c000000), %bb.3(0x04000000)
+  
+    renamable $r2, renamable $r1 = t2LDRH_POST killed renamable $r1, 2, 14, $noreg :: (load 2 from %ir.b.addr.05, !tbaa !3)
+    early-clobber renamable $r0 = t2STRH_POST killed renamable $r2, killed renamable $r0, 2, 14, $noreg :: (store 2 into %ir.a.addr.06, !tbaa !3)
+    renamable $lr = t2LoopDec killed renamable $lr, 1
+    t2LoopEnd renamable $lr, %bb.2
+    tB %bb.3, 14, $noreg
+  
+  bb.3.while.end:
+    tPOP_RET 14, $noreg, def $r7, def $pc
+
+...
diff --git a/test/Transforms/HardwareLoops/ARM/do-rem.ll b/test/Transforms/HardwareLoops/ARM/do-rem.ll
index 074a1bb6c4c..144600fe5bf 100644
--- a/test/Transforms/HardwareLoops/ARM/do-rem.ll
+++ b/test/Transforms/HardwareLoops/ARM/do-rem.ll
@@ -3,10 +3,14 @@
 @g = common local_unnamed_addr global i32* null, align 4
 
 ; CHECK-LABEL: do_with_i32_urem
+; CHECK: entry:
+; CHECK: [[TEST:%[^ ]+]] = call i1 @llvm.test.set.loop.iterations.i32(i32 %n)
+; CHECK: br i1 [[TEST]], label %while.body.preheader, label %while.end
+
 ; CHECK: while.body.preheader:
-; CHECK: call void @llvm.set.loop.iterations.i32(i32 %n)
 ; CHECK-NEXT: br label %while.body
 
+; CHECK: while.body:
 ; CHECK: [[REM:%[^ ]+]] = phi i32 [ %n, %while.body.preheader ], [ [[LOOP_DEC:%[^ ]+]], %while.body ]
 ; CHECK: [[LOOP_DEC]] = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 [[REM]], i32 1)
 ; CHECK: [[CMP:%[^ ]+]] = icmp ne i32 [[LOOP_DEC]], 0
@@ -38,10 +42,14 @@ while.end:
 }
 
 ; CHECK-LABEL: do_with_i32_srem
+; CHECK: entry:
+; CHECK: [[TEST:%[^ ]+]] = call i1 @llvm.test.set.loop.iterations.i32(i32 %n)
+; CHECK: br i1 [[TEST]], label %while.body.preheader, label %while.end
+
 ; CHECK: while.body.preheader:
-; CHECK: call void @llvm.set.loop.iterations.i32(i32 %n)
 ; CHECK-NEXT: br label %while.body
 
+; CHECK: while.body:
 ; CHECK: [[REM:%[^ ]+]] = phi i32 [ %n, %while.body.preheader ], [ [[LOOP_DEC:%[^ ]+]], %while.body ]
 ; CHECK: [[LOOP_DEC]] = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 [[REM]], i32 1)
 ; CHECK: [[CMP:%[^ ]+]] = icmp ne i32 [[LOOP_DEC]], 0
@@ -73,10 +81,14 @@ while.end:
 }
 
 ; CHECK-LABEL: do_with_i32_udiv
+; CHECK: entry:
+; CHECK: [[TEST:%[^ ]+]] = call i1 @llvm.test.set.loop.iterations.i32(i32 %n)
+; CHECK: br i1 [[TEST]], label %while.body.preheader, label %while.end
+
 ; CHECK: while.body.preheader:
-; CHECK: call void @llvm.set.loop.iterations.i32(i32 %n)
 ; CHECK-NEXT: br label %while.body
 
+; CHECK: while.body:
 ; CHECK: [[REM:%[^ ]+]] = phi i32 [ %n, %while.body.preheader ], [ [[LOOP_DEC:%[^ ]+]], %while.body ]
 ; CHECK: [[LOOP_DEC]] = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 [[REM]], i32 1)
 ; CHECK: [[CMP:%[^ ]+]] = icmp ne i32 [[LOOP_DEC]], 0
@@ -108,10 +120,14 @@ while.end:
 }
 
 ; CHECK-LABEL: do_with_i32_sdiv
+; CHECK: entry:
+; CHECK: [[TEST:%[^ ]+]] = call i1 @llvm.test.set.loop.iterations.i32(i32 %n)
+; CHECK: br i1 [[TEST]], label %while.body.preheader, label %while.end
+
 ; CHECK: while.body.preheader:
-; CHECK: call void @llvm.set.loop.iterations.i32(i32 %n)
 ; CHECK-NEXT: br label %while.body
 
+; CHECK: while.body:
 ; CHECK: [[REM:%[^ ]+]] = phi i32 [ %n, %while.body.preheader ], [ [[LOOP_DEC:%[^ ]+]], %while.body ]
 ; CHECK: [[LOOP_DEC]] = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 [[REM]], i32 1)
 ; CHECK: [[CMP:%[^ ]+]] = icmp ne i32 [[LOOP_DEC]], 0
@@ -143,7 +159,7 @@ while.end:
 }
 
 ; CHECK-LABEL: do_with_i64_urem
-; CHECK-NOT: llvm.set.loop.iterations
+; CHECK-NOT: llvm.{{.*}}.loop.iterations
 ; CHECK-NOT: llvm.loop.decrement
 define i64 @do_with_i64_urem(i32 %n) {
 entry:
@@ -172,7 +188,7 @@ while.end:
 }
 
 ; CHECK-LABEL: do_with_i64_srem
-; CHECK-NOT: llvm.set.loop.iterations
+; CHECK-NOT: llvm.{{.*}}.loop.iterations
 ; CHECK-NOT: llvm.loop.decrement
 define i64 @do_with_i64_srem(i32 %n) {
 entry:
@@ -201,7 +217,7 @@ while.end:
 }
 
 ; CHECK-LABEL: do_with_i64_udiv
-; CHECK-NOT: llvm.set.loop.iterations
+; CHECK-NOT: llvm.{{.*}}.loop.iterations
 ; CHECK-NOT: llvm.loop.decrement
 define i64 @do_with_i64_udiv(i32 %n) {
 entry:
@@ -230,7 +246,7 @@ while.end:
 }
 
 ; CHECK-LABEL: do_with_i64_sdiv
-; CHECK-NOT: call void @llvm.set.loop.iterations
+; CHECK-NOT: call void @llvm.{{.*}}.loop.iterations
 ; CHECK-NOT: call i32 @llvm.loop.decrement
 define i64 @do_with_i64_sdiv(i32 %n) {
 entry:
diff --git a/test/Transforms/HardwareLoops/ARM/fp-emulation.ll b/test/Transforms/HardwareLoops/ARM/fp-emulation.ll
index 8336b989ac7..ddb4c6a4cea 100644
--- a/test/Transforms/HardwareLoops/ARM/fp-emulation.ll
+++ b/test/Transforms/HardwareLoops/ARM/fp-emulation.ll
@@ -2,9 +2,13 @@
 ; RUN: opt -mtriple=thumbv8.1m.main-arm-none-eabi -mattr=+soft-float -hardware-loops -disable-arm-loloops=false %s -S -o - | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-SOFT
 
 ; CHECK-LABEL: test_fptosi
-; CHECK: while.body.lr.ph:
+; CHECK-SOFT-NOT: call void @llvm.set.loop.iterations
+
+; CHECK: entry:
 ; CHECK-FP: [[CMP:%[^ ]+]] = icmp ugt i32 %n, 1
 ; CHECK-FP: [[COUNT:%[^ ]+]] = select i1 [[CMP]], i32 %n, i32 1
+
+; CHECK: while.body.lr.ph:
 ; CHECK-FP: call void @llvm.set.loop.iterations.i32(i32 [[COUNT]])
 ; CHECK-FP-NEXT: br label %while.body
 
@@ -13,8 +17,6 @@
 ; CHECK-FP: [[CMP:%[^ ]+]] = icmp ne i32 [[LOOP_DEC]], 0
 ; CHECK-FP: br i1 [[CMP]], label %while.body, label %cleanup.loopexit
 
-; CHECK-SOFT-NOT: call void @llvm.set.loop.iterations
-
 define void @test_fptosi(i32 %n, i32** %g, double** %d) {
 entry:
   %n.off = add i32 %n, -1
@@ -53,9 +55,10 @@ cleanup:
 }
 
 ; CHECK-LABEL: test_fptoui
-; CHECK-FP: while.body.lr.ph:
+; CHECK: entry:
 ; CHECK-FP: [[CMP:%[^ ]+]] = icmp ugt i32 %n, 1
 ; CHECK-FP: [[COUNT:%[^ ]+]] = select i1 [[CMP]], i32 %n, i32 1
+; CHECK-FP: while.body.lr.ph:
 ; CHECK-FP: call void @llvm.set.loop.iterations.i32(i32 [[COUNT]])
 ; CHECK-FP-NEXT: br label %while.body
 
@@ -104,10 +107,11 @@ cleanup:
 }
 
 ; CHECK-LABEL: load_store_float
+; CHECK: entry:
+; CHECK:   [[CMP:%[^ ]+]] = icmp ugt i32 %n, 1
+; CHECK:   [[COUNT:%[^ ]+]] = select i1 [[CMP]], i32 %n, i32 1
 ; CHECK: while.body.lr.ph:
-; CHECK: [[CMP:%[^ ]+]] = icmp ugt i32 %n, 1
-; CHECK: [[COUNT:%[^ ]+]] = select i1 [[CMP]], i32 %n, i32 1
-; CHECK: call void @llvm.set.loop.iterations.i32(i32 [[COUNT]])
+; CHECK:   call void @llvm.set.loop.iterations.i32(i32 [[COUNT]])
 ; CHECK-NEXT: br label %while.body
 
 ; CHECK: [[REM:%[^ ]+]] = phi i32 [ [[COUNT]], %while.body.lr.ph ], [ [[LOOP_DEC:%[^ ]+]], %if.end4 ]
@@ -152,12 +156,11 @@ cleanup:
 }
 
 ; CHECK-LABEL: fp_add
-; CHECK: while.body.lr.ph:
-
 ; CHECK-SOFT-NOT: call void @llvm.set.loop.iterations
-
+; CHECK: entry:
 ; CHECK-FP: [[CMP:%[^ ]+]] = icmp ugt i32 %n, 1
 ; CHECK-FP: [[COUNT:%[^ ]+]] = select i1 [[CMP]], i32 %n, i32 1
+; CHECK: while.body.lr.ph:
 ; CHECK-FP: call void @llvm.set.loop.iterations.i32(i32 [[COUNT]])
 ; CHECK: br label %while.body
 
diff --git a/test/Transforms/HardwareLoops/ARM/simple-do.ll b/test/Transforms/HardwareLoops/ARM/simple-do.ll
index ca18d892197..da169eebfd1 100644
--- a/test/Transforms/HardwareLoops/ARM/simple-do.ll
+++ b/test/Transforms/HardwareLoops/ARM/simple-do.ll
@@ -3,7 +3,7 @@
 ; RUN: opt -mtriple=thumbv8.1m.main-arm-none-eabi -mattr=-lob -hardware-loops %s -S -o - | FileCheck %s --check-prefix=DISABLED
 ; RUN: llc -mtriple=thumbv8.1m.main-arm-none-eabi -disable-arm-loloops=false %s -o - | FileCheck %s --check-prefix=CHECK-LLC
 
-; DISABLED-NOT: llvm.set.loop.iterations
+; DISABLED-NOT: llvm.{{.*}}.loop.iterations
 ; DISABLED-NOT: llvm.loop.decrement
 
 @g = common local_unnamed_addr global i32* null, align 4
@@ -46,9 +46,12 @@ while.end:
 }
 
 ; CHECK-LABEL: do_inc1
+; CHECK: entry:
+; CHECK: [[TEST:%[^ ]+]] = call i1 @llvm.test.set.loop.iterations.i32(i32 %n)
+; CHECK: br i1 [[TEST]], label %while.body.lr.ph, label %while.end
+
 ; CHECK: while.body.lr.ph:
-; CHECK: call void @llvm.set.loop.iterations.i32(i32 %n)
-; CHECK-NEXT: br label %while.body
+; CHECK: br label %while.body
 
 ; CHECK: [[REM:%[^ ]+]] = phi i32 [ %n, %while.body.lr.ph ], [ [[LOOP_DEC:%[^ ]+]], %while.body ]
 ; CHECK: [[LOOP_DEC]] = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 [[REM]], i32 1)
@@ -56,12 +59,12 @@ while.end:
 ; CHECK: br i1 [[CMP]], label %while.body, label %while.end.loopexit
 
 ; CHECK-LLC-LABEL:do_inc1:
-; CHECK-LLC:        dls lr,
+; CHECK-LLC:        wls lr, {{.*}}, [[LOOP_EXIT:.[LBB_0-3]+]]
 ; CHECK-LLC-NOT:    mov lr,
 ; CHECK-LLC:      [[LOOP_HEADER:\.LBB[0-9_]+]]:
 ; CHECK-LLC:        le lr, [[LOOP_HEADER]]
 ; CHECK-LLC-NOT:    b [[LOOP_EXIT:\.LBB[0-9_]+]]
-; CHECK-LLC:      [[LOOP_EXIT:\.LBB[0-9_]+]]:
+; CHECK-LLC:      [[LOOP_EXIT]]:
 
 define i32 @do_inc1(i32 %n) {
 entry:
@@ -91,26 +94,26 @@ while.end:
 }
 
 ; CHECK-LABEL: do_inc2
-; CHECK: while.body.lr.ph:
+; CHECK: entry:
 ; CHECK: [[ROUND:%[^ ]+]] = add i32 %n, -1
 ; CHECK: [[HALVE:%[^ ]+]] = lshr i32 [[ROUND]], 1
 ; CHECK: [[COUNT:%[^ ]+]] = add nuw i32 [[HALVE]], 1
-; CHECK: call void @llvm.set.loop.iterations.i32(i32 [[COUNT]])
-; CHECK-NEXT: br label %while.body
 
-; CHECK: [[REM:%[^ ]+]] = phi i32 [ [[COUNT]], %while.body.lr.ph ], [ [[LOOP_DEC:%[^ ]+]], %while.body ]
-; CHECK: [[LOOP_DEC]] = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 [[REM]], i32 1)
-; CHECK: [[CMP:%[^ ]+]] = icmp ne i32 [[LOOP_DEC]], 0
-; CHECK: br i1 [[CMP]], label %while.body, label %while.end.loopexit
+; CHECK: while.body.lr.ph:
+; CHECK:   call void @llvm.set.loop.iterations.i32(i32 [[COUNT]])
+; CHECK:   br label %while.body
+; CHECK: while.body:
+; CHECK:   [[REM:%[^ ]+]] = phi i32 [ [[COUNT]], %while.body.lr.ph ], [ [[LOOP_DEC:%[^ ]+]], %while.body ]
+; CHECK:   [[LOOP_DEC]] = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 [[REM]], i32 1)
+; CHECK:   [[CMP:%[^ ]+]] = icmp ne i32 [[LOOP_DEC]], 0
+; CHECK:   br i1 [[CMP]], label %while.body, label %while.end.loopexit
 
 ; CHECK-LLC:      do_inc2:
 ; CHECK-LLC-NOT:    mov lr,
-; CHECK-LLC:        dls lr,
+; CHECK-LLC:        dls lr, {{.*}}
 ; CHECK-LLC-NOT:    mov lr,
 ; CHECK-LLC:      [[LOOP_HEADER:\.LBB[0-9._]+]]:
 ; CHECK-LLC:        le lr, [[LOOP_HEADER]]
-; CHECK-LLC-NOT:    b [[LOOP_EXIT:\.LBB[0-9._]+]]
-; CHECK-LLC:      [[LOOP_EXIT:\.LBB[0-9_]+]]:
 
 define i32 @do_inc2(i32 %n) {
 entry:
@@ -141,15 +144,17 @@ while.end:
 
 ; CHECK-LABEL: do_dec2
 
-; CHECK: while.body.lr.ph:
+; CHECK: entry:
 ; CHECK: [[ROUND:%[^ ]+]] = add i32 %n, 1
 ; CHECK: [[CMP:%[^ ]+]] = icmp slt i32 %n, 2
 ; CHECK: [[SMIN:%[^ ]+]] = select i1 [[CMP]], i32 %n, i32 2
 ; CHECK: [[SUB:%[^ ]+]] = sub i32 [[ROUND]], [[SMIN]]
 ; CHECK: [[HALVE:%[^ ]+]] = lshr i32 [[SUB]], 1
 ; CHECK: [[COUNT:%[^ ]+]] = add nuw i32 [[HALVE]], 1
+
+; CHECK: while.body.lr.ph:
 ; CHECK: call void @llvm.set.loop.iterations.i32(i32 [[COUNT]])
-; CHECK-NEXT: br label %while.body
+; CHECK: br label %while.body
 
 ; CHECK: [[REM:%[^ ]+]] = phi i32 [ [[COUNT]], %while.body.lr.ph ], [ [[LOOP_DEC:%[^ ]+]], %while.body ]
 ; CHECK: [[LOOP_DEC]] = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 [[REM]], i32 1)
@@ -158,12 +163,11 @@ while.end:
 
 ; CHECK-LLC:      do_dec2
 ; CHECK-LLC-NOT:    mov lr,
-; CHECK-LLC:        dls lr,
+; CHECK-LLC:        dls lr, {{.*}}
 ; CHECK-LLC-NOT:    mov lr,
 ; CHECK-LLC:      [[LOOP_HEADER:\.LBB[0-9_]+]]:
 ; CHECK-LLC:        le lr, [[LOOP_HEADER]]
 ; CHECK-LLC-NOT:    b .
-; CHECK-LLC:      @ %while.end
 define i32 @do_dec2(i32 %n) {
 entry:
   %cmp6 = icmp sgt i32 %n, 0
diff --git a/test/Transforms/HardwareLoops/ARM/structure.ll b/test/Transforms/HardwareLoops/ARM/structure.ll
index 03c9e4071cf..198cbef4681 100644
--- a/test/Transforms/HardwareLoops/ARM/structure.ll
+++ b/test/Transforms/HardwareLoops/ARM/structure.ll
@@ -109,6 +109,35 @@ while.end:                                        ; preds = %while.body
   ret i32 0
 }
 
+; CHECK-LABEL: pre_existing_test_set
+; CHECK: call i1 @llvm.test.set.loop.iterations
+; CHECK-NOT: llvm.set{{.*}}.loop.iterations
+; CHECK: call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 %0, i32 1)
+; CHECK-NOT: call i32 @llvm.loop.decrement.reg
+define i32 @pre_existing_test_set(i32 %n, i32* nocapture %p, i32* nocapture readonly %q) {
+entry:
+  %guard = call i1 @llvm.test.set.loop.iterations.i32(i32 %n)
+  br i1 %guard, label %while.preheader, label %while.end
+
+while.preheader:
+  br label %while.body
+
+while.body:                                       ; preds = %while.body, %entry
+  %q.addr.05 = phi i32* [ %incdec.ptr, %while.body ], [ %q, %while.preheader ]
+  %p.addr.04 = phi i32* [ %incdec.ptr1, %while.body ], [ %p, %while.preheader ]
+  %0 = phi i32 [ %n, %while.preheader ], [ %2, %while.body ]
+  %incdec.ptr = getelementptr inbounds i32, i32* %q.addr.05, i32 1
+  %1 = load i32, i32* %q.addr.05, align 4
+  %incdec.ptr1 = getelementptr inbounds i32, i32* %p.addr.04, i32 1
+  store i32 %1, i32* %p.addr.04, align 4
+  %2 = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 %0, i32 1)
+  %3 = icmp ne i32 %2, 0
+  br i1 %3, label %while.body, label %while.end
+
+while.end:                                        ; preds = %while.body
+  ret i32 0
+}
+
 ; CHECK-LABEL: pre_existing_inner
 ; CHECK-NOT: llvm.set.loop.iterations
 ; CHECK: while.cond1.preheader.us:
@@ -223,14 +252,16 @@ exit:
 }
 
 ; CHECK-LABEL: search
+; CHECK: entry:
+; CHECK:   [[TEST:%[^ ]+]] = call i1 @llvm.test.set.loop.iterations.i32(i32 %N)
+; CHECK:   br i1 [[TEST]], label %for.body.preheader, label %for.cond.cleanup
 ; CHECK: for.body.preheader:
-; CHECK: call void @llvm.set.loop.iterations.i32(i32 %N)
-; CHECK: br label %for.body
+; CHECK:   br label %for.body
 ; CHECK: for.body:
 ; CHECK: for.inc:
-; CHECK: [[LOOP_DEC:%[^ ]+]] = call i32 @llvm.loop.decrement.reg.i32.i32.i32
-; CHECK: [[CMP:%[^ ]+]] = icmp ne i32 [[LOOP_DEC]], 0
-; CHECK: br i1 [[CMP]], label %for.body, label %for.cond.cleanup
+; CHECK:   [[LOOP_DEC:%[^ ]+]] = call i32 @llvm.loop.decrement.reg.i32.i32.i32
+; CHECK:   [[CMP:%[^ ]+]] = icmp ne i32 [[LOOP_DEC]], 0
+; CHECK:   br i1 [[CMP]], label %for.body, label %for.cond.cleanup
 define i32 @search(i8* nocapture readonly %c, i32 %N) {
 entry:
   %cmp11 = icmp eq i32 %N, 0
@@ -276,16 +307,16 @@ for.inc:                                          ; preds = %sw.bb, %sw.bb1, %fo
 ; CHECK: call i32 @llvm.loop.decrement.reg.i32.i32.i32(
 
 ; TODO: We should be able to support the unrolled loop body.
-; CHECK-UNROLL-LABEL: unroll_inc_int:
+; CHECK-UNROLL-LABEL: unroll_inc_int
 ; CHECK-UNROLL:     [[PREHEADER:.LBB[0-9_]+]]: @ %for.body.preheader
 ; CHECK-UNROLL-NOT: dls
 ; CHECK-UNROLL:     [[LOOP:.LBB[0-9_]+]]: @ %for.body
 ; CHECK-UNROLL-NOT: le lr, [[LOOP]]
 ; CHECK-UNROLL:     bne [[LOOP]]
-; CHECK-UNROLL:     %for.body.epil.preheader
-; CHECK-UNROLL:     dls
-; CHECK-UNROLL:     %for.body.epil
-; CHECK-UNROLL:     le
+; CHECK-UNROLL:     wls lr, lr, [[EXIT:.LBB[0-9_]+]]
+; CHECK-UNROLL:     [[EPIL:.LBB[0-9_]+]]:
+; CHECK-UNROLL:     le lr, [[EPIL]]
+; CHECK-UNROLL-NEXT: [[EXIT]]
 
 define void @unroll_inc_int(i32* nocapture %a, i32* nocapture readonly %b, i32* nocapture readonly %c, i32 %N) {
 entry:
@@ -310,24 +341,27 @@ for.body:
 }
 
 ; CHECK-LABEL: unroll_inc_unsigned
-; CHECK: call void @llvm.set.loop.iterations.i32(i32 %N)
+; CHECK: call i1 @llvm.test.set.loop.iterations.i32(i32 %N)
 ; CHECK: call i32 @llvm.loop.decrement.reg.i32.i32.i32(
 
 ; CHECK-LLC-LABEL: unroll_inc_unsigned:
-; CHECK-LLC: dls lr, [[COUNT:r[0-9]+]]
-; CHECK-LLC: le  lr
+; CHECK-LLC: wls lr, r3, [[EXIT:.LBB[0-9_]+]]
+; CHECK-LLC: [[HEADER:.LBB[0-9_]+]]:
+; CHECK-LLC: le lr, [[HEADER]]
+; CHECK-LLC-NEXT: [[EXIT]]:
 
 ; TODO: We should be able to support the unrolled loop body.
-; CHECK-UNROLL-LABEL: unroll_inc_unsigned:
+; CHECK-UNROLL-LABEL: unroll_inc_unsigned
 ; CHECK-UNROLL:     [[PREHEADER:.LBB[0-9_]+]]: @ %for.body.preheader
 ; CHECK-UNROLL-NOT: dls
 ; CHECK-UNROLL:     [[LOOP:.LBB[0-9_]+]]: @ %for.body
 ; CHECK-UNROLL-NOT: le lr, [[LOOP]]
 ; CHECK-UNROLL:     bne [[LOOP]]
-; CHECK-UNROLL:     %for.body.epil.preheader
-; CHECK-UNROLL:     dls
-; CHECK-UNROLL:     %for.body.epil
-; CHECK-UNROLL:     le
+; CHECK-UNROLL:     wls lr, lr, [[EPIL_EXIT:.LBB[0-9_]+]]
+; CHECK-UNROLL: [[EPIL:.LBB[0-9_]+]]:
+; CHECK-UNROLL:     le lr, [[EPIL]]
+; CHECK-UNROLL: [[EPIL_EXIT]]:
+; CHECK-UNROLL:     pop
 define void @unroll_inc_unsigned(i32* nocapture %a, i32* nocapture readonly %b, i32* nocapture readonly %c, i32 %N) {
 entry:
   %cmp8 = icmp eq i32 %N, 0
@@ -357,15 +391,21 @@ for.body:
 ; TODO: An unnecessary register is being held to hold COUNT, lr should just
 ; be used instead.
 ; CHECK-LLC-LABEL: unroll_dec_int:
-; CHECK-LLC: dls lr, [[COUNT:r[0-9]+]]
-; CHECK-LLC: subs  [[COUNT]], #1
-; CHECK-LLC: le  lr
-
-; CHECK-UNROLL-LABEL: unroll_dec_int
-; CHECK-UNROLL: dls lr
-; CHECK-UNROLL: le lr
-; CHECK-UNROLL: dls lr
-; CHECK-UNROLL: le lr
+; CHECK-LLC: dls lr, r3
+; CHECK-LLC-NOT: mov lr, r3
+; CHECK-LLC: [[HEADER:.LBB[0-9_]+]]:
+; CHECK-LLC: le lr, [[HEADER]]
+
+; CHECK-UNROLL-LABEL: unroll_dec_int:
+; CHECK-UNROLL:         wls lr, {{.*}}, [[PROLOGUE_EXIT:.LBB[0-9_]+]]
+; CHECK-UNROLL-NEXT: [[PROLOGUE:.LBB[0-9_]+]]:
+; CHECK-UNROLL:         le lr, [[PROLOGUE]]
+; CHECK-UNROLL-NEXT: [[PROLOGUE_EXIT:.LBB[0-9_]+]]:
+; CHECK-UNROLL:         dls lr, lr
+; CHECK-UNROLL:      [[BODY:.LBB[0-9_]+]]:
+; CHECK-UNROLL:         le lr, [[BODY]]
+; CHECK-UNROLL-NOT:     b
+; CHECK-UNROLL:         pop
 define void @unroll_dec_int(i32* nocapture %a, i32* nocapture readonly %b, i32* nocapture readonly %c, i32 %N) {
 entry:
   %cmp8 = icmp sgt i32 %N, 0
@@ -389,5 +429,6 @@ for.body:
 }
 
 declare void @llvm.set.loop.iterations.i32(i32) #0
+declare i1 @llvm.test.set.loop.iterations.i32(i32) #0
 declare i32 @llvm.loop.decrement.reg.i32.i32.i32(i32, i32) #0