From d0c143de76255379185e6741c8106355d61c4142 Mon Sep 17 00:00:00 2001
From: Sam Parker <sam.parker@arm.com>
Date: Thu, 7 Feb 2019 13:32:54 +0000
Subject: [PATCH] [LSR] Generate cross iteration indexes

Modify GenerateConstantOffsetsImpl to create offsets that can be used
by indexed addressing modes. If formulae can be generated which
result in the constant offset being the same size as the recurrence,
we can generate a pre-indexed access. This allows the pointer to be
updated via the single pre-indexed access so that (hopefully) no
add/subs are required to update it for the next iteration. For small
cores, this can significantly improve performance DSP-like loops.

Differential Revision: https://reviews.llvm.org/D55373


git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@353403 91177308-0d34-0410-b5e6-96231b3b80d8
---
 include/llvm/Analysis/TargetTransformInfo.h   |    8 +
 .../llvm/Analysis/TargetTransformInfoImpl.h   |    2 +
 lib/Analysis/TargetTransformInfo.cpp          |    4 +
 lib/Target/ARM/ARMTargetTransformInfo.h       |    6 +
 lib/Transforms/Scalar/LoopStrengthReduce.cpp  |   90 +-
 test/CodeGen/ARM/dsp-loop-indexing.ll         |  310 +++++
 test/CodeGen/ARM/loop-align-cortex-m.ll       |    4 +-
 test/CodeGen/ARM/loop-indexing.ll             | 1190 +++++++++++++++++
 .../LoopStrengthReduce/ARM/complexity.ll      |   24 +-
 9 files changed, 1598 insertions(+), 40 deletions(-)
 create mode 100644 test/CodeGen/ARM/dsp-loop-indexing.ll
 create mode 100644 test/CodeGen/ARM/loop-indexing.ll

diff --git a/include/llvm/Analysis/TargetTransformInfo.h b/include/llvm/Analysis/TargetTransformInfo.h
index e20ccc9002b..60dbf6775a6 100644
--- a/include/llvm/Analysis/TargetTransformInfo.h
+++ b/include/llvm/Analysis/TargetTransformInfo.h
@@ -486,6 +486,10 @@ public:
   /// addressing mode expressions.
   bool shouldFavorPostInc() const;
 
+  /// Return true if LSR should make efforts to generate indexed addressing
+  /// modes that operate across loop iterations.
+  bool shouldFavorBackedgeIndex(const Loop *L) const;
+
   /// Return true if the target supports masked load/store
   /// AVX2 and AVX-512 targets allow masks for consecutive load and store
   bool isLegalMaskedStore(Type *DataType) const;
@@ -1065,6 +1069,7 @@ public:
                              TargetTransformInfo::LSRCost &C2) = 0;
   virtual bool canMacroFuseCmp() = 0;
   virtual bool shouldFavorPostInc() const = 0;
+  virtual bool shouldFavorBackedgeIndex(const Loop *L) const = 0;
   virtual bool isLegalMaskedStore(Type *DataType) = 0;
   virtual bool isLegalMaskedLoad(Type *DataType) = 0;
   virtual bool isLegalMaskedScatter(Type *DataType) = 0;
@@ -1301,6 +1306,9 @@ public:
   bool shouldFavorPostInc() const override {
     return Impl.shouldFavorPostInc();
   }
+  bool shouldFavorBackedgeIndex(const Loop *L) const override {
+    return Impl.shouldFavorBackedgeIndex(L);
+  }
   bool isLegalMaskedStore(Type *DataType) override {
     return Impl.isLegalMaskedStore(DataType);
   }
diff --git a/include/llvm/Analysis/TargetTransformInfoImpl.h b/include/llvm/Analysis/TargetTransformInfoImpl.h
index bd66e24aeb5..4705933750d 100644
--- a/include/llvm/Analysis/TargetTransformInfoImpl.h
+++ b/include/llvm/Analysis/TargetTransformInfoImpl.h
@@ -253,6 +253,8 @@ public:
 
   bool shouldFavorPostInc() const { return false; }
 
+  bool shouldFavorBackedgeIndex(const Loop *L) const { return false; }
+
   bool isLegalMaskedStore(Type *DataType) { return false; }
 
   bool isLegalMaskedLoad(Type *DataType) { return false; }
diff --git a/lib/Analysis/TargetTransformInfo.cpp b/lib/Analysis/TargetTransformInfo.cpp
index 7155972724a..7e453bfa1df 100644
--- a/lib/Analysis/TargetTransformInfo.cpp
+++ b/lib/Analysis/TargetTransformInfo.cpp
@@ -162,6 +162,10 @@ bool TargetTransformInfo::shouldFavorPostInc() const {
   return TTIImpl->shouldFavorPostInc();
 }
 
+bool TargetTransformInfo::shouldFavorBackedgeIndex(const Loop *L) const {
+  return TTIImpl->shouldFavorBackedgeIndex(L);
+}
+
 bool TargetTransformInfo::isLegalMaskedStore(Type *DataType) const {
   return TTIImpl->isLegalMaskedStore(DataType);
 }
diff --git a/lib/Target/ARM/ARMTargetTransformInfo.h b/lib/Target/ARM/ARMTargetTransformInfo.h
index d8e91344c0b..90842643c36 100644
--- a/lib/Target/ARM/ARMTargetTransformInfo.h
+++ b/lib/Target/ARM/ARMTargetTransformInfo.h
@@ -93,6 +93,12 @@ public:
 
   bool enableInterleavedAccessVectorization() { return true; }
 
+  bool shouldFavorBackedgeIndex(const Loop *L) const {
+    if (L->getHeader()->getParent()->optForSize())
+      return false;
+    return ST->isMClass() && ST->isThumb2() && L->getNumBlocks() == 1;
+  }
+
   /// Floating-point computation using ARMv8 AArch32 Advanced
   /// SIMD instructions remains unchanged from ARMv7. Only AArch64 SIMD
   /// is IEEE-754 compliant, but it's not covered in this target.
diff --git a/lib/Transforms/Scalar/LoopStrengthReduce.cpp b/lib/Transforms/Scalar/LoopStrengthReduce.cpp
index eb6b1f24a7f..04a25052635 100644
--- a/lib/Transforms/Scalar/LoopStrengthReduce.cpp
+++ b/lib/Transforms/Scalar/LoopStrengthReduce.cpp
@@ -154,6 +154,10 @@ static cl::opt<bool> FilterSameScaledReg(
     cl::desc("Narrow LSR search space by filtering non-optimal formulae"
              " with the same ScaledReg and Scale"));
 
+static cl::opt<bool> EnableBackedgeIndexing(
+  "lsr-backedge-indexing", cl::Hidden, cl::init(true),
+  cl::desc("Enable the generation of cross iteration indexed memops"));
+
 static cl::opt<unsigned> ComplexityLimit(
   "lsr-complexity-limit", cl::Hidden,
   cl::init(std::numeric_limits<uint16_t>::max()),
@@ -1052,12 +1056,12 @@ public:
   void dump() const;
 
 private:
-  void RateRegister(const SCEV *Reg,
+  void RateRegister(const Formula &F, const SCEV *Reg,
                     SmallPtrSetImpl<const SCEV *> &Regs,
                     const Loop *L,
                     ScalarEvolution &SE, DominatorTree &DT,
                     const TargetTransformInfo &TTI);
-  void RatePrimaryRegister(const SCEV *Reg,
+  void RatePrimaryRegister(const Formula &F, const SCEV *Reg,
                            SmallPtrSetImpl<const SCEV *> &Regs,
                            const Loop *L,
                            ScalarEvolution &SE, DominatorTree &DT,
@@ -1208,7 +1212,7 @@ static bool isAMCompletelyFolded(const TargetTransformInfo &TTI,
                                  Instruction *Fixup = nullptr);
 
 /// Tally up interesting quantities from the given register.
-void Cost::RateRegister(const SCEV *Reg,
+void Cost::RateRegister(const Formula &F, const SCEV *Reg,
                         SmallPtrSetImpl<const SCEV *> &Regs,
                         const Loop *L,
                         ScalarEvolution &SE, DominatorTree &DT,
@@ -1235,16 +1239,24 @@ void Cost::RateRegister(const SCEV *Reg,
     }
 
     unsigned LoopCost = 1;
-    if (TTI.shouldFavorPostInc()) {
-      const SCEV *LoopStep = AR->getStepRecurrence(SE);
-      if (isa<SCEVConstant>(LoopStep)) {
-        // Check if a post-indexed load/store can be used.
-        if (TTI.isIndexedLoadLegal(TTI.MIM_PostInc, AR->getType()) ||
-            TTI.isIndexedStoreLegal(TTI.MIM_PostInc, AR->getType())) {
+    if (TTI.isIndexedLoadLegal(TTI.MIM_PostInc, AR->getType()) ||
+        TTI.isIndexedStoreLegal(TTI.MIM_PostInc, AR->getType())) {
+
+      // If the step size matches the base offset, we could use pre-indexed
+      // addressing.
+      if (TTI.shouldFavorBackedgeIndex(L)) {
+        if (auto *Step = dyn_cast<SCEVConstant>(AR->getStepRecurrence(SE)))
+          if (Step->getAPInt() == F.BaseOffset)
+            LoopCost = 0;
+      }
+
+      if (TTI.shouldFavorPostInc()) {
+        const SCEV *LoopStep = AR->getStepRecurrence(SE);
+        if (isa<SCEVConstant>(LoopStep)) {
           const SCEV *LoopStart = AR->getStart();
           if (!isa<SCEVConstant>(LoopStart) &&
-            SE.isLoopInvariant(LoopStart, L))
-              LoopCost = 0;
+              SE.isLoopInvariant(LoopStart, L))
+            LoopCost = 0;
         }
       }
     }
@@ -1254,7 +1266,7 @@ void Cost::RateRegister(const SCEV *Reg,
     // TODO: The non-affine case isn't precisely modeled here.
     if (!AR->isAffine() || !isa<SCEVConstant>(AR->getOperand(1))) {
       if (!Regs.count(AR->getOperand(1))) {
-        RateRegister(AR->getOperand(1), Regs, L, SE, DT, TTI);
+        RateRegister(F, AR->getOperand(1), Regs, L, SE, DT, TTI);
         if (isLoser())
           return;
       }
@@ -1278,7 +1290,7 @@ void Cost::RateRegister(const SCEV *Reg,
 /// Record this register in the set. If we haven't seen it before, rate
 /// it. Optional LoserRegs provides a way to declare any formula that refers to
 /// one of those regs an instant loser.
-void Cost::RatePrimaryRegister(const SCEV *Reg,
+void Cost::RatePrimaryRegister(const Formula &F, const SCEV *Reg,
                                SmallPtrSetImpl<const SCEV *> &Regs,
                                const Loop *L,
                                ScalarEvolution &SE, DominatorTree &DT,
@@ -1289,7 +1301,7 @@ void Cost::RatePrimaryRegister(const SCEV *Reg,
     return;
   }
   if (Regs.insert(Reg).second) {
-    RateRegister(Reg, Regs, L, SE, DT, TTI);
+    RateRegister(F, Reg, Regs, L, SE, DT, TTI);
     if (LoserRegs && isLoser())
       LoserRegs->insert(Reg);
   }
@@ -1313,7 +1325,7 @@ void Cost::RateFormula(const TargetTransformInfo &TTI,
       Lose();
       return;
     }
-    RatePrimaryRegister(ScaledReg, Regs, L, SE, DT, LoserRegs, TTI);
+    RatePrimaryRegister(F, ScaledReg, Regs, L, SE, DT, LoserRegs, TTI);
     if (isLoser())
       return;
   }
@@ -1322,7 +1334,7 @@ void Cost::RateFormula(const TargetTransformInfo &TTI,
       Lose();
       return;
     }
-    RatePrimaryRegister(BaseReg, Regs, L, SE, DT, LoserRegs, TTI);
+    RatePrimaryRegister(F, BaseReg, Regs, L, SE, DT, LoserRegs, TTI);
     if (isLoser())
       return;
   }
@@ -1889,6 +1901,7 @@ class LSRInstance {
   LoopInfo &LI;
   const TargetTransformInfo &TTI;
   Loop *const L;
+  bool FavorBackedgeIndex = false;
   bool Changed = false;
 
   /// This is the insert position that the current loop's induction variable
@@ -2803,7 +2816,7 @@ bool IVChain::isProfitableIncrement(const SCEV *OperExpr,
 /// TODO: Consider IVInc free if it's already used in another chains.
 static bool
 isProfitableChain(IVChain &Chain, SmallPtrSetImpl<Instruction*> &Users,
-                  ScalarEvolution &SE, const TargetTransformInfo &TTI) {
+                  ScalarEvolution &SE) {
   if (StressIVChain)
     return true;
 
@@ -3063,7 +3076,7 @@ void LSRInstance::CollectChains() {
   for (unsigned UsersIdx = 0, NChains = IVChainVec.size();
        UsersIdx < NChains; ++UsersIdx) {
     if (!isProfitableChain(IVChainVec[UsersIdx],
-                           ChainUsersVec[UsersIdx].FarUsers, SE, TTI))
+                           ChainUsersVec[UsersIdx].FarUsers, SE))
       continue;
     // Preserve the chain at UsesIdx.
     if (ChainIdx != UsersIdx)
@@ -3077,7 +3090,7 @@ void LSRInstance::CollectChains() {
 void LSRInstance::FinalizeChain(IVChain &Chain) {
   assert(!Chain.Incs.empty() && "empty IV chains are not allowed");
   LLVM_DEBUG(dbgs() << "Final Chain: " << *Chain.Incs[0].UserInst << "\n");
-
+  
   for (const IVInc &Inc : Chain) {
     LLVM_DEBUG(dbgs() << "        Inc: " << *Inc.UserInst << "\n");
     auto UseI = find(Inc.UserInst->operands(), Inc.IVOperand);
@@ -3737,10 +3750,11 @@ void LSRInstance::GenerateSymbolicOffsets(LSRUse &LU, unsigned LUIdx,
 void LSRInstance::GenerateConstantOffsetsImpl(
     LSRUse &LU, unsigned LUIdx, const Formula &Base,
     const SmallVectorImpl<int64_t> &Worklist, size_t Idx, bool IsScaledReg) {
-  const SCEV *G = IsScaledReg ? Base.ScaledReg : Base.BaseRegs[Idx];
-  for (int64_t Offset : Worklist) {
+
+  auto GenerateOffset = [&](const SCEV *G, int64_t Offset) {
     Formula F = Base;
     F.BaseOffset = (uint64_t)Base.BaseOffset - Offset;
+
     if (isLegalUse(TTI, LU.MinOffset - Offset, LU.MaxOffset - Offset, LU.Kind,
                    LU.AccessTy, F)) {
       // Add the offset to the base register.
@@ -3760,7 +3774,35 @@ void LSRInstance::GenerateConstantOffsetsImpl(
 
       (void)InsertFormula(LU, LUIdx, F);
     }
+  };
+
+  const SCEV *G = IsScaledReg ? Base.ScaledReg : Base.BaseRegs[Idx];
+
+  // With constant offsets and constant steps, we can generate pre-inc
+  // accesses by having the offset equal the step. So, for access #0 with a
+  // step of 8, we generate a G - 8 base which would require the first access
+  // to be ((G - 8) + 8),+,8. The pre-indexed access then updates the pointer
+  // for itself and hopefully becomes the base for other accesses. This means
+  // means that a single pre-indexed access can be generated to become the new
+  // base pointer for each iteration of the loop, resulting in no extra add/sub
+  // instructions for pointer updating.
+  if (FavorBackedgeIndex && LU.Kind == LSRUse::Address) {
+    if (auto *GAR = dyn_cast<SCEVAddRecExpr>(G)) {
+      if (auto *StepRec =
+          dyn_cast<SCEVConstant>(GAR->getStepRecurrence(SE))) {
+        const APInt &StepInt = StepRec->getAPInt();
+        int64_t Step = StepInt.isNegative() ?
+          StepInt.getSExtValue() : StepInt.getZExtValue();
+
+        for (int64_t Offset : Worklist) {
+          Offset -= Step;
+          GenerateOffset(G, Offset);
+        }
+      }
+    }
   }
+  for (int64_t Offset : Worklist)
+    GenerateOffset(G, Offset);
 
   int64_t Imm = ExtractImmediate(G, SE);
   if (G->isZero() || Imm == 0)
@@ -4417,7 +4459,7 @@ void LSRInstance::NarrowSearchSpaceByDetectingSupersets() {
 /// When there are many registers for expressions like A, A+1, A+2, etc.,
 /// allocate a single register for them.
 void LSRInstance::NarrowSearchSpaceByCollapsingUnrolledCode() {
-  if (EstimateSearchSpaceComplexity() < ComplexityLimit)
+  if (EstimateSearchSpaceComplexity() < ComplexityLimit) 
     return;
 
   LLVM_DEBUG(
@@ -5378,7 +5420,9 @@ void LSRInstance::ImplementSolution(
 LSRInstance::LSRInstance(Loop *L, IVUsers &IU, ScalarEvolution &SE,
                          DominatorTree &DT, LoopInfo &LI,
                          const TargetTransformInfo &TTI)
-    : IU(IU), SE(SE), DT(DT), LI(LI), TTI(TTI), L(L) {
+    : IU(IU), SE(SE), DT(DT), LI(LI), TTI(TTI), L(L),
+      FavorBackedgeIndex(EnableBackedgeIndexing &&
+                         TTI.shouldFavorBackedgeIndex(L)) {
   // If LoopSimplify form is not available, stay out of trouble.
   if (!L->isLoopSimplifyForm())
     return;
diff --git a/test/CodeGen/ARM/dsp-loop-indexing.ll b/test/CodeGen/ARM/dsp-loop-indexing.ll
new file mode 100644
index 00000000000..7b80b400af4
--- /dev/null
+++ b/test/CodeGen/ARM/dsp-loop-indexing.ll
@@ -0,0 +1,310 @@
+; RUN: llc -mtriple=thumbv7em -mattr=+fp-armv8 %s -o - | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-DEFAULT
+; RUN: llc -mtriple=thumbv8m.main -mattr=+fp-armv8,+dsp %s -o - | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-DEFAULT
+; RUN: llc -mtriple=thumbv8m.main -mattr=+fp-armv8,+dsp -lsr-backedge-indexing=false %s -o - | FileCheck %s --check-prefix=CHECK --check-prefix=DISABLED
+; RUN: llc -mtriple=thumbv8 %s -o - | FileCheck %s --check-prefix=CHECK --check-prefix=DISABLED
+; RUN: llc -mtriple=thumbv8m.main -mattr=+fp-armv8,+dsp -lsr-complexity-limit=2147483647 %s -o - | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-COMPLEX
+
+; CHECK-LABEL: test_qadd_2
+; CHECK: @ %loop
+
+; CHECK-DEFAULT: ldr{{.*}}, #4]
+; CHECK-DEFAULT: ldr{{.*}}, #4]
+; CHECK-DEFAULT: str{{.*}}, #4]
+; CHECK-DEFAULT: ldr{{.*}}, #8]!
+; CHECK-DEAFULT: ldr{{.*}}, #8]!
+; CHECK-DEFAULT: str{{.*}}, #8]!
+
+; CHECK-COMPLEX: ldr{{.*}}, #8]!
+; CHECK-COMPLEX: ldr{{.*}}, #8]!
+; CHECK-COMPLEX: str{{.*}}, #8]!
+; CHECK-COMPLEX: ldr{{.*}}, #4]
+; CHECK-COMPLEX: ldr{{.*}}, #4]
+; CHECK-COMPLEX: str{{.*}}, #4]
+
+; DISABLED-NOT: ldr{{.*}}]!
+; DISABLED-NOT: str{{.*}}]!
+
+define void @test_qadd_2(i32* %a.array, i32* %b.array, i32* %out.array, i32 %N) {
+entry:
+  br label %loop
+
+loop:
+  %i = phi i32 [ 0, %entry ], [ %i.next, %loop ]
+  %idx.1 = phi i32 [ 0, %entry ], [ %idx.next, %loop ]
+  %gep.a.1 = getelementptr inbounds i32, i32* %a.array, i32 %idx.1
+  %a.1 = load i32, i32* %gep.a.1
+  %gep.b.1 = getelementptr inbounds i32, i32* %b.array, i32 %idx.1
+  %b.1 = load i32, i32* %gep.b.1
+  %qadd.1 = call i32 @llvm.arm.qadd(i32 %a.1, i32 %b.1)
+  %addr.1 = getelementptr inbounds i32, i32* %out.array, i32 %idx.1
+  store i32 %qadd.1, i32* %addr.1
+  %idx.2 = or i32 %idx.1, 1
+  %gep.a.2 = getelementptr inbounds i32, i32* %a.array, i32 %idx.2
+  %a.2 = load i32, i32* %gep.a.2
+  %gep.b.2 = getelementptr inbounds i32, i32* %b.array, i32 %idx.2
+  %b.2 = load i32, i32* %gep.b.2
+  %qadd.2 = call i32 @llvm.arm.qadd(i32 %a.2, i32 %b.2)
+  %addr.2 = getelementptr inbounds i32, i32* %out.array, i32 %idx.2
+  store i32 %qadd.2, i32* %addr.2
+  %i.next = add nsw nuw i32 %i, -2
+  %idx.next = add nsw nuw i32 %idx.1, 2
+  %cmp = icmp ult i32 %i.next, %N
+  br i1 %cmp, label %loop, label %exit
+
+exit:
+  ret void
+}
+
+; CHECK-LABEL: test_qadd_2_backwards
+; TODO: Indexes should be generated.
+
+; CHECK: @ %loop
+
+; CHECK-DEFAULT: ldr{{.*}},
+; CHECK-DEFAULT: ldr{{.*}},
+; CHECK-DEFAULT: str{{.*}},
+; CHECK-DEFAULT: ldr{{.*}}, #-4]
+; CHECK-DEFAULT: ldr{{.*}}, #-4]
+; CHECK-DEFAULT: sub{{.*}}, #8
+; CHECK-DEFAULT: str{{.*}}, #-4]
+; CHECK-DEFAULT: sub{{.*}}, #8
+
+; CHECK-COMPLEX: ldr{{.*}} lsl #2]
+; CHECK-COMPLEX: ldr{{.*}} lsl #2]
+; CHECK-COMPLEX: str{{.*}} lsl #2]
+; CHECK-COMPLEX: ldr{{.*}} lsl #2]
+; CHECK-COMPLEX: ldr{{.*}} lsl #2]
+; CHECK-COMPLEX: str{{.*}} lsl #2]
+
+; DISABLED-NOT: ldr{{.*}}]!
+; DISABLED-NOT: str{{.*}}]!
+
+define void @test_qadd_2_backwards(i32* %a.array, i32* %b.array, i32* %out.array, i32 %N) {
+entry:
+  br label %loop
+
+loop:
+  %i = phi i32 [ 0, %entry ], [ %i.next, %loop ]
+  %idx.1 = phi i32 [ %N, %entry ], [ %idx.next, %loop ]
+  %gep.a.1 = getelementptr inbounds i32, i32* %a.array, i32 %idx.1
+  %a.1 = load i32, i32* %gep.a.1
+  %gep.b.1 = getelementptr inbounds i32, i32* %b.array, i32 %idx.1
+  %b.1 = load i32, i32* %gep.b.1
+  %qadd.1 = call i32 @llvm.arm.qadd(i32 %a.1, i32 %b.1)
+  %addr.1 = getelementptr inbounds i32, i32* %out.array, i32 %idx.1
+  store i32 %qadd.1, i32* %addr.1
+  %idx.2 = sub nsw nuw i32 %idx.1, 1
+  %gep.a.2 = getelementptr inbounds i32, i32* %a.array, i32 %idx.2
+  %a.2 = load i32, i32* %gep.a.2
+  %gep.b.2 = getelementptr inbounds i32, i32* %b.array, i32 %idx.2
+  %b.2 = load i32, i32* %gep.b.2
+  %qadd.2 = call i32 @llvm.arm.qadd(i32 %a.2, i32 %b.2)
+  %addr.2 = getelementptr inbounds i32, i32* %out.array, i32 %idx.2
+  store i32 %qadd.2, i32* %addr.2
+  %i.next = add nsw nuw i32 %i, -2
+  %idx.next = sub nsw nuw i32 %idx.1, 2
+  %cmp = icmp ult i32 %i.next, %N
+  br i1 %cmp, label %loop, label %exit
+
+exit:
+  ret void
+}
+
+; CHECK-LABEL: test_qadd_3
+; CHECK: @ %loop
+
+; CHECK-DEFAULT: ldr{{.*}}, #8]
+; CHECK-DEFAULT: ldr{{.*}}, #8]
+; CHECK-DEFAULT: str{{.*}}, #8]
+; CHECK-DEFAULT: ldr{{.*}}, #12]!
+; CHECK-DEFAULT: ldr{{.*}}, #12]!
+; CHECK-DEFAULT: str{{.*}}, #12]!
+
+; CHECK-COMPLEX: ldr{{.*}}, #12]!
+; CHECK-COMPLEX: ldr{{.*}}, #12]!
+; CHECK-COMPLEX: str{{.*}}, #12]!
+; CHECK-COMPLEX: ldr{{.*}}, #4]
+; CHECK-COMPLEX: ldr{{.*}}, #4]
+; CHECK-COMPLEX: str{{.*}}, #4]
+; CHECK-COMPLEX: ldr{{.*}}, #8]
+; CHECK-COMPLEX: ldr{{.*}}, #8]
+; CHECK-COMPLEX: str{{.*}}, #8]
+
+; DISABLED-NOT: ldr{{.*}}]!
+; DISABLED-NOT: str{{.*}}]!
+
+define void @test_qadd_3(i32* %a.array, i32* %b.array, i32* %out.array, i32 %N) {
+entry:
+  br label %loop
+
+loop:
+  %i = phi i32 [ 0, %entry ], [ %i.next, %loop ]
+  %idx.1 = phi i32 [ 0, %entry ], [ %idx.next, %loop ]
+  %gep.a.1 = getelementptr inbounds i32, i32* %a.array, i32 %idx.1
+  %a.1 = load i32, i32* %gep.a.1
+  %gep.b.1 = getelementptr inbounds i32, i32* %b.array, i32 %idx.1
+  %b.1 = load i32, i32* %gep.b.1
+  %qadd.1 = call i32 @llvm.arm.qadd(i32 %a.1, i32 %b.1)
+  %addr.1 = getelementptr inbounds i32, i32* %out.array, i32 %idx.1
+  store i32 %qadd.1, i32* %addr.1
+  %idx.2 = add nuw nsw i32 %idx.1, 1
+  %gep.a.2 = getelementptr inbounds i32, i32* %a.array, i32 %idx.2
+  %a.2 = load i32, i32* %gep.a.2
+  %gep.b.2 = getelementptr inbounds i32, i32* %b.array, i32 %idx.2
+  %b.2 = load i32, i32* %gep.b.2
+  %qadd.2 = call i32 @llvm.arm.qadd(i32 %a.2, i32 %b.2)
+  %addr.2 = getelementptr inbounds i32, i32* %out.array, i32 %idx.2
+  store i32 %qadd.2, i32* %addr.2
+  %idx.3 = add nuw nsw i32 %idx.1, 2
+  %gep.a.3 = getelementptr inbounds i32, i32* %a.array, i32 %idx.3
+  %a.3 = load i32, i32* %gep.a.3
+  %gep.b.3 = getelementptr inbounds i32, i32* %b.array, i32 %idx.3
+  %b.3 = load i32, i32* %gep.b.3
+  %qadd.3 = call i32 @llvm.arm.qadd(i32 %a.3, i32 %b.3)
+  %addr.3 = getelementptr inbounds i32, i32* %out.array, i32 %idx.3
+  store i32 %qadd.3, i32* %addr.3
+  %i.next = add nsw nuw i32 %i, -3
+  %idx.next = add nsw nuw i32 %idx.1, 3
+  %cmp = icmp ult i32 %i.next, %N
+  br i1 %cmp, label %loop, label %exit
+
+exit:
+  ret void
+}
+
+; CHECK-LABEL: test_qadd_4
+; CHECK: @ %loop
+
+; TODO: pre-inc store
+
+; CHECK-DEFAULT: ldr{{.*}}, #4]
+; CHECK-DEFAULT: ldr{{.*}}, #4]
+; CHECK-DEFAULT: str{{.*}}, #4]
+; CHECK-DEFAULT: ldr{{.*}}, #8]
+; CHECK-DEFAULT: ldr{{.*}}, #8]
+; CHECK-DEFAULT: str{{.*}}, #8]
+; CHECK-DEFAULT: ldr{{.*}}, #12]
+; CHECK-DEFAULT: ldr{{.*}}, #12]
+; CHECK-DEFAULT: str{{.*}}, #12]
+
+; CHECK-COMPLEX: ldr{{.*}}, #16]!
+; CHECK-COMPLEX: ldr{{.*}}, #16]!
+; CHECK-COMPLEX: str{{.*}}, #16]!
+; CHECK-COMPLEX: ldr{{.*}}, #4]
+; CHECK-COMPLEX: ldr{{.*}}, #4]
+; CHECK-COMPLEX: str{{.*}}, #4]
+; CHECK-COMPLEX: ldr{{.*}}, #8]
+; CHECK-COMPLEX: ldr{{.*}}, #8]
+; CHECK-COMPLEX: str{{.*}}, #8]
+; CHECK-COMPLEX: ldr{{.*}}, #12]
+; CHECK-COMPLEX: ldr{{.*}}, #12]
+; CHECK-COMPLEX: str{{.*}}, #12]
+
+; DISABLED-NOT: ldr{{.*}}]!
+; DISABLED-NOT: str{{.*}}]!
+
+define void @test_qadd_4(i32* %a.array, i32* %b.array, i32* %out.array, i32 %N) {
+entry:
+  br label %loop
+
+loop:
+  %i = phi i32 [ 0, %entry ], [ %i.next, %loop ]
+  %idx.1 = phi i32 [ 0, %entry ], [ %idx.next, %loop ]
+  %gep.a.1 = getelementptr inbounds i32, i32* %a.array, i32 %idx.1
+  %a.1 = load i32, i32* %gep.a.1
+  %gep.b.1 = getelementptr inbounds i32, i32* %b.array, i32 %idx.1
+  %b.1 = load i32, i32* %gep.b.1
+  %qadd.1 = call i32 @llvm.arm.qadd(i32 %a.1, i32 %b.1)
+  %addr.1 = getelementptr inbounds i32, i32* %out.array, i32 %idx.1
+  store i32 %qadd.1, i32* %addr.1
+  %idx.2 = or i32 %idx.1, 1
+  %gep.a.2 = getelementptr inbounds i32, i32* %a.array, i32 %idx.2
+  %a.2 = load i32, i32* %gep.a.2
+  %gep.b.2 = getelementptr inbounds i32, i32* %b.array, i32 %idx.2
+  %b.2 = load i32, i32* %gep.b.2
+  %qadd.2 = call i32 @llvm.arm.qadd(i32 %a.2, i32 %b.2)
+  %addr.2 = getelementptr inbounds i32, i32* %out.array, i32 %idx.2
+  store i32 %qadd.2, i32* %addr.2
+  %idx.3 = or i32 %idx.1, 2
+  %gep.a.3 = getelementptr inbounds i32, i32* %a.array, i32 %idx.3
+  %a.3 = load i32, i32* %gep.a.3
+  %gep.b.3 = getelementptr inbounds i32, i32* %b.array, i32 %idx.3
+  %b.3 = load i32, i32* %gep.b.3
+  %qadd.3 = call i32 @llvm.arm.qadd(i32 %a.3, i32 %b.3)
+  %addr.3 = getelementptr inbounds i32, i32* %out.array, i32 %idx.3
+  store i32 %qadd.3, i32* %addr.3
+  %idx.4 = or i32 %idx.1, 3
+  %gep.a.4 = getelementptr inbounds i32, i32* %a.array, i32 %idx.4
+  %a.4 = load i32, i32* %gep.a.4
+  %gep.b.4 = getelementptr inbounds i32, i32* %b.array, i32 %idx.4
+  %b.4 = load i32, i32* %gep.b.4
+  %qadd.4 = call i32 @llvm.arm.qadd(i32 %a.4, i32 %b.4)
+  %addr.4 = getelementptr inbounds i32, i32* %out.array, i32 %idx.4
+  store i32 %qadd.4, i32* %addr.4
+  %i.next = add nsw nuw i32 %i, -4
+  %idx.next = add nsw nuw i32 %idx.1, 4
+  %cmp = icmp ult i32 %i.next, %N
+  br i1 %cmp, label %loop, label %exit
+
+exit:
+  ret void
+}
+
+; CHECK-LABEL: test_qadd16_2
+; CHECK: @ %loop
+; TODO: pre-inc store.
+
+; CHECK-DEFAULT: ldr{{.*}}, #4]
+; CHECK-DEFAULT: ldr{{.*}}, #4]
+; CHECK-DEFAULT: str{{.*}}, #8]
+; CHECK-DEFAULT: ldr{{.*}}, #8]!
+; CHECK-DEFAULT: ldr{{.*}}, #8]!
+; CHECK-DEFAULT: str{{.*}}, #16]!
+
+; CHECK-COMPLEX: ldr{{.*}}, #8]!
+; CHECK-COMPLEX: ldr{{.*}}, #8]!
+; CHECK-COMPLEX: str{{.*}}, #16]!
+; CHECK-COMPLEX: ldr{{.*}}, #4]
+; CHECK-COMPLEX: ldr{{.*}}, #4]
+; CHECK-COMPLEX: str{{.*}}, #8]
+
+; DISABLED-NOT: ldr{{.*}}]!
+; DISABLED-NOT: str{{.*}}]!
+
+define void @test_qadd16_2(i16* %a.array, i16* %b.array, i32* %out.array, i32 %N) {
+entry:
+  br label %loop
+
+loop:
+  %i = phi i32 [ 0, %entry ], [ %i.next, %loop ]
+  %idx.1 = phi i32 [ 0, %entry ], [ %idx.next, %loop ]
+  %gep.a.1 = getelementptr inbounds i16, i16* %a.array, i32 %idx.1
+  %cast.a.1 = bitcast i16* %gep.a.1 to i32*
+  %a.1 = load i32, i32* %cast.a.1
+  %gep.b.1 = getelementptr inbounds i16, i16* %b.array, i32 %idx.1
+  %cast.b.1 = bitcast i16* %gep.b.1 to i32*
+  %b.1 = load i32, i32* %cast.b.1
+  %qadd.1 = call i32 @llvm.arm.qadd16(i32 %a.1, i32 %b.1)
+  %addr.1 = getelementptr inbounds i32, i32* %out.array, i32 %idx.1
+  store i32 %qadd.1, i32* %addr.1
+  %idx.2 = add nsw nuw i32 %idx.1, 2
+  %gep.a.2 = getelementptr inbounds i16, i16* %a.array, i32 %idx.2
+  %cast.a.2 = bitcast i16* %gep.a.2 to i32*
+  %a.2 = load i32, i32* %cast.a.2
+  %gep.b.2 = getelementptr inbounds i16, i16* %b.array, i32 %idx.2
+  %cast.b.2 = bitcast i16* %gep.b.2 to i32*
+  %b.2 = load i32, i32* %cast.b.2
+  %qadd.2 = call i32 @llvm.arm.qadd16(i32 %a.2, i32 %b.2)
+  %addr.2 = getelementptr inbounds i32, i32* %out.array, i32 %idx.2
+  store i32 %qadd.2, i32* %addr.2
+  %i.next = add nsw nuw i32 %i, -2
+  %idx.next = add nsw nuw i32 %idx.1, 4
+  %cmp = icmp ult i32 %i.next, %N
+  br i1 %cmp, label %loop, label %exit
+
+exit:
+  ret void
+}
+
+declare i32 @llvm.arm.qadd(i32, i32)
+declare i32 @llvm.arm.qadd16(i32, i32)
diff --git a/test/CodeGen/ARM/loop-align-cortex-m.ll b/test/CodeGen/ARM/loop-align-cortex-m.ll
index 1b41c1b6c3f..61ba1a6ca2d 100644
--- a/test/CodeGen/ARM/loop-align-cortex-m.ll
+++ b/test/CodeGen/ARM/loop-align-cortex-m.ll
@@ -1,10 +1,10 @@
 ; RUN: llc -mtriple=thumbv7m-none-eabi %s -mcpu=cortex-m3 -o - | FileCheck %s
 ; RUN: llc -mtriple=thumbv7m-none-eabi %s -mcpu=cortex-m4 -o - | FileCheck %s
-; RUN: llc -mtriple=thumbv7m-none-eabi %s -mcpu=cortex-m33 -o - | FileCheck %s
+; RUN: llc -mtriple=thumbv8m-none-eabi %s -mcpu=cortex-m33 -o - | FileCheck %s
 
 define void @test_loop_alignment(i32* %in, i32*  %out) optsize {
 ; CHECK-LABEL: test_loop_alignment:
-; CHECK: movs {{r[0-9]+}}, #0
+; CHECK: mov{{.*}}, #0
 ; CHECK: .p2align 2
 
 entry:
diff --git a/test/CodeGen/ARM/loop-indexing.ll b/test/CodeGen/ARM/loop-indexing.ll
new file mode 100644
index 00000000000..0c364a76969
--- /dev/null
+++ b/test/CodeGen/ARM/loop-indexing.ll
@@ -0,0 +1,1190 @@
+; RUN: llc -mtriple=thumbv7em -mattr=+fp-armv8 %s -o - | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-BASE --check-prefix=CHECK-DEFAULT --check-prefix=CHECK-T2
+; RUN: llc -mtriple=thumbv8m.main -mattr=+fp-armv8,+dsp %s -o - | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-DEFAULT --check-prefix=CHECK-T2
+; RUN: llc -mtriple=thumbv8m.main -mattr=+fp-armv8,+dsp -lsr-backedge-indexing=false %s -o - | FileCheck %s --check-prefix=CHECK --check-prefix=DISABLED
+; RUN: llc -mtriple=thumbv8m.base %s -o - | FileCheck %s --check-prefix=CHECK --check-prefix=DISABLED
+; RUN: llc -mtriple=thumbv8 %s -o - | FileCheck %s --check-prefix=CHECK --check-prefix=DISABLED
+; RUN: llc -mtriple=thumbv8m.main -mattr=+fp-armv8,+dsp -lsr-complexity-limit=2147483647 %s -o - | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-COMPLEX --check-prefix=CHECK-T2
+
+; Tests to check that post increment addressing modes are used instead of
+; updating base pointers with add instructions.
+
+; TODO: I think we should be able to use post inc addressing with VLDM
+; instructions.
+; CHECK-LABEL: test_fma
+; CHECK: @ %loop
+
+; CHECK-BASE: vldr s{{.*}}, #8]
+; CHECK-BASE: vldr s{{.*}}, #8]
+; CHECK-BASE: vldr s{{.*}}, #12]
+; CHECK-BASE: vldr s{{.*}}, #12]
+
+; CHECK-COMPLEX: vldr s{{.*}}, #8]
+; CHECK-COMPLEX: vldr s{{.*}}, #8]
+; CHECK-COMPLEX: vldr s{{.*}}, #12]
+; CHECK-COMPLEX: vldr s{{.*}}, #12]
+
+define float @test_fma(float* %a, float* %b, i32 %N) {
+entry:
+  br label %loop
+
+loop:
+  %i = phi i32 [ 0, %entry ], [ %i.next, %loop ]
+  %idx.1 = phi i32 [ 0, %entry ], [ %idx.next, %loop ]
+  %res = phi float [ 0.0, %entry ], [ %fma.2, %loop ]
+  %gep.a.1 = getelementptr inbounds float, float* %a, i32 %idx.1
+  %a.1 = load float, float* %gep.a.1
+  %gep.b.1 = getelementptr inbounds float, float* %b, i32 %idx.1
+  %b.1 = load float, float* %gep.b.1
+  %fmul.1 = fmul float %a.1, %b.1
+  %fma.1 = fadd float %fmul.1, %res
+  %idx.2 = or i32 %idx.1, 1
+  %gep.a.2 = getelementptr inbounds float, float* %a, i32 %idx.2
+  %a.2 = load float, float* %gep.a.2
+  %gep.b.2 = getelementptr inbounds float, float* %b, i32 %idx.2
+  %b.2 = load float, float* %gep.b.2
+  %fmul.2 = fmul float %a.2, %b.2
+  %fma.2 = fadd float %fmul.2, %fma.1
+  %i.next = add nsw nuw i32 %i, -2
+  %idx.next = add nsw nuw i32 %idx.1, 2
+  %cmp = icmp ult i32 %i.next, %N
+  br i1 %cmp, label %loop, label %exit
+
+exit:
+  ret float %fma.2
+}
+
+; CHECK-LABEL: convolve_16bit
+; TODO: Both arrays should use indexing
+; CHECK-DEFAULT: ldr{{.*}}, #8]!
+; CHECK-DEFAULT: ldr{{.*}}, #10]
+; CHECK-DEFAULT: ldr{{.*}}, #4]
+; CHECK-DEFAULT: ldr{{.*}}, #6]
+
+; CHECK-COMPLEX: ldr{{.*}}, #8]!
+; CHECK-COMPLEX: ldr{{.*}}, #10]
+; CHECK-COMPLEX: ldr{{.*}}, #4]
+; CHECK-COMPLEX: ldr{{.*}}, #6]
+
+; DISABLED-NOT: ldr{{.*}}]!
+; DISABLED-NOT: str{{.*}}]!
+
+define void @convolve_16bit(i16** nocapture readonly %input_image, i16** nocapture readonly %filter,
+                            i32 %filter_dim, i32 %out_width, i32 %out_height,
+                            i32** nocapture readonly %convolved) {
+entry:
+  %cmp92 = icmp eq i32 %out_height, 0
+  br i1 %cmp92, label %for.cond.cleanup, label %for.cond1.preheader.lr.ph
+
+for.cond1.preheader.lr.ph:                        ; preds = %entry
+  %xtraiter = and i32 %filter_dim, 3
+  %unroll_iter = sub i32 %filter_dim, %xtraiter
+  br label %for.cond1.preheader
+
+for.cond1.preheader:                              ; preds = %for.cond.cleanup3, %for.cond1.preheader.lr.ph
+  %res_y.093 = phi i32 [ 0, %for.cond1.preheader.lr.ph ], [ %add28, %for.cond.cleanup3 ]
+  %arrayidx22 = getelementptr inbounds i32*, i32** %convolved, i32 %res_y.093
+  %tmp3 = load i32*, i32** %arrayidx22, align 4
+  br label %for.cond9.preheader.us.us.preheader
+
+for.cond9.preheader.us.us.preheader:              ; preds = %for.cond5.for.cond.cleanup7_crit_edge.us, %for.cond5.preheader.lr.ph
+  %res_x.060.us = phi i32 [ %add25.us, %for.cond5.for.cond.cleanup7_crit_edge.us ], [ 0, %for.cond1.preheader ]
+  br label %for.cond9.preheader.us.us
+
+for.cond9.preheader.us.us:                        ; preds = %for.cond9.for.cond.cleanup11_crit_edge.us.us, %for.cond9.preheader.us.us.preheader
+  %filter_y.056.us.us = phi i32 [ %inc20.us.us, %for.cond9.for.cond.cleanup11_crit_edge.us.us.unr-lcssa ], [ 0, %for.cond9.preheader.us.us.preheader ]
+  %result_element.055.us.us = phi i32 [ %add18.us.us.3, %for.cond9.for.cond.cleanup11_crit_edge.us.us.unr-lcssa ], [ 0, %for.cond9.preheader.us.us.preheader ]
+  %add.us.us = add i32 %filter_y.056.us.us, %res_y.093
+  %arrayidx.us.us = getelementptr inbounds i16*, i16** %filter, i32 %filter_y.056.us.us
+  %tmp5 = load i16*, i16** %arrayidx.us.us, align 4
+  %arrayidx15.us.us = getelementptr inbounds i16*, i16** %input_image, i32 %add.us.us
+  %tmp6 = load i16*, i16** %arrayidx15.us.us, align 4
+  br label %for.body12.us.us
+
+for.body12.us.us:                                 ; preds = %for.body12.us.us, %for.cond9.preheader.us.us
+  %filter_x.053.us.us = phi i32 [ %inc.us.us.3, %for.body12.us.us ], [ 0, %for.cond9.preheader.us.us ]
+  %result_element.152.us.us = phi i32 [ %add18.us.us.3, %for.body12.us.us ], [ %result_element.055.us.us, %for.cond9.preheader.us.us ]
+  %niter = phi i32 [ %niter.nsub.3, %for.body12.us.us ], [ %unroll_iter, %for.cond9.preheader.us.us ]
+  %add13.us.us = add i32 %filter_x.053.us.us, %res_x.060.us
+  %arrayidx14.us.us = getelementptr inbounds i16, i16* %tmp5, i32 %filter_x.053.us.us
+  %tmp9 = load i16, i16* %arrayidx14.us.us, align 2
+  %conv.us.us = sext i16 %tmp9 to i32
+  %arrayidx16.us.us = getelementptr inbounds i16, i16* %tmp6, i32 %add13.us.us
+  %tmp10 = load i16, i16* %arrayidx16.us.us, align 2
+  %conv17.us.us = sext i16 %tmp10 to i32
+  %mul.us.us = mul nsw i32 %conv17.us.us, %conv.us.us
+  %add18.us.us = add nsw i32 %mul.us.us, %result_element.152.us.us
+  %inc.us.us = or i32 %filter_x.053.us.us, 1
+  %add13.us.us.1 = add i32 %inc.us.us, %res_x.060.us
+  %arrayidx14.us.us.1 = getelementptr inbounds i16, i16* %tmp5, i32 %inc.us.us
+  %tmp11 = load i16, i16* %arrayidx14.us.us.1, align 2
+  %conv.us.us.1 = sext i16 %tmp11 to i32
+  %arrayidx16.us.us.1 = getelementptr inbounds i16, i16* %tmp6, i32 %add13.us.us.1
+  %tmp12 = load i16, i16* %arrayidx16.us.us.1, align 2
+  %conv17.us.us.1 = sext i16 %tmp12 to i32
+  %mul.us.us.1 = mul nsw i32 %conv17.us.us.1, %conv.us.us.1
+  %add18.us.us.1 = add nsw i32 %mul.us.us.1, %add18.us.us
+  %inc.us.us.1 = or i32 %filter_x.053.us.us, 2
+  %add13.us.us.2 = add i32 %inc.us.us.1, %res_x.060.us
+  %arrayidx14.us.us.2 = getelementptr inbounds i16, i16* %tmp5, i32 %inc.us.us.1
+  %tmp13 = load i16, i16* %arrayidx14.us.us.2, align 2
+  %conv.us.us.2 = sext i16 %tmp13 to i32
+  %arrayidx16.us.us.2 = getelementptr inbounds i16, i16* %tmp6, i32 %add13.us.us.2
+  %tmp14 = load i16, i16* %arrayidx16.us.us.2, align 2
+  %conv17.us.us.2 = sext i16 %tmp14 to i32
+  %mul.us.us.2 = mul nsw i32 %conv17.us.us.2, %conv.us.us.2
+  %add18.us.us.2 = add nsw i32 %mul.us.us.2, %add18.us.us.1
+  %inc.us.us.2 = or i32 %filter_x.053.us.us, 3
+  %add13.us.us.3 = add i32 %inc.us.us.2, %res_x.060.us
+  %arrayidx14.us.us.3 = getelementptr inbounds i16, i16* %tmp5, i32 %inc.us.us.2
+  %tmp15 = load i16, i16* %arrayidx14.us.us.3, align 2
+  %conv.us.us.3 = sext i16 %tmp15 to i32
+  %arrayidx16.us.us.3 = getelementptr inbounds i16, i16* %tmp6, i32 %add13.us.us.3
+  %tmp16 = load i16, i16* %arrayidx16.us.us.3, align 2
+  %conv17.us.us.3 = sext i16 %tmp16 to i32
+  %mul.us.us.3 = mul nsw i32 %conv17.us.us.3, %conv.us.us.3
+  %add18.us.us.3 = add nsw i32 %mul.us.us.3, %add18.us.us.2
+  %inc.us.us.3 = add i32 %filter_x.053.us.us, 4
+  %niter.nsub.3 = add i32 %niter, -4
+  %niter.ncmp.3 = icmp eq i32 %niter.nsub.3, 0
+  br i1 %niter.ncmp.3, label %for.cond9.for.cond.cleanup11_crit_edge.us.us.unr-lcssa, label %for.body12.us.us
+
+for.cond9.for.cond.cleanup11_crit_edge.us.us.unr-lcssa: ; preds = %for.body12.us.us, %for.cond9.preheader.us.us
+  %inc20.us.us = add nuw i32 %filter_y.056.us.us, 1
+  %exitcond98 = icmp eq i32 %inc20.us.us, %filter_dim
+  br i1 %exitcond98, label %for.cond5.for.cond.cleanup7_crit_edge.us, label %for.cond9.preheader.us.us
+
+for.cond5.for.cond.cleanup7_crit_edge.us:         ; preds = %for.cond9.for.cond.cleanup11_crit_edge.us.us
+  %arrayidx23.us = getelementptr inbounds i32, i32* %tmp3, i32 %res_x.060.us
+  store i32 %add18.us.us.3, i32* %arrayidx23.us, align 4
+  %add25.us = add nuw i32 %res_x.060.us, 1
+  %exitcond99 = icmp eq i32 %add25.us, %out_width
+  br i1 %exitcond99, label %for.cond.cleanup3, label %for.cond9.preheader.us.us.preheader
+
+for.cond.cleanup3:                                ; preds = %for.cond5.for.cond.cleanup7_crit_edge.us, %for.cond5.preheader.preheader, %for.cond1.preheader
+  %add28 = add nuw i32 %res_y.093, 1
+  %exitcond100 = icmp eq i32 %add28, %out_height
+  br i1 %exitcond100, label %for.cond.cleanup, label %for.cond1.preheader
+
+for.cond.cleanup:                                 ; preds = %for.cond.cleanup3, %entry
+  ret void
+}
+
+; CHECK-LABEL: mul_8x8
+; CHECK: @ %for.body
+
+; CHECK-DEFAULT: ldrb{{.*}}, #3]
+; CHECK-DEFAULT: ldrb{{.*}}, #3]
+; CHECK-DEFAULT: str{{.*}}, #16]!
+; CHECK-DEFAULT: ldrb{{.*}}, #4]!
+; CHECK-DEFAULT: ldrb{{.*}}, #4]!
+; CHECK-DEFAULT: str{{.*}}, #4]
+; CHECK-DEFAULT: ldrb{{.*}}, #1]
+; CHECK-DEFAULT: ldrb{{.*}}, #1]
+; CHECK-DEFAULT: str{{.*}}, #8]
+; CHECK-DEFAULT: ldrb{{.*}}, #2]
+; CHECK-DEFAULT: ldrb{{.*}}, #2]
+; CHECK-DEFAULT: str{{.*}}, #12]
+
+; CHECK-COMPLEX: ldrb{{.*}}, #3]
+; CHECK-COMPLEX: ldrb{{.*}}, #3]
+; CHECK-COMPLEX: str{{.*}}, #16]!
+; CHECK-COMPLEX: ldrb{{.*}}, #4]!
+; CHECK-COMPLEX: ldrb{{.*}}, #4]!
+; CHECK-COMPLEX: str{{.*}}, #4]
+; CHECK-COMPLEX: ldrb{{.*}}, #1]
+; CHECK-COMPLEX: ldrb{{.*}}, #1]
+; CHECK-COMPLEX: str{{.*}}, #8]
+; CHECK-COMPLEX: ldrb{{.*}}, #2]
+; CHECK-COMPLEX: ldrb{{.*}}, #2]
+; CHECK-COMPLEX: str{{.*}}, #12]
+
+; DISABLED-NOT: ldr{{.*}}]!
+; DISABLED-NOT: str{{.*}}]!
+
+; CHECK-T2: @ %for.body.epil
+; CHECK-T2: ldrb{{.*}}, #1]!
+; CHECK-T2: ldrb{{.*}}, #1]!
+; CHECK-T2: str{{.*}}, #4]!
+
+define void @mul_8x8(i8* nocapture readonly %A, i8* nocapture readonly %B, i32* nocapture %C, i32 %N) {
+entry:
+  %cmp9 = icmp eq i32 %N, 0
+  br i1 %cmp9, label %for.cond.cleanup, label %for.body.preheader
+
+for.body.preheader:                               ; preds = %entry
+  %tmp = add i32 %N, -1
+  %xtraiter = and i32 %N, 3
+  %tmp1 = icmp ult i32 %tmp, 3
+  br i1 %tmp1, label %for.cond.cleanup.loopexit.unr-lcssa, label %for.body.preheader.new
+
+for.body.preheader.new:                           ; preds = %for.body.preheader
+  %unroll_iter = sub i32 %N, %xtraiter
+  br label %for.body
+
+for.cond.cleanup.loopexit.unr-lcssa:              ; preds = %for.body, %for.body.preheader
+  %i.010.unr = phi i32 [ 0, %for.body.preheader ], [ %inc.3, %for.body ]
+  %lcmp.mod = icmp eq i32 %xtraiter, 0
+  br i1 %lcmp.mod, label %for.cond.cleanup, label %for.body.epil
+
+for.body.epil:                                    ; preds = %for.body.epil, %for.cond.cleanup.loopexit.unr-lcssa
+  %i.010.epil = phi i32 [ %inc.epil, %for.body.epil ], [ %i.010.unr, %for.cond.cleanup.loopexit.unr-lcssa ]
+  %epil.iter = phi i32 [ %epil.iter.sub, %for.body.epil ], [ %xtraiter, %for.cond.cleanup.loopexit.unr-lcssa ]
+  %arrayidx.epil = getelementptr inbounds i8, i8* %A, i32 %i.010.epil
+  %tmp2 = load i8, i8* %arrayidx.epil, align 1
+  %conv.epil = zext i8 %tmp2 to i32
+  %arrayidx1.epil = getelementptr inbounds i8, i8* %B, i32 %i.010.epil
+  %tmp3 = load i8, i8* %arrayidx1.epil, align 1
+  %conv2.epil = zext i8 %tmp3 to i32
+  %mul.epil = mul nuw nsw i32 %conv2.epil, %conv.epil
+  %arrayidx3.epil = getelementptr inbounds i32, i32* %C, i32 %i.010.epil
+  store i32 %mul.epil, i32* %arrayidx3.epil, align 4
+  %inc.epil = add nuw i32 %i.010.epil, 1
+  %epil.iter.sub = add i32 %epil.iter, -1
+  %epil.iter.cmp = icmp eq i32 %epil.iter.sub, 0
+  br i1 %epil.iter.cmp, label %for.cond.cleanup, label %for.body.epil
+
+for.cond.cleanup:                                 ; preds = %for.body.epil, %for.cond.cleanup.loopexit.unr-lcssa, %entry
+  ret void
+
+for.body:                                         ; preds = %for.body, %for.body.preheader.new
+  %i.010 = phi i32 [ 0, %for.body.preheader.new ], [ %inc.3, %for.body ]
+  %niter = phi i32 [ %unroll_iter, %for.body.preheader.new ], [ %niter.nsub.3, %for.body ]
+  %arrayidx = getelementptr inbounds i8, i8* %A, i32 %i.010
+  %tmp4 = load i8, i8* %arrayidx, align 1
+  %conv = zext i8 %tmp4 to i32
+  %arrayidx1 = getelementptr inbounds i8, i8* %B, i32 %i.010
+  %tmp5 = load i8, i8* %arrayidx1, align 1
+  %conv2 = zext i8 %tmp5 to i32
+  %mul = mul nuw nsw i32 %conv2, %conv
+  %arrayidx3 = getelementptr inbounds i32, i32* %C, i32 %i.010
+  store i32 %mul, i32* %arrayidx3, align 4
+  %inc = or i32 %i.010, 1
+  %arrayidx.1 = getelementptr inbounds i8, i8* %A, i32 %inc
+  %tmp6 = load i8, i8* %arrayidx.1, align 1
+  %conv.1 = zext i8 %tmp6 to i32
+  %arrayidx1.1 = getelementptr inbounds i8, i8* %B, i32 %inc
+  %tmp7 = load i8, i8* %arrayidx1.1, align 1
+  %conv2.1 = zext i8 %tmp7 to i32
+  %mul.1 = mul nuw nsw i32 %conv2.1, %conv.1
+  %arrayidx3.1 = getelementptr inbounds i32, i32* %C, i32 %inc
+  store i32 %mul.1, i32* %arrayidx3.1, align 4
+  %inc.1 = or i32 %i.010, 2
+  %arrayidx.2 = getelementptr inbounds i8, i8* %A, i32 %inc.1
+  %tmp8 = load i8, i8* %arrayidx.2, align 1
+  %conv.2 = zext i8 %tmp8 to i32
+  %arrayidx1.2 = getelementptr inbounds i8, i8* %B, i32 %inc.1
+  %tmp9 = load i8, i8* %arrayidx1.2, align 1
+  %conv2.2 = zext i8 %tmp9 to i32
+  %mul.2 = mul nuw nsw i32 %conv2.2, %conv.2
+  %arrayidx3.2 = getelementptr inbounds i32, i32* %C, i32 %inc.1
+  store i32 %mul.2, i32* %arrayidx3.2, align 4
+  %inc.2 = or i32 %i.010, 3
+  %arrayidx.3 = getelementptr inbounds i8, i8* %A, i32 %inc.2
+  %tmp10 = load i8, i8* %arrayidx.3, align 1
+  %conv.3 = zext i8 %tmp10 to i32
+  %arrayidx1.3 = getelementptr inbounds i8, i8* %B, i32 %inc.2
+  %tmp11 = load i8, i8* %arrayidx1.3, align 1
+  %conv2.3 = zext i8 %tmp11 to i32
+  %mul.3 = mul nuw nsw i32 %conv2.3, %conv.3
+  %arrayidx3.3 = getelementptr inbounds i32, i32* %C, i32 %inc.2
+  store i32 %mul.3, i32* %arrayidx3.3, align 4
+  %inc.3 = add i32 %i.010, 4
+  %niter.nsub.3 = add i32 %niter, -4
+  %niter.ncmp.3 = icmp eq i32 %niter.nsub.3, 0
+  br i1 %niter.ncmp.3, label %for.cond.cleanup.loopexit.unr-lcssa, label %for.body
+}
+
+; CHECK-LABEL: mul_16x8
+; CHECK: @ %for.body 
+
+; CHECK-DEFAULT: ldrsh{{.*}}, #2]
+; CHECK-DEFAULT: ldrb{{.*}}, #-1]
+; CHECK-DEFAULT: str{{.*}}, #16]!
+; CHECK-DEFAULT: ldrb{{.*}},
+; CHECK-DEFAULT: ldrsh{{.*}}, #2]
+; CHECK-DEFAULT: str{{.*}}, #4]
+; CHECK-DEFAULT: ldrsh{{.*}}, #4]
+; CHECK-DEFAULT: ldrb{{.*}}, #1]
+; CHECK-DEFAULT: str{{.*}}, #8]
+; CHECK-DEFAULT: ldrsh{{.*}}, #8]!
+; CHECK-DEFAULT: ldrb{{.*}}, #2]
+; CHECK-DEFAULT: str{{.*}}, #12]
+
+; CHECK-COMPLEX: ldrsh{{.*}}, #8]!
+; CHECK-COMPLEX: str{{.*}}, #16]!
+; CHECK-COMPLEX: ldrb{{.*}}, #4]!
+
+; DISABLED-NOT: ldr{{.*}}]!
+; DISABLED-NOT: str{{.*}}]!
+
+; CHECK-T2: @ %for.body.epil
+; CHECK-T2: ldrsh{{.*}}, #2]!
+; CHECK-T2: ldrb{{.*}}, #1]!
+; CHECK-T2: str{{.*}}, #4]!
+
+define void @mul_16x8(i16* nocapture readonly %A, i8* nocapture readonly %B, i32* nocapture %C, i32 %N) {
+entry:
+  %cmp9 = icmp eq i32 %N, 0
+  br i1 %cmp9, label %for.cond.cleanup, label %for.body.preheader
+
+for.body.preheader:                               ; preds = %entry
+  %tmp = add i32 %N, -1
+  %xtraiter = and i32 %N, 3
+  %tmp1 = icmp ult i32 %tmp, 3
+  br i1 %tmp1, label %for.cond.cleanup.loopexit.unr-lcssa, label %for.body.preheader.new
+
+for.body.preheader.new:                           ; preds = %for.body.preheader
+  %unroll_iter = sub i32 %N, %xtraiter
+  br label %for.body
+
+for.cond.cleanup.loopexit.unr-lcssa:              ; preds = %for.body, %for.body.preheader
+  %i.010.unr = phi i32 [ 0, %for.body.preheader ], [ %inc.3, %for.body ]
+  %lcmp.mod = icmp eq i32 %xtraiter, 0
+  br i1 %lcmp.mod, label %for.cond.cleanup, label %for.body.epil
+
+for.body.epil:                                    ; preds = %for.body.epil, %for.cond.cleanup.loopexit.unr-lcssa
+  %i.010.epil = phi i32 [ %inc.epil, %for.body.epil ], [ %i.010.unr, %for.cond.cleanup.loopexit.unr-lcssa ]
+  %epil.iter = phi i32 [ %epil.iter.sub, %for.body.epil ], [ %xtraiter, %for.cond.cleanup.loopexit.unr-lcssa ]
+  %arrayidx.epil = getelementptr inbounds i16, i16* %A, i32 %i.010.epil
+  %tmp2 = load i16, i16* %arrayidx.epil, align 2
+  %conv.epil = sext i16 %tmp2 to i32
+  %arrayidx1.epil = getelementptr inbounds i8, i8* %B, i32 %i.010.epil
+  %tmp3 = load i8, i8* %arrayidx1.epil, align 1
+  %conv2.epil = zext i8 %tmp3 to i32
+  %mul.epil = mul nsw i32 %conv2.epil, %conv.epil
+  %arrayidx3.epil = getelementptr inbounds i32, i32* %C, i32 %i.010.epil
+  store i32 %mul.epil, i32* %arrayidx3.epil, align 4
+  %inc.epil = add nuw i32 %i.010.epil, 1
+  %epil.iter.sub = add i32 %epil.iter, -1
+  %epil.iter.cmp = icmp eq i32 %epil.iter.sub, 0
+  br i1 %epil.iter.cmp, label %for.cond.cleanup, label %for.body.epil
+
+for.cond.cleanup:                                 ; preds = %for.body.epil, %for.cond.cleanup.loopexit.unr-lcssa, %entry
+  ret void
+
+for.body:                                         ; preds = %for.body, %for.body.preheader.new
+  %i.010 = phi i32 [ 0, %for.body.preheader.new ], [ %inc.3, %for.body ]
+  %niter = phi i32 [ %unroll_iter, %for.body.preheader.new ], [ %niter.nsub.3, %for.body ]
+  %arrayidx = getelementptr inbounds i16, i16* %A, i32 %i.010
+  %tmp4 = load i16, i16* %arrayidx, align 2
+  %conv = sext i16 %tmp4 to i32
+  %arrayidx1 = getelementptr inbounds i8, i8* %B, i32 %i.010
+  %tmp5 = load i8, i8* %arrayidx1, align 1
+  %conv2 = zext i8 %tmp5 to i32
+  %mul = mul nsw i32 %conv2, %conv
+  %arrayidx3 = getelementptr inbounds i32, i32* %C, i32 %i.010
+  store i32 %mul, i32* %arrayidx3, align 4
+  %inc = or i32 %i.010, 1
+  %arrayidx.1 = getelementptr inbounds i16, i16* %A, i32 %inc
+  %tmp6 = load i16, i16* %arrayidx.1, align 2
+  %conv.1 = sext i16 %tmp6 to i32
+  %arrayidx1.1 = getelementptr inbounds i8, i8* %B, i32 %inc
+  %tmp7 = load i8, i8* %arrayidx1.1, align 1
+  %conv2.1 = zext i8 %tmp7 to i32
+  %mul.1 = mul nsw i32 %conv2.1, %conv.1
+  %arrayidx3.1 = getelementptr inbounds i32, i32* %C, i32 %inc
+  store i32 %mul.1, i32* %arrayidx3.1, align 4
+  %inc.1 = or i32 %i.010, 2
+  %arrayidx.2 = getelementptr inbounds i16, i16* %A, i32 %inc.1
+  %tmp8 = load i16, i16* %arrayidx.2, align 2
+  %conv.2 = sext i16 %tmp8 to i32
+  %arrayidx1.2 = getelementptr inbounds i8, i8* %B, i32 %inc.1
+  %tmp9 = load i8, i8* %arrayidx1.2, align 1
+  %conv2.2 = zext i8 %tmp9 to i32
+  %mul.2 = mul nsw i32 %conv2.2, %conv.2
+  %arrayidx3.2 = getelementptr inbounds i32, i32* %C, i32 %inc.1
+  store i32 %mul.2, i32* %arrayidx3.2, align 4
+  %inc.2 = or i32 %i.010, 3
+  %arrayidx.3 = getelementptr inbounds i16, i16* %A, i32 %inc.2
+  %tmp10 = load i16, i16* %arrayidx.3, align 2
+  %conv.3 = sext i16 %tmp10 to i32
+  %arrayidx1.3 = getelementptr inbounds i8, i8* %B, i32 %inc.2
+  %tmp11 = load i8, i8* %arrayidx1.3, align 1
+  %conv2.3 = zext i8 %tmp11 to i32
+  %mul.3 = mul nsw i32 %conv2.3, %conv.3
+  %arrayidx3.3 = getelementptr inbounds i32, i32* %C, i32 %inc.2
+  store i32 %mul.3, i32* %arrayidx3.3, align 4
+  %inc.3 = add i32 %i.010, 4
+  %niter.nsub.3 = add i32 %niter, -4
+  %niter.ncmp.3 = icmp eq i32 %niter.nsub.3, 0
+  br i1 %niter.ncmp.3, label %for.cond.cleanup.loopexit.unr-lcssa, label %for.body
+}
+
+; CHECK-LABEL: mul_16x16
+; CHECK: @ %for.body
+
+; TODO: pre-inc store
+; CHECK-DEFAULT: ldrsh{{.*}}, #2]
+; CHECK-DEFAULT: ldrsh{{.*}}, #2]
+; CHECK-DEFAULT: str{{.*}}, #16]!
+; CHECK-DEFAULT: ldrsh{{.*}}, #2]
+; CHECK-DEFAULT: ldrsh{{.*}}, #2]
+; CHECK-DEFAULT: str{{.*}}, #4]
+; CHECK-DEFAULT: ldrsh{{.*}}, #4]
+; CHECK-DEFAULT: ldrsh{{.*}}, #4]
+; CHECK-DEFAULT: str{{.*}}, #8]
+; CHECK-DEFAULT: ldrsh{{.*}}, #8]
+; CHECK-DEFAULT: ldrsh{{.*}}, #8]
+; CHECK-DEFAULT: str{{.*}}, #12]
+
+; CHECK-COMPLEX: ldrsh
+; CHECK-COMPLEX: ldrsh
+; CHECK-COMPLEX: str
+; CHECK-COMPLEX: ldrsh{{.*}}, #2]
+; CHECK-COMPLEX: ldrsh{{.*}}, #2]
+; CHECK-COMPLEX: str{{.*}}, #4]
+; CHECK-COMPLEX: ldrsh{{.*}}, #4]
+; CHECK-COMPLEX: ldrsh{{.*}}, #4]
+; CHECK-COMPLEX: str{{.*}}, #8]
+; CHECK-COMPLEX: ldrsh{{.*}}, #6]
+; CHECK-COMPLEX: ldrsh{{.*}}, #6]
+; CHECK-COMPLEX: str{{.*}}, #12]
+
+; DISABLED-NOT: ldr{{.*}}]!
+; DISABLED-NOT: str{{.*}}]!
+
+; CHECK-T2: @ %for.body.epil
+; CHECK-T2: ldrsh{{.*}}, #2]!
+; CHECK-T2: ldrsh{{.*}}, #2]!
+; CHECK-T2: str{{.*}}, #4]!
+
+define void @mul_16x16(i16* nocapture readonly %A, i16* nocapture readonly %B, i32* nocapture %C, i32 %N) {
+entry:
+  %cmp9 = icmp eq i32 %N, 0
+  br i1 %cmp9, label %for.cond.cleanup, label %for.body.preheader
+
+for.body.preheader:                               ; preds = %entry
+  %tmp = add i32 %N, -1
+  %xtraiter = and i32 %N, 3
+  %tmp1 = icmp ult i32 %tmp, 3
+  br i1 %tmp1, label %for.cond.cleanup.loopexit.unr-lcssa, label %for.body.preheader.new
+
+for.body.preheader.new:                           ; preds = %for.body.preheader
+  %unroll_iter = sub i32 %N, %xtraiter
+  br label %for.body
+
+for.cond.cleanup.loopexit.unr-lcssa:              ; preds = %for.body, %for.body.preheader
+  %i.010.unr = phi i32 [ 0, %for.body.preheader ], [ %inc.3, %for.body ]
+  %lcmp.mod = icmp eq i32 %xtraiter, 0
+  br i1 %lcmp.mod, label %for.cond.cleanup, label %for.body.epil
+
+for.body.epil:                                    ; preds = %for.body.epil, %for.cond.cleanup.loopexit.unr-lcssa
+  %i.010.epil = phi i32 [ %inc.epil, %for.body.epil ], [ %i.010.unr, %for.cond.cleanup.loopexit.unr-lcssa ]
+  %epil.iter = phi i32 [ %epil.iter.sub, %for.body.epil ], [ %xtraiter, %for.cond.cleanup.loopexit.unr-lcssa ]
+  %arrayidx.epil = getelementptr inbounds i16, i16* %A, i32 %i.010.epil
+  %tmp2 = load i16, i16* %arrayidx.epil, align 2
+  %conv.epil = sext i16 %tmp2 to i32
+  %arrayidx1.epil = getelementptr inbounds i16, i16* %B, i32 %i.010.epil
+  %tmp3 = load i16, i16* %arrayidx1.epil, align 2
+  %conv2.epil = sext i16 %tmp3 to i32
+  %mul.epil = mul nsw i32 %conv2.epil, %conv.epil
+  %arrayidx3.epil = getelementptr inbounds i32, i32* %C, i32 %i.010.epil
+  store i32 %mul.epil, i32* %arrayidx3.epil, align 4
+  %inc.epil = add nuw i32 %i.010.epil, 1
+  %epil.iter.sub = add i32 %epil.iter, -1
+  %epil.iter.cmp = icmp eq i32 %epil.iter.sub, 0
+  br i1 %epil.iter.cmp, label %for.cond.cleanup, label %for.body.epil
+
+for.cond.cleanup:                                 ; preds = %for.body.epil, %for.cond.cleanup.loopexit.unr-lcssa, %entry
+  ret void
+
+for.body:                                         ; preds = %for.body, %for.body.preheader.new
+  %i.010 = phi i32 [ 0, %for.body.preheader.new ], [ %inc.3, %for.body ]
+  %niter = phi i32 [ %unroll_iter, %for.body.preheader.new ], [ %niter.nsub.3, %for.body ]
+  %arrayidx = getelementptr inbounds i16, i16* %A, i32 %i.010
+  %tmp4 = load i16, i16* %arrayidx, align 2
+  %conv = sext i16 %tmp4 to i32
+  %arrayidx1 = getelementptr inbounds i16, i16* %B, i32 %i.010
+  %tmp5 = load i16, i16* %arrayidx1, align 2
+  %conv2 = sext i16 %tmp5 to i32
+  %mul = mul nsw i32 %conv2, %conv
+  %arrayidx3 = getelementptr inbounds i32, i32* %C, i32 %i.010
+  store i32 %mul, i32* %arrayidx3, align 4
+  %inc = or i32 %i.010, 1
+  %arrayidx.1 = getelementptr inbounds i16, i16* %A, i32 %inc
+  %tmp6 = load i16, i16* %arrayidx.1, align 2
+  %conv.1 = sext i16 %tmp6 to i32
+  %arrayidx1.1 = getelementptr inbounds i16, i16* %B, i32 %inc
+  %tmp7 = load i16, i16* %arrayidx1.1, align 2
+  %conv2.1 = sext i16 %tmp7 to i32
+  %mul.1 = mul nsw i32 %conv2.1, %conv.1
+  %arrayidx3.1 = getelementptr inbounds i32, i32* %C, i32 %inc
+  store i32 %mul.1, i32* %arrayidx3.1, align 4
+  %inc.1 = or i32 %i.010, 2
+  %arrayidx.2 = getelementptr inbounds i16, i16* %A, i32 %inc.1
+  %tmp8 = load i16, i16* %arrayidx.2, align 2
+  %conv.2 = sext i16 %tmp8 to i32
+  %arrayidx1.2 = getelementptr inbounds i16, i16* %B, i32 %inc.1
+  %tmp9 = load i16, i16* %arrayidx1.2, align 2
+  %conv2.2 = sext i16 %tmp9 to i32
+  %mul.2 = mul nsw i32 %conv2.2, %conv.2
+  %arrayidx3.2 = getelementptr inbounds i32, i32* %C, i32 %inc.1
+  store i32 %mul.2, i32* %arrayidx3.2, align 4
+  %inc.2 = or i32 %i.010, 3
+  %arrayidx.3 = getelementptr inbounds i16, i16* %A, i32 %inc.2
+  %tmp10 = load i16, i16* %arrayidx.3, align 2
+  %conv.3 = sext i16 %tmp10 to i32
+  %arrayidx1.3 = getelementptr inbounds i16, i16* %B, i32 %inc.2
+  %tmp11 = load i16, i16* %arrayidx1.3, align 2
+  %conv2.3 = sext i16 %tmp11 to i32
+  %mul.3 = mul nsw i32 %conv2.3, %conv.3
+  %arrayidx3.3 = getelementptr inbounds i32, i32* %C, i32 %inc.2
+  store i32 %mul.3, i32* %arrayidx3.3, align 4
+  %inc.3 = add i32 %i.010, 4
+  %niter.nsub.3 = add i32 %niter, -4
+  %niter.ncmp.3 = icmp eq i32 %niter.nsub.3, 0
+  br i1 %niter.ncmp.3, label %for.cond.cleanup.loopexit.unr-lcssa, label %for.body
+}
+
+; CHECK-LABEL: mul_8x8_2d
+; CHECK: @ %for.body4.us
+
+; CHECK-DEFAULT: ldr{{.*}}, #16]!
+; CHECK-DEFAULT: ldrb{{.*}}, #4]!
+
+; DISABLED-NOT: ldr{{.*}}]!
+; DISABLED-NOT: str{{.*}}]!
+
+; CHECK-T2: @ %for.body4.us.epil
+; CHECK-T2: ldrb{{.*}}, #1]!
+; CHECK-T2: ldr{{.*}}, #4]!
+
+define void @mul_8x8_2d(i8* nocapture readonly %A, i8** nocapture readonly %B, i32** nocapture readonly %C, i32 %N, i32 %M) {
+entry:
+  %cmp24 = icmp eq i32 %N, 0
+  %cmp222 = icmp eq i32 %M, 0
+  %or.cond = or i1 %cmp24, %cmp222
+  br i1 %or.cond, label %for.cond.cleanup, label %for.cond1.preheader.us.preheader
+
+for.cond1.preheader.us.preheader:                 ; preds = %entry
+  %tmp = add i32 %M, -1
+  %xtraiter = and i32 %M, 3
+  %tmp1 = icmp ult i32 %tmp, 3
+  %unroll_iter = sub i32 %M, %xtraiter
+  %lcmp.mod = icmp eq i32 %xtraiter, 0
+  br label %for.cond1.preheader.us
+
+for.cond1.preheader.us:                           ; preds = %for.cond1.for.cond.cleanup3_crit_edge.us, %for.cond1.preheader.us.preheader
+  %i.025.us = phi i32 [ %inc11.us, %for.cond1.for.cond.cleanup3_crit_edge.us ], [ 0, %for.cond1.preheader.us.preheader ]
+  %arrayidx.us = getelementptr inbounds i8, i8* %A, i32 %i.025.us
+  %arrayidx5.us = getelementptr inbounds i8*, i8** %B, i32 %i.025.us
+  %arrayidx8.us = getelementptr inbounds i32*, i32** %C, i32 %i.025.us
+  %.pre = load i8*, i8** %arrayidx5.us, align 4
+  %.pre30 = load i32*, i32** %arrayidx8.us, align 4
+  br i1 %tmp1, label %for.cond1.for.cond.cleanup3_crit_edge.us.unr-lcssa, label %for.body4.us
+
+for.body4.us:                                     ; preds = %for.body4.us, %for.cond1.preheader.us
+  %j.023.us = phi i32 [ %inc.us.3, %for.body4.us ], [ 0, %for.cond1.preheader.us ]
+  %niter = phi i32 [ %niter.nsub.3, %for.body4.us ], [ %unroll_iter, %for.cond1.preheader.us ]
+  %tmp2 = load i8, i8* %arrayidx.us, align 1
+  %conv.us = zext i8 %tmp2 to i32
+  %arrayidx6.us = getelementptr inbounds i8, i8* %.pre, i32 %j.023.us
+  %tmp3 = load i8, i8* %arrayidx6.us, align 1
+  %conv7.us = zext i8 %tmp3 to i32
+  %mul.us = mul nuw nsw i32 %conv7.us, %conv.us
+  %arrayidx9.us = getelementptr inbounds i32, i32* %.pre30, i32 %j.023.us
+  %tmp4 = load i32, i32* %arrayidx9.us, align 4
+  %add.us = add nsw i32 %tmp4, %mul.us
+  store i32 %add.us, i32* %arrayidx9.us, align 4
+  %inc.us = or i32 %j.023.us, 1
+  %tmp5 = load i8, i8* %arrayidx.us, align 1
+  %conv.us.1 = zext i8 %tmp5 to i32
+  %arrayidx6.us.1 = getelementptr inbounds i8, i8* %.pre, i32 %inc.us
+  %tmp6 = load i8, i8* %arrayidx6.us.1, align 1
+  %conv7.us.1 = zext i8 %tmp6 to i32
+  %mul.us.1 = mul nuw nsw i32 %conv7.us.1, %conv.us.1
+  %arrayidx9.us.1 = getelementptr inbounds i32, i32* %.pre30, i32 %inc.us
+  %tmp7 = load i32, i32* %arrayidx9.us.1, align 4
+  %add.us.1 = add nsw i32 %tmp7, %mul.us.1
+  store i32 %add.us.1, i32* %arrayidx9.us.1, align 4
+  %inc.us.1 = or i32 %j.023.us, 2
+  %tmp8 = load i8, i8* %arrayidx.us, align 1
+  %conv.us.2 = zext i8 %tmp8 to i32
+  %arrayidx6.us.2 = getelementptr inbounds i8, i8* %.pre, i32 %inc.us.1
+  %tmp9 = load i8, i8* %arrayidx6.us.2, align 1
+  %conv7.us.2 = zext i8 %tmp9 to i32
+  %mul.us.2 = mul nuw nsw i32 %conv7.us.2, %conv.us.2
+  %arrayidx9.us.2 = getelementptr inbounds i32, i32* %.pre30, i32 %inc.us.1
+  %tmp10 = load i32, i32* %arrayidx9.us.2, align 4
+  %add.us.2 = add nsw i32 %tmp10, %mul.us.2
+  store i32 %add.us.2, i32* %arrayidx9.us.2, align 4
+  %inc.us.2 = or i32 %j.023.us, 3
+  %tmp11 = load i8, i8* %arrayidx.us, align 1
+  %conv.us.3 = zext i8 %tmp11 to i32
+  %arrayidx6.us.3 = getelementptr inbounds i8, i8* %.pre, i32 %inc.us.2
+  %tmp12 = load i8, i8* %arrayidx6.us.3, align 1
+  %conv7.us.3 = zext i8 %tmp12 to i32
+  %mul.us.3 = mul nuw nsw i32 %conv7.us.3, %conv.us.3
+  %arrayidx9.us.3 = getelementptr inbounds i32, i32* %.pre30, i32 %inc.us.2
+  %tmp13 = load i32, i32* %arrayidx9.us.3, align 4
+  %add.us.3 = add nsw i32 %tmp13, %mul.us.3
+  store i32 %add.us.3, i32* %arrayidx9.us.3, align 4
+  %inc.us.3 = add i32 %j.023.us, 4
+  %niter.nsub.3 = add i32 %niter, -4
+  %niter.ncmp.3 = icmp eq i32 %niter.nsub.3, 0
+  br i1 %niter.ncmp.3, label %for.cond1.for.cond.cleanup3_crit_edge.us.unr-lcssa, label %for.body4.us
+
+for.cond1.for.cond.cleanup3_crit_edge.us.unr-lcssa: ; preds = %for.body4.us, %for.cond1.preheader.us
+  %j.023.us.unr = phi i32 [ 0, %for.cond1.preheader.us ], [ %inc.us.3, %for.body4.us ]
+  br i1 %lcmp.mod, label %for.cond1.for.cond.cleanup3_crit_edge.us, label %for.body4.us.epil
+
+for.body4.us.epil:                                ; preds = %for.body4.us.epil, %for.cond1.for.cond.cleanup3_crit_edge.us.unr-lcssa
+  %j.023.us.epil = phi i32 [ %inc.us.epil, %for.body4.us.epil ], [ %j.023.us.unr, %for.cond1.for.cond.cleanup3_crit_edge.us.unr-lcssa ]
+  %epil.iter = phi i32 [ %epil.iter.sub, %for.body4.us.epil ], [ %xtraiter, %for.cond1.for.cond.cleanup3_crit_edge.us.unr-lcssa ]
+  %tmp14 = load i8, i8* %arrayidx.us, align 1
+  %conv.us.epil = zext i8 %tmp14 to i32
+  %arrayidx6.us.epil = getelementptr inbounds i8, i8* %.pre, i32 %j.023.us.epil
+  %tmp15 = load i8, i8* %arrayidx6.us.epil, align 1
+  %conv7.us.epil = zext i8 %tmp15 to i32
+  %mul.us.epil = mul nuw nsw i32 %conv7.us.epil, %conv.us.epil
+  %arrayidx9.us.epil = getelementptr inbounds i32, i32* %.pre30, i32 %j.023.us.epil
+  %tmp16 = load i32, i32* %arrayidx9.us.epil, align 4
+  %add.us.epil = add nsw i32 %tmp16, %mul.us.epil
+  store i32 %add.us.epil, i32* %arrayidx9.us.epil, align 4
+  %inc.us.epil = add nuw i32 %j.023.us.epil, 1
+  %epil.iter.sub = add i32 %epil.iter, -1
+  %epil.iter.cmp = icmp eq i32 %epil.iter.sub, 0
+  br i1 %epil.iter.cmp, label %for.cond1.for.cond.cleanup3_crit_edge.us, label %for.body4.us.epil
+
+for.cond1.for.cond.cleanup3_crit_edge.us:         ; preds = %for.body4.us.epil, %for.cond1.for.cond.cleanup3_crit_edge.us.unr-lcssa
+  %inc11.us = add nuw i32 %i.025.us, 1
+  %exitcond28 = icmp eq i32 %inc11.us, %N
+  br i1 %exitcond28, label %for.cond.cleanup, label %for.cond1.preheader.us
+
+for.cond.cleanup:                                 ; preds = %for.cond1.for.cond.cleanup3_crit_edge.us, %entry
+  ret void
+}
+
+; CHECK-LABEL: mul_16x16_2d
+; CHECK: @ %for.body4.us
+
+; CHECK-DEFAULT: ldr{{.*}}, #16]!
+; CHECK-DEFAULT: ldrsh{{.*}}, #8]!
+
+; DISABLED-NOT: ldr{{.*}}]!
+; DISABLED-NOT: str{{.*}}]!
+
+; CHECK-T2: @ %for.body4.us.epil
+; CHECK-T2: ldrsh{{.*}}, #2]!
+; CHECK-T2: ldr{{.*}}, #4]!
+
+define void @mul_16x16_2d(i16* nocapture readonly %A, i16** nocapture readonly %B, i32** nocapture readonly %C, i32 %N, i32 %M) {
+entry:
+  %cmp24 = icmp eq i32 %N, 0
+  %cmp222 = icmp eq i32 %M, 0
+  %or.cond = or i1 %cmp24, %cmp222
+  br i1 %or.cond, label %for.cond.cleanup, label %for.cond1.preheader.us.preheader
+
+for.cond1.preheader.us.preheader:                 ; preds = %entry
+  %tmp = add i32 %M, -1
+  %xtraiter = and i32 %M, 3
+  %tmp1 = icmp ult i32 %tmp, 3
+  %unroll_iter = sub i32 %M, %xtraiter
+  %lcmp.mod = icmp eq i32 %xtraiter, 0
+  br label %for.cond1.preheader.us
+
+for.cond1.preheader.us:                           ; preds = %for.cond1.for.cond.cleanup3_crit_edge.us, %for.cond1.preheader.us.preheader
+  %i.025.us = phi i32 [ %inc11.us, %for.cond1.for.cond.cleanup3_crit_edge.us ], [ 0, %for.cond1.preheader.us.preheader ]
+  %arrayidx.us = getelementptr inbounds i16, i16* %A, i32 %i.025.us
+  %tmp2 = load i16, i16* %arrayidx.us, align 2
+  %conv.us = sext i16 %tmp2 to i32
+  %arrayidx5.us = getelementptr inbounds i16*, i16** %B, i32 %i.025.us
+  %tmp3 = load i16*, i16** %arrayidx5.us, align 4
+  %arrayidx8.us = getelementptr inbounds i32*, i32** %C, i32 %i.025.us
+  %tmp4 = load i32*, i32** %arrayidx8.us, align 4
+  br i1 %tmp1, label %for.cond1.for.cond.cleanup3_crit_edge.us.unr-lcssa, label %for.body4.us
+
+for.body4.us:                                     ; preds = %for.body4.us, %for.cond1.preheader.us
+  %j.023.us = phi i32 [ %inc.us.3, %for.body4.us ], [ 0, %for.cond1.preheader.us ]
+  %niter = phi i32 [ %niter.nsub.3, %for.body4.us ], [ %unroll_iter, %for.cond1.preheader.us ]
+  %arrayidx6.us = getelementptr inbounds i16, i16* %tmp3, i32 %j.023.us
+  %tmp5 = load i16, i16* %arrayidx6.us, align 2
+  %conv7.us = sext i16 %tmp5 to i32
+  %mul.us = mul nsw i32 %conv7.us, %conv.us
+  %arrayidx9.us = getelementptr inbounds i32, i32* %tmp4, i32 %j.023.us
+  %tmp6 = load i32, i32* %arrayidx9.us, align 4
+  %add.us = add nsw i32 %tmp6, %mul.us
+  store i32 %add.us, i32* %arrayidx9.us, align 4
+  %inc.us = or i32 %j.023.us, 1
+  %arrayidx6.us.1 = getelementptr inbounds i16, i16* %tmp3, i32 %inc.us
+  %tmp7 = load i16, i16* %arrayidx6.us.1, align 2
+  %conv7.us.1 = sext i16 %tmp7 to i32
+  %mul.us.1 = mul nsw i32 %conv7.us.1, %conv.us
+  %arrayidx9.us.1 = getelementptr inbounds i32, i32* %tmp4, i32 %inc.us
+  %tmp8 = load i32, i32* %arrayidx9.us.1, align 4
+  %add.us.1 = add nsw i32 %tmp8, %mul.us.1
+  store i32 %add.us.1, i32* %arrayidx9.us.1, align 4
+  %inc.us.1 = or i32 %j.023.us, 2
+  %arrayidx6.us.2 = getelementptr inbounds i16, i16* %tmp3, i32 %inc.us.1
+  %tmp9 = load i16, i16* %arrayidx6.us.2, align 2
+  %conv7.us.2 = sext i16 %tmp9 to i32
+  %mul.us.2 = mul nsw i32 %conv7.us.2, %conv.us
+  %arrayidx9.us.2 = getelementptr inbounds i32, i32* %tmp4, i32 %inc.us.1
+  %tmp10 = load i32, i32* %arrayidx9.us.2, align 4
+  %add.us.2 = add nsw i32 %tmp10, %mul.us.2
+  store i32 %add.us.2, i32* %arrayidx9.us.2, align 4
+  %inc.us.2 = or i32 %j.023.us, 3
+  %arrayidx6.us.3 = getelementptr inbounds i16, i16* %tmp3, i32 %inc.us.2
+  %tmp11 = load i16, i16* %arrayidx6.us.3, align 2
+  %conv7.us.3 = sext i16 %tmp11 to i32
+  %mul.us.3 = mul nsw i32 %conv7.us.3, %conv.us
+  %arrayidx9.us.3 = getelementptr inbounds i32, i32* %tmp4, i32 %inc.us.2
+  %tmp12 = load i32, i32* %arrayidx9.us.3, align 4
+  %add.us.3 = add nsw i32 %tmp12, %mul.us.3
+  store i32 %add.us.3, i32* %arrayidx9.us.3, align 4
+  %inc.us.3 = add i32 %j.023.us, 4
+  %niter.nsub.3 = add i32 %niter, -4
+  %niter.ncmp.3 = icmp eq i32 %niter.nsub.3, 0
+  br i1 %niter.ncmp.3, label %for.cond1.for.cond.cleanup3_crit_edge.us.unr-lcssa, label %for.body4.us
+
+for.cond1.for.cond.cleanup3_crit_edge.us.unr-lcssa: ; preds = %for.body4.us, %for.cond1.preheader.us
+  %j.023.us.unr = phi i32 [ 0, %for.cond1.preheader.us ], [ %inc.us.3, %for.body4.us ]
+  br i1 %lcmp.mod, label %for.cond1.for.cond.cleanup3_crit_edge.us, label %for.body4.us.epil
+
+for.body4.us.epil:                                ; preds = %for.body4.us.epil, %for.cond1.for.cond.cleanup3_crit_edge.us.unr-lcssa
+  %j.023.us.epil = phi i32 [ %inc.us.epil, %for.body4.us.epil ], [ %j.023.us.unr, %for.cond1.for.cond.cleanup3_crit_edge.us.unr-lcssa ]
+  %epil.iter = phi i32 [ %epil.iter.sub, %for.body4.us.epil ], [ %xtraiter, %for.cond1.for.cond.cleanup3_crit_edge.us.unr-lcssa ]
+  %arrayidx6.us.epil = getelementptr inbounds i16, i16* %tmp3, i32 %j.023.us.epil
+  %tmp13 = load i16, i16* %arrayidx6.us.epil, align 2
+  %conv7.us.epil = sext i16 %tmp13 to i32
+  %mul.us.epil = mul nsw i32 %conv7.us.epil, %conv.us
+  %arrayidx9.us.epil = getelementptr inbounds i32, i32* %tmp4, i32 %j.023.us.epil
+  %tmp14 = load i32, i32* %arrayidx9.us.epil, align 4
+  %add.us.epil = add nsw i32 %tmp14, %mul.us.epil
+  store i32 %add.us.epil, i32* %arrayidx9.us.epil, align 4
+  %inc.us.epil = add nuw i32 %j.023.us.epil, 1
+  %epil.iter.sub = add i32 %epil.iter, -1
+  %epil.iter.cmp = icmp eq i32 %epil.iter.sub, 0
+  br i1 %epil.iter.cmp, label %for.cond1.for.cond.cleanup3_crit_edge.us, label %for.body4.us.epil
+
+for.cond1.for.cond.cleanup3_crit_edge.us:         ; preds = %for.body4.us.epil, %for.cond1.for.cond.cleanup3_crit_edge.us.unr-lcssa
+  %inc11.us = add nuw i32 %i.025.us, 1
+  %exitcond28 = icmp eq i32 %inc11.us, %N
+  br i1 %exitcond28, label %for.cond.cleanup, label %for.cond1.preheader.us
+
+for.cond.cleanup:                                 ; preds = %for.cond1.for.cond.cleanup3_crit_edge.us, %entry
+  ret void
+}
+
+; CHECK-LABEL: mac_8x8_2d
+; CHECK: @ %for.body4.us
+
+; CHECK-BASE: ldrb{{.*}}
+; CHECK-BASE: ldrb{{.*}}, #3]
+; CHECK-BASE: str{{.*}}, lsl #2]
+; CHECK-BASE: ldrb{{.*}}
+; CHECK-BASE: ldrb{{.*}}, #4]!
+; CHECK-BASE: str{{.*}}, lsl #2]
+; CHECK-BASE: ldrb{{.*}}
+; CHECK-BASE: ldrb{{.*}}, #1]
+; CHECK-BASE: str{{.*}}, lsl #2]
+; CHECK-BASE: ldrb{{.*}}
+; CHECK-BASE: ldrb{{.*}}, #2]
+; CHECK-BASE: str{{.*}}, lsl #2]
+
+; CHECK-COMPLEX: ldrb{{.*}}
+; CHECK-COMPLEX: ldrb{{.*}}
+; CHECK-COMPLEX: str{{.*}}, lsl #2]
+; CHECK-COMPLEX: ldrb{{.*}}
+; CHECK-COMPLEX: ldrb{{.*}}, #1]
+; CHECK-COMPLEX: str{{.*}}, lsl #2]
+; CHECK-COMPLEX: ldrb{{.*}}
+; CHECK-COMPLEX: ldrb{{.*}}, #2]
+; CHECK-COMPLEX: str{{.*}}, lsl #2]
+; CHECK-COMPLEX: ldrb{{.*}}
+; CHECK-COMPLEX: ldrb{{.*}}, #3]
+; CHECK-COMPLEX: str{{.*}}, lsl #2]
+
+; DISABLED-NOT: ldr{{.*}}]!
+; DISABLED-NOT: str{{.*}}]!
+
+; CHECK-T2: @ %for.body4.us.epil
+; CHECK-T2: ldrb{{.*}}, #1]!
+
+define void @mac_8x8_2d(i8* nocapture readonly %A, i8** nocapture readonly %B, i32* nocapture %C, i32 %N, i32 %M) {
+entry:
+  %cmp22 = icmp eq i32 %N, 0
+  %cmp220 = icmp eq i32 %M, 0
+  %or.cond = or i1 %cmp22, %cmp220
+  br i1 %or.cond, label %for.cond.cleanup, label %for.cond1.preheader.us.preheader
+
+for.cond1.preheader.us.preheader:                 ; preds = %entry
+  %tmp = add i32 %M, -1
+  %xtraiter = and i32 %M, 3
+  %tmp1 = icmp ult i32 %tmp, 3
+  %unroll_iter = sub i32 %M, %xtraiter
+  %lcmp.mod = icmp eq i32 %xtraiter, 0
+  br label %for.cond1.preheader.us
+
+for.cond1.preheader.us:                           ; preds = %for.cond1.for.cond.cleanup3_crit_edge.us, %for.cond1.preheader.us.preheader
+  %i.023.us = phi i32 [ %inc10.us, %for.cond1.for.cond.cleanup3_crit_edge.us ], [ 0, %for.cond1.preheader.us.preheader ]
+  %arrayidx.us = getelementptr inbounds i8, i8* %A, i32 %i.023.us
+  %arrayidx5.us = getelementptr inbounds i8*, i8** %B, i32 %i.023.us
+  %arrayidx8.us = getelementptr inbounds i32, i32* %C, i32 %i.023.us
+  %.pre = load i8*, i8** %arrayidx5.us, align 4
+  %.pre28 = load i32, i32* %arrayidx8.us, align 4
+  br i1 %tmp1, label %for.cond1.for.cond.cleanup3_crit_edge.us.unr-lcssa, label %for.body4.us
+
+for.body4.us:                                     ; preds = %for.body4.us, %for.cond1.preheader.us
+  %tmp2 = phi i32 [ %add.us.3, %for.body4.us ], [ %.pre28, %for.cond1.preheader.us ]
+  %j.021.us = phi i32 [ %inc.us.3, %for.body4.us ], [ 0, %for.cond1.preheader.us ]
+  %niter = phi i32 [ %niter.nsub.3, %for.body4.us ], [ %unroll_iter, %for.cond1.preheader.us ]
+  %tmp3 = load i8, i8* %arrayidx.us, align 1
+  %conv.us = zext i8 %tmp3 to i32
+  %arrayidx6.us = getelementptr inbounds i8, i8* %.pre, i32 %j.021.us
+  %tmp4 = load i8, i8* %arrayidx6.us, align 1
+  %conv7.us = zext i8 %tmp4 to i32
+  %mul.us = mul nuw nsw i32 %conv7.us, %conv.us
+  %add.us = add nsw i32 %mul.us, %tmp2
+  store i32 %add.us, i32* %arrayidx8.us, align 4
+  %inc.us = or i32 %j.021.us, 1
+  %tmp5 = load i8, i8* %arrayidx.us, align 1
+  %conv.us.1 = zext i8 %tmp5 to i32
+  %arrayidx6.us.1 = getelementptr inbounds i8, i8* %.pre, i32 %inc.us
+  %tmp6 = load i8, i8* %arrayidx6.us.1, align 1
+  %conv7.us.1 = zext i8 %tmp6 to i32
+  %mul.us.1 = mul nuw nsw i32 %conv7.us.1, %conv.us.1
+  %add.us.1 = add nsw i32 %mul.us.1, %add.us
+  store i32 %add.us.1, i32* %arrayidx8.us, align 4
+  %inc.us.1 = or i32 %j.021.us, 2
+  %tmp7 = load i8, i8* %arrayidx.us, align 1
+  %conv.us.2 = zext i8 %tmp7 to i32
+  %arrayidx6.us.2 = getelementptr inbounds i8, i8* %.pre, i32 %inc.us.1
+  %tmp8 = load i8, i8* %arrayidx6.us.2, align 1
+  %conv7.us.2 = zext i8 %tmp8 to i32
+  %mul.us.2 = mul nuw nsw i32 %conv7.us.2, %conv.us.2
+  %add.us.2 = add nsw i32 %mul.us.2, %add.us.1
+  store i32 %add.us.2, i32* %arrayidx8.us, align 4
+  %inc.us.2 = or i32 %j.021.us, 3
+  %tmp9 = load i8, i8* %arrayidx.us, align 1
+  %conv.us.3 = zext i8 %tmp9 to i32
+  %arrayidx6.us.3 = getelementptr inbounds i8, i8* %.pre, i32 %inc.us.2
+  %tmp10 = load i8, i8* %arrayidx6.us.3, align 1
+  %conv7.us.3 = zext i8 %tmp10 to i32
+  %mul.us.3 = mul nuw nsw i32 %conv7.us.3, %conv.us.3
+  %add.us.3 = add nsw i32 %mul.us.3, %add.us.2
+  store i32 %add.us.3, i32* %arrayidx8.us, align 4
+  %inc.us.3 = add i32 %j.021.us, 4
+  %niter.nsub.3 = add i32 %niter, -4
+  %niter.ncmp.3 = icmp eq i32 %niter.nsub.3, 0
+  br i1 %niter.ncmp.3, label %for.cond1.for.cond.cleanup3_crit_edge.us.unr-lcssa, label %for.body4.us
+
+for.cond1.for.cond.cleanup3_crit_edge.us.unr-lcssa: ; preds = %for.body4.us, %for.cond1.preheader.us
+  %.unr = phi i32 [ %.pre28, %for.cond1.preheader.us ], [ %add.us.3, %for.body4.us ]
+  %j.021.us.unr = phi i32 [ 0, %for.cond1.preheader.us ], [ %inc.us.3, %for.body4.us ]
+  br i1 %lcmp.mod, label %for.cond1.for.cond.cleanup3_crit_edge.us, label %for.body4.us.epil
+
+for.body4.us.epil:                                ; preds = %for.body4.us.epil, %for.cond1.for.cond.cleanup3_crit_edge.us.unr-lcssa
+  %tmp11 = phi i32 [ %add.us.epil, %for.body4.us.epil ], [ %.unr, %for.cond1.for.cond.cleanup3_crit_edge.us.unr-lcssa ]
+  %j.021.us.epil = phi i32 [ %inc.us.epil, %for.body4.us.epil ], [ %j.021.us.unr, %for.cond1.for.cond.cleanup3_crit_edge.us.unr-lcssa ]
+  %epil.iter = phi i32 [ %epil.iter.sub, %for.body4.us.epil ], [ %xtraiter, %for.cond1.for.cond.cleanup3_crit_edge.us.unr-lcssa ]
+  %tmp12 = load i8, i8* %arrayidx.us, align 1
+  %conv.us.epil = zext i8 %tmp12 to i32
+  %arrayidx6.us.epil = getelementptr inbounds i8, i8* %.pre, i32 %j.021.us.epil
+  %tmp13 = load i8, i8* %arrayidx6.us.epil, align 1
+  %conv7.us.epil = zext i8 %tmp13 to i32
+  %mul.us.epil = mul nuw nsw i32 %conv7.us.epil, %conv.us.epil
+  %add.us.epil = add nsw i32 %mul.us.epil, %tmp11
+  store i32 %add.us.epil, i32* %arrayidx8.us, align 4
+  %inc.us.epil = add nuw i32 %j.021.us.epil, 1
+  %epil.iter.sub = add i32 %epil.iter, -1
+  %epil.iter.cmp = icmp eq i32 %epil.iter.sub, 0
+  br i1 %epil.iter.cmp, label %for.cond1.for.cond.cleanup3_crit_edge.us, label %for.body4.us.epil
+
+for.cond1.for.cond.cleanup3_crit_edge.us:         ; preds = %for.body4.us.epil, %for.cond1.for.cond.cleanup3_crit_edge.us.unr-lcssa
+  %inc10.us = add nuw i32 %i.023.us, 1
+  %exitcond26 = icmp eq i32 %inc10.us, %N
+  br i1 %exitcond26, label %for.cond.cleanup, label %for.cond1.preheader.us
+
+for.cond.cleanup:                                 ; preds = %for.cond1.for.cond.cleanup3_crit_edge.us, %entry
+  ret void
+}
+
+; CHECK-LABEL: mac_16x16_2d
+; CHECK: @ %for.body4.us
+
+; CHECK-BASE: ldrsh{{.*}}, #8]!
+; CHECK-BASE: ldrsh{{.*}}, #2]
+; CHECK-BASE: ldrsh{{.*}}, #4]
+; CHECK-BASE: ldrsh{{.*}}, #6]
+
+; CHECK-COMPLEX: ldrsh{{.*}}, lsl #1]
+; CHECK-COMPLEX: ldrsh{{.*}}, #2]
+; CHECK-COMPLEX: ldrsh{{.*}}, #4]
+; CHECK-COMPLEX: ldrsh{{.*}}, #6]
+
+; DISABLED-NOT: ldr{{.*}}]!
+
+; CHECK-T2: @ %for.body4.us.epil
+; CHECK-T2: ldrsh{{.*}}, #2]!
+
+define void @mac_16x16_2d(i16* nocapture readonly %A, i16** nocapture readonly %B, i32* nocapture %C, i32 %N, i32 %M) {
+entry:
+  %cmp23 = icmp eq i32 %N, 0
+  %cmp220 = icmp eq i32 %M, 0
+  %or.cond = or i1 %cmp23, %cmp220
+  br i1 %or.cond, label %for.cond.cleanup, label %for.cond1.preheader.us.preheader
+
+for.cond1.preheader.us.preheader:                 ; preds = %entry
+  %tmp = add i32 %M, -1
+  %xtraiter = and i32 %M, 3
+  %tmp1 = icmp ult i32 %tmp, 3
+  %unroll_iter = sub i32 %M, %xtraiter
+  %lcmp.mod = icmp eq i32 %xtraiter, 0
+  br label %for.cond1.preheader.us
+
+for.cond1.preheader.us:                           ; preds = %for.cond1.for.cond.cleanup3_crit_edge.us, %for.cond1.preheader.us.preheader
+  %i.024.us = phi i32 [ %inc10.us, %for.cond1.for.cond.cleanup3_crit_edge.us ], [ 0, %for.cond1.preheader.us.preheader ]
+  %arrayidx.us = getelementptr inbounds i16, i16* %A, i32 %i.024.us
+  %tmp2 = load i16, i16* %arrayidx.us, align 2
+  %conv.us = sext i16 %tmp2 to i32
+  %arrayidx5.us = getelementptr inbounds i16*, i16** %B, i32 %i.024.us
+  %tmp3 = load i16*, i16** %arrayidx5.us, align 4
+  %arrayidx8.us = getelementptr inbounds i32, i32* %C, i32 %i.024.us
+  %arrayidx8.promoted.us = load i32, i32* %arrayidx8.us, align 4
+  br i1 %tmp1, label %for.cond1.for.cond.cleanup3_crit_edge.us.unr-lcssa, label %for.body4.us
+
+for.body4.us:                                     ; preds = %for.body4.us, %for.cond1.preheader.us
+  %add22.us = phi i32 [ %add.us.3, %for.body4.us ], [ %arrayidx8.promoted.us, %for.cond1.preheader.us ]
+  %j.021.us = phi i32 [ %inc.us.3, %for.body4.us ], [ 0, %for.cond1.preheader.us ]
+  %niter = phi i32 [ %niter.nsub.3, %for.body4.us ], [ %unroll_iter, %for.cond1.preheader.us ]
+  %arrayidx6.us = getelementptr inbounds i16, i16* %tmp3, i32 %j.021.us
+  %tmp4 = load i16, i16* %arrayidx6.us, align 2
+  %conv7.us = sext i16 %tmp4 to i32
+  %mul.us = mul nsw i32 %conv7.us, %conv.us
+  %add.us = add nsw i32 %mul.us, %add22.us
+  %inc.us = or i32 %j.021.us, 1
+  %arrayidx6.us.1 = getelementptr inbounds i16, i16* %tmp3, i32 %inc.us
+  %tmp5 = load i16, i16* %arrayidx6.us.1, align 2
+  %conv7.us.1 = sext i16 %tmp5 to i32
+  %mul.us.1 = mul nsw i32 %conv7.us.1, %conv.us
+  %add.us.1 = add nsw i32 %mul.us.1, %add.us
+  %inc.us.1 = or i32 %j.021.us, 2
+  %arrayidx6.us.2 = getelementptr inbounds i16, i16* %tmp3, i32 %inc.us.1
+  %tmp6 = load i16, i16* %arrayidx6.us.2, align 2
+  %conv7.us.2 = sext i16 %tmp6 to i32
+  %mul.us.2 = mul nsw i32 %conv7.us.2, %conv.us
+  %add.us.2 = add nsw i32 %mul.us.2, %add.us.1
+  %inc.us.2 = or i32 %j.021.us, 3
+  %arrayidx6.us.3 = getelementptr inbounds i16, i16* %tmp3, i32 %inc.us.2
+  %tmp7 = load i16, i16* %arrayidx6.us.3, align 2
+  %conv7.us.3 = sext i16 %tmp7 to i32
+  %mul.us.3 = mul nsw i32 %conv7.us.3, %conv.us
+  %add.us.3 = add nsw i32 %mul.us.3, %add.us.2
+  %inc.us.3 = add i32 %j.021.us, 4
+  %niter.nsub.3 = add i32 %niter, -4
+  %niter.ncmp.3 = icmp eq i32 %niter.nsub.3, 0
+  br i1 %niter.ncmp.3, label %for.cond1.for.cond.cleanup3_crit_edge.us.unr-lcssa, label %for.body4.us
+
+for.cond1.for.cond.cleanup3_crit_edge.us.unr-lcssa: ; preds = %for.body4.us, %for.cond1.preheader.us
+  %add.us.lcssa.ph = phi i32 [ undef, %for.cond1.preheader.us ], [ %add.us.3, %for.body4.us ]
+  %add22.us.unr = phi i32 [ %arrayidx8.promoted.us, %for.cond1.preheader.us ], [ %add.us.3, %for.body4.us ]
+  %j.021.us.unr = phi i32 [ 0, %for.cond1.preheader.us ], [ %inc.us.3, %for.body4.us ]
+  br i1 %lcmp.mod, label %for.cond1.for.cond.cleanup3_crit_edge.us, label %for.body4.us.epil
+
+for.body4.us.epil:                                ; preds = %for.body4.us.epil, %for.cond1.for.cond.cleanup3_crit_edge.us.unr-lcssa
+  %add22.us.epil = phi i32 [ %add.us.epil, %for.body4.us.epil ], [ %add22.us.unr, %for.cond1.for.cond.cleanup3_crit_edge.us.unr-lcssa ]
+  %j.021.us.epil = phi i32 [ %inc.us.epil, %for.body4.us.epil ], [ %j.021.us.unr, %for.cond1.for.cond.cleanup3_crit_edge.us.unr-lcssa ]
+  %epil.iter = phi i32 [ %epil.iter.sub, %for.body4.us.epil ], [ %xtraiter, %for.cond1.for.cond.cleanup3_crit_edge.us.unr-lcssa ]
+  %arrayidx6.us.epil = getelementptr inbounds i16, i16* %tmp3, i32 %j.021.us.epil
+  %tmp8 = load i16, i16* %arrayidx6.us.epil, align 2
+  %conv7.us.epil = sext i16 %tmp8 to i32
+  %mul.us.epil = mul nsw i32 %conv7.us.epil, %conv.us
+  %add.us.epil = add nsw i32 %mul.us.epil, %add22.us.epil
+  %inc.us.epil = add nuw i32 %j.021.us.epil, 1
+  %epil.iter.sub = add i32 %epil.iter, -1
+  %epil.iter.cmp = icmp eq i32 %epil.iter.sub, 0
+  br i1 %epil.iter.cmp, label %for.cond1.for.cond.cleanup3_crit_edge.us, label %for.body4.us.epil
+
+for.cond1.for.cond.cleanup3_crit_edge.us:         ; preds = %for.body4.us.epil, %for.cond1.for.cond.cleanup3_crit_edge.us.unr-lcssa
+  %add.us.lcssa = phi i32 [ %add.us.lcssa.ph, %for.cond1.for.cond.cleanup3_crit_edge.us.unr-lcssa ], [ %add.us.epil, %for.body4.us.epil ]
+  store i32 %add.us.lcssa, i32* %arrayidx8.us, align 4
+  %inc10.us = add nuw i32 %i.024.us, 1
+  %exitcond27 = icmp eq i32 %inc10.us, %N
+  br i1 %exitcond27, label %for.cond.cleanup, label %for.cond1.preheader.us
+
+for.cond.cleanup:                                 ; preds = %for.cond1.for.cond.cleanup3_crit_edge.us, %entry
+  ret void
+}
+
+; CHECK-LABEL: mul32x32_backwards
+; CHECK: @ %for.body
+
+; TODO: post increments for decreasing addresses
+; CHECK-DEFAULT-NOT: ldr{{.*}}]!
+; CHECK-DEFAULT-NOT: str{{.*}}]!
+
+; CHECK-COMPLEX-NOT: ldr{{.*}}]!
+; CHECK-COMPLEX-NOT: str{{.*}}]!
+
+define void @mul32x32_backwards(i32* nocapture %a, i32* nocapture readonly %b, i32* nocapture readonly %c, i32 %N) {
+entry:
+  %i.08 = add i32 %N, -1
+  %cmp9 = icmp sgt i32 %i.08, -1
+  br i1 %cmp9, label %for.body.preheader, label %for.cond.cleanup
+
+for.body.preheader:                               ; preds = %entry
+  %xtraiter = and i32 %N, 3
+  %lcmp.mod = icmp eq i32 %xtraiter, 0
+  br i1 %lcmp.mod, label %for.body.prol.loopexit, label %for.body.prol
+
+for.body.prol:                                    ; preds = %for.body.prol, %for.body.preheader
+  %i.010.prol = phi i32 [ %i.0.prol, %for.body.prol ], [ %i.08, %for.body.preheader ]
+  %prol.iter = phi i32 [ %prol.iter.sub, %for.body.prol ], [ %xtraiter, %for.body.preheader ]
+  %arrayidx.prol = getelementptr inbounds i32, i32* %b, i32 %i.010.prol
+  %tmp = load i32, i32* %arrayidx.prol, align 4
+  %arrayidx1.prol = getelementptr inbounds i32, i32* %c, i32 %i.010.prol
+  %tmp1 = load i32, i32* %arrayidx1.prol, align 4
+  %mul.prol = mul nsw i32 %tmp1, %tmp
+  %arrayidx2.prol = getelementptr inbounds i32, i32* %a, i32 %i.010.prol
+  store i32 %mul.prol, i32* %arrayidx2.prol, align 4
+  %i.0.prol = add i32 %i.010.prol, -1
+  %prol.iter.sub = add i32 %prol.iter, -1
+  %prol.iter.cmp = icmp eq i32 %prol.iter.sub, 0
+  br i1 %prol.iter.cmp, label %for.body.prol.loopexit, label %for.body.prol
+
+for.body.prol.loopexit:                           ; preds = %for.body.prol, %for.body.preheader
+  %i.010.unr = phi i32 [ %i.08, %for.body.preheader ], [ %i.0.prol, %for.body.prol ]
+  %tmp2 = icmp ult i32 %i.08, 3
+  br i1 %tmp2, label %for.cond.cleanup, label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.body, %for.body.prol.loopexit, %entry
+  ret void
+
+for.body:                                         ; preds = %for.body, %for.body.prol.loopexit
+  %i.010 = phi i32 [ %i.0.3, %for.body ], [ %i.010.unr, %for.body.prol.loopexit ]
+  %arrayidx = getelementptr inbounds i32, i32* %b, i32 %i.010
+  %tmp3 = load i32, i32* %arrayidx, align 4
+  %arrayidx1 = getelementptr inbounds i32, i32* %c, i32 %i.010
+  %tmp4 = load i32, i32* %arrayidx1, align 4
+  %mul = mul nsw i32 %tmp4, %tmp3
+  %arrayidx2 = getelementptr inbounds i32, i32* %a, i32 %i.010
+  store i32 %mul, i32* %arrayidx2, align 4
+  %i.0 = add i32 %i.010, -1
+  %arrayidx.1 = getelementptr inbounds i32, i32* %b, i32 %i.0
+  %tmp5 = load i32, i32* %arrayidx.1, align 4
+  %arrayidx1.1 = getelementptr inbounds i32, i32* %c, i32 %i.0
+  %tmp6 = load i32, i32* %arrayidx1.1, align 4
+  %mul.1 = mul nsw i32 %tmp6, %tmp5
+  %arrayidx2.1 = getelementptr inbounds i32, i32* %a, i32 %i.0
+  store i32 %mul.1, i32* %arrayidx2.1, align 4
+  %i.0.1 = add i32 %i.010, -2
+  %arrayidx.2 = getelementptr inbounds i32, i32* %b, i32 %i.0.1
+  %tmp7 = load i32, i32* %arrayidx.2, align 4
+  %arrayidx1.2 = getelementptr inbounds i32, i32* %c, i32 %i.0.1
+  %tmp8 = load i32, i32* %arrayidx1.2, align 4
+  %mul.2 = mul nsw i32 %tmp8, %tmp7
+  %arrayidx2.2 = getelementptr inbounds i32, i32* %a, i32 %i.0.1
+  store i32 %mul.2, i32* %arrayidx2.2, align 4
+  %i.0.2 = add i32 %i.010, -3
+  %arrayidx.3 = getelementptr inbounds i32, i32* %b, i32 %i.0.2
+  %tmp9 = load i32, i32* %arrayidx.3, align 4
+  %arrayidx1.3 = getelementptr inbounds i32, i32* %c, i32 %i.0.2
+  %tmp10 = load i32, i32* %arrayidx1.3, align 4
+  %mul.3 = mul nsw i32 %tmp10, %tmp9
+  %arrayidx2.3 = getelementptr inbounds i32, i32* %a, i32 %i.0.2
+  store i32 %mul.3, i32* %arrayidx2.3, align 4
+  %i.0.3 = add i32 %i.010, -4
+  %cmp.3 = icmp sgt i32 %i.0.3, -1
+  br i1 %cmp.3, label %for.body, label %for.cond.cleanup
+}
+
+; CHECK-LABEL: mul32x32_forwards
+; CHECK: @ %for.body
+
+; CHECK-DEFAULT: ldr{{.*}}, #4]
+; CHECK-DEFAULT: ldr{{.*}}, #4]
+; CHECK-DEFAULT: str{{.*}}, #4]
+; CHECK-DEFAULT: ldr{{.*}}, #8]
+; CHECK-DEFAULT: ldr{{.*}}, #8]
+; CHECK-DEFAULT: str{{.*}}, #8]
+; CHECK-DEFAULT: ldr{{.*}}, #12]
+; CHECK-DEFAULT: ldr{{.*}}, #12]
+; CHECK-DEFAULT: str{{.*}}, #12]
+
+; CHECK-COMPLEX: ldr{{.*}}, #16]!
+; CHECK-COMPLEX: ldr{{.*}}, #16]!
+; CHECK-COMPLEX: str{{.*}}, #16]!
+
+; CHECK-T2: @ %for.body.epil
+; CHECK-T2: ldr{{.*}}, #4]!
+; CHECK-T2: ldr{{.*}}, #4]!
+; CHECK-T2: str{{.*}}, #4]!
+
+define void @mul32x32_forwards(i32* nocapture %a, i32* nocapture readonly %b, i32* nocapture readonly %c, i32 %N) {
+entry:
+  %cmp8 = icmp eq i32 %N, 0
+  br i1 %cmp8, label %for.cond.cleanup, label %for.body.preheader
+
+for.body.preheader:                               ; preds = %entry
+  %tmp = add i32 %N, -1
+  %xtraiter = and i32 %N, 3
+  %tmp1 = icmp ult i32 %tmp, 3
+  br i1 %tmp1, label %for.cond.cleanup.loopexit.unr-lcssa, label %for.body.preheader.new
+
+for.body.preheader.new:                           ; preds = %for.body.preheader
+  %unroll_iter = sub i32 %N, %xtraiter
+  br label %for.body
+
+for.cond.cleanup.loopexit.unr-lcssa:              ; preds = %for.body, %for.body.preheader
+  %i.09.unr = phi i32 [ 0, %for.body.preheader ], [ %inc.3, %for.body ]
+  %lcmp.mod = icmp eq i32 %xtraiter, 0
+  br i1 %lcmp.mod, label %for.cond.cleanup, label %for.body.epil
+
+for.body.epil:                                    ; preds = %for.body.epil, %for.cond.cleanup.loopexit.unr-lcssa
+  %i.09.epil = phi i32 [ %inc.epil, %for.body.epil ], [ %i.09.unr, %for.cond.cleanup.loopexit.unr-lcssa ]
+  %epil.iter = phi i32 [ %epil.iter.sub, %for.body.epil ], [ %xtraiter, %for.cond.cleanup.loopexit.unr-lcssa ]
+  %arrayidx.epil = getelementptr inbounds i32, i32* %b, i32 %i.09.epil
+  %tmp2 = load i32, i32* %arrayidx.epil, align 4
+  %arrayidx1.epil = getelementptr inbounds i32, i32* %c, i32 %i.09.epil
+  %tmp3 = load i32, i32* %arrayidx1.epil, align 4
+  %mul.epil = mul nsw i32 %tmp3, %tmp2
+  %arrayidx2.epil = getelementptr inbounds i32, i32* %a, i32 %i.09.epil
+  store i32 %mul.epil, i32* %arrayidx2.epil, align 4
+  %inc.epil = add nuw nsw i32 %i.09.epil, 1
+  %epil.iter.sub = add i32 %epil.iter, -1
+  %epil.iter.cmp = icmp eq i32 %epil.iter.sub, 0
+  br i1 %epil.iter.cmp, label %for.cond.cleanup, label %for.body.epil
+
+for.cond.cleanup:                                 ; preds = %for.body.epil, %for.cond.cleanup.loopexit.unr-lcssa, %entry
+  ret void
+
+for.body:                                         ; preds = %for.body, %for.body.preheader.new
+  %i.09 = phi i32 [ 0, %for.body.preheader.new ], [ %inc.3, %for.body ]
+  %niter = phi i32 [ %unroll_iter, %for.body.preheader.new ], [ %niter.nsub.3, %for.body ]
+  %arrayidx = getelementptr inbounds i32, i32* %b, i32 %i.09
+  %tmp4 = load i32, i32* %arrayidx, align 4
+  %arrayidx1 = getelementptr inbounds i32, i32* %c, i32 %i.09
+  %tmp5 = load i32, i32* %arrayidx1, align 4
+  %mul = mul nsw i32 %tmp5, %tmp4
+  %arrayidx2 = getelementptr inbounds i32, i32* %a, i32 %i.09
+  store i32 %mul, i32* %arrayidx2, align 4
+  %inc = or i32 %i.09, 1
+  %arrayidx.1 = getelementptr inbounds i32, i32* %b, i32 %inc
+  %tmp6 = load i32, i32* %arrayidx.1, align 4
+  %arrayidx1.1 = getelementptr inbounds i32, i32* %c, i32 %inc
+  %tmp7 = load i32, i32* %arrayidx1.1, align 4
+  %mul.1 = mul nsw i32 %tmp7, %tmp6
+  %arrayidx2.1 = getelementptr inbounds i32, i32* %a, i32 %inc
+  store i32 %mul.1, i32* %arrayidx2.1, align 4
+  %inc.1 = or i32 %i.09, 2
+  %arrayidx.2 = getelementptr inbounds i32, i32* %b, i32 %inc.1
+  %tmp8 = load i32, i32* %arrayidx.2, align 4
+  %arrayidx1.2 = getelementptr inbounds i32, i32* %c, i32 %inc.1
+  %tmp9 = load i32, i32* %arrayidx1.2, align 4
+  %mul.2 = mul nsw i32 %tmp9, %tmp8
+  %arrayidx2.2 = getelementptr inbounds i32, i32* %a, i32 %inc.1
+  store i32 %mul.2, i32* %arrayidx2.2, align 4
+  %inc.2 = or i32 %i.09, 3
+  %arrayidx.3 = getelementptr inbounds i32, i32* %b, i32 %inc.2
+  %tmp10 = load i32, i32* %arrayidx.3, align 4
+  %arrayidx1.3 = getelementptr inbounds i32, i32* %c, i32 %inc.2
+  %tmp11 = load i32, i32* %arrayidx1.3, align 4
+  %mul.3 = mul nsw i32 %tmp11, %tmp10
+  %arrayidx2.3 = getelementptr inbounds i32, i32* %a, i32 %inc.2
+  store i32 %mul.3, i32* %arrayidx2.3, align 4
+  %inc.3 = add nuw nsw i32 %i.09, 4
+  %niter.nsub.3 = add i32 %niter, -4
+  %niter.ncmp.3 = icmp eq i32 %niter.nsub.3, 0
+  br i1 %niter.ncmp.3, label %for.cond.cleanup.loopexit.unr-lcssa, label %for.body
+}
diff --git a/test/Transforms/LoopStrengthReduce/ARM/complexity.ll b/test/Transforms/LoopStrengthReduce/ARM/complexity.ll
index f2cc0a5a6f8..197bb53ab51 100644
--- a/test/Transforms/LoopStrengthReduce/ARM/complexity.ll
+++ b/test/Transforms/LoopStrengthReduce/ARM/complexity.ll
@@ -1,21 +1,15 @@
 target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-a:0:32-n32-S64"
 
-; RUN: opt -mtriple=thumbv7em %s -S -loop-reduce -lsr-complexity-limit=65536 -o - | FileCheck %s --check-prefix=CHECK-DEFAULT
-; RUN: opt -mtriple=thumbv7em %s -S -loop-reduce -lsr-complexity-limit=2147483647 -o - | FileCheck %s --check-prefix=CHECK-COMPLEX
+; RUN: opt -mtriple=thumbv7em %s -S -loop-reduce -lsr-complexity-limit=65536 -o - | FileCheck %s
+; RUN: opt -mtriple=thumbv7em %s -S -loop-reduce -lsr-complexity-limit=2147483647 -o - | FileCheck %s
 
-; CHECK-DEFAULT-LABEL: for.body12.us.us:
-; CHECK-DEFAULT: phi i32
-; CHECK-DEFAULT: [[LSR_IV:%[^ ]+]] = phi i32 [ [[LSR_IV_NEXT:%[^ ]+]], %for.body12.us.us ], [ 0, %for.cond9.preheader.us.us ]
-; CHECK-DEFAULT: phi i32
-; CHECK-DEFAULT: [[LSR_IV_NEXT]] = add i32 [[LSR_IV]], 8
-
-; CHECK-COMPLEX-LABEL: for.body12.us.us:
-; CHECK-COMPLEX: phi i32
-; CHECK-COMPLEX: [[LSR_IV6:%[^ ]+]] = phi i16* [ [[SCEVGEP7:%[^ ]+]], %for.body12.us.us ], [ [[SCEVGEP5:%[^ ]+]], %for.cond9.preheader.us.us ]
-; CHECK-COMPLEX: [[LSR_IV:%[^ ]+]] = phi i16* [ [[SCEVGEP1:%[^ ]+]], %for.body12.us.us ], [ [[SCEVGEP:%[^ ]+]], %for.cond9.preheader.us.us ]
-; CHECK-COMPLEX: phi i32
-; CHECK-COMPLEX: [[SCEVGEP1]] = getelementptr i16, i16* [[LSR_IV]], i32 4
-; CHECK-COMPLEX: [[SCEVGEP7]] = getelementptr i16, i16* [[LSR_IV6]], i32 4
+; CHECK-LABEL: for.body12.us.us:
+; CHECK: [[LSR_IV6:%[^ ]+]] = phi i16* [ [[SCEVGEP7:%[^ ]+]], %for.body12.us.us ], [ [[SCEVGEP5:%[^ ]+]], %for.cond9.preheader.us.us ]
+; CHECK: phi i32
+; CHECK: [[LSR_IV:%[^ ]+]] = phi i16* [ [[SCEVGEP1:%[^ ]+]], %for.body12.us.us ], [ [[SCEVGEP:%[^ ]+]], %for.cond9.preheader.us.us ]
+; CHECK: phi i32
+; CHECK: [[SCEVGEP1]] = getelementptr i16, i16* [[LSR_IV]], i32 4
+; CHECK: [[SCEVGEP7]] = getelementptr i16, i16* [[LSR_IV6]], i32 4
 
 define void @convolve(i16** nocapture readonly %input_image, i16** nocapture readonly %filter, i32 %filter_dim, i32 %out_width, i32 %out_height, i32** nocapture readonly %convolved) {
 entry:
-- 
2.50.1