From 132df1d1cd7e972bebf265956881c59751550cb0 Mon Sep 17 00:00:00 2001
From: Sam Parker <sam.parker@arm.com>
Date: Mon, 30 Sep 2019 08:03:23 +0000
Subject: [PATCH] [ARM][MVE] Change VCTP operand

The VCTP instruction will calculate the predicate masked based upon
the number of elements that need to be processed. I had inserted the
sub before the vctp intrinsic and supplied it as the operand, but
this is incorrect as the phi should directly feed the vctp. The sub
is calculating the value for the next iteration.

Differential Revision: https://reviews.llvm.org/D67921

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@373188 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Target/ARM/MVETailPredication.cpp         |  6 ++--
 .../LowOverheadLoops/basic-tail-pred.ll       | 14 ++++----
 .../cond-vector-reduce-mve-codegen.ll         | 28 ++++++++-------
 .../LowOverheadLoops/mve-tail-data-types.ll   | 21 ++++++------
 .../CodeGen/Thumb2/LowOverheadLoops/nested.ll |  2 +-
 .../LowOverheadLoops/tail-pred-widen.ll       |  2 +-
 .../Thumb2/LowOverheadLoops/tail-reduce.ll    |  4 +--
 .../LowOverheadLoops/vector-arith-codegen.ll  | 34 +++++++++++++------
 .../vector-reduce-mve-tail.ll                 |  4 +--
 9 files changed, 66 insertions(+), 49 deletions(-)
diff --git a/lib/Target/ARM/MVETailPredication.cpp b/lib/Target/ARM/MVETailPredication.cpp
index 844eafbcb38..4db8ab17c49 100644
--- a/lib/Target/ARM/MVETailPredication.cpp
+++ b/lib/Target/ARM/MVETailPredication.cpp
@@ -491,13 +491,13 @@ bool MVETailPredication::TryConvert(Value *TripCount) {
     case 16: VCTPID = Intrinsic::arm_vctp8; break;
     }
     Function *VCTP = Intrinsic::getDeclaration(M, VCTPID);
-    // TODO: This add likely already exists in the loop.
-    Value *Remaining = Builder.CreateSub(Processed, Factor);
-    Value *TailPredicate = Builder.CreateCall(VCTP, Remaining);
+    Value *TailPredicate = Builder.CreateCall(VCTP, Processed);
     Predicate->replaceAllUsesWith(TailPredicate);
     NewPredicates[Predicate] = cast<Instruction>(TailPredicate);
 
     // Add the incoming value to the new phi.
+    // TODO: This add likely already exists in the loop.
+    Value *Remaining = Builder.CreateSub(Processed, Factor);
     Processed->addIncoming(Remaining, L->getLoopLatch());
     LLVM_DEBUG(dbgs() << "TP: Insert processed elements phi: "
                << *Processed << "\n"
diff --git a/test/CodeGen/Thumb2/LowOverheadLoops/basic-tail-pred.ll b/test/CodeGen/Thumb2/LowOverheadLoops/basic-tail-pred.ll
index f01e6ab1918..79c81ca7a44 100644
--- a/test/CodeGen/Thumb2/LowOverheadLoops/basic-tail-pred.ll
+++ b/test/CodeGen/Thumb2/LowOverheadLoops/basic-tail-pred.ll
@@ -4,8 +4,8 @@
 ; CHECK: vector.body:
 ; CHECK: %index = phi i32
 ; CHECK: [[ELEMS:%[^ ]+]] = phi i32 [ %N, %vector.ph ], [ [[REMAINING:%[^ ]+]], %vector.body ]
+; CHECK: [[VCTP:%[^ ]+]] = call <16 x i1> @llvm.arm.vctp8(i32 [[ELEMS]])
 ; CHECK: [[REMAINING]] = sub i32 [[ELEMS]], 16
-; CHECK: [[VCTP:%[^ ]+]] = call <16 x i1> @llvm.arm.vctp8(i32 [[REMAINING]])
 ; CHECK: [[LD0:%[^ ]+]] = tail call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* {{.*}}, i32 4, <16 x i1> [[VCTP]], <16 x i8> undef)
 ; CHECK: [[LD1:%[^ ]+]] = tail call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* {{.*}}, i32 4, <16 x i1> [[VCTP]], <16 x i8> undef)
 ; CHECK: tail call void @llvm.masked.store.v16i8.p0v16i8(<16 x i8> {{.*}}, <16 x i8>* {{.*}}, i32 4, <16 x i1> [[VCTP]])
@@ -57,8 +57,8 @@ for.cond.cleanup:                                 ; preds = %vector.body, %entry
 ; CHECK: vector.body:
 ; CHECK: %index = phi i32
 ; CHECK: [[ELEMS:%[^ ]+]] = phi i32 [ %N, %vector.ph ], [ [[REMAINING:%[^ ]+]], %vector.body ]
+; CHECK: [[VCTP:%[^ ]+]] = call <8 x i1> @llvm.arm.vctp16(i32 [[ELEMS]])
 ; CHECK: [[REMAINING]] = sub i32 [[ELEMS]], 8
-; CHECK: [[VCTP:%[^ ]+]] = call <8 x i1> @llvm.arm.vctp16(i32 [[REMAINING]])
 ; CHECK: [[LD0:%[^ ]+]] = tail call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* {{.*}}, i32 4, <8 x i1> [[VCTP]], <8 x i16> undef)
 ; CHECK: [[LD1:%[^ ]+]] = tail call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* {{.*}}, i32 4, <8 x i1> [[VCTP]], <8 x i16> undef)
 ; CHECK: tail call void @llvm.masked.store.v8i16.p0v8i16(<8 x i16> {{.*}}, <8 x i16>* {{.*}}, i32 4, <8 x i1> [[VCTP]])
@@ -109,8 +109,8 @@ for.cond.cleanup:                                 ; preds = %vector.body, %entry
 ; CHECK-LABEL: mul_v4i32
 ; CHECK: vector.body:
 ; CHECK: [[ELEMS:%[^ ]+]] = phi i32 [ %N, %vector.ph ], [ [[REMAINING:%[^ ]+]], %vector.body ]
+; CHECK: [[VCTP:%[^ ]+]] = call <4 x i1> @llvm.arm.vctp32(i32 [[ELEMS]])
 ; CHECK: [[REMAINING]] = sub i32 [[ELEMS]], 4
-; CHECK: [[VCTP:%[^ ]+]] = call <4 x i1> @llvm.arm.vctp32(i32 [[REMAINING]])
 ; CHECK: [[LD0:%[^ ]+]] = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* {{.*}}, i32 4, <4 x i1> [[VCTP]], <4 x i32> undef)
 ; CHECK: [[LD1:%[^ ]+]] = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* {{.*}}, i32 4, <4 x i1> [[VCTP]], <4 x i32> undef)
 ; CHECK: tail call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> {{.*}}, <4 x i32>* {{.*}}, i32 4, <4 x i1> [[VCTP]])
@@ -162,8 +162,8 @@ for.cond.cleanup:                                 ; preds = %vector.body, %entry
 ; CHECK: vector.body:
 ; CHECK: %index = phi i32
 ; CHECK: [[ELEMS:%[^ ]+]] = phi i32 [ %N, %vector.ph ], [ [[REMAINING:%[^ ]+]], %vector.body ]
+; CHECK: [[VCTP:%[^ ]+]] = call <2 x i1> @llvm.arm.vctp64(i32 [[ELEMS]])
 ; CHECK: [[REMAINING]] = sub i32 [[ELEMS]], 2
-; CHECK: [[VCTP:%[^ ]+]] = call <2 x i1> @llvm.arm.vctp64(i32 [[REMAINING]])
 ; CHECK: [[LD0:%[^ ]+]] = tail call <2 x i64> @llvm.masked.load.v2i64.p0v2i64(<2 x i64>* {{.*}}, i32 4, <2 x i1> [[VCTP]], <2 x i64> undef)
 ; CHECK: tail call void @llvm.masked.store.v2i64.p0v2i64(<2 x i64> [[LD0]], <2 x i64>* {{.*}}, i32 4, <2 x i1> [[VCTP]])
 define void @copy_v2i64(i64* %a, i64* %b, i32 %N) {
@@ -210,8 +210,8 @@ for.cond.cleanup:                                 ; preds = %vector.body, %entry
 ; CHECK: vector.body:
 ; CHECK: %index = phi i32
 ; CHECK: [[ELEMS:%[^ ]+]] = phi i32 [ %N, %vector.ph ], [ [[REMAINING:%[^ ]+]], %vector.body ]
+; CHECK: [[VCTP:%[^ ]+]] = call <4 x i1> @llvm.arm.vctp32(i32 [[ELEMS]])
 ; CHECK: [[REMAINING]] = sub i32 [[ELEMS]], 4
-; CHECK: [[VCTP:%[^ ]+]] = call <4 x i1> @llvm.arm.vctp32(i32 [[REMAINING]])
 ; CHECK: [[LD0:%[^ ]+]] = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* {{.*}}, i32 4, <4 x i1> [[VCTP]], <4 x i32> undef)
 ; CHECK: [[LD1:%[^ ]+]] = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* {{.*}}, i32 4, <4 x i1> [[VCTP]], <4 x i32> undef)
 ; CHECK: tail call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> {{.*}}, <4 x i32>* {{.*}}, i32 4, <4 x i1> [[VCTP]])
@@ -268,8 +268,8 @@ for.cond.cleanup:                                 ; preds = %vector.body, %entry
 ; One of the loads now uses ult predicate.
 ; CHECK-LABEL: mismatch_load_pred
 ; CHECK: [[ELEMS:%[^ ]+]] = phi i32 [ %N, %vector.ph ], [ [[REMAINING:%[^ ]+]], %vector.body ]
+; CHECK: [[VCTP:%[^ ]+]] = call <4 x i1> @llvm.arm.vctp32(i32 [[ELEMS]])
 ; CHECK: [[REMAINING]] = sub i32 [[ELEMS]], 4
-; CHECK: [[VCTP:%[^ ]+]] = call <4 x i1> @llvm.arm.vctp32(i32 [[REMAINING]])
 ; CHECK: [[LD0:%[^ ]+]] = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* {{.*}}, i32 4, <4 x i1> [[VCTP]], <4 x i32> undef)
 ; CHECK: [[LD1:%[^ ]+]] = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* {{.*}}, i32 4, <4 x i1> %wrong, <4 x i32> undef)
 ; CHECK: tail call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> {{.*}}, <4 x i32>* {{.*}}, i32 4, <4 x i1> [[VCTP]])
@@ -322,8 +322,8 @@ for.cond.cleanup:                                 ; preds = %vector.body, %entry
 ; CHECK-LABEL: mismatch_store_pred
 ; CHECK: %index = phi i32
 ; CHECK: [[ELEMS:%[^ ]+]] = phi i32 [ %N, %vector.ph ], [ [[REMAINING:%[^ ]+]], %vector.body ]
+; CHECK: [[VCTP:%[^ ]+]] = call <4 x i1> @llvm.arm.vctp32(i32 [[ELEMS]])
 ; CHECK: [[REMAINING]] = sub i32 [[ELEMS]], 4
-; CHECK: [[VCTP:%[^ ]+]] = call <4 x i1> @llvm.arm.vctp32(i32 [[REMAINING]])
 ; CHECK: [[LD0:%[^ ]+]] = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* {{.*}}, i32 4, <4 x i1> [[VCTP]], <4 x i32> undef)
 ; CHECK: [[LD1:%[^ ]+]] = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* {{.*}}, i32 4, <4 x i1> [[VCTP]], <4 x i32> undef)
 ; CHECK: tail call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> {{.*}}, <4 x i32>* {{.*}}, i32 4, <4 x i1> %wrong)
diff --git a/test/CodeGen/Thumb2/LowOverheadLoops/cond-vector-reduce-mve-codegen.ll b/test/CodeGen/Thumb2/LowOverheadLoops/cond-vector-reduce-mve-codegen.ll
index d701e0f1b57..5900dd9ac66 100644
--- a/test/CodeGen/Thumb2/LowOverheadLoops/cond-vector-reduce-mve-codegen.ll
+++ b/test/CodeGen/Thumb2/LowOverheadLoops/cond-vector-reduce-mve-codegen.ll
@@ -3,8 +3,8 @@
 ; CHECK-LABEL: vpsel_mul_reduce_add
 ; CHECK:      dls lr, lr
 ; CHECK:  [[LOOP:.LBB[0-9_]+]]:
-; CHECK:      sub{{.*}} [[ELEMS:r[0-9]+]], #4
-; CHECK:      vctp.32 [[ELEMS]]
+; CHECK:      vctp.32 [[ELEMS:r[0-9]+]]
+; CHECK:      mov [[ELEMS_OUT:r[0-9]+]], [[ELEMS]]
 ; CHECK:      vstr p0, [sp
 ; CHECK:      vpstt	
 ; CHECK-NEXT: vldrwt.u32
@@ -14,8 +14,9 @@
 ; CHECK:      vldr p0, [sp
 ; CHECK:      vpst	
 ; CHECK-NEXT: vldrwt.u32 q{{.*}}, [r0]
+; CHECK:      sub{{.*}} [[ELEMS]], [[ELEMS_OUT]], #4
 ; CHECK:      le lr, [[LOOP]]
-; CHECK:      vctp.32	[[ELEMS]]
+; CHECK:      vctp.32	[[ELEMS_OUT]]
 ; CHECK-NEXT: vpsel
 ; CHECK-NEXT: vaddv.u32
 define dso_local i32 @vpsel_mul_reduce_add(i32* noalias nocapture readonly %a, i32* noalias nocapture readonly %b, i32* noalias nocapture readonly %c, i32 %N) {
@@ -71,8 +72,8 @@ for.cond.cleanup:                                 ; preds = %middle.block, %entr
 ; CHECK-LABEL: vpsel_mul_reduce_add_2
 ; CHECK:      dls lr, lr
 ; CHECK:  [[LOOP:.LBB[0-9_]+]]:
-; CHECK:      sub{{.*}} [[ELEMS:r[0-9]+]], #4
-; CHECK:      vctp.32 [[ELEMS]]
+; CHECK:      vctp.32 [[ELEMS:r[0-9]+]]
+; CHECK:      mov [[ELEMS_OUT:r[0-9]+]], [[ELEMS]]
 ; CHECK:      vstr p0, [sp
 ; CHECK:      vpstt
 ; CHECK-NEXT: vldrwt.u32
@@ -85,8 +86,9 @@ for.cond.cleanup:                                 ; preds = %middle.block, %entr
 ; CHECK:      vldr p0, [sp
 ; CHECK:      vpst	
 ; CHECK-NEXT: vldrwt.u32 q{{.*}}, [r0]
+; CHECK:      sub{{.*}} [[ELEMS]], [[ELEMS_OUT]], #4
 ; CHECK:      le lr, [[LOOP]]
-; CHECK:      vctp.32	[[ELEMS]]
+; CHECK:      vctp.32	[[ELEMS_OUT]]
 ; CHECK-NEXT: vpsel
 ; CHECK-NEXT: vaddv.u32
 define dso_local i32 @vpsel_mul_reduce_add_2(i32* noalias nocapture readonly %a, i32* noalias nocapture readonly %b,
@@ -147,17 +149,18 @@ for.cond.cleanup:                                 ; preds = %middle.block, %entr
 ; CHECK-LABEL: and_mul_reduce_add
 ; CHECK:      dls lr, lr
 ; CHECK:  [[LOOP:.LBB[0-9_]+]]:
-; CHECK:      sub{{.*}} [[ELEMS:r[0-9]+]],{{.*}}#4
-; CHECK:      vctp.32 [[ELEMS]]
+; CHECK:      vctp.32 [[ELEMS:r[0-9]+]]
 ; CHECK:      vpstt	
 ; CHECK-NEXT: vldrwt.u32
 ; CHECK-NEXT: vldrwt.u32
+; CHECK:      mov [[ELEMS_OUT:r[0-9]+]], [[ELEMS]]
 ; CHECK:      vpsttt
 ; CHECK-NEXT: vcmpt.i32	eq, {{.*}}, zr
 ; CHECK-NEXT: vldrwt.u32 q{{.*}}, [r3]
 ; CHECK-NEXT: vldrwt.u32 q{{.*}}, [r2]
+; CHECK:      sub{{.*}} [[ELEMS]],{{.*}}#4
 ; CHECK:      le lr, [[LOOP]]
-; CHECK:      vctp.32 [[ELEMS]]
+; CHECK:      vctp.32 [[ELEMS_OUT]]
 ; CHECK:      vpsel
 define dso_local i32 @and_mul_reduce_add(i32* noalias nocapture readonly %a, i32* noalias nocapture readonly %b,
                                          i32* noalias nocapture readonly %c, i32* noalias nocapture readonly %d, i32 %N) {
@@ -215,9 +218,9 @@ for.cond.cleanup:                                 ; preds = %middle.block, %entr
 ; CHECK-LABEL: or_mul_reduce_add
 ; CHECK:      dls lr, lr
 ; CHECK:  [[LOOP:.LBB[0-9_]+]]:
-; CHECK:      sub{{.*}} [[ELEMS:r[0-9]+]],{{.*}}#4
-; CHECK:      vctp.32 [[ELEMS]]
+; CHECK:      vctp.32 [[ELEMS:r[0-9]+]]
 ; CHECK:      vstr p0, [sp
+; CHECK:      mov [[ELEMS_OUT:r[0-9]+]], [[ELEMS]]
 ; CHECK:      vpstt	
 ; CHECK-NEXT: vldrwt.u32
 ; CHECK-NEXT: vldrwt.u32
@@ -226,12 +229,13 @@ for.cond.cleanup:                                 ; preds = %middle.block, %entr
 ; CHECK:      vldr p0, [sp
 ; CHECK:      vmrs [[VCTP:r[0-9]+]], p0
 ; CHECK:      orr{{.*}} [[VCMP]], [[VCTP]]
+; CHECK:      sub{{.*}} [[ELEMS:r[0-9]+]], [[ELEMS_OUT]], #4
 ; CHECK-NEXT: vmsr p0
 ; CHECK-NEXT: vpstt
 ; CHECK-NEXT: vldrwt.u32 q{{.*}}, [r3]
 ; CHECK-NEXT: vldrwt.u32 q{{.*}}, [r2]
 ; CHECK:      le lr, [[LOOP]]
-; CHECK:      vctp.32 [[ELEMS]]
+; CHECK:      vctp.32 [[ELEMS_OUT]]
 ; CHECK:      vpsel
 define dso_local i32 @or_mul_reduce_add(i32* noalias nocapture readonly %a, i32* noalias nocapture readonly %b,
                                         i32* noalias nocapture readonly %c, i32* noalias nocapture readonly %d, i32 %N) {
diff --git a/test/CodeGen/Thumb2/LowOverheadLoops/mve-tail-data-types.ll b/test/CodeGen/Thumb2/LowOverheadLoops/mve-tail-data-types.ll
index 45ca7ce5172..17f9d26ec03 100644
--- a/test/CodeGen/Thumb2/LowOverheadLoops/mve-tail-data-types.ll
+++ b/test/CodeGen/Thumb2/LowOverheadLoops/mve-tail-data-types.ll
@@ -500,16 +500,17 @@ define arm_aapcs_vfpcc i32 @test_acc_scalar_int(i32 %a, i32* nocapture readonly
 ; CHECK-NEXT:    dls lr, lr
 ; CHECK-NEXT:  .LBB4_1: @ %vector.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    subs r2, #4
-; CHECK-NEXT:    vmov q1, q0
 ; CHECK-NEXT:    vctp.32 r2
+; CHECK-NEXT:    mov r3, r2
 ; CHECK-NEXT:    vpst
 ; CHECK-NEXT:    vldrwt.u32 q2, [r1]
 ; CHECK-NEXT:    adds r1, #16
+; CHECK-NEXT:    subs r2, #4
+; CHECK-NEXT:    vmov q1, q0
 ; CHECK-NEXT:    vmla.u32 q0, q2, r0
 ; CHECK-NEXT:    le lr, .LBB4_1
 ; CHECK-NEXT:  @ %bb.2: @ %middle.block
-; CHECK-NEXT:    vctp.32 r2
+; CHECK-NEXT:    vctp.32 r3
 ; CHECK-NEXT:    vpsel q0, q0, q1
 ; CHECK-NEXT:    vaddv.u32 r0, q0
 ; CHECK-NEXT:    pop {r7, pc}
@@ -607,7 +608,6 @@ define arm_aapcs_vfpcc void @test_vec_mul_scalar_add_char(i8* nocapture readonly
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    vadd.i32 q4, q1, r4
 ; CHECK-NEXT:    @ implicit-def: $q5
-; CHECK-NEXT:    sub.w r12, r12, #4
 ; CHECK-NEXT:    vcmp.u32 cs, q0, q4
 ; CHECK-NEXT:    @ implicit-def: $q4
 ; CHECK-NEXT:    vmrs r6, p0
@@ -681,6 +681,7 @@ define arm_aapcs_vfpcc void @test_vec_mul_scalar_add_char(i8* nocapture readonly
 ; CHECK-NEXT:    vpst
 ; CHECK-NEXT:    vstrwt.32 q4, [r3]
 ; CHECK-NEXT:    adds r3, #16
+; CHECK-NEXT:    sub.w r12, r12, #4
 ; CHECK-NEXT:    le lr, .LBB5_5
 ; CHECK-NEXT:    b .LBB5_12
 ; CHECK-NEXT:  .LBB5_6: @ %for.body.preheader.new
@@ -903,10 +904,9 @@ define arm_aapcs_vfpcc void @test_vec_mul_scalar_add_short(i16* nocapture readon
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    vadd.i32 q2, q1, r4
 ; CHECK-NEXT:    @ implicit-def: $q3
-; CHECK-NEXT:    sub.w r12, r12, #4
+; CHECK-NEXT:    adds r4, #4
 ; CHECK-NEXT:    vcmp.u32 cs, q0, q2
 ; CHECK-NEXT:    @ implicit-def: $q2
-; CHECK-NEXT:    adds r4, #4
 ; CHECK-NEXT:    vmrs r6, p0
 ; CHECK-NEXT:    and r5, r6, #1
 ; CHECK-NEXT:    rsbs r7, r5, #0
@@ -977,6 +977,7 @@ define arm_aapcs_vfpcc void @test_vec_mul_scalar_add_short(i16* nocapture readon
 ; CHECK-NEXT:    vpst
 ; CHECK-NEXT:    vstrwt.32 q2, [r3]
 ; CHECK-NEXT:    adds r3, #16
+; CHECK-NEXT:    sub.w r12, r12, #4
 ; CHECK-NEXT:    le lr, .LBB6_2
 ; CHECK-NEXT:  .LBB6_3: @ %for.cond.cleanup
 ; CHECK-NEXT:    add sp, #8
@@ -1084,7 +1085,6 @@ define arm_aapcs_vfpcc void @test_vec_mul_scalar_add_uchar(i8* nocapture readonl
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    vadd.i32 q4, q1, r4
 ; CHECK-NEXT:    @ implicit-def: $q5
-; CHECK-NEXT:    sub.w r12, r12, #4
 ; CHECK-NEXT:    vcmp.u32 cs, q0, q4
 ; CHECK-NEXT:    @ implicit-def: $q4
 ; CHECK-NEXT:    vmrs r6, p0
@@ -1158,6 +1158,7 @@ define arm_aapcs_vfpcc void @test_vec_mul_scalar_add_uchar(i8* nocapture readonl
 ; CHECK-NEXT:    vpst
 ; CHECK-NEXT:    vstrwt.32 q4, [r3]
 ; CHECK-NEXT:    adds r3, #16
+; CHECK-NEXT:    sub.w r12, r12, #4
 ; CHECK-NEXT:    le lr, .LBB7_5
 ; CHECK-NEXT:    b .LBB7_12
 ; CHECK-NEXT:  .LBB7_6: @ %for.body.preheader.new
@@ -1380,10 +1381,9 @@ define arm_aapcs_vfpcc void @test_vec_mul_scalar_add_ushort(i16* nocapture reado
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    vadd.i32 q2, q1, r4
 ; CHECK-NEXT:    @ implicit-def: $q3
-; CHECK-NEXT:    sub.w r12, r12, #4
+; CHECK-NEXT:    adds r4, #4
 ; CHECK-NEXT:    vcmp.u32 cs, q0, q2
 ; CHECK-NEXT:    @ implicit-def: $q2
-; CHECK-NEXT:    adds r4, #4
 ; CHECK-NEXT:    vmrs r6, p0
 ; CHECK-NEXT:    and r5, r6, #1
 ; CHECK-NEXT:    rsbs r7, r5, #0
@@ -1454,6 +1454,7 @@ define arm_aapcs_vfpcc void @test_vec_mul_scalar_add_ushort(i16* nocapture reado
 ; CHECK-NEXT:    vpst
 ; CHECK-NEXT:    vstrwt.32 q2, [r3]
 ; CHECK-NEXT:    adds r3, #16
+; CHECK-NEXT:    sub.w r12, r12, #4
 ; CHECK-NEXT:    le lr, .LBB8_2
 ; CHECK-NEXT:  .LBB8_3: @ %for.cond.cleanup
 ; CHECK-NEXT:    add sp, #8
@@ -1550,8 +1551,8 @@ define arm_aapcs_vfpcc void @test_vec_mul_scalar_add_int(i32* nocapture readonly
 ; CHECK-NEXT:    dls lr, lr
 ; CHECK-NEXT:  .LBB9_5: @ %vector.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    sub.w r12, r12, #4
 ; CHECK-NEXT:    vctp.32 r12
+; CHECK-NEXT:    sub.w r12, r12, #4
 ; CHECK-NEXT:    vpstt
 ; CHECK-NEXT:    vldrwt.u32 q0, [r0]
 ; CHECK-NEXT:    vldrwt.u32 q1, [r1]
diff --git a/test/CodeGen/Thumb2/LowOverheadLoops/nested.ll b/test/CodeGen/Thumb2/LowOverheadLoops/nested.ll
index a5e922858f2..6d5516249e2 100644
--- a/test/CodeGen/Thumb2/LowOverheadLoops/nested.ll
+++ b/test/CodeGen/Thumb2/LowOverheadLoops/nested.ll
@@ -69,8 +69,8 @@ for.cond.cleanup:                                 ; preds = %middle.block, %entr
 ; CHECK: phi
 ; CHECK: phi
 ; CHECK: [[IV:%[^ ]+]] = phi i32 [ %N, %for.cond1.preheader.us ], [ [[REM:%[^ ]+]], %vector.body ]
+; CHECK: [[VCTP:%[^ ]+]] = call <4 x i1> @llvm.arm.vctp32(i32 [[IV]])
 ; CHECK: [[REM]] = sub i32 [[IV]], 4
-; CHECK: [[VCTP:%[^ ]+]] = call <4 x i1> @llvm.arm.vctp32(i32 [[REM]])
 ; CHECK: call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* {{.*}}, i32 4, <4 x i1> [[VCTP]], <4 x i32> undef)
 ; CHECK: call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* {{.*}}, i32 4, <4 x i1> [[VCTP]], <4 x i32> undef)
 define void @mat_vec_i32(i32** nocapture readonly %A, i32* nocapture readonly %B, i32* noalias nocapture %C, i32 %N) {
diff --git a/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-widen.ll b/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-widen.ll
index c1b957b9657..70e272ffc0d 100644
--- a/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-widen.ll
+++ b/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-widen.ll
@@ -50,8 +50,8 @@ for.cond.cleanup:                                 ; preds = %vector.body, %entry
 
 ; CHECK-LABEL: expand_v8i16_v4i32
 ; CHECK: [[ELEMS:%[^ ]+]] = phi i32 [ %N, %vector.ph ], [ [[ELEMS_REM:%[^ ]+]], %vector.body ]
+; CHECK: [[VCTP:%[^ ]+]] = call <8 x i1> @llvm.arm.vctp16(i32 [[ELEMS]])
 ; CHECK: [[ELEMS_REM]] = sub i32 [[ELEMS]], 8
-; CHECK: [[VCTP:%[^ ]+]] = call <8 x i1> @llvm.arm.vctp16(i32 [[ELEMS_REM]])
 ; CHECK: tail call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* {{.*}}, i32 4, <8 x i1> [[VCTP]], <8 x i16> undef)
 ; CHECK: %store.pred = icmp ule <4 x i32> %induction.store
 ; CHECK: tail call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> {{.*}}, <4 x i32>* {{.*}}, i32 4, <4 x i1> %store.pred)
diff --git a/test/CodeGen/Thumb2/LowOverheadLoops/tail-reduce.ll b/test/CodeGen/Thumb2/LowOverheadLoops/tail-reduce.ll
index b30806001c1..7cdd28fd0f3 100644
--- a/test/CodeGen/Thumb2/LowOverheadLoops/tail-reduce.ll
+++ b/test/CodeGen/Thumb2/LowOverheadLoops/tail-reduce.ll
@@ -5,8 +5,8 @@
 ; CHECK: phi <8 x i16> [ zeroinitializer, %entry ]
 ; CHECK: phi i32
 ; CHECK: [[PHI:%[^ ]+]] = phi i32 [ %N, %entry ], [ [[ELEMS:%[^ ]+]], %vector.body ]
+; CHECK: [[VCTP:%[^ ]+]] = call <8 x i1> @llvm.arm.vctp16(i32 [[PHI]])
 ; CHECK: [[ELEMS]] = sub i32 [[PHI]], 8
-; CHECK: [[VCTP:%[^ ]+]] = call <8 x i1> @llvm.arm.vctp16(i32 [[ELEMS]])
 ; CHECK: call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* %tmp4, i32 4, <8 x i1> [[VCTP]], <8 x i16> undef)
 ; CHECK: call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* %tmp6, i32 4, <8 x i1> [[VCTP]], <8 x i16> undef)
 define i16 @reduction_i32(i16* nocapture readonly %A, i16* nocapture readonly %B, i32 %N) {
@@ -63,8 +63,8 @@ middle.block:                                     ; preds = %vector.body
 ; CHECK: phi <8 x i16> [ zeroinitializer, %entry ]
 ; CHECK: phi i32
 ; CHECK: [[PHI:%[^ ]+]] = phi i32 [ %N, %entry ], [ [[ELEMS:%[^ ]+]], %vector.body ]
+; CHECK: [[VCTP:%[^ ]+]] = call <8 x i1> @llvm.arm.vctp16(i32 [[PHI]])
 ; CHECK: [[ELEMS]] = sub i32 [[PHI]], 8
-; CHECK: [[VCTP:%[^ ]+]] = call <8 x i1> @llvm.arm.vctp16(i32 [[ELEMS]])
 ; CHECK: call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* %tmp4, i32 4, <8 x i1> [[VCTP]], <8 x i16> undef)
 define i16 @reduction_i32_with_scalar(i16* nocapture readonly %A, i16 %B, i32 %N) local_unnamed_addr {
 entry:
diff --git a/test/CodeGen/Thumb2/LowOverheadLoops/vector-arith-codegen.ll b/test/CodeGen/Thumb2/LowOverheadLoops/vector-arith-codegen.ll
index 1612e26e3f7..bd691963ad3 100644
--- a/test/CodeGen/Thumb2/LowOverheadLoops/vector-arith-codegen.ll
+++ b/test/CodeGen/Thumb2/LowOverheadLoops/vector-arith-codegen.ll
@@ -3,13 +3,14 @@
 ; CHECK-LABEL: mul_reduce_add
 ; CHECK:      dls lr,
 ; CHECK: [[LOOP:.LBB[0-9_]+]]:
-; CHECK:      sub{{.*}} [[ELEMS:r[0-9]+]], #4
-; CHECK:      vctp.32	[[ELEMS]]
+; CHECK:      vctp.32	[[ELEMS:r[0-9]+]]
 ; CHECK:      vpstt	
 ; CHECK-NEXT: vldrwt.u32
 ; CHECK-NEXT: vldrwt.u32
+; CHECK:      mov [[ELEMS_OUT:r[0-9]+]], [[ELEMS]]
+; CHECK:      sub{{.*}} [[ELEMS]], #4
 ; CHECK:      le	lr, [[LOOP]]
-; CHECK:      vctp.32	[[ELEMS]]
+; CHECK:      vctp.32	[[ELEMS_OUT]]
 ; CHECK:      vpsel
 ; CHECK:      vaddv.u32	r0
 define dso_local i32 @mul_reduce_add(i32* noalias nocapture readonly %a, i32* noalias nocapture readonly %b, i32 %N) {
@@ -54,7 +55,17 @@ for.cond.cleanup:                                 ; preds = %middle.block, %entr
   ret i32 %res.0.lcssa
 }
 
-; Function Attrs: norecurse nounwind readonly
+; CHECK-LABEL: mul_reduce_add_const
+; CHECK:    dls lr
+; CHECK: [[LOOP:.LBB[0-9_]+]]:
+; CHECK:      vctp.32 [[ELEMS:r[0-9]+]]
+; CHECK:      vpst	
+; CHECK-NEXT: vldrwt.u32 q{{.*}}, [r0]
+; CHECK:      mov [[ELEMS_OUT:r[0-9]+]], [[ELEMS]]
+; CHECK:      sub{{.*}} [[ELEMS]], #4
+; CHECK:      le lr, [[LOOP]]
+; CHECK:      vctp.32 [[ELEMS_OUT]]
+; CHECK:      vpsel
 define dso_local i32 @mul_reduce_add_const(i32* noalias nocapture readonly %a, i32 %b, i32 %N) {
 entry:
   %cmp6 = icmp eq i32 %N, 0
@@ -96,13 +107,14 @@ for.cond.cleanup:                                 ; preds = %middle.block, %entr
 ; CHECK-LABEL: add_reduce_add_const
 ; CHECK:      dls lr, lr
 ; CHECK: [[LOOP:.LBB[0-9_]+]]:
-; CHECK:      subs [[ELEMS:r[0-9]+]], #4
-; CHECK:      vctp.32 [[ELEMS]]
+; CHECK:      vctp.32 [[ELEMS:r[0-9]+]]
 ; CHECK:      vpst	
 ; CHECK-NEXT: vldrwt.u32 q{{.*}}, [r0]
+; CHECK:      mov [[ELEMS_OUT:r[0-9]+]], [[ELEMS]]
+; CHECK:      sub{{.*}} [[ELEMS]], #4
 ; CHECK:      vadd.i32
 ; CHECK:      le lr, [[LOOP]]
-; CHECK:      vctp.32 [[ELEMS]]
+; CHECK:      vctp.32 [[ELEMS_OUT]]
 ; CHECK:      vpsel
 define dso_local i32 @add_reduce_add_const(i32* noalias nocapture readonly %a, i32 %b, i32 %N) {
 entry:
@@ -145,8 +157,8 @@ for.cond.cleanup:                                 ; preds = %middle.block, %entr
 ; CHECK-LABEL: vector_mul_const
 ; CHECK:      dls lr, lr
 ; CHECK: [[LOOP:.LBB[0-9_]+]]:
-; CHECK:      subs [[ELEMS:r[0-9]+]], #4
-; CHECK:      vctp.32 [[ELEMS]]
+; CHECK:      vctp.32 [[ELEMS:r[0-9]+]]
+; CHECK:      sub{{.*}} [[ELEMS]], #4
 ; CHECK:      vpst	
 ; CHECK-NEXT: vldrwt.u32 q{{.*}}, [r1]
 ; CHECK:      vmul.i32
@@ -192,8 +204,8 @@ for.cond.cleanup:                                 ; preds = %vector.body, %entry
 ; CHECK-LABEL: vector_add_const
 ; CHECK:      dls lr, lr
 ; CHECK: [[LOOP:.LBB[0-9_]+]]:
-; CHECK:      subs [[ELEMS:r[0-9]+]], #4
-; CHECK:      vctp.32 [[ELEMS]]
+; CHECK:      vctp.32 [[ELEMS:r[0-9]+]]
+; CHECK:      sub{{.*}} [[ELEMS]], #4
 ; CHECK:      vpst	
 ; CHECK-NEXT: vldrwt.u32 q{{.*}}, [r1]
 ; CHECK:      vadd.i32
diff --git a/test/CodeGen/Thumb2/LowOverheadLoops/vector-reduce-mve-tail.ll b/test/CodeGen/Thumb2/LowOverheadLoops/vector-reduce-mve-tail.ll
index 824f1d5790d..dbf40f60cbd 100644
--- a/test/CodeGen/Thumb2/LowOverheadLoops/vector-reduce-mve-tail.ll
+++ b/test/CodeGen/Thumb2/LowOverheadLoops/vector-reduce-mve-tail.ll
@@ -6,13 +6,13 @@
 ; CHECK: vector.body:
 ; CHECK-NOT: phi i32 [ 0, %vector.ph ]
 ; CHECK: [[ELTS:%[^ ]+]] = phi i32 [ %N, %vector.ph ], [ [[SUB:%[^ ]+]], %vector.body ]
+; CHECK: [[VCTP:%[^ ]+]] = call <4 x i1> @llvm.arm.vctp32(i32 [[ELTS]])
 ; CHECK: [[SUB]] = sub i32 [[ELTS]], 4
-; CHECK: [[VCTP:%[^ ]+]] = call <4 x i1> @llvm.arm.vctp32(i32 [[SUB]])
 ; CHECK: call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* {{.*}}, i32 4, <4 x i1> [[VCTP]]
 ; CHECK: call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* {{.*}}, i32 4, <4 x i1> [[VCTP]],
 
 ; CHECK: middle.block:
-; CHECK: [[VCTP_CLONE:%[^ ]+]] = call <4 x i1> @llvm.arm.vctp32(i32 [[SUB]])
+; CHECK: [[VCTP_CLONE:%[^ ]+]] = call <4 x i1> @llvm.arm.vctp32(i32 [[ELTS]])
 ; CHECK: [[VPSEL:%[^ ]+]] = select <4 x i1> [[VCTP_CLONE]],
 ; CHECK: call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> [[VPSEL]])
 
-- 
2.50.1