case 16: VCTPID = Intrinsic::arm_vctp8; break;
}
Function *VCTP = Intrinsic::getDeclaration(M, VCTPID);
- // TODO: This add likely already exists in the loop.
- Value *Remaining = Builder.CreateSub(Processed, Factor);
- Value *TailPredicate = Builder.CreateCall(VCTP, Remaining);
+ Value *TailPredicate = Builder.CreateCall(VCTP, Processed);
Predicate->replaceAllUsesWith(TailPredicate);
NewPredicates[Predicate] = cast<Instruction>(TailPredicate);
// Add the incoming value to the new phi.
+ // TODO: This add likely already exists in the loop.
+ Value *Remaining = Builder.CreateSub(Processed, Factor);
Processed->addIncoming(Remaining, L->getLoopLatch());
LLVM_DEBUG(dbgs() << "TP: Insert processed elements phi: "
<< *Processed << "\n"
; CHECK: vector.body:
; CHECK: %index = phi i32
; CHECK: [[ELEMS:%[^ ]+]] = phi i32 [ %N, %vector.ph ], [ [[REMAINING:%[^ ]+]], %vector.body ]
+; CHECK: [[VCTP:%[^ ]+]] = call <16 x i1> @llvm.arm.vctp8(i32 [[ELEMS]])
; CHECK: [[REMAINING]] = sub i32 [[ELEMS]], 16
-; CHECK: [[VCTP:%[^ ]+]] = call <16 x i1> @llvm.arm.vctp8(i32 [[REMAINING]])
; CHECK: [[LD0:%[^ ]+]] = tail call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* {{.*}}, i32 4, <16 x i1> [[VCTP]], <16 x i8> undef)
; CHECK: [[LD1:%[^ ]+]] = tail call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* {{.*}}, i32 4, <16 x i1> [[VCTP]], <16 x i8> undef)
; CHECK: tail call void @llvm.masked.store.v16i8.p0v16i8(<16 x i8> {{.*}}, <16 x i8>* {{.*}}, i32 4, <16 x i1> [[VCTP]])
; CHECK: vector.body:
; CHECK: %index = phi i32
; CHECK: [[ELEMS:%[^ ]+]] = phi i32 [ %N, %vector.ph ], [ [[REMAINING:%[^ ]+]], %vector.body ]
+; CHECK: [[VCTP:%[^ ]+]] = call <8 x i1> @llvm.arm.vctp16(i32 [[ELEMS]])
; CHECK: [[REMAINING]] = sub i32 [[ELEMS]], 8
-; CHECK: [[VCTP:%[^ ]+]] = call <8 x i1> @llvm.arm.vctp16(i32 [[REMAINING]])
; CHECK: [[LD0:%[^ ]+]] = tail call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* {{.*}}, i32 4, <8 x i1> [[VCTP]], <8 x i16> undef)
; CHECK: [[LD1:%[^ ]+]] = tail call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* {{.*}}, i32 4, <8 x i1> [[VCTP]], <8 x i16> undef)
; CHECK: tail call void @llvm.masked.store.v8i16.p0v8i16(<8 x i16> {{.*}}, <8 x i16>* {{.*}}, i32 4, <8 x i1> [[VCTP]])
; CHECK-LABEL: mul_v4i32
; CHECK: vector.body:
; CHECK: [[ELEMS:%[^ ]+]] = phi i32 [ %N, %vector.ph ], [ [[REMAINING:%[^ ]+]], %vector.body ]
+; CHECK: [[VCTP:%[^ ]+]] = call <4 x i1> @llvm.arm.vctp32(i32 [[ELEMS]])
; CHECK: [[REMAINING]] = sub i32 [[ELEMS]], 4
-; CHECK: [[VCTP:%[^ ]+]] = call <4 x i1> @llvm.arm.vctp32(i32 [[REMAINING]])
; CHECK: [[LD0:%[^ ]+]] = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* {{.*}}, i32 4, <4 x i1> [[VCTP]], <4 x i32> undef)
; CHECK: [[LD1:%[^ ]+]] = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* {{.*}}, i32 4, <4 x i1> [[VCTP]], <4 x i32> undef)
; CHECK: tail call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> {{.*}}, <4 x i32>* {{.*}}, i32 4, <4 x i1> [[VCTP]])
; CHECK: vector.body:
; CHECK: %index = phi i32
; CHECK: [[ELEMS:%[^ ]+]] = phi i32 [ %N, %vector.ph ], [ [[REMAINING:%[^ ]+]], %vector.body ]
+; CHECK: [[VCTP:%[^ ]+]] = call <2 x i1> @llvm.arm.vctp64(i32 [[ELEMS]])
; CHECK: [[REMAINING]] = sub i32 [[ELEMS]], 2
-; CHECK: [[VCTP:%[^ ]+]] = call <2 x i1> @llvm.arm.vctp64(i32 [[REMAINING]])
; CHECK: [[LD0:%[^ ]+]] = tail call <2 x i64> @llvm.masked.load.v2i64.p0v2i64(<2 x i64>* {{.*}}, i32 4, <2 x i1> [[VCTP]], <2 x i64> undef)
; CHECK: tail call void @llvm.masked.store.v2i64.p0v2i64(<2 x i64> [[LD0]], <2 x i64>* {{.*}}, i32 4, <2 x i1> [[VCTP]])
define void @copy_v2i64(i64* %a, i64* %b, i32 %N) {
; CHECK: vector.body:
; CHECK: %index = phi i32
; CHECK: [[ELEMS:%[^ ]+]] = phi i32 [ %N, %vector.ph ], [ [[REMAINING:%[^ ]+]], %vector.body ]
+; CHECK: [[VCTP:%[^ ]+]] = call <4 x i1> @llvm.arm.vctp32(i32 [[ELEMS]])
; CHECK: [[REMAINING]] = sub i32 [[ELEMS]], 4
-; CHECK: [[VCTP:%[^ ]+]] = call <4 x i1> @llvm.arm.vctp32(i32 [[REMAINING]])
; CHECK: [[LD0:%[^ ]+]] = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* {{.*}}, i32 4, <4 x i1> [[VCTP]], <4 x i32> undef)
; CHECK: [[LD1:%[^ ]+]] = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* {{.*}}, i32 4, <4 x i1> [[VCTP]], <4 x i32> undef)
; CHECK: tail call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> {{.*}}, <4 x i32>* {{.*}}, i32 4, <4 x i1> [[VCTP]])
; One of the loads now uses ult predicate.
; CHECK-LABEL: mismatch_load_pred
; CHECK: [[ELEMS:%[^ ]+]] = phi i32 [ %N, %vector.ph ], [ [[REMAINING:%[^ ]+]], %vector.body ]
+; CHECK: [[VCTP:%[^ ]+]] = call <4 x i1> @llvm.arm.vctp32(i32 [[ELEMS]])
; CHECK: [[REMAINING]] = sub i32 [[ELEMS]], 4
-; CHECK: [[VCTP:%[^ ]+]] = call <4 x i1> @llvm.arm.vctp32(i32 [[REMAINING]])
; CHECK: [[LD0:%[^ ]+]] = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* {{.*}}, i32 4, <4 x i1> [[VCTP]], <4 x i32> undef)
; CHECK: [[LD1:%[^ ]+]] = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* {{.*}}, i32 4, <4 x i1> %wrong, <4 x i32> undef)
; CHECK: tail call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> {{.*}}, <4 x i32>* {{.*}}, i32 4, <4 x i1> [[VCTP]])
; CHECK-LABEL: mismatch_store_pred
; CHECK: %index = phi i32
; CHECK: [[ELEMS:%[^ ]+]] = phi i32 [ %N, %vector.ph ], [ [[REMAINING:%[^ ]+]], %vector.body ]
+; CHECK: [[VCTP:%[^ ]+]] = call <4 x i1> @llvm.arm.vctp32(i32 [[ELEMS]])
; CHECK: [[REMAINING]] = sub i32 [[ELEMS]], 4
-; CHECK: [[VCTP:%[^ ]+]] = call <4 x i1> @llvm.arm.vctp32(i32 [[REMAINING]])
; CHECK: [[LD0:%[^ ]+]] = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* {{.*}}, i32 4, <4 x i1> [[VCTP]], <4 x i32> undef)
; CHECK: [[LD1:%[^ ]+]] = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* {{.*}}, i32 4, <4 x i1> [[VCTP]], <4 x i32> undef)
; CHECK: tail call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> {{.*}}, <4 x i32>* {{.*}}, i32 4, <4 x i1> %wrong)
; CHECK-LABEL: vpsel_mul_reduce_add
; CHECK: dls lr, lr
; CHECK: [[LOOP:.LBB[0-9_]+]]:
-; CHECK: sub{{.*}} [[ELEMS:r[0-9]+]], #4
-; CHECK: vctp.32 [[ELEMS]]
+; CHECK: vctp.32 [[ELEMS:r[0-9]+]]
+; CHECK: mov [[ELEMS_OUT:r[0-9]+]], [[ELEMS]]
; CHECK: vstr p0, [sp
; CHECK: vpstt
; CHECK-NEXT: vldrwt.u32
; CHECK: vldr p0, [sp
; CHECK: vpst
; CHECK-NEXT: vldrwt.u32 q{{.*}}, [r0]
+; CHECK: sub{{.*}} [[ELEMS]], [[ELEMS_OUT]], #4
; CHECK: le lr, [[LOOP]]
-; CHECK: vctp.32 [[ELEMS]]
+; CHECK: vctp.32 [[ELEMS_OUT]]
; CHECK-NEXT: vpsel
; CHECK-NEXT: vaddv.u32
define dso_local i32 @vpsel_mul_reduce_add(i32* noalias nocapture readonly %a, i32* noalias nocapture readonly %b, i32* noalias nocapture readonly %c, i32 %N) {
; CHECK-LABEL: vpsel_mul_reduce_add_2
; CHECK: dls lr, lr
; CHECK: [[LOOP:.LBB[0-9_]+]]:
-; CHECK: sub{{.*}} [[ELEMS:r[0-9]+]], #4
-; CHECK: vctp.32 [[ELEMS]]
+; CHECK: vctp.32 [[ELEMS:r[0-9]+]]
+; CHECK: mov [[ELEMS_OUT:r[0-9]+]], [[ELEMS]]
; CHECK: vstr p0, [sp
; CHECK: vpstt
; CHECK-NEXT: vldrwt.u32
; CHECK: vldr p0, [sp
; CHECK: vpst
; CHECK-NEXT: vldrwt.u32 q{{.*}}, [r0]
+; CHECK: sub{{.*}} [[ELEMS]], [[ELEMS_OUT]], #4
; CHECK: le lr, [[LOOP]]
-; CHECK: vctp.32 [[ELEMS]]
+; CHECK: vctp.32 [[ELEMS_OUT]]
; CHECK-NEXT: vpsel
; CHECK-NEXT: vaddv.u32
define dso_local i32 @vpsel_mul_reduce_add_2(i32* noalias nocapture readonly %a, i32* noalias nocapture readonly %b,
; CHECK-LABEL: and_mul_reduce_add
; CHECK: dls lr, lr
; CHECK: [[LOOP:.LBB[0-9_]+]]:
-; CHECK: sub{{.*}} [[ELEMS:r[0-9]+]],{{.*}}#4
-; CHECK: vctp.32 [[ELEMS]]
+; CHECK: vctp.32 [[ELEMS:r[0-9]+]]
; CHECK: vpstt
; CHECK-NEXT: vldrwt.u32
; CHECK-NEXT: vldrwt.u32
+; CHECK: mov [[ELEMS_OUT:r[0-9]+]], [[ELEMS]]
; CHECK: vpsttt
; CHECK-NEXT: vcmpt.i32 eq, {{.*}}, zr
; CHECK-NEXT: vldrwt.u32 q{{.*}}, [r3]
; CHECK-NEXT: vldrwt.u32 q{{.*}}, [r2]
+; CHECK: sub{{.*}} [[ELEMS]],{{.*}}#4
; CHECK: le lr, [[LOOP]]
-; CHECK: vctp.32 [[ELEMS]]
+; CHECK: vctp.32 [[ELEMS_OUT]]
; CHECK: vpsel
define dso_local i32 @and_mul_reduce_add(i32* noalias nocapture readonly %a, i32* noalias nocapture readonly %b,
i32* noalias nocapture readonly %c, i32* noalias nocapture readonly %d, i32 %N) {
; CHECK-LABEL: or_mul_reduce_add
; CHECK: dls lr, lr
; CHECK: [[LOOP:.LBB[0-9_]+]]:
-; CHECK: sub{{.*}} [[ELEMS:r[0-9]+]],{{.*}}#4
-; CHECK: vctp.32 [[ELEMS]]
+; CHECK: vctp.32 [[ELEMS:r[0-9]+]]
; CHECK: vstr p0, [sp
+; CHECK: mov [[ELEMS_OUT:r[0-9]+]], [[ELEMS]]
; CHECK: vpstt
; CHECK-NEXT: vldrwt.u32
; CHECK-NEXT: vldrwt.u32
; CHECK: vldr p0, [sp
; CHECK: vmrs [[VCTP:r[0-9]+]], p0
; CHECK: orr{{.*}} [[VCMP]], [[VCTP]]
+; CHECK: sub{{.*}} [[ELEMS:r[0-9]+]], [[ELEMS_OUT]], #4
; CHECK-NEXT: vmsr p0
; CHECK-NEXT: vpstt
; CHECK-NEXT: vldrwt.u32 q{{.*}}, [r3]
; CHECK-NEXT: vldrwt.u32 q{{.*}}, [r2]
; CHECK: le lr, [[LOOP]]
-; CHECK: vctp.32 [[ELEMS]]
+; CHECK: vctp.32 [[ELEMS_OUT]]
; CHECK: vpsel
define dso_local i32 @or_mul_reduce_add(i32* noalias nocapture readonly %a, i32* noalias nocapture readonly %b,
i32* noalias nocapture readonly %c, i32* noalias nocapture readonly %d, i32 %N) {
; CHECK-NEXT: dls lr, lr
; CHECK-NEXT: .LBB4_1: @ %vector.body
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: subs r2, #4
-; CHECK-NEXT: vmov q1, q0
; CHECK-NEXT: vctp.32 r2
+; CHECK-NEXT: mov r3, r2
; CHECK-NEXT: vpst
; CHECK-NEXT: vldrwt.u32 q2, [r1]
; CHECK-NEXT: adds r1, #16
+; CHECK-NEXT: subs r2, #4
+; CHECK-NEXT: vmov q1, q0
; CHECK-NEXT: vmla.u32 q0, q2, r0
; CHECK-NEXT: le lr, .LBB4_1
; CHECK-NEXT: @ %bb.2: @ %middle.block
-; CHECK-NEXT: vctp.32 r2
+; CHECK-NEXT: vctp.32 r3
; CHECK-NEXT: vpsel q0, q0, q1
; CHECK-NEXT: vaddv.u32 r0, q0
; CHECK-NEXT: pop {r7, pc}
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT: vadd.i32 q4, q1, r4
; CHECK-NEXT: @ implicit-def: $q5
-; CHECK-NEXT: sub.w r12, r12, #4
; CHECK-NEXT: vcmp.u32 cs, q0, q4
; CHECK-NEXT: @ implicit-def: $q4
; CHECK-NEXT: vmrs r6, p0
; CHECK-NEXT: vpst
; CHECK-NEXT: vstrwt.32 q4, [r3]
; CHECK-NEXT: adds r3, #16
+; CHECK-NEXT: sub.w r12, r12, #4
; CHECK-NEXT: le lr, .LBB5_5
; CHECK-NEXT: b .LBB5_12
; CHECK-NEXT: .LBB5_6: @ %for.body.preheader.new
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT: vadd.i32 q2, q1, r4
; CHECK-NEXT: @ implicit-def: $q3
-; CHECK-NEXT: sub.w r12, r12, #4
+; CHECK-NEXT: adds r4, #4
; CHECK-NEXT: vcmp.u32 cs, q0, q2
; CHECK-NEXT: @ implicit-def: $q2
-; CHECK-NEXT: adds r4, #4
; CHECK-NEXT: vmrs r6, p0
; CHECK-NEXT: and r5, r6, #1
; CHECK-NEXT: rsbs r7, r5, #0
; CHECK-NEXT: vpst
; CHECK-NEXT: vstrwt.32 q2, [r3]
; CHECK-NEXT: adds r3, #16
+; CHECK-NEXT: sub.w r12, r12, #4
; CHECK-NEXT: le lr, .LBB6_2
; CHECK-NEXT: .LBB6_3: @ %for.cond.cleanup
; CHECK-NEXT: add sp, #8
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT: vadd.i32 q4, q1, r4
; CHECK-NEXT: @ implicit-def: $q5
-; CHECK-NEXT: sub.w r12, r12, #4
; CHECK-NEXT: vcmp.u32 cs, q0, q4
; CHECK-NEXT: @ implicit-def: $q4
; CHECK-NEXT: vmrs r6, p0
; CHECK-NEXT: vpst
; CHECK-NEXT: vstrwt.32 q4, [r3]
; CHECK-NEXT: adds r3, #16
+; CHECK-NEXT: sub.w r12, r12, #4
; CHECK-NEXT: le lr, .LBB7_5
; CHECK-NEXT: b .LBB7_12
; CHECK-NEXT: .LBB7_6: @ %for.body.preheader.new
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT: vadd.i32 q2, q1, r4
; CHECK-NEXT: @ implicit-def: $q3
-; CHECK-NEXT: sub.w r12, r12, #4
+; CHECK-NEXT: adds r4, #4
; CHECK-NEXT: vcmp.u32 cs, q0, q2
; CHECK-NEXT: @ implicit-def: $q2
-; CHECK-NEXT: adds r4, #4
; CHECK-NEXT: vmrs r6, p0
; CHECK-NEXT: and r5, r6, #1
; CHECK-NEXT: rsbs r7, r5, #0
; CHECK-NEXT: vpst
; CHECK-NEXT: vstrwt.32 q2, [r3]
; CHECK-NEXT: adds r3, #16
+; CHECK-NEXT: sub.w r12, r12, #4
; CHECK-NEXT: le lr, .LBB8_2
; CHECK-NEXT: .LBB8_3: @ %for.cond.cleanup
; CHECK-NEXT: add sp, #8
; CHECK-NEXT: dls lr, lr
; CHECK-NEXT: .LBB9_5: @ %vector.body
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: sub.w r12, r12, #4
; CHECK-NEXT: vctp.32 r12
+; CHECK-NEXT: sub.w r12, r12, #4
; CHECK-NEXT: vpstt
; CHECK-NEXT: vldrwt.u32 q0, [r0]
; CHECK-NEXT: vldrwt.u32 q1, [r1]
; CHECK: phi
; CHECK: phi
; CHECK: [[IV:%[^ ]+]] = phi i32 [ %N, %for.cond1.preheader.us ], [ [[REM:%[^ ]+]], %vector.body ]
+; CHECK: [[VCTP:%[^ ]+]] = call <4 x i1> @llvm.arm.vctp32(i32 [[IV]])
; CHECK: [[REM]] = sub i32 [[IV]], 4
-; CHECK: [[VCTP:%[^ ]+]] = call <4 x i1> @llvm.arm.vctp32(i32 [[REM]])
; CHECK: call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* {{.*}}, i32 4, <4 x i1> [[VCTP]], <4 x i32> undef)
; CHECK: call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* {{.*}}, i32 4, <4 x i1> [[VCTP]], <4 x i32> undef)
define void @mat_vec_i32(i32** nocapture readonly %A, i32* nocapture readonly %B, i32* noalias nocapture %C, i32 %N) {
; CHECK-LABEL: expand_v8i16_v4i32
; CHECK: [[ELEMS:%[^ ]+]] = phi i32 [ %N, %vector.ph ], [ [[ELEMS_REM:%[^ ]+]], %vector.body ]
+; CHECK: [[VCTP:%[^ ]+]] = call <8 x i1> @llvm.arm.vctp16(i32 [[ELEMS]])
; CHECK: [[ELEMS_REM]] = sub i32 [[ELEMS]], 8
-; CHECK: [[VCTP:%[^ ]+]] = call <8 x i1> @llvm.arm.vctp16(i32 [[ELEMS_REM]])
; CHECK: tail call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* {{.*}}, i32 4, <8 x i1> [[VCTP]], <8 x i16> undef)
; CHECK: %store.pred = icmp ule <4 x i32> %induction.store
; CHECK: tail call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> {{.*}}, <4 x i32>* {{.*}}, i32 4, <4 x i1> %store.pred)
; CHECK: phi <8 x i16> [ zeroinitializer, %entry ]
; CHECK: phi i32
; CHECK: [[PHI:%[^ ]+]] = phi i32 [ %N, %entry ], [ [[ELEMS:%[^ ]+]], %vector.body ]
+; CHECK: [[VCTP:%[^ ]+]] = call <8 x i1> @llvm.arm.vctp16(i32 [[PHI]])
; CHECK: [[ELEMS]] = sub i32 [[PHI]], 8
-; CHECK: [[VCTP:%[^ ]+]] = call <8 x i1> @llvm.arm.vctp16(i32 [[ELEMS]])
; CHECK: call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* %tmp4, i32 4, <8 x i1> [[VCTP]], <8 x i16> undef)
; CHECK: call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* %tmp6, i32 4, <8 x i1> [[VCTP]], <8 x i16> undef)
define i16 @reduction_i32(i16* nocapture readonly %A, i16* nocapture readonly %B, i32 %N) {
; CHECK: phi <8 x i16> [ zeroinitializer, %entry ]
; CHECK: phi i32
; CHECK: [[PHI:%[^ ]+]] = phi i32 [ %N, %entry ], [ [[ELEMS:%[^ ]+]], %vector.body ]
+; CHECK: [[VCTP:%[^ ]+]] = call <8 x i1> @llvm.arm.vctp16(i32 [[PHI]])
; CHECK: [[ELEMS]] = sub i32 [[PHI]], 8
-; CHECK: [[VCTP:%[^ ]+]] = call <8 x i1> @llvm.arm.vctp16(i32 [[ELEMS]])
; CHECK: call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* %tmp4, i32 4, <8 x i1> [[VCTP]], <8 x i16> undef)
define i16 @reduction_i32_with_scalar(i16* nocapture readonly %A, i16 %B, i32 %N) local_unnamed_addr {
entry:
; CHECK-LABEL: mul_reduce_add
; CHECK: dls lr,
; CHECK: [[LOOP:.LBB[0-9_]+]]:
-; CHECK: sub{{.*}} [[ELEMS:r[0-9]+]], #4
-; CHECK: vctp.32 [[ELEMS]]
+; CHECK: vctp.32 [[ELEMS:r[0-9]+]]
; CHECK: vpstt
; CHECK-NEXT: vldrwt.u32
; CHECK-NEXT: vldrwt.u32
+; CHECK: mov [[ELEMS_OUT:r[0-9]+]], [[ELEMS]]
+; CHECK: sub{{.*}} [[ELEMS]], #4
; CHECK: le lr, [[LOOP]]
-; CHECK: vctp.32 [[ELEMS]]
+; CHECK: vctp.32 [[ELEMS_OUT]]
; CHECK: vpsel
; CHECK: vaddv.u32 r0
define dso_local i32 @mul_reduce_add(i32* noalias nocapture readonly %a, i32* noalias nocapture readonly %b, i32 %N) {
ret i32 %res.0.lcssa
}
-; Function Attrs: norecurse nounwind readonly
+; CHECK-LABEL: mul_reduce_add_const
+; CHECK: dls lr
+; CHECK: [[LOOP:.LBB[0-9_]+]]:
+; CHECK: vctp.32 [[ELEMS:r[0-9]+]]
+; CHECK: vpst
+; CHECK-NEXT: vldrwt.u32 q{{.*}}, [r0]
+; CHECK: mov [[ELEMS_OUT:r[0-9]+]], [[ELEMS]]
+; CHECK: sub{{.*}} [[ELEMS]], #4
+; CHECK: le lr, [[LOOP]]
+; CHECK: vctp.32 [[ELEMS_OUT]]
+; CHECK: vpsel
define dso_local i32 @mul_reduce_add_const(i32* noalias nocapture readonly %a, i32 %b, i32 %N) {
entry:
%cmp6 = icmp eq i32 %N, 0
; CHECK-LABEL: add_reduce_add_const
; CHECK: dls lr, lr
; CHECK: [[LOOP:.LBB[0-9_]+]]:
-; CHECK: subs [[ELEMS:r[0-9]+]], #4
-; CHECK: vctp.32 [[ELEMS]]
+; CHECK: vctp.32 [[ELEMS:r[0-9]+]]
; CHECK: vpst
; CHECK-NEXT: vldrwt.u32 q{{.*}}, [r0]
+; CHECK: mov [[ELEMS_OUT:r[0-9]+]], [[ELEMS]]
+; CHECK: sub{{.*}} [[ELEMS]], #4
; CHECK: vadd.i32
; CHECK: le lr, [[LOOP]]
-; CHECK: vctp.32 [[ELEMS]]
+; CHECK: vctp.32 [[ELEMS_OUT]]
; CHECK: vpsel
define dso_local i32 @add_reduce_add_const(i32* noalias nocapture readonly %a, i32 %b, i32 %N) {
entry:
; CHECK-LABEL: vector_mul_const
; CHECK: dls lr, lr
; CHECK: [[LOOP:.LBB[0-9_]+]]:
-; CHECK: subs [[ELEMS:r[0-9]+]], #4
-; CHECK: vctp.32 [[ELEMS]]
+; CHECK: vctp.32 [[ELEMS:r[0-9]+]]
+; CHECK: sub{{.*}} [[ELEMS]], #4
; CHECK: vpst
; CHECK-NEXT: vldrwt.u32 q{{.*}}, [r1]
; CHECK: vmul.i32
; CHECK-LABEL: vector_add_const
; CHECK: dls lr, lr
; CHECK: [[LOOP:.LBB[0-9_]+]]:
-; CHECK: subs [[ELEMS:r[0-9]+]], #4
-; CHECK: vctp.32 [[ELEMS]]
+; CHECK: vctp.32 [[ELEMS:r[0-9]+]]
+; CHECK: sub{{.*}} [[ELEMS]], #4
; CHECK: vpst
; CHECK-NEXT: vldrwt.u32 q{{.*}}, [r1]
; CHECK: vadd.i32
; CHECK: vector.body:
; CHECK-NOT: phi i32 [ 0, %vector.ph ]
; CHECK: [[ELTS:%[^ ]+]] = phi i32 [ %N, %vector.ph ], [ [[SUB:%[^ ]+]], %vector.body ]
+; CHECK: [[VCTP:%[^ ]+]] = call <4 x i1> @llvm.arm.vctp32(i32 [[ELTS]])
; CHECK: [[SUB]] = sub i32 [[ELTS]], 4
-; CHECK: [[VCTP:%[^ ]+]] = call <4 x i1> @llvm.arm.vctp32(i32 [[SUB]])
; CHECK: call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* {{.*}}, i32 4, <4 x i1> [[VCTP]]
; CHECK: call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* {{.*}}, i32 4, <4 x i1> [[VCTP]],
; CHECK: middle.block:
-; CHECK: [[VCTP_CLONE:%[^ ]+]] = call <4 x i1> @llvm.arm.vctp32(i32 [[SUB]])
+; CHECK: [[VCTP_CLONE:%[^ ]+]] = call <4 x i1> @llvm.arm.vctp32(i32 [[ELTS]])
; CHECK: [[VPSEL:%[^ ]+]] = select <4 x i1> [[VCTP_CLONE]],
; CHECK: call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> [[VPSEL]])