[ARM][MVE] Change VCTP operand

author Sam Parker <sam.parker@arm.com>

Mon, 30 Sep 2019 08:03:23 +0000 (08:03 +0000)

committer Sam Parker <sam.parker@arm.com>

Mon, 30 Sep 2019 08:03:23 +0000 (08:03 +0000)
author Sam Parker <sam.parker@arm.com>
Mon, 30 Sep 2019 08:03:23 +0000 (08:03 +0000)
committer Sam Parker <sam.parker@arm.com>
Mon, 30 Sep 2019 08:03:23 +0000 (08:03 +0000)
diff --git a/lib/Target/ARM/MVETailPredication.cpp b/lib/Target/ARM/MVETailPredication.cpp

index 844eafbcb38e495604811b6ea9db447f8c21ccca..4db8ab17c49b563ba6fc558a32942f2aba2cc4dc 100644 (file)
--- a/lib/Target/ARM/MVETailPredication.cpp
+++ b/lib/Target/ARM/MVETailPredication.cpp
@@ -491,13 +491,13 @@ bool MVETailPredication::TryConvert(Value *TripCount) {
      case 16: VCTPID = Intrinsic::arm_vctp8; break;
      }
      Function *VCTP = Intrinsic::getDeclaration(M, VCTPID);
-    // TODO: This add likely already exists in the loop.
-    Value *Remaining = Builder.CreateSub(Processed, Factor);
-    Value *TailPredicate = Builder.CreateCall(VCTP, Remaining);
+    Value *TailPredicate = Builder.CreateCall(VCTP, Processed);
      Predicate->replaceAllUsesWith(TailPredicate);
      NewPredicates[Predicate] = cast<Instruction>(TailPredicate);
  
      // Add the incoming value to the new phi.
+    // TODO: This add likely already exists in the loop.
+    Value *Remaining = Builder.CreateSub(Processed, Factor);
      Processed->addIncoming(Remaining, L->getLoopLatch());
      LLVM_DEBUG(dbgs() << "TP: Insert processed elements phi: "
                 << *Processed << "\n"
diff --git a/test/CodeGen/Thumb2/LowOverheadLoops/basic-tail-pred.ll b/test/CodeGen/Thumb2/LowOverheadLoops/basic-tail-pred.ll

index f01e6ab19184b52fe4e6efc5b0677f7d5c0ef691..79c81ca7a449c0c3a4af8a88293a5e2f348c95d3 100644 (file)
--- a/test/CodeGen/Thumb2/LowOverheadLoops/basic-tail-pred.ll
+++ b/test/CodeGen/Thumb2/LowOverheadLoops/basic-tail-pred.ll
@@ -4,8 +4,8 @@
  ; CHECK: vector.body:
  ; CHECK: %index = phi i32
  ; CHECK: [[ELEMS:%[^ ]+]] = phi i32 [ %N, %vector.ph ], [ [[REMAINING:%[^ ]+]], %vector.body ]
+; CHECK: [[VCTP:%[^ ]+]] = call <16 x i1> @llvm.arm.vctp8(i32 [[ELEMS]])
  ; CHECK: [[REMAINING]] = sub i32 [[ELEMS]], 16
-; CHECK: [[VCTP:%[^ ]+]] = call <16 x i1> @llvm.arm.vctp8(i32 [[REMAINING]])
  ; CHECK: [[LD0:%[^ ]+]] = tail call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* {{.*}}, i32 4, <16 x i1> [[VCTP]], <16 x i8> undef)
  ; CHECK: [[LD1:%[^ ]+]] = tail call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* {{.*}}, i32 4, <16 x i1> [[VCTP]], <16 x i8> undef)
  ; CHECK: tail call void @llvm.masked.store.v16i8.p0v16i8(<16 x i8> {{.*}}, <16 x i8>* {{.*}}, i32 4, <16 x i1> [[VCTP]])
@@ -57,8 +57,8 @@ for.cond.cleanup:                                 ; preds = %vector.body, %entry
  ; CHECK: vector.body:
  ; CHECK: %index = phi i32
  ; CHECK: [[ELEMS:%[^ ]+]] = phi i32 [ %N, %vector.ph ], [ [[REMAINING:%[^ ]+]], %vector.body ]
+; CHECK: [[VCTP:%[^ ]+]] = call <8 x i1> @llvm.arm.vctp16(i32 [[ELEMS]])
  ; CHECK: [[REMAINING]] = sub i32 [[ELEMS]], 8
-; CHECK: [[VCTP:%[^ ]+]] = call <8 x i1> @llvm.arm.vctp16(i32 [[REMAINING]])
  ; CHECK: [[LD0:%[^ ]+]] = tail call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* {{.*}}, i32 4, <8 x i1> [[VCTP]], <8 x i16> undef)
  ; CHECK: [[LD1:%[^ ]+]] = tail call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* {{.*}}, i32 4, <8 x i1> [[VCTP]], <8 x i16> undef)
  ; CHECK: tail call void @llvm.masked.store.v8i16.p0v8i16(<8 x i16> {{.*}}, <8 x i16>* {{.*}}, i32 4, <8 x i1> [[VCTP]])
@@ -109,8 +109,8 @@ for.cond.cleanup:                                 ; preds = %vector.body, %entry
  ; CHECK-LABEL: mul_v4i32
  ; CHECK: vector.body:
  ; CHECK: [[ELEMS:%[^ ]+]] = phi i32 [ %N, %vector.ph ], [ [[REMAINING:%[^ ]+]], %vector.body ]
+; CHECK: [[VCTP:%[^ ]+]] = call <4 x i1> @llvm.arm.vctp32(i32 [[ELEMS]])
  ; CHECK: [[REMAINING]] = sub i32 [[ELEMS]], 4
-; CHECK: [[VCTP:%[^ ]+]] = call <4 x i1> @llvm.arm.vctp32(i32 [[REMAINING]])
  ; CHECK: [[LD0:%[^ ]+]] = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* {{.*}}, i32 4, <4 x i1> [[VCTP]], <4 x i32> undef)
  ; CHECK: [[LD1:%[^ ]+]] = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* {{.*}}, i32 4, <4 x i1> [[VCTP]], <4 x i32> undef)
  ; CHECK: tail call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> {{.*}}, <4 x i32>* {{.*}}, i32 4, <4 x i1> [[VCTP]])
@@ -162,8 +162,8 @@ for.cond.cleanup:                                 ; preds = %vector.body, %entry
  ; CHECK: vector.body:
  ; CHECK: %index = phi i32
  ; CHECK: [[ELEMS:%[^ ]+]] = phi i32 [ %N, %vector.ph ], [ [[REMAINING:%[^ ]+]], %vector.body ]
+; CHECK: [[VCTP:%[^ ]+]] = call <2 x i1> @llvm.arm.vctp64(i32 [[ELEMS]])
  ; CHECK: [[REMAINING]] = sub i32 [[ELEMS]], 2
-; CHECK: [[VCTP:%[^ ]+]] = call <2 x i1> @llvm.arm.vctp64(i32 [[REMAINING]])
  ; CHECK: [[LD0:%[^ ]+]] = tail call <2 x i64> @llvm.masked.load.v2i64.p0v2i64(<2 x i64>* {{.*}}, i32 4, <2 x i1> [[VCTP]], <2 x i64> undef)
  ; CHECK: tail call void @llvm.masked.store.v2i64.p0v2i64(<2 x i64> [[LD0]], <2 x i64>* {{.*}}, i32 4, <2 x i1> [[VCTP]])
  define void @copy_v2i64(i64* %a, i64* %b, i32 %N) {
@@ -210,8 +210,8 @@ for.cond.cleanup:                                 ; preds = %vector.body, %entry
  ; CHECK: vector.body:
  ; CHECK: %index = phi i32
  ; CHECK: [[ELEMS:%[^ ]+]] = phi i32 [ %N, %vector.ph ], [ [[REMAINING:%[^ ]+]], %vector.body ]
+; CHECK: [[VCTP:%[^ ]+]] = call <4 x i1> @llvm.arm.vctp32(i32 [[ELEMS]])
  ; CHECK: [[REMAINING]] = sub i32 [[ELEMS]], 4
-; CHECK: [[VCTP:%[^ ]+]] = call <4 x i1> @llvm.arm.vctp32(i32 [[REMAINING]])
  ; CHECK: [[LD0:%[^ ]+]] = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* {{.*}}, i32 4, <4 x i1> [[VCTP]], <4 x i32> undef)
  ; CHECK: [[LD1:%[^ ]+]] = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* {{.*}}, i32 4, <4 x i1> [[VCTP]], <4 x i32> undef)
  ; CHECK: tail call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> {{.*}}, <4 x i32>* {{.*}}, i32 4, <4 x i1> [[VCTP]])
@@ -268,8 +268,8 @@ for.cond.cleanup:                                 ; preds = %vector.body, %entry
  ; One of the loads now uses ult predicate.
  ; CHECK-LABEL: mismatch_load_pred
  ; CHECK: [[ELEMS:%[^ ]+]] = phi i32 [ %N, %vector.ph ], [ [[REMAINING:%[^ ]+]], %vector.body ]
+; CHECK: [[VCTP:%[^ ]+]] = call <4 x i1> @llvm.arm.vctp32(i32 [[ELEMS]])
  ; CHECK: [[REMAINING]] = sub i32 [[ELEMS]], 4
-; CHECK: [[VCTP:%[^ ]+]] = call <4 x i1> @llvm.arm.vctp32(i32 [[REMAINING]])
  ; CHECK: [[LD0:%[^ ]+]] = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* {{.*}}, i32 4, <4 x i1> [[VCTP]], <4 x i32> undef)
  ; CHECK: [[LD1:%[^ ]+]] = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* {{.*}}, i32 4, <4 x i1> %wrong, <4 x i32> undef)
  ; CHECK: tail call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> {{.*}}, <4 x i32>* {{.*}}, i32 4, <4 x i1> [[VCTP]])
@@ -322,8 +322,8 @@ for.cond.cleanup:                                 ; preds = %vector.body, %entry
  ; CHECK-LABEL: mismatch_store_pred
  ; CHECK: %index = phi i32
  ; CHECK: [[ELEMS:%[^ ]+]] = phi i32 [ %N, %vector.ph ], [ [[REMAINING:%[^ ]+]], %vector.body ]
+; CHECK: [[VCTP:%[^ ]+]] = call <4 x i1> @llvm.arm.vctp32(i32 [[ELEMS]])
  ; CHECK: [[REMAINING]] = sub i32 [[ELEMS]], 4
-; CHECK: [[VCTP:%[^ ]+]] = call <4 x i1> @llvm.arm.vctp32(i32 [[REMAINING]])
  ; CHECK: [[LD0:%[^ ]+]] = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* {{.*}}, i32 4, <4 x i1> [[VCTP]], <4 x i32> undef)
  ; CHECK: [[LD1:%[^ ]+]] = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* {{.*}}, i32 4, <4 x i1> [[VCTP]], <4 x i32> undef)
  ; CHECK: tail call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> {{.*}}, <4 x i32>* {{.*}}, i32 4, <4 x i1> %wrong)
diff --git a/test/CodeGen/Thumb2/LowOverheadLoops/cond-vector-reduce-mve-codegen.ll b/test/CodeGen/Thumb2/LowOverheadLoops/cond-vector-reduce-mve-codegen.ll

index d701e0f1b57e307b04b7aea770189c574c8eaa60..5900dd9ac66a9aa3f75231c05beed6c6800ce90e 100644 (file)
--- a/test/CodeGen/Thumb2/LowOverheadLoops/cond-vector-reduce-mve-codegen.ll
+++ b/test/CodeGen/Thumb2/LowOverheadLoops/cond-vector-reduce-mve-codegen.ll
@@ -3,8 +3,8 @@
  ; CHECK-LABEL: vpsel_mul_reduce_add
  ; CHECK:      dls lr, lr
  ; CHECK:  [[LOOP:.LBB[0-9_]+]]:
-; CHECK:      sub{{.*}} [[ELEMS:r[0-9]+]], #4
-; CHECK:      vctp.32 [[ELEMS]]
+; CHECK:      vctp.32 [[ELEMS:r[0-9]+]]
+; CHECK:      mov [[ELEMS_OUT:r[0-9]+]], [[ELEMS]]
  ; CHECK:      vstr p0, [sp
  ; CHECK:      vpstt    
  ; CHECK-NEXT: vldrwt.u32
@@ -14,8 +14,9 @@
  ; CHECK:      vldr p0, [sp
  ; CHECK:      vpst     
  ; CHECK-NEXT: vldrwt.u32 q{{.*}}, [r0]
+; CHECK:      sub{{.*}} [[ELEMS]], [[ELEMS_OUT]], #4
  ; CHECK:      le lr, [[LOOP]]
-; CHECK:      vctp.32  [[ELEMS]]
+; CHECK:      vctp.32  [[ELEMS_OUT]]
  ; CHECK-NEXT: vpsel
  ; CHECK-NEXT: vaddv.u32
  define dso_local i32 @vpsel_mul_reduce_add(i32* noalias nocapture readonly %a, i32* noalias nocapture readonly %b, i32* noalias nocapture readonly %c, i32 %N) {
@@ -71,8 +72,8 @@ for.cond.cleanup:                                 ; preds = %middle.block, %entr
  ; CHECK-LABEL: vpsel_mul_reduce_add_2
  ; CHECK:      dls lr, lr
  ; CHECK:  [[LOOP:.LBB[0-9_]+]]:
-; CHECK:      sub{{.*}} [[ELEMS:r[0-9]+]], #4
-; CHECK:      vctp.32 [[ELEMS]]
+; CHECK:      vctp.32 [[ELEMS:r[0-9]+]]
+; CHECK:      mov [[ELEMS_OUT:r[0-9]+]], [[ELEMS]]
  ; CHECK:      vstr p0, [sp
  ; CHECK:      vpstt
  ; CHECK-NEXT: vldrwt.u32
@@ -85,8 +86,9 @@ for.cond.cleanup:                                 ; preds = %middle.block, %entr
  ; CHECK:      vldr p0, [sp
  ; CHECK:      vpst     
  ; CHECK-NEXT: vldrwt.u32 q{{.*}}, [r0]
+; CHECK:      sub{{.*}} [[ELEMS]], [[ELEMS_OUT]], #4
  ; CHECK:      le lr, [[LOOP]]
-; CHECK:      vctp.32  [[ELEMS]]
+; CHECK:      vctp.32  [[ELEMS_OUT]]
  ; CHECK-NEXT: vpsel
  ; CHECK-NEXT: vaddv.u32
  define dso_local i32 @vpsel_mul_reduce_add_2(i32* noalias nocapture readonly %a, i32* noalias nocapture readonly %b,
@@ -147,17 +149,18 @@ for.cond.cleanup:                                 ; preds = %middle.block, %entr
  ; CHECK-LABEL: and_mul_reduce_add
  ; CHECK:      dls lr, lr
  ; CHECK:  [[LOOP:.LBB[0-9_]+]]:
-; CHECK:      sub{{.*}} [[ELEMS:r[0-9]+]],{{.*}}#4
-; CHECK:      vctp.32 [[ELEMS]]
+; CHECK:      vctp.32 [[ELEMS:r[0-9]+]]
  ; CHECK:      vpstt    
  ; CHECK-NEXT: vldrwt.u32
  ; CHECK-NEXT: vldrwt.u32
+; CHECK:      mov [[ELEMS_OUT:r[0-9]+]], [[ELEMS]]
  ; CHECK:      vpsttt
  ; CHECK-NEXT: vcmpt.i32        eq, {{.*}}, zr
  ; CHECK-NEXT: vldrwt.u32 q{{.*}}, [r3]
  ; CHECK-NEXT: vldrwt.u32 q{{.*}}, [r2]
+; CHECK:      sub{{.*}} [[ELEMS]],{{.*}}#4
  ; CHECK:      le lr, [[LOOP]]
-; CHECK:      vctp.32 [[ELEMS]]
+; CHECK:      vctp.32 [[ELEMS_OUT]]
  ; CHECK:      vpsel
  define dso_local i32 @and_mul_reduce_add(i32* noalias nocapture readonly %a, i32* noalias nocapture readonly %b,
                                           i32* noalias nocapture readonly %c, i32* noalias nocapture readonly %d, i32 %N) {
@@ -215,9 +218,9 @@ for.cond.cleanup:                                 ; preds = %middle.block, %entr
  ; CHECK-LABEL: or_mul_reduce_add
  ; CHECK:      dls lr, lr
  ; CHECK:  [[LOOP:.LBB[0-9_]+]]:
-; CHECK:      sub{{.*}} [[ELEMS:r[0-9]+]],{{.*}}#4
-; CHECK:      vctp.32 [[ELEMS]]
+; CHECK:      vctp.32 [[ELEMS:r[0-9]+]]
  ; CHECK:      vstr p0, [sp
+; CHECK:      mov [[ELEMS_OUT:r[0-9]+]], [[ELEMS]]
  ; CHECK:      vpstt    
  ; CHECK-NEXT: vldrwt.u32
  ; CHECK-NEXT: vldrwt.u32
@@ -226,12 +229,13 @@ for.cond.cleanup:                                 ; preds = %middle.block, %entr
  ; CHECK:      vldr p0, [sp
  ; CHECK:      vmrs [[VCTP:r[0-9]+]], p0
  ; CHECK:      orr{{.*}} [[VCMP]], [[VCTP]]
+; CHECK:      sub{{.*}} [[ELEMS:r[0-9]+]], [[ELEMS_OUT]], #4
  ; CHECK-NEXT: vmsr p0
  ; CHECK-NEXT: vpstt
  ; CHECK-NEXT: vldrwt.u32 q{{.*}}, [r3]
  ; CHECK-NEXT: vldrwt.u32 q{{.*}}, [r2]
  ; CHECK:      le lr, [[LOOP]]
-; CHECK:      vctp.32 [[ELEMS]]
+; CHECK:      vctp.32 [[ELEMS_OUT]]
  ; CHECK:      vpsel
  define dso_local i32 @or_mul_reduce_add(i32* noalias nocapture readonly %a, i32* noalias nocapture readonly %b,
                                          i32* noalias nocapture readonly %c, i32* noalias nocapture readonly %d, i32 %N) {
diff --git a/test/CodeGen/Thumb2/LowOverheadLoops/mve-tail-data-types.ll b/test/CodeGen/Thumb2/LowOverheadLoops/mve-tail-data-types.ll

index 45ca7ce517261b9e96e4dbffe6432c26cfddfc3b..17f9d26ec033518659c1def6547d51b703e5d435 100644 (file)
--- a/test/CodeGen/Thumb2/LowOverheadLoops/mve-tail-data-types.ll
+++ b/test/CodeGen/Thumb2/LowOverheadLoops/mve-tail-data-types.ll
@@ -500,16 +500,17 @@ define arm_aapcs_vfpcc i32 @test_acc_scalar_int(i32 %a, i32* nocapture readonly
  ; CHECK-NEXT:    dls lr, lr
  ; CHECK-NEXT:  .LBB4_1: @ %vector.body
  ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    subs r2, #4
-; CHECK-NEXT:    vmov q1, q0
  ; CHECK-NEXT:    vctp.32 r2
+; CHECK-NEXT:    mov r3, r2
  ; CHECK-NEXT:    vpst
  ; CHECK-NEXT:    vldrwt.u32 q2, [r1]
  ; CHECK-NEXT:    adds r1, #16
+; CHECK-NEXT:    subs r2, #4
+; CHECK-NEXT:    vmov q1, q0
  ; CHECK-NEXT:    vmla.u32 q0, q2, r0
  ; CHECK-NEXT:    le lr, .LBB4_1
  ; CHECK-NEXT:  @ %bb.2: @ %middle.block
-; CHECK-NEXT:    vctp.32 r2
+; CHECK-NEXT:    vctp.32 r3
  ; CHECK-NEXT:    vpsel q0, q0, q1
  ; CHECK-NEXT:    vaddv.u32 r0, q0
  ; CHECK-NEXT:    pop {r7, pc}
@@ -607,7 +608,6 @@ define arm_aapcs_vfpcc void @test_vec_mul_scalar_add_char(i8* nocapture readonly
  ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
  ; CHECK-NEXT:    vadd.i32 q4, q1, r4
  ; CHECK-NEXT:    @ implicit-def: $q5
-; CHECK-NEXT:    sub.w r12, r12, #4
  ; CHECK-NEXT:    vcmp.u32 cs, q0, q4
  ; CHECK-NEXT:    @ implicit-def: $q4
  ; CHECK-NEXT:    vmrs r6, p0
@@ -681,6 +681,7 @@ define arm_aapcs_vfpcc void @test_vec_mul_scalar_add_char(i8* nocapture readonly
  ; CHECK-NEXT:    vpst
  ; CHECK-NEXT:    vstrwt.32 q4, [r3]
  ; CHECK-NEXT:    adds r3, #16
+; CHECK-NEXT:    sub.w r12, r12, #4
  ; CHECK-NEXT:    le lr, .LBB5_5
  ; CHECK-NEXT:    b .LBB5_12
  ; CHECK-NEXT:  .LBB5_6: @ %for.body.preheader.new
@@ -903,10 +904,9 @@ define arm_aapcs_vfpcc void @test_vec_mul_scalar_add_short(i16* nocapture readon
  ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
  ; CHECK-NEXT:    vadd.i32 q2, q1, r4
  ; CHECK-NEXT:    @ implicit-def: $q3
-; CHECK-NEXT:    sub.w r12, r12, #4
+; CHECK-NEXT:    adds r4, #4
  ; CHECK-NEXT:    vcmp.u32 cs, q0, q2
  ; CHECK-NEXT:    @ implicit-def: $q2
-; CHECK-NEXT:    adds r4, #4
  ; CHECK-NEXT:    vmrs r6, p0
  ; CHECK-NEXT:    and r5, r6, #1
  ; CHECK-NEXT:    rsbs r7, r5, #0
@@ -977,6 +977,7 @@ define arm_aapcs_vfpcc void @test_vec_mul_scalar_add_short(i16* nocapture readon
  ; CHECK-NEXT:    vpst
  ; CHECK-NEXT:    vstrwt.32 q2, [r3]
  ; CHECK-NEXT:    adds r3, #16
+; CHECK-NEXT:    sub.w r12, r12, #4
  ; CHECK-NEXT:    le lr, .LBB6_2
  ; CHECK-NEXT:  .LBB6_3: @ %for.cond.cleanup
  ; CHECK-NEXT:    add sp, #8
@@ -1084,7 +1085,6 @@ define arm_aapcs_vfpcc void @test_vec_mul_scalar_add_uchar(i8* nocapture readonl
  ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
  ; CHECK-NEXT:    vadd.i32 q4, q1, r4
  ; CHECK-NEXT:    @ implicit-def: $q5
-; CHECK-NEXT:    sub.w r12, r12, #4
  ; CHECK-NEXT:    vcmp.u32 cs, q0, q4
  ; CHECK-NEXT:    @ implicit-def: $q4
  ; CHECK-NEXT:    vmrs r6, p0
@@ -1158,6 +1158,7 @@ define arm_aapcs_vfpcc void @test_vec_mul_scalar_add_uchar(i8* nocapture readonl
  ; CHECK-NEXT:    vpst
  ; CHECK-NEXT:    vstrwt.32 q4, [r3]
  ; CHECK-NEXT:    adds r3, #16
+; CHECK-NEXT:    sub.w r12, r12, #4
  ; CHECK-NEXT:    le lr, .LBB7_5
  ; CHECK-NEXT:    b .LBB7_12
  ; CHECK-NEXT:  .LBB7_6: @ %for.body.preheader.new
@@ -1380,10 +1381,9 @@ define arm_aapcs_vfpcc void @test_vec_mul_scalar_add_ushort(i16* nocapture reado
  ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
  ; CHECK-NEXT:    vadd.i32 q2, q1, r4
  ; CHECK-NEXT:    @ implicit-def: $q3
-; CHECK-NEXT:    sub.w r12, r12, #4
+; CHECK-NEXT:    adds r4, #4
  ; CHECK-NEXT:    vcmp.u32 cs, q0, q2
  ; CHECK-NEXT:    @ implicit-def: $q2
-; CHECK-NEXT:    adds r4, #4
  ; CHECK-NEXT:    vmrs r6, p0
  ; CHECK-NEXT:    and r5, r6, #1
  ; CHECK-NEXT:    rsbs r7, r5, #0
@@ -1454,6 +1454,7 @@ define arm_aapcs_vfpcc void @test_vec_mul_scalar_add_ushort(i16* nocapture reado
  ; CHECK-NEXT:    vpst
  ; CHECK-NEXT:    vstrwt.32 q2, [r3]
  ; CHECK-NEXT:    adds r3, #16
+; CHECK-NEXT:    sub.w r12, r12, #4
  ; CHECK-NEXT:    le lr, .LBB8_2
  ; CHECK-NEXT:  .LBB8_3: @ %for.cond.cleanup
  ; CHECK-NEXT:    add sp, #8
@@ -1550,8 +1551,8 @@ define arm_aapcs_vfpcc void @test_vec_mul_scalar_add_int(i32* nocapture readonly
  ; CHECK-NEXT:    dls lr, lr
  ; CHECK-NEXT:  .LBB9_5: @ %vector.body
  ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    sub.w r12, r12, #4
  ; CHECK-NEXT:    vctp.32 r12
+; CHECK-NEXT:    sub.w r12, r12, #4
  ; CHECK-NEXT:    vpstt
  ; CHECK-NEXT:    vldrwt.u32 q0, [r0]
  ; CHECK-NEXT:    vldrwt.u32 q1, [r1]
diff --git a/test/CodeGen/Thumb2/LowOverheadLoops/nested.ll b/test/CodeGen/Thumb2/LowOverheadLoops/nested.ll

index a5e922858f2d0caae1e8c09d47bc8fb0200fa3e7..6d5516249e239311cee6ee864210e1060f0b86d3 100644 (file)
--- a/test/CodeGen/Thumb2/LowOverheadLoops/nested.ll
+++ b/test/CodeGen/Thumb2/LowOverheadLoops/nested.ll
@@ -69,8 +69,8 @@ for.cond.cleanup:                                 ; preds = %middle.block, %entr
  ; CHECK: phi
  ; CHECK: phi
  ; CHECK: [[IV:%[^ ]+]] = phi i32 [ %N, %for.cond1.preheader.us ], [ [[REM:%[^ ]+]], %vector.body ]
+; CHECK: [[VCTP:%[^ ]+]] = call <4 x i1> @llvm.arm.vctp32(i32 [[IV]])
  ; CHECK: [[REM]] = sub i32 [[IV]], 4
-; CHECK: [[VCTP:%[^ ]+]] = call <4 x i1> @llvm.arm.vctp32(i32 [[REM]])
  ; CHECK: call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* {{.*}}, i32 4, <4 x i1> [[VCTP]], <4 x i32> undef)
  ; CHECK: call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* {{.*}}, i32 4, <4 x i1> [[VCTP]], <4 x i32> undef)
  define void @mat_vec_i32(i32** nocapture readonly %A, i32* nocapture readonly %B, i32* noalias nocapture %C, i32 %N) {
diff --git a/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-widen.ll b/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-widen.ll

index c1b957b9657b224ca78e721e0b484f4b3f9613f5..70e272ffc0dce15e06f37c1b4da91912e3234ecf 100644 (file)
--- a/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-widen.ll
+++ b/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-widen.ll
@@ -50,8 +50,8 @@ for.cond.cleanup:                                 ; preds = %vector.body, %entry
  
  ; CHECK-LABEL: expand_v8i16_v4i32
  ; CHECK: [[ELEMS:%[^ ]+]] = phi i32 [ %N, %vector.ph ], [ [[ELEMS_REM:%[^ ]+]], %vector.body ]
+; CHECK: [[VCTP:%[^ ]+]] = call <8 x i1> @llvm.arm.vctp16(i32 [[ELEMS]])
  ; CHECK: [[ELEMS_REM]] = sub i32 [[ELEMS]], 8
-; CHECK: [[VCTP:%[^ ]+]] = call <8 x i1> @llvm.arm.vctp16(i32 [[ELEMS_REM]])
  ; CHECK: tail call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* {{.*}}, i32 4, <8 x i1> [[VCTP]], <8 x i16> undef)
  ; CHECK: %store.pred = icmp ule <4 x i32> %induction.store
  ; CHECK: tail call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> {{.*}}, <4 x i32>* {{.*}}, i32 4, <4 x i1> %store.pred)
diff --git a/test/CodeGen/Thumb2/LowOverheadLoops/tail-reduce.ll b/test/CodeGen/Thumb2/LowOverheadLoops/tail-reduce.ll

index b30806001c1aaef6af31ee362f1ee4c729f14e14..7cdd28fd0f3cfb2b79bd52a7eb479769690475a3 100644 (file)
--- a/test/CodeGen/Thumb2/LowOverheadLoops/tail-reduce.ll
+++ b/test/CodeGen/Thumb2/LowOverheadLoops/tail-reduce.ll
@@ -5,8 +5,8 @@
  ; CHECK: phi <8 x i16> [ zeroinitializer, %entry ]
  ; CHECK: phi i32
  ; CHECK: [[PHI:%[^ ]+]] = phi i32 [ %N, %entry ], [ [[ELEMS:%[^ ]+]], %vector.body ]
+; CHECK: [[VCTP:%[^ ]+]] = call <8 x i1> @llvm.arm.vctp16(i32 [[PHI]])
  ; CHECK: [[ELEMS]] = sub i32 [[PHI]], 8
-; CHECK: [[VCTP:%[^ ]+]] = call <8 x i1> @llvm.arm.vctp16(i32 [[ELEMS]])
  ; CHECK: call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* %tmp4, i32 4, <8 x i1> [[VCTP]], <8 x i16> undef)
  ; CHECK: call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* %tmp6, i32 4, <8 x i1> [[VCTP]], <8 x i16> undef)
  define i16 @reduction_i32(i16* nocapture readonly %A, i16* nocapture readonly %B, i32 %N) {
@@ -63,8 +63,8 @@ middle.block:                                     ; preds = %vector.body
  ; CHECK: phi <8 x i16> [ zeroinitializer, %entry ]
  ; CHECK: phi i32
  ; CHECK: [[PHI:%[^ ]+]] = phi i32 [ %N, %entry ], [ [[ELEMS:%[^ ]+]], %vector.body ]
+; CHECK: [[VCTP:%[^ ]+]] = call <8 x i1> @llvm.arm.vctp16(i32 [[PHI]])
  ; CHECK: [[ELEMS]] = sub i32 [[PHI]], 8
-; CHECK: [[VCTP:%[^ ]+]] = call <8 x i1> @llvm.arm.vctp16(i32 [[ELEMS]])
  ; CHECK: call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* %tmp4, i32 4, <8 x i1> [[VCTP]], <8 x i16> undef)
  define i16 @reduction_i32_with_scalar(i16* nocapture readonly %A, i16 %B, i32 %N) local_unnamed_addr {
  entry:
diff --git a/test/CodeGen/Thumb2/LowOverheadLoops/vector-arith-codegen.ll b/test/CodeGen/Thumb2/LowOverheadLoops/vector-arith-codegen.ll

index 1612e26e3f7d0d2bc409556e74969ad706dd5668..bd691963ad32c0730398ba86fb20c164e91c9118 100644 (file)
--- a/test/CodeGen/Thumb2/LowOverheadLoops/vector-arith-codegen.ll
+++ b/test/CodeGen/Thumb2/LowOverheadLoops/vector-arith-codegen.ll
@@ -3,13 +3,14 @@
  ; CHECK-LABEL: mul_reduce_add
  ; CHECK:      dls lr,
  ; CHECK: [[LOOP:.LBB[0-9_]+]]:
-; CHECK:      sub{{.*}} [[ELEMS:r[0-9]+]], #4
-; CHECK:      vctp.32  [[ELEMS]]
+; CHECK:      vctp.32  [[ELEMS:r[0-9]+]]
  ; CHECK:      vpstt    
  ; CHECK-NEXT: vldrwt.u32
  ; CHECK-NEXT: vldrwt.u32
+; CHECK:      mov [[ELEMS_OUT:r[0-9]+]], [[ELEMS]]
+; CHECK:      sub{{.*}} [[ELEMS]], #4
  ; CHECK:      le       lr, [[LOOP]]
-; CHECK:      vctp.32  [[ELEMS]]
+; CHECK:      vctp.32  [[ELEMS_OUT]]
  ; CHECK:      vpsel
  ; CHECK:      vaddv.u32        r0
  define dso_local i32 @mul_reduce_add(i32* noalias nocapture readonly %a, i32* noalias nocapture readonly %b, i32 %N) {
@@ -54,7 +55,17 @@ for.cond.cleanup:                                 ; preds = %middle.block, %entr
    ret i32 %res.0.lcssa
  }
  
-; Function Attrs: norecurse nounwind readonly
+; CHECK-LABEL: mul_reduce_add_const
+; CHECK:    dls lr
+; CHECK: [[LOOP:.LBB[0-9_]+]]:
+; CHECK:      vctp.32 [[ELEMS:r[0-9]+]]
+; CHECK:      vpst     
+; CHECK-NEXT: vldrwt.u32 q{{.*}}, [r0]
+; CHECK:      mov [[ELEMS_OUT:r[0-9]+]], [[ELEMS]]
+; CHECK:      sub{{.*}} [[ELEMS]], #4
+; CHECK:      le lr, [[LOOP]]
+; CHECK:      vctp.32 [[ELEMS_OUT]]
+; CHECK:      vpsel
  define dso_local i32 @mul_reduce_add_const(i32* noalias nocapture readonly %a, i32 %b, i32 %N) {
  entry:
    %cmp6 = icmp eq i32 %N, 0
@@ -96,13 +107,14 @@ for.cond.cleanup:                                 ; preds = %middle.block, %entr
  ; CHECK-LABEL: add_reduce_add_const
  ; CHECK:      dls lr, lr
  ; CHECK: [[LOOP:.LBB[0-9_]+]]:
-; CHECK:      subs [[ELEMS:r[0-9]+]], #4
-; CHECK:      vctp.32 [[ELEMS]]
+; CHECK:      vctp.32 [[ELEMS:r[0-9]+]]
  ; CHECK:      vpst     
  ; CHECK-NEXT: vldrwt.u32 q{{.*}}, [r0]
+; CHECK:      mov [[ELEMS_OUT:r[0-9]+]], [[ELEMS]]
+; CHECK:      sub{{.*}} [[ELEMS]], #4
  ; CHECK:      vadd.i32
  ; CHECK:      le lr, [[LOOP]]
-; CHECK:      vctp.32 [[ELEMS]]
+; CHECK:      vctp.32 [[ELEMS_OUT]]
  ; CHECK:      vpsel
  define dso_local i32 @add_reduce_add_const(i32* noalias nocapture readonly %a, i32 %b, i32 %N) {
  entry:
@@ -145,8 +157,8 @@ for.cond.cleanup:                                 ; preds = %middle.block, %entr
  ; CHECK-LABEL: vector_mul_const
  ; CHECK:      dls lr, lr
  ; CHECK: [[LOOP:.LBB[0-9_]+]]:
-; CHECK:      subs [[ELEMS:r[0-9]+]], #4
-; CHECK:      vctp.32 [[ELEMS]]
+; CHECK:      vctp.32 [[ELEMS:r[0-9]+]]
+; CHECK:      sub{{.*}} [[ELEMS]], #4
  ; CHECK:      vpst     
  ; CHECK-NEXT: vldrwt.u32 q{{.*}}, [r1]
  ; CHECK:      vmul.i32
@@ -192,8 +204,8 @@ for.cond.cleanup:                                 ; preds = %vector.body, %entry
  ; CHECK-LABEL: vector_add_const
  ; CHECK:      dls lr, lr
  ; CHECK: [[LOOP:.LBB[0-9_]+]]:
-; CHECK:      subs [[ELEMS:r[0-9]+]], #4
-; CHECK:      vctp.32 [[ELEMS]]
+; CHECK:      vctp.32 [[ELEMS:r[0-9]+]]
+; CHECK:      sub{{.*}} [[ELEMS]], #4
  ; CHECK:      vpst     
  ; CHECK-NEXT: vldrwt.u32 q{{.*}}, [r1]
  ; CHECK:      vadd.i32
diff --git a/test/CodeGen/Thumb2/LowOverheadLoops/vector-reduce-mve-tail.ll b/test/CodeGen/Thumb2/LowOverheadLoops/vector-reduce-mve-tail.ll

index 824f1d5790d2832dba9f881e7884e45a721b6a6d..dbf40f60cbd9a17323aeb4fb87bda4105c47956c 100644 (file)
--- a/test/CodeGen/Thumb2/LowOverheadLoops/vector-reduce-mve-tail.ll
+++ b/test/CodeGen/Thumb2/LowOverheadLoops/vector-reduce-mve-tail.ll
@@ -6,13 +6,13 @@
  ; CHECK: vector.body:
  ; CHECK-NOT: phi i32 [ 0, %vector.ph ]
  ; CHECK: [[ELTS:%[^ ]+]] = phi i32 [ %N, %vector.ph ], [ [[SUB:%[^ ]+]], %vector.body ]
+; CHECK: [[VCTP:%[^ ]+]] = call <4 x i1> @llvm.arm.vctp32(i32 [[ELTS]])
  ; CHECK: [[SUB]] = sub i32 [[ELTS]], 4
-; CHECK: [[VCTP:%[^ ]+]] = call <4 x i1> @llvm.arm.vctp32(i32 [[SUB]])
  ; CHECK: call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* {{.*}}, i32 4, <4 x i1> [[VCTP]]
  ; CHECK: call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* {{.*}}, i32 4, <4 x i1> [[VCTP]],
  
  ; CHECK: middle.block:
-; CHECK: [[VCTP_CLONE:%[^ ]+]] = call <4 x i1> @llvm.arm.vctp32(i32 [[SUB]])
+; CHECK: [[VCTP_CLONE:%[^ ]+]] = call <4 x i1> @llvm.arm.vctp32(i32 [[ELTS]])
  ; CHECK: [[VPSEL:%[^ ]+]] = select <4 x i1> [[VCTP_CLONE]],
  ; CHECK: call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> [[VPSEL]])
author	Sam Parker <sam.parker@arm.com>
	Mon, 30 Sep 2019 08:03:23 +0000 (08:03 +0000)
committer	Sam Parker <sam.parker@arm.com>
	Mon, 30 Sep 2019 08:03:23 +0000 (08:03 +0000)
lib/Target/ARM/MVETailPredication.cpp		patch \| blob \| history
test/CodeGen/Thumb2/LowOverheadLoops/basic-tail-pred.ll		patch \| blob \| history
test/CodeGen/Thumb2/LowOverheadLoops/cond-vector-reduce-mve-codegen.ll		patch \| blob \| history
test/CodeGen/Thumb2/LowOverheadLoops/mve-tail-data-types.ll		patch \| blob \| history
test/CodeGen/Thumb2/LowOverheadLoops/nested.ll		patch \| blob \| history
test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-widen.ll		patch \| blob \| history
test/CodeGen/Thumb2/LowOverheadLoops/tail-reduce.ll		patch \| blob \| history
test/CodeGen/Thumb2/LowOverheadLoops/vector-arith-codegen.ll		patch \| blob \| history
test/CodeGen/Thumb2/LowOverheadLoops/vector-reduce-mve-tail.ll		patch \| blob \| history