[LV] fold-tail predication should be respected even with assume_safety

author Dorit Nuzman <dorit.nuzman@intel.com>

Thu, 15 Aug 2019 07:12:14 +0000 (07:12 +0000)

committer Dorit Nuzman <dorit.nuzman@intel.com>

Thu, 15 Aug 2019 07:12:14 +0000 (07:12 +0000)
author Dorit Nuzman <dorit.nuzman@intel.com>
Thu, 15 Aug 2019 07:12:14 +0000 (07:12 +0000)
committer Dorit Nuzman <dorit.nuzman@intel.com>
Thu, 15 Aug 2019 07:12:14 +0000 (07:12 +0000)
diff --git a/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h b/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h

index 3b0b1eec5b8fa32f8f6e7fab6c1f187b75ee135e..62ecd6d6aa343c136d3b6674e7138b0915b5160f 100644 (file)
--- a/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h
+++ b/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h
@@ -228,8 +228,8 @@ public:
    bool canVectorize(bool UseVPlanNativePath);
  
    /// Return true if we can vectorize this loop while folding its tail by
-  /// masking.
-  bool canFoldTailByMasking();
+  /// masking, and mark all respective loads/stores for masking.
+  bool prepareToFoldTailByMasking();
  
    /// Returns the primary induction variable.
    PHINode *getPrimaryInduction() { return PrimaryInduction; }
@@ -355,9 +355,16 @@ private:
    bool canVectorizeOuterLoop();
  
    /// Return true if all of the instructions in the block can be speculatively
-  /// executed. \p SafePtrs is a list of addresses that are known to be legal
-  /// and we know that we can read from them without segfault.
-  bool blockCanBePredicated(BasicBlock *BB, SmallPtrSetImpl<Value *> &SafePtrs);
+  /// executed, and record the loads/stores that require masking. If's that
+  /// guard loads can be ignored under "assume safety" unless \p PreserveGuards
+  /// is true. This can happen when we introduces guards for which the original
+  /// "unguarded-loads are safe" assumption does not hold. For example, the
+  /// vectorizer's fold-tail transformation changes the loop to execute beyond
+  /// its original trip-count, under a proper guard, which should be preserved.
+  /// \p SafePtrs is a list of addresses that are known to be legal and we know
+  /// that we can read from them without segfault.
+  bool blockCanBePredicated(BasicBlock *BB, SmallPtrSetImpl<Value *> &SafePtrs,
+                            bool PreserveGuards = false);
  
    /// Updates the vectorization state by adding \p Phi to the inductions list.
    /// This can set \p Phi as the main induction of the loop if \p Phi is a
diff --git a/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp b/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp

index 8b311fa8b4c64781bdc2aab8f2f6838a0bcf4977..3ac1234446ca8a8b7eb8fa2321ce25b82d8b1b1d 100644 (file)
--- a/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
+++ b/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
@@ -869,7 +869,7 @@ bool LoopVectorizationLegality::blockNeedsPredication(BasicBlock *BB) {
  }
  
  bool LoopVectorizationLegality::blockCanBePredicated(
-    BasicBlock *BB, SmallPtrSetImpl<Value *> &SafePtrs) {
+    BasicBlock *BB, SmallPtrSetImpl<Value *> &SafePtrs, bool PreserveGuards) {
    const bool IsAnnotatedParallel = TheLoop->isAnnotatedParallel();
  
    for (Instruction &I : *BB) {
@@ -888,7 +888,7 @@ bool LoopVectorizationLegality::blockCanBePredicated(
          // !llvm.mem.parallel_loop_access implies if-conversion safety.
          // Otherwise, record that the load needs (real or emulated) masking
          // and let the cost model decide.
-        if (!IsAnnotatedParallel)
+        if (!IsAnnotatedParallel || PreserveGuards)
            MaskedOp.insert(LI);
          continue;
        }
@@ -1159,7 +1159,7 @@ bool LoopVectorizationLegality::canVectorize(bool UseVPlanNativePath) {
    return Result;
  }
  
-bool LoopVectorizationLegality::canFoldTailByMasking() {
+bool LoopVectorizationLegality::prepareToFoldTailByMasking() {
  
    LLVM_DEBUG(dbgs() << "LV: checking if tail can be folded by masking.\n");
  
@@ -1202,7 +1202,7 @@ bool LoopVectorizationLegality::canFoldTailByMasking() {
    // Check and mark all blocks for predication, including those that ordinarily
    // do not need predication such as the header block.
    for (BasicBlock *BB : TheLoop->blocks()) {
-    if (!blockCanBePredicated(BB, SafePointers)) {
+    if (!blockCanBePredicated(BB, SafePointers, /* MaskAllLoads= */ true)) {
        reportVectorizationFailure(
            "Cannot fold tail by masking as required",
            "control flow cannot be substituted for a select",
diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp

index 2030541607977520bc91014f9b807759784f35d0..dac48a1d8142b879db0cd7691a1212f084a4a6e4 100644 (file)
--- a/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -4853,7 +4853,7 @@ Optional<unsigned> LoopVectorizationCostModel::computeMaxVF() {
    // found modulo the vectorization factor is not zero, try to fold the tail
    // by masking.
    // FIXME: look for a smaller MaxVF that does divide TC rather than masking.
-  if (Legal->canFoldTailByMasking()) {
+  if (Legal->prepareToFoldTailByMasking()) {
      FoldTailByMasking = true;
      return MaxVF;
    }
diff --git a/test/Transforms/LoopVectorize/X86/tail_folding_and_assume_safety.ll b/test/Transforms/LoopVectorize/X86/tail_folding_and_assume_safety.ll

new file mode 100644 (file)

index 0000000..98ca496
--- /dev/null
+++ b/test/Transforms/LoopVectorize/X86/tail_folding_and_assume_safety.ll
@@ -0,0 +1,166 @@
+; RUN: opt -mcpu=skx -S -loop-vectorize -instcombine -force-vector-width=8 -force-vector-interleave=1 < %s | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-pc-linux-gnu"
+
+; Case1: With pragma predicate to force tail-folding.
+; All memory opertions are masked.
+;void fold_tail(int * restrict p, int * restrict q1, int * restrict q2, int guard) {
+;   #pragma clang loop vectorize_predicate(enable)
+;   for(int ix=0; ix < 1021; ++ix) {
+;     if (ix > guard) {
+;       p[ix] = q1[ix] + q2[ix];
+;     }
+;   }
+;}
+
+;CHECK-LABEL: @fold_tail
+;CHECK: vector.body:
+;CHECK: call <8 x i32> @llvm.masked.load
+;CHECK: call <8 x i32> @llvm.masked.load
+;CHECK: call void @llvm.masked.store
+
+; Function Attrs: nofree norecurse nounwind uwtable
+define dso_local void @fold_tail(i32* noalias nocapture %p, i32* noalias nocapture readonly %q1, i32* noalias nocapture readonly %q2, 
+i32 %guard) local_unnamed_addr #0 {
+entry:
+  %0 = sext i32 %guard to i64
+  br label %for.body
+
+for.cond.cleanup:
+  ret void
+
+for.body:
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.inc ]
+  %cmp1 = icmp sgt i64 %indvars.iv, %0
+  br i1 %cmp1, label %if.then, label %for.inc
+
+if.then:
+  %arrayidx = getelementptr inbounds i32, i32* %q1, i64 %indvars.iv
+  %1 = load i32, i32* %arrayidx, align 4, !tbaa !2
+  %arrayidx3 = getelementptr inbounds i32, i32* %q2, i64 %indvars.iv
+  %2 = load i32, i32* %arrayidx3, align 4, !tbaa !2
+  %add = add nsw i32 %2, %1
+  %arrayidx5 = getelementptr inbounds i32, i32* %p, i64 %indvars.iv
+  store i32 %add, i32* %arrayidx5, align 4, !tbaa !2
+  br label %for.inc
+
+for.inc:
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 1021
+  br i1 %exitcond, label %for.cond.cleanup, label %for.body, !llvm.loop !8
+}
+
+; Case2: With pragma assume_safety only the store is masked.
+; void assume_safety(int * p, int * q1, int * q2, int guard) {
+;   #pragma clang loop vectorize(assume_safety)
+;   for(int ix=0; ix < 1021; ++ix) {
+;     if (ix > guard) {
+;       p[ix] = q1[ix] + q2[ix];
+;     }
+;   }
+;}
+
+;CHECK-LABEL: @assume_safety
+;CHECK: vector.body:
+;CHECK-NOT: @llvm.masked.load
+;CHECK:  call void @llvm.masked.store
+
+; Function Attrs: norecurse nounwind uwtable
+define void @assume_safety(i32* nocapture, i32* nocapture readonly, i32* nocapture readonly, i32) local_unnamed_addr #0 {
+  %5 = sext i32 %3 to i64
+  br label %7
+
+; <label>:6:
+  ret void
+
+; <label>:7:
+  %8 = phi i64 [ 0, %4 ], [ %18, %17 ]
+  %9 = icmp sgt i64 %8, %5
+  br i1 %9, label %10, label %17
+
+; <label>:10:
+  %11 = getelementptr inbounds i32, i32* %1, i64 %8
+  %12 = load i32, i32* %11, align 4, !tbaa !2, !llvm.mem.parallel_loop_access !6
+  %13 = getelementptr inbounds i32, i32* %2, i64 %8
+  %14 = load i32, i32* %13, align 4, !tbaa !2, !llvm.mem.parallel_loop_access !6
+  %15 = add nsw i32 %14, %12
+  %16 = getelementptr inbounds i32, i32* %0, i64 %8
+  store i32 %15, i32* %16, align 4, !tbaa !2, !llvm.mem.parallel_loop_access !6
+  br label %17
+
+; <label>:17:
+  %18 = add nuw nsw i64 %8, 1
+  %19 = icmp eq i64 %18, 1021
+  br i1 %19, label %6, label %7, !llvm.loop !6
+}
+
+; Case3: With pragma assume_safety and pragma predicate both the store and the
+; load are masked.
+; void fold_tail_and_assume_safety(int * p, int * q1, int * q2, int guard) {
+;   #pragma clang loop vectorize(assume_safety) vectorize_predicate(enable)
+;   for(int ix=0; ix < 1021; ++ix) {
+;     if (ix > guard) {
+;       p[ix] = q1[ix] + q2[ix];
+;     }
+;   }
+;}
+
+;CHECK-LABEL: @fold_tail_and_assume_safety
+;CHECK: vector.body:
+;CHECK: call <8 x i32> @llvm.masked.load
+;CHECK: call <8 x i32> @llvm.masked.load
+;CHECK: call void @llvm.masked.store
+
+; Function Attrs: nofree norecurse nounwind uwtable
+define dso_local void @fold_tail_and_assume_safety(i32* noalias nocapture %p, i32* noalias nocapture readonly %q1, i32* noalias nocapture readonly %q2, 
+i32 %guard) local_unnamed_addr #0 {
+entry:
+  %0 = sext i32 %guard to i64
+  br label %for.body
+
+for.cond.cleanup:
+  ret void
+
+for.body:
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.inc ]
+  %cmp1 = icmp sgt i64 %indvars.iv, %0
+  br i1 %cmp1, label %if.then, label %for.inc
+
+if.then:
+  %arrayidx = getelementptr inbounds i32, i32* %q1, i64 %indvars.iv
+  %1 = load i32, i32* %arrayidx, align 4, !tbaa !2, !llvm.access.group !10
+  %arrayidx3 = getelementptr inbounds i32, i32* %q2, i64 %indvars.iv
+  %2 = load i32, i32* %arrayidx3, align 4, !tbaa !2, !llvm.access.group !10
+  %add = add nsw i32 %2, %1
+  %arrayidx5 = getelementptr inbounds i32, i32* %p, i64 %indvars.iv
+  store i32 %add, i32* %arrayidx5, align 4, !tbaa !2, !llvm.access.group !10
+  br label %for.inc
+
+for.inc:
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 1021
+  br i1 %exitcond, label %for.cond.cleanup, label %for.body, !llvm.loop !11
+}
+
+attributes #0 = { norecurse nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
+
+!llvm.module.flags = !{!0}
+!llvm.ident = !{!1}
+
+!0 = !{i32 1, !"wchar_size", i32 4}
+!1 = !{!"clang version 6.0.0-1ubuntu2 (tags/RELEASE_600/final)"}
+!2 = !{!3, !3, i64 0}
+!3 = !{!"int", !4, i64 0}
+!4 = !{!"omnipotent char", !5, i64 0}
+!5 = !{!"Simple C/C++ TBAA"}
+!6 = distinct !{!6, !7}
+!7 = !{!"llvm.loop.vectorize.enable", i1 true}
+
+!8 = distinct !{!8, !9}
+!9 = !{!"llvm.loop.vectorize.predicate.enable", i1 true}
+
+!10 = distinct !{}
+!11 = distinct !{!11, !12, !13}
+!12 = !{!"llvm.loop.parallel_accesses", !10}
+!13 = !{!"llvm.loop.vectorize.predicate.enable", i1 true}
diff --git a/test/Transforms/LoopVectorize/X86/vect.omp.force.small-tc.ll b/test/Transforms/LoopVectorize/X86/vect.omp.force.small-tc.ll

index 7c249a1b4228b2bcfb33aa988ecce6702160120d..1e8f1409dfb3783ce49c1195a8ddbdb214cd4aa8 100644 (file)
--- a/test/Transforms/LoopVectorize/X86/vect.omp.force.small-tc.ll
+++ b/test/Transforms/LoopVectorize/X86/vect.omp.force.small-tc.ll
@@ -102,17 +102,17 @@ define void @vectorized1(float* noalias nocapture %A, float* noalias nocapture r
  ; CHECK-NEXT:    [[INDUCTION:%.*]] = add <8 x i64> [[BROADCAST_SPLAT]], <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7>
  ; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
  ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds float, float* [[B:%.*]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds float, float* [[TMP1]], i32 0
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast float* [[TMP2]] to <8 x float>*
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x float>, <8 x float>* [[TMP3]], align 4
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds float, float* [[TMP4]], i32 0
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast float* [[TMP5]] to <8 x float>*
-; CHECK-NEXT:    [[WIDE_LOAD1:%.*]] = load <8 x float>, <8 x float>* [[TMP6]], align 4
-; CHECK-NEXT:    [[TMP7:%.*]] = fadd fast <8 x float> [[WIDE_LOAD]], [[WIDE_LOAD1]]
-; CHECK-NEXT:    [[TMP8:%.*]] = icmp ule <8 x i64> [[INDUCTION]], <i64 19, i64 19, i64 19, i64 19, i64 19, i64 19, i64 19, i64 19>
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast float* [[TMP5]] to <8 x float>*
-; CHECK-NEXT:    call void @llvm.masked.store.v8f32.p0v8f32(<8 x float> [[TMP7]], <8 x float>* [[TMP9]], i32 4, <8 x i1> [[TMP8]])
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp ule <8 x i64> [[INDUCTION]], <i64 19, i64 19, i64 19, i64 19, i64 19, i64 19, i64 19, i64 19>
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds float, float* [[TMP1]], i32 0
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast float* [[TMP3]] to <8 x float>*
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <8 x float> @llvm.masked.load.v8f32.p0v8f32(<8 x float>* [[TMP4]], i32 4, <8 x i1> [[TMP2]], <8 x float> undef), !llvm.access.group !6
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i64 [[TMP0]]
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds float, float* [[TMP5]], i32 0
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast float* [[TMP6]] to <8 x float>*
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD1:%.*]] = call <8 x float> @llvm.masked.load.v8f32.p0v8f32(<8 x float>* [[TMP7]], i32 4, <8 x i1> [[TMP2]], <8 x float> undef), !llvm.access.group !6
+; CHECK-NEXT:    [[TMP8:%.*]] = fadd fast <8 x float> [[WIDE_MASKED_LOAD]], [[WIDE_MASKED_LOAD1]]
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast float* [[TMP6]] to <8 x float>*
+; CHECK-NEXT:    call void @llvm.masked.store.v8f32.p0v8f32(<8 x float> [[TMP8]], <8 x float>* [[TMP9]], i32 4, <8 x i1> [[TMP2]])
  ; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 8
  ; CHECK-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], 24
  ; CHECK-NEXT:    br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !7
author	Dorit Nuzman <dorit.nuzman@intel.com>
	Thu, 15 Aug 2019 07:12:14 +0000 (07:12 +0000)
committer	Dorit Nuzman <dorit.nuzman@intel.com>
	Thu, 15 Aug 2019 07:12:14 +0000 (07:12 +0000)
include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h		patch \| blob \| history
lib/Transforms/Vectorize/LoopVectorizationLegality.cpp		patch \| blob \| history
lib/Transforms/Vectorize/LoopVectorize.cpp		patch \| blob \| history
test/Transforms/LoopVectorize/X86/tail_folding_and_assume_safety.ll	[new file with mode: 0644]	patch \| blob
test/Transforms/LoopVectorize/X86/vect.omp.force.small-tc.ll		patch \| blob \| history