[LV] Sink casts to unravel first order recurrence

author Ayal Zaks <ayal.zaks@intel.com>

Fri, 30 Jun 2017 21:05:06 +0000 (21:05 +0000)

committer Ayal Zaks <ayal.zaks@intel.com>

Fri, 30 Jun 2017 21:05:06 +0000 (21:05 +0000)
author Ayal Zaks <ayal.zaks@intel.com>
Fri, 30 Jun 2017 21:05:06 +0000 (21:05 +0000)
committer Ayal Zaks <ayal.zaks@intel.com>
Fri, 30 Jun 2017 21:05:06 +0000 (21:05 +0000)
diff --git a/include/llvm/Transforms/Utils/LoopUtils.h b/include/llvm/Transforms/Utils/LoopUtils.h

index 0397eb95e7632a6c61ffd7670f605df2503283b0..1344285917ba0c0c9b6056669d5a92615ec045cc 100644 (file)
--- a/include/llvm/Transforms/Utils/LoopUtils.h
+++ b/include/llvm/Transforms/Utils/LoopUtils.h
@@ -184,9 +184,14 @@ public:
    /// Returns true if Phi is a first-order recurrence. A first-order recurrence
    /// is a non-reduction recurrence relation in which the value of the
    /// recurrence in the current loop iteration equals a value defined in the
-  /// previous iteration.
-  static bool isFirstOrderRecurrence(PHINode *Phi, Loop *TheLoop,
-                                     DominatorTree *DT);
+  /// previous iteration. \p SinkAfter includes pairs of instructions where the
+  /// first will be rescheduled to appear after the second if/when the loop is
+  /// vectorized. It may be augmented with additional pairs if needed in order
+  /// to handle Phi as a first-order recurrence.
+  static bool
+  isFirstOrderRecurrence(PHINode *Phi, Loop *TheLoop,
+                         DenseMap<Instruction *, Instruction *> &SinkAfter,
+                         DominatorTree *DT);
  
    RecurrenceKind getRecurrenceKind() { return Kind; }
  
diff --git a/lib/Transforms/Utils/LoopUtils.cpp b/lib/Transforms/Utils/LoopUtils.cpp

index 0ed33945ef4074672ef22079a4d3bcf3c740b7c0..58b70be95d9971d133b59eb8fd907f137a583d73 100644 (file)
--- a/lib/Transforms/Utils/LoopUtils.cpp
+++ b/lib/Transforms/Utils/LoopUtils.cpp
@@ -528,8 +528,9 @@ bool RecurrenceDescriptor::isReductionPHI(PHINode *Phi, Loop *TheLoop,
    return false;
  }
  
-bool RecurrenceDescriptor::isFirstOrderRecurrence(PHINode *Phi, Loop *TheLoop,
-                                                  DominatorTree *DT) {
+bool RecurrenceDescriptor::isFirstOrderRecurrence(
+    PHINode *Phi, Loop *TheLoop,
+    DenseMap<Instruction *, Instruction *> &SinkAfter, DominatorTree *DT) {
  
    // Ensure the phi node is in the loop header and has two incoming values.
    if (Phi->getParent() != TheLoop->getHeader() ||
@@ -551,12 +552,24 @@ bool RecurrenceDescriptor::isFirstOrderRecurrence(PHINode *Phi, Loop *TheLoop,
    // Get the previous value. The previous value comes from the latch edge while
    // the initial value comes form the preheader edge.
    auto *Previous = dyn_cast<Instruction>(Phi->getIncomingValueForBlock(Latch));
-  if (!Previous || !TheLoop->contains(Previous) || isa<PHINode>(Previous))
+  if (!Previous || !TheLoop->contains(Previous) || isa<PHINode>(Previous) ||
+      SinkAfter.count(Previous)) // Cannot rely on dominance due to motion.
      return false;
  
    // Ensure every user of the phi node is dominated by the previous value.
    // The dominance requirement ensures the loop vectorizer will not need to
    // vectorize the initial value prior to the first iteration of the loop.
+  // TODO: Consider extending this sinking to handle other kinds of instructions
+  // and expressions, beyond sinking a single cast past Previous.
+  if (Phi->hasOneUse()) {
+    auto *I = Phi->user_back();
+    if (I->isCast() && (I->getParent() == Phi->getParent()) && I->hasOneUse() &&
+        DT->dominates(Previous, I->user_back())) {
+      SinkAfter[I] = Previous;
+      return true;
+    }
+  }
+
    for (User *U : Phi->users())
      if (auto *I = dyn_cast<Instruction>(U)) {
        if (!DT->dominates(Previous, I))
diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp

index 0cf2b9f15a844a5ae314ac71cdd44f1cc05a3600..193cc4d137870a5cd4ad84f1f24b60da554b9ebf 100644 (file)
--- a/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -1604,6 +1604,9 @@ public:
    /// Return the first-order recurrences found in the loop.
    RecurrenceSet *getFirstOrderRecurrences() { return &FirstOrderRecurrences; }
  
+  /// Return the set of instructions to sink to handle first-order recurrences.
+  DenseMap<Instruction *, Instruction *> &getSinkAfter() { return SinkAfter; }
+
    /// Returns the widest induction type.
    Type *getWidestInductionType() { return WidestIndTy; }
  
@@ -1806,6 +1809,9 @@ private:
    InductionList Inductions;
    /// Holds the phi nodes that are first-order recurrences.
    RecurrenceSet FirstOrderRecurrences;
+  /// Holds instructions that need to sink past other instructions to handle
+  /// first-order recurrences.
+  DenseMap<Instruction *, Instruction *> SinkAfter;
    /// Holds the widest induction type encountered.
    Type *WidestIndTy;
  
@@ -5378,7 +5384,8 @@ bool LoopVectorizationLegality::canVectorizeInstrs() {
            continue;
          }
  
-        if (RecurrenceDescriptor::isFirstOrderRecurrence(Phi, TheLoop, DT)) {
+        if (RecurrenceDescriptor::isFirstOrderRecurrence(Phi, TheLoop,
+                                                         SinkAfter, DT)) {
            FirstOrderRecurrences.insert(Phi);
            continue;
          }
@@ -7651,6 +7658,15 @@ void LoopVectorizationPlanner::executePlan(InnerLoopVectorizer &ILV) {
  
    // 2. Copy and widen instructions from the old loop into the new loop.
  
+  // Move instructions to handle first-order recurrences.
+  DenseMap<Instruction *, Instruction *> SinkAfter = Legal->getSinkAfter();
+  for (auto &Entry : SinkAfter) {
+    Entry.first->removeFromParent();
+    Entry.first->insertAfter(Entry.second);
+    DEBUG(dbgs() << "Sinking" << *Entry.first << " after" << *Entry.second
+                 << " to vectorize a 1st order recurrence.\n");
+  }
+
    // Collect instructions from the original loop that will become trivially dead
    // in the vectorized loop. We don't need to vectorize these instructions. For
    // example, original induction update instructions can become dead because we
diff --git a/test/Transforms/LoopVectorize/first-order-recurrence.ll b/test/Transforms/LoopVectorize/first-order-recurrence.ll

index ef65deee8ec95176977e236c3f30ce3ce69a7d50..0ff94c1450acfcfb0377dcfd6fcbf5fde24392b7 100644 (file)
--- a/test/Transforms/LoopVectorize/first-order-recurrence.ll
+++ b/test/Transforms/LoopVectorize/first-order-recurrence.ll
@@ -2,6 +2,8 @@
  ; RUN: opt < %s -loop-vectorize -force-vector-width=4 -force-vector-interleave=2 -dce -instcombine -S | FileCheck %s --check-prefix=UNROLL
  ; RUN: opt < %s -loop-vectorize -force-vector-width=4 -force-vector-interleave=2 -S | FileCheck %s --check-prefix=UNROLL-NO-IC
  ; RUN: opt < %s -loop-vectorize -force-vector-width=1 -force-vector-interleave=2 -S | FileCheck %s --check-prefix=UNROLL-NO-VF
+; RUN: opt < %s -loop-vectorize -force-vector-width=4 -force-vector-interleave=1 -S | FileCheck %s --check-prefix=SINK-AFTER
+; RUN: opt < %s -loop-vectorize -force-vector-width=4 -force-vector-interleave=1 -S | FileCheck %s --check-prefix=NO-SINK-AFTER
  
  target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128"
  
@@ -447,3 +449,81 @@ for.body:
    %exitcond = icmp eq i32 %inc1, 10240
    br i1 %exitcond, label %for.cond.cleanup, label %for.body
  }
+
+; void sink_after(short *a, int n, int *b) {
+;   for(int i = 0; i < n; i++)
+;     b[i] = (a[i] * a[i + 1]);
+; }
+;
+; SINK-AFTER-LABEL: sink_after
+; Check that the sext sank after the load in the vector loop.
+; SINK-AFTER: vector.body
+; SINK-AFTER:   %vector.recur = phi <4 x i16> [ %vector.recur.init, %vector.ph ], [ %wide.load, %vector.body ]
+; SINK-AFTER:   %wide.load = load <4 x i16>
+; SINK-AFTER:   %[[VSHUF:.+]] = shufflevector <4 x i16> %vector.recur, <4 x i16> %wide.load, <4 x i32> <i32 3, i32 4, i32 5, i32 6>
+; SINK-AFTER:   %[[VCONV:.+]] = sext <4 x i16> %[[VSHUF]] to <4 x i32>
+; SINK-AFTER:   %[[VCONV3:.+]] = sext <4 x i16> %wide.load to <4 x i32>
+; SINK-AFTER:   mul nsw <4 x i32> %[[VCONV3]], %[[VCONV]]
+; Check also that the sext sank after the load in the scalar loop.
+; SINK-AFTER: for.body
+; SINK-AFTER:   %scalar.recur = phi i16 [ %scalar.recur.init, %scalar.ph ], [ %[[LOAD:.+]], %for.body ]
+; SINK-AFTER:   %[[LOAD]] = load i16, i16* %arrayidx2
+; SINK-AFTER:   %[[CONV:.+]] = sext i16 %scalar.recur to i32
+; SINK-AFTER:   %[[CONV3:.+]] = sext i16 %[[LOAD]] to i32
+; SINK-AFTER:   %mul = mul nsw i32 %[[CONV3]], %[[CONV]]
+;
+define void @sink_after(i16* %a, i32* %b, i64 %n) {
+entry:
+  %.pre = load i16, i16* %a
+  br label %for.body
+
+for.body:
+  %0 = phi i16 [ %.pre, %entry ], [ %1, %for.body ]
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %conv = sext i16 %0 to i32
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %arrayidx2 = getelementptr inbounds i16, i16* %a, i64 %indvars.iv.next
+  %1 = load i16, i16* %arrayidx2
+  %conv3 = sext i16 %1 to i32
+  %mul = mul nsw i32 %conv3, %conv
+  %arrayidx5 = getelementptr inbounds i32, i32* %b, i64 %indvars.iv
+  store i32 %mul, i32* %arrayidx5
+  %exitcond = icmp eq i64 %indvars.iv.next, %n
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret void
+}
+
+; void no_sink_after(short *a, int n, int *b) {
+;   for(int i = 0; i < n; i++)
+;     b[i] = ((a[i] + 2) * a[i + 1]);
+; }
+;
+; NO-SINK-AFTER-LABEL: no_sink_after
+; NO-SINK-AFTER-NOT:   vector.ph:
+; NO-SINK-AFTER:       }
+;
+define void @no_sink_after(i16* %a, i32* %b, i64 %n) {
+entry:
+  %.pre = load i16, i16* %a
+  br label %for.body
+
+for.body:
+  %0 = phi i16 [ %.pre, %entry ], [ %1, %for.body ]
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %conv = sext i16 %0 to i32
+  %add = add nsw i32 %conv, 2
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %arrayidx2 = getelementptr inbounds i16, i16* %a, i64 %indvars.iv.next
+  %1 = load i16, i16* %arrayidx2
+  %conv3 = sext i16 %1 to i32
+  %mul = mul nsw i32 %add, %conv3
+  %arrayidx5 = getelementptr inbounds i32, i32* %b, i64 %indvars.iv
+  store i32 %mul, i32* %arrayidx5
+  %exitcond = icmp eq i64 %indvars.iv.next, %n
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret void
+}
author	Ayal Zaks <ayal.zaks@intel.com>
	Fri, 30 Jun 2017 21:05:06 +0000 (21:05 +0000)
committer	Ayal Zaks <ayal.zaks@intel.com>
	Fri, 30 Jun 2017 21:05:06 +0000 (21:05 +0000)
include/llvm/Transforms/Utils/LoopUtils.h		patch \| blob \| history
lib/Transforms/Utils/LoopUtils.cpp		patch \| blob \| history
lib/Transforms/Vectorize/LoopVectorize.cpp		patch \| blob \| history
test/Transforms/LoopVectorize/first-order-recurrence.ll		patch \| blob \| history