[LV] Preserve order of dependences in interleaved accesses analysis

author Matthew Simpson <mssimpso@codeaurora.org>

Fri, 24 Jun 2016 15:33:25 +0000 (15:33 +0000)

committer Matthew Simpson <mssimpso@codeaurora.org>

Fri, 24 Jun 2016 15:33:25 +0000 (15:33 +0000)
author Matthew Simpson <mssimpso@codeaurora.org>
Fri, 24 Jun 2016 15:33:25 +0000 (15:33 +0000)
committer Matthew Simpson <mssimpso@codeaurora.org>
Fri, 24 Jun 2016 15:33:25 +0000 (15:33 +0000)
diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp

index 954b5938cf5f4d72e54a0ca9fdbd11c6acb9e198..336ce708cb784fe490c22d6ae4e13d6e752f33f4 100644 (file)
--- a/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -832,8 +832,9 @@ private:
  class InterleavedAccessInfo {
  public:
    InterleavedAccessInfo(PredicatedScalarEvolution &PSE, Loop *L,
-                        DominatorTree *DT)
-      : PSE(PSE), TheLoop(L), DT(DT), RequiresScalarEpilogue(false) {}
+                        DominatorTree *DT, LoopInfo *LI)
+      : PSE(PSE), TheLoop(L), DT(DT), LI(LI), LAI(nullptr),
+        RequiresScalarEpilogue(false) {}
  
    ~InterleavedAccessInfo() {
      SmallSet<InterleaveGroup *, 4> DelSet;
@@ -874,6 +875,9 @@ public:
    /// out-of-bounds requires a scalar epilogue iteration for correctness.
    bool requiresScalarEpilogue() const { return RequiresScalarEpilogue; }
  
+  /// \brief Initialize the LoopAccessInfo used for dependence checking.
+  void setLAI(const LoopAccessInfo *Info) { LAI = Info; }
+
  private:
    /// A wrapper around ScalarEvolution, used to add runtime SCEV checks.
    /// Simplifies SCEV expressions in the context of existing SCEV assumptions.
@@ -882,6 +886,8 @@ private:
    PredicatedScalarEvolution &PSE;
    Loop *TheLoop;
    DominatorTree *DT;
+  LoopInfo *LI;
+  const LoopAccessInfo *LAI;
  
    /// True if the loop may contain non-reversed interleaved groups with
    /// out-of-bounds accesses. We ensure we don't speculatively access memory
@@ -891,6 +897,10 @@ private:
    /// Holds the relationships between the members and the interleave group.
    DenseMap<Instruction *, InterleaveGroup *> InterleaveGroupMap;
  
+  /// Holds dependences among the memory accesses in the loop. It maps a source
+  /// access to a set of dependent sink accesses.
+  DenseMap<Instruction *, SmallPtrSet<Instruction *, 2>> Dependences;
+
    /// \brief The descriptor for a strided memory access.
    struct StrideDescriptor {
      StrideDescriptor(int Stride, const SCEV *Scev, unsigned Size,
@@ -905,6 +915,9 @@ private:
      unsigned Align;   // The alignment of this access.
    };
  
+  /// \brief A type for holding instructions and their stride descriptors.
+  typedef std::pair<Instruction *, StrideDescriptor> StrideEntry;
+
    /// \brief Create a new interleave group with the given instruction \p Instr,
    /// stride \p Stride and alignment \p Align.
    ///
@@ -930,6 +943,78 @@ private:
    void collectConstStridedAccesses(
        MapVector<Instruction *, StrideDescriptor> &StrideAccesses,
        const ValueToValueMap &Strides);
+
+  /// \brief Returns true if \p Stride is allowed in an interleaved group.
+  static bool isStrided(int Stride) {
+    unsigned Factor = std::abs(Stride);
+    return Factor >= 2 && Factor <= MaxInterleaveGroupFactor;
+  }
+
+  /// \brief Returns true if LoopAccessInfo can be used for dependence queries.
+  bool areDependencesValid() const {
+    return LAI && LAI->getDepChecker().getDependences();
+  }
+
+  /// \brief Returns true if memory accesses \p B and \p A can be reordered, if
+  /// necessary, when constructing interleaved groups.
+  ///
+  /// \p B must precede \p A in program order. We return false if reordering is
+  /// not necessary or is prevented because \p B and \p A may be dependent.
+  bool canReorderMemAccessesForInterleavedGroups(StrideEntry *B,
+                                                 StrideEntry *A) const {
+
+    // Code motion for interleaved accesses can potentially hoist strided loads
+    // and sink strided stores. The code below checks the legality of the
+    // following two conditions:
+    //
+    // 1. Potentially moving a strided load (A) before any store (B) that
+    //    precedes A, or
+    //
+    // 2. Potentially moving a strided store (B) after any load or store (A)
+    //    that B precedes.
+    //
+    // It's legal to reorder B and A if we know there isn't a dependence from B
+    // to A. Note that this determination is conservative since some
+    // dependences could potentially be reordered safely.
+
+    // B is potentially the source of a dependence.
+    auto *Src = B->first;
+    auto SrcDes = B->second;
+
+    // A is potentially the sink of a dependence.
+    auto *Sink = A->first;
+    auto SinkDes = A->second;
+
+    // Code motion for interleaved accesses can't violate WAR dependences.
+    // Thus, reordering is legal if the source isn't a write.
+    if (!Src->mayWriteToMemory())
+      return true;
+
+    // At least one of the accesses must be strided.
+    if (!isStrided(SrcDes.Stride) && !isStrided(SinkDes.Stride))
+      return true;
+
+    // If dependence information is not available from LoopAccessInfo,
+    // conservatively assume the instructions can't be reordered.
+    if (!areDependencesValid())
+      return false;
+
+    // If we know there is a dependence from source to sink, assume the
+    // instructions can't be reordered. Otherwise, reordering is legal.
+    return !Dependences.count(Src) || !Dependences.lookup(Src).count(Sink);
+  }
+
+  /// \brief Collect the dependences from LoopAccessInfo.
+  ///
+  /// We process the dependences once during the interleaved access analysis to
+  /// enable constant-time dependence queries.
+  void collectDependences() {
+    if (!areDependencesValid())
+      return;
+    auto *Deps = LAI->getDepChecker().getDependences();
+    for (auto Dep : *Deps)
+      Dependences[Dep.getSource(*LAI)].insert(Dep.getDestination(*LAI));
+  }
  };
  
  /// Utility class for getting and setting loop vectorizer hints in the form
@@ -1260,13 +1345,14 @@ public:
                              DominatorTree *DT, TargetLibraryInfo *TLI,
                              AliasAnalysis *AA, Function *F,
                              const TargetTransformInfo *TTI,
-                            LoopAccessAnalysis *LAA,
+                            LoopAccessAnalysis *LAA, LoopInfo *LI,
                              LoopVectorizationRequirements *R,
                              LoopVectorizeHints *H)
        : NumPredStores(0), TheLoop(L), PSE(PSE), TLI(TLI), TheFunction(F),
-        TTI(TTI), DT(DT), LAA(LAA), LAI(nullptr), InterleaveInfo(PSE, L, DT),
-        Induction(nullptr), WidestIndTy(nullptr), HasFunNoNaNAttr(false),
-        Requirements(R), Hints(H) {}
+        TTI(TTI), DT(DT), LAA(LAA), LAI(nullptr),
+        InterleaveInfo(PSE, L, DT, LI), Induction(nullptr),
+        WidestIndTy(nullptr), HasFunNoNaNAttr(false), Requirements(R),
+        Hints(H) {}
  
    /// ReductionList contains the reduction descriptors for all
    /// of the reductions that were found in the loop.
@@ -1872,7 +1958,7 @@ struct LoopVectorize : public FunctionPass {
  
      // Check if it is legal to vectorize the loop.
      LoopVectorizationRequirements Requirements;
-    LoopVectorizationLegality LVL(L, PSE, DT, TLI, AA, F, TTI, LAA,
+    LoopVectorizationLegality LVL(L, PSE, DT, TLI, AA, F, TTI, LAA, LI,
                                    &Requirements, &Hints);
      if (!LVL.canVectorize()) {
        DEBUG(dbgs() << "LV: Not vectorizing: Cannot prove legality.\n");
@@ -4956,6 +5042,7 @@ void LoopVectorizationLegality::collectLoopUniforms() {
  
  bool LoopVectorizationLegality::canVectorizeMemory() {
    LAI = &LAA->getInfo(TheLoop);
+  InterleaveInfo.setLAI(LAI);
    auto &OptionalReport = LAI->getReport();
    if (OptionalReport)
      emitAnalysis(VectorizationReport(*OptionalReport));
@@ -5070,7 +5157,17 @@ void InterleavedAccessInfo::collectConstStridedAccesses(
    // Holds load/store instructions in program order.
    SmallVector<Instruction *, 16> AccessList;
  
-  for (auto *BB : TheLoop->getBlocks()) {
+  // Since it's desired that the load/store instructions be maintained in
+  // "program order" for the interleaved access analysis, we have to visit the
+  // blocks in the loop in reverse postorder (i.e., in a topological order).
+  // Such an ordering will ensure that any load/store that may be executed
+  // before a second load/store will precede the second load/store in the
+  // AccessList.
+  LoopBlocksDFS DFS(TheLoop);
+  DFS.perform(LI);
+  for (LoopBlocksDFS::RPOIterator I = DFS.beginRPO(), E = DFS.endRPO(); I != E;
+       ++I) {
+    BasicBlock *BB = *I;
      bool IsPred = LoopAccessInfo::blockNeedsPredication(BB, TheLoop, DT);
  
      for (auto &I : *BB) {
@@ -5095,13 +5192,6 @@ void InterleavedAccessInfo::collectConstStridedAccesses(
      Value *Ptr = LI ? LI->getPointerOperand() : SI->getPointerOperand();
      int Stride = getPtrStride(PSE, Ptr, TheLoop, Strides);
  
-    // The factor of the corresponding interleave group.
-    unsigned Factor = std::abs(Stride);
-
-    // Ignore the access if the factor is too small or too large.
-    if (Factor < 2 || Factor > MaxInterleaveGroupFactor)
-      continue;
-
      const SCEV *Scev = replaceSymbolicStrideSCEV(PSE, Strides, Ptr);
      PointerType *PtrTy = dyn_cast<PointerType>(Ptr->getType());
      unsigned Size = DL.getTypeAllocSize(PtrTy->getElementType());
@@ -5115,26 +5205,42 @@ void InterleavedAccessInfo::collectConstStridedAccesses(
    }
  }
  
-// Analyze interleaved accesses and collect them into interleave groups.
+// Analyze interleaved accesses and collect them into interleaved load and
+// store groups.
+//
+// When generating code for an interleaved load group, we effectively hoist all
+// loads in the group to the location of the first load in program order. When
+// generating code for an interleaved store group, we sink all stores to the
+// location of the last store. This code motion can change the order of load
+// and store instructions and may break dependences.
+//
+// The code generation strategy mentioned above ensures that we won't violate
+// any write-after-read (WAR) dependences.
+//
+// E.g., for the WAR dependence:  a = A[i];      // (1)
+//                                A[i] = b;      // (2)
  //
-// Notice that the vectorization on interleaved groups will change instruction
-// orders and may break dependences. But the memory dependence check guarantees
-// that there is no overlap between two pointers of different strides, element
-// sizes or underlying bases.
+// The store group of (2) is always inserted at or below (2), and the load
+// group of (1) is always inserted at or above (1). Thus, the instructions will
+// never be reordered. All other dependences are checked to ensure the
+// correctness of the instruction reordering.
  //
-// For pointers sharing the same stride, element size and underlying base, no
-// need to worry about Read-After-Write dependences and Write-After-Read
+// The algorithm visits all memory accesses in the loop in bottom-up program
+// order. Program order is established by traversing the blocks in the loop in
+// reverse postorder when collecting the accesses.
+//
+// We visit the memory accesses in bottom-up order because it can simplify the
+// construction of store groups in the presence of write-after-write (WAW)
  // dependences.
  //
-// E.g. The RAW dependence:  A[i] = a;
-//                           b = A[i];
-// This won't exist as it is a store-load forwarding conflict, which has
-// already been checked and forbidden in the dependence check.
+// E.g., for the WAW dependence:  A[i] = a;      // (1)
+//                                A[i] = b;      // (2)
+//                                A[i + 1] = c;  // (3)
  //
-// E.g. The WAR dependence:  a = A[i];  // (1)
-//                           A[i] = b;  // (2)
-// The store group of (2) is always inserted at or below (2), and the load group
-// of (1) is always inserted at or above (1). The dependence is safe.
+// We will first create a store group with (3) and (2). (1) can't be added to
+// this group because it and (2) are dependent. However, (1) can be grouped
+// with other accesses that may precede it in program order. Note that a
+// bottom-up order does not imply that WAW dependences should not be checked.
  void InterleavedAccessInfo::analyzeInterleaving(
      const ValueToValueMap &Strides) {
    DEBUG(dbgs() << "LV: Analyzing interleaved accesses...\n");
@@ -5146,6 +5252,9 @@ void InterleavedAccessInfo::analyzeInterleaving(
    if (StrideAccesses.empty())
      return;
  
+  // Collect the dependences in the loop.
+  collectDependences();
+
    // Holds all interleaved store groups temporarily.
    SmallSetVector<InterleaveGroup *, 4> StoreGroups;
    // Holds all interleaved load groups temporarily.
@@ -5156,33 +5265,75 @@ void InterleavedAccessInfo::analyzeInterleaving(
    //   1. A and B have the same stride.
    //   2. A and B have the same memory object size.
    //   3. B belongs to the group according to the distance.
-  //
-  // The bottom-up order can avoid breaking the Write-After-Write dependences
-  // between two pointers of the same base.
-  // E.g.  A[i]   = a;   (1)
-  //       A[i]   = b;   (2)
-  //       A[i+1] = c    (3)
-  // We form the group (2)+(3) in front, so (1) has to form groups with accesses
-  // above (1), which guarantees that (1) is always above (2).
-  for (auto I = StrideAccesses.rbegin(), E = StrideAccesses.rend(); I != E;
-       ++I) {
-    Instruction *A = I->first;
-    StrideDescriptor DesA = I->second;
-
-    InterleaveGroup *Group = getInterleaveGroup(A);
-    if (!Group) {
-      DEBUG(dbgs() << "LV: Creating an interleave group with:" << *A << '\n');
-      Group = createInterleaveGroup(A, DesA.Stride, DesA.Align);
+  for (auto AI = StrideAccesses.rbegin(), E = StrideAccesses.rend(); AI != E;
+       ++AI) {
+    Instruction *A = AI->first;
+    StrideDescriptor DesA = AI->second;
+
+    // Initialize a group for A if it has an allowable stride. Even if we don't
+    // create a group for A, we continue with the bottom-up algorithm to ensure
+    // we don't break any of A's dependences.
+    InterleaveGroup *Group = nullptr;
+    if (isStrided(DesA.Stride)) {
+      Group = getInterleaveGroup(A);
+      if (!Group) {
+        DEBUG(dbgs() << "LV: Creating an interleave group with:" << *A << '\n');
+        Group = createInterleaveGroup(A, DesA.Stride, DesA.Align);
+      }
+      if (A->mayWriteToMemory())
+        StoreGroups.insert(Group);
+      else
+        LoadGroups.insert(Group);
      }
  
-    if (A->mayWriteToMemory())
-      StoreGroups.insert(Group);
-    else
-      LoadGroups.insert(Group);
+    for (auto BI = std::next(AI); BI != E; ++BI) {
+      Instruction *B = BI->first;
+      StrideDescriptor DesB = BI->second;
+
+      // Our code motion strategy implies that we can't have dependences
+      // between accesses in an interleaved group and other accesses located
+      // between the first and last member of the group. Note that this also
+      // means that a group can't have more than one member at a given offset.
+      // The accesses in a group can have dependences with other accesses, but
+      // we must ensure we don't extend the boundaries of the group such that
+      // we encompass those dependent accesses.
+      //
+      // For example, assume we have the sequence of accesses shown below in a
+      // stride-2 loop:
+      //
+      //  (1, 2) is a group | A[i]   = a;  // (1)
+      //                    | A[i-1] = b;  // (2) |
+      //                      A[i-3] = c;  // (3)
+      //                      A[i]   = d;  // (4) | (2, 4) is not a group
+      //
+      // Because accesses (2) and (3) are dependent, we can group (2) with (1)
+      // but not with (4). If we did, the dependent access (3) would be within
+      // the boundaries of the (2, 4) group.
+      if (!canReorderMemAccessesForInterleavedGroups(&*BI, &*AI)) {
+
+        // If a dependence exists and B is already in a group, we know that B
+        // must be a store since B precedes A and WAR dependences are allowed.
+        // Thus, B would be sunk below A. We release B's group to prevent this
+        // illegal code motion. B will then be free to form another group with
+        // instructions that precede it.
+        if (isInterleaved(B)) {
+          InterleaveGroup *StoreGroup = getInterleaveGroup(B);
+          StoreGroups.remove(StoreGroup);
+          releaseGroup(StoreGroup);
+        }
+
+        // If a dependence exists and B is not already in a group (or it was
+        // and we just released it), A might be hoisted above B (if A is a
+        // load) or another store might be sunk below B (if A is a store). In
+        // either case, we can't add additional instructions to A's group. A
+        // will only form a group with instructions that it precedes.
+        break;
+      }
  
-    for (auto II = std::next(I); II != E; ++II) {
-      Instruction *B = II->first;
-      StrideDescriptor DesB = II->second;
+      // At this point, we've checked for illegal code motion. If either A or B
+      // isn't strided, there's nothing left to do.
+      if (!isStrided(DesA.Stride) || !isStrided(DesB.Stride))
+        continue;
  
        // Ignore if B is already in a group or B is a different memory operation.
        if (isInterleaved(B) || A->mayReadFromMemory() != B->mayReadFromMemory())
diff --git a/test/Transforms/LoopVectorize/interleaved-accesses.ll b/test/Transforms/LoopVectorize/interleaved-accesses.ll

index 1cce7931936c150d75a51c690520111010493daf..868c3a2cdabf79e213a9aa4e231231bf50f14fc0 100644 (file)
--- a/test/Transforms/LoopVectorize/interleaved-accesses.ll
+++ b/test/Transforms/LoopVectorize/interleaved-accesses.ll
@@ -555,4 +555,309 @@ for.body:                                         ; preds = %for.body, %entry
    br i1 %exitcond, label %for.cond.cleanup, label %for.body
  }
  
+; Check vectorization of interleaved access groups in the presence of
+; dependences (PR27626). The following tests check that we don't reorder
+; dependent loads and stores when generating code for interleaved access
+; groups. Stores should be scalarized because the required code motion would
+; break dependences, and the remaining interleaved load groups should have
+; gaps.
+
+; PR27626_0: Ensure a strided store is not moved after a dependent (zero
+;            distance) strided load.
+
+; void PR27626_0(struct pair *p, int z, int n) {
+;   for (int i = 0; i < n; i++) {
+;     p[i].x = z;
+;     p[i].y = p[i].x;
+;   }
+; }
+
+; CHECK-LABEL: @PR27626_0(
+; CHECK: min.iters.checked:
+; CHECK:   %n.mod.vf = and i64 %[[N:.+]], 3
+; CHECK:   %[[IsZero:[a-zA-Z0-9]+]] = icmp eq i64 %n.mod.vf, 0
+; CHECK:   %[[R:[a-zA-Z0-9]+]] = select i1 %[[IsZero]], i64 4, i64 %n.mod.vf
+; CHECK:   %n.vec = sub i64 %[[N]], %[[R]]
+; CHECK: vector.body:
+; CHECK:   %[[L1:.+]] = load <8 x i32>, <8 x i32>* {{.*}}
+; CHECK:   %[[X1:.+]] = extractelement <8 x i32> %[[L1]], i32 0
+; CHECK:   store i32 %[[X1]], {{.*}}
+; CHECK:   %[[X2:.+]] = extractelement <8 x i32> %[[L1]], i32 2
+; CHECK:   store i32 %[[X2]], {{.*}}
+; CHECK:   %[[X3:.+]] = extractelement <8 x i32> %[[L1]], i32 4
+; CHECK:   store i32 %[[X3]], {{.*}}
+; CHECK:   %[[X4:.+]] = extractelement <8 x i32> %[[L1]], i32 6
+; CHECK:   store i32 %[[X4]], {{.*}}
+
+%pair.i32 = type { i32, i32 }
+define void @PR27626_0(%pair.i32 *%p, i32 %z, i64 %n) {
+entry:
+  br label %for.body
+
+for.body:
+  %i = phi i64 [ %i.next, %for.body ], [ 0, %entry ]
+  %p_i.x = getelementptr inbounds %pair.i32, %pair.i32* %p, i64 %i, i32 0
+  %p_i.y = getelementptr inbounds %pair.i32, %pair.i32* %p, i64 %i, i32 1
+  store i32 %z, i32* %p_i.x, align 4
+  %0 = load i32, i32* %p_i.x, align 4
+  store i32 %0, i32 *%p_i.y, align 4
+  %i.next = add nuw nsw i64 %i, 1
+  %cond = icmp slt i64 %i.next, %n
+  br i1 %cond, label %for.body, label %for.end
+
+for.end:
+  ret void
+}
+
+; PR27626_1: Ensure a strided load is not moved before a dependent (zero
+;            distance) strided store.
+
+; void PR27626_1(struct pair *p, int n) {
+;   int s = 0;
+;   for (int i = 0; i < n; i++) {
+;     p[i].y = p[i].x;
+;     s += p[i].y
+;   }
+; }
+
+; CHECK-LABEL: @PR27626_1(
+; CHECK: min.iters.checked:
+; CHECK:   %n.mod.vf = and i64 %[[N:.+]], 3
+; CHECK:   %[[IsZero:[a-zA-Z0-9]+]] = icmp eq i64 %n.mod.vf, 0
+; CHECK:   %[[R:[a-zA-Z0-9]+]] = select i1 %[[IsZero]], i64 4, i64 %n.mod.vf
+; CHECK:   %n.vec = sub i64 %[[N]], %[[R]]
+; CHECK: vector.body:
+; CHECK:   %[[Phi:.+]] = phi <4 x i32> [ zeroinitializer, %vector.ph ], [ {{.*}}, %vector.body ]
+; CHECK:   %[[L1:.+]] = load <8 x i32>, <8 x i32>* {{.*}}
+; CHECK:   %[[X1:.+]] = extractelement <8 x i32> %[[L1:.+]], i32 0
+; CHECK:   store i32 %[[X1:.+]], {{.*}}
+; CHECK:   %[[X2:.+]] = extractelement <8 x i32> %[[L1:.+]], i32 2
+; CHECK:   store i32 %[[X2:.+]], {{.*}}
+; CHECK:   %[[X3:.+]] = extractelement <8 x i32> %[[L1:.+]], i32 4
+; CHECK:   store i32 %[[X3:.+]], {{.*}}
+; CHECK:   %[[X4:.+]] = extractelement <8 x i32> %[[L1:.+]], i32 6
+; CHECK:   store i32 %[[X4:.+]], {{.*}}
+; CHECK:   %[[L2:.+]] = load <8 x i32>, <8 x i32>* {{.*}}
+; CHECK:   %[[S1:.+]] = shufflevector <8 x i32> %[[L2]], <8 x i32> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+; CHECK:   add nsw <4 x i32> %[[S1]], %[[Phi]]
+
+define i32 @PR27626_1(%pair.i32 *%p, i64 %n) {
+entry:
+  br label %for.body
+
+for.body:
+  %i = phi i64 [ %i.next, %for.body ], [ 0, %entry ]
+  %s = phi i32 [ %2, %for.body ], [ 0, %entry ]
+  %p_i.x = getelementptr inbounds %pair.i32, %pair.i32* %p, i64 %i, i32 0
+  %p_i.y = getelementptr inbounds %pair.i32, %pair.i32* %p, i64 %i, i32 1
+  %0 = load i32, i32* %p_i.x, align 4
+  store i32 %0, i32* %p_i.y, align 4
+  %1 = load i32, i32* %p_i.y, align 4
+  %2 = add nsw i32 %1, %s
+  %i.next = add nuw nsw i64 %i, 1
+  %cond = icmp slt i64 %i.next, %n
+  br i1 %cond, label %for.body, label %for.end
+
+for.end:
+  %3 = phi i32 [ %2, %for.body ]
+  ret i32 %3
+}
+
+; PR27626_2: Ensure a strided store is not moved after a dependent (negative
+;            distance) strided load.
+
+; void PR27626_2(struct pair *p, int z, int n) {
+;   for (int i = 0; i < n; i++) {
+;     p[i].x = z;
+;     p[i].y = p[i - 1].x;
+;   }
+; }
+
+; CHECK-LABEL: @PR27626_2(
+; CHECK: min.iters.checked:
+; CHECK:   %n.mod.vf = and i64 %[[N:.+]], 3
+; CHECK:   %[[IsZero:[a-zA-Z0-9]+]] = icmp eq i64 %n.mod.vf, 0
+; CHECK:   %[[R:[a-zA-Z0-9]+]] = select i1 %[[IsZero]], i64 4, i64 %n.mod.vf
+; CHECK:   %n.vec = sub i64 %[[N]], %[[R]]
+; CHECK: vector.body:
+; CHECK:   %[[L1:.+]] = load <8 x i32>, <8 x i32>* {{.*}}
+; CHECK:   %[[X1:.+]] = extractelement <8 x i32> %[[L1]], i32 0
+; CHECK:   store i32 %[[X1]], {{.*}}
+; CHECK:   %[[X2:.+]] = extractelement <8 x i32> %[[L1]], i32 2
+; CHECK:   store i32 %[[X2]], {{.*}}
+; CHECK:   %[[X3:.+]] = extractelement <8 x i32> %[[L1]], i32 4
+; CHECK:   store i32 %[[X3]], {{.*}}
+; CHECK:   %[[X4:.+]] = extractelement <8 x i32> %[[L1]], i32 6
+; CHECK:   store i32 %[[X4]], {{.*}}
+
+define void @PR27626_2(%pair.i32 *%p, i64 %n, i32 %z) {
+entry:
+  br label %for.body
+
+for.body:
+  %i = phi i64 [ %i.next, %for.body ], [ 0, %entry ]
+  %i_minus_1 = add nuw nsw i64 %i, -1
+  %p_i.x = getelementptr inbounds %pair.i32, %pair.i32* %p, i64 %i, i32 0
+  %p_i_minus_1.x = getelementptr inbounds %pair.i32, %pair.i32* %p, i64 %i_minus_1, i32 0
+  %p_i.y = getelementptr inbounds %pair.i32, %pair.i32* %p, i64 %i, i32 1
+  store i32 %z, i32* %p_i.x, align 4
+  %0 = load i32, i32* %p_i_minus_1.x, align 4
+  store i32 %0, i32 *%p_i.y, align 4
+  %i.next = add nuw nsw i64 %i, 1
+  %cond = icmp slt i64 %i.next, %n
+  br i1 %cond, label %for.body, label %for.end
+
+for.end:
+  ret void
+}
+
+; PR27626_3: Ensure a strided load is not moved before a dependent (negative
+;            distance) strided store.
+
+; void PR27626_3(struct pair *p, int z, int n) {
+;   for (int i = 0; i < n; i++) {
+;     p[i + 1].y = p[i].x;
+;     s += p[i].y;
+;   }
+; }
+
+; CHECK-LABEL: @PR27626_3(
+; CHECK: min.iters.checked:
+; CHECK:   %n.mod.vf = and i64 %[[N:.+]], 3
+; CHECK:   %[[IsZero:[a-zA-Z0-9]+]] = icmp eq i64 %n.mod.vf, 0
+; CHECK:   %[[R:[a-zA-Z0-9]+]] = select i1 %[[IsZero]], i64 4, i64 %n.mod.vf
+; CHECK:   %n.vec = sub i64 %[[N]], %[[R]]
+; CHECK: vector.body:
+; CHECK:   %[[Phi:.+]] = phi <4 x i32> [ zeroinitializer, %vector.ph ], [ {{.*}}, %vector.body ]
+; CHECK:   %[[L1:.+]] = load <8 x i32>, <8 x i32>* {{.*}}
+; CHECK:   %[[X1:.+]] = extractelement <8 x i32> %[[L1:.+]], i32 0
+; CHECK:   store i32 %[[X1:.+]], {{.*}}
+; CHECK:   %[[X2:.+]] = extractelement <8 x i32> %[[L1:.+]], i32 2
+; CHECK:   store i32 %[[X2:.+]], {{.*}}
+; CHECK:   %[[X3:.+]] = extractelement <8 x i32> %[[L1:.+]], i32 4
+; CHECK:   store i32 %[[X3:.+]], {{.*}}
+; CHECK:   %[[X4:.+]] = extractelement <8 x i32> %[[L1:.+]], i32 6
+; CHECK:   store i32 %[[X4:.+]], {{.*}}
+; CHECK:   %[[L2:.+]] = load <8 x i32>, <8 x i32>* {{.*}}
+; CHECK:   %[[S1:.+]] = shufflevector <8 x i32> %[[L2]], <8 x i32> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+; CHECK:   add nsw <4 x i32> %[[S1]], %[[Phi]]
+
+define i32 @PR27626_3(%pair.i32 *%p, i64 %n, i32 %z) {
+entry:
+  br label %for.body
+
+for.body:
+  %i = phi i64 [ %i.next, %for.body ], [ 0, %entry ]
+  %s = phi i32 [ %2, %for.body ], [ 0, %entry ]
+  %i_plus_1 = add nuw nsw i64 %i, 1
+  %p_i.x = getelementptr inbounds %pair.i32, %pair.i32* %p, i64 %i, i32 0
+  %p_i.y = getelementptr inbounds %pair.i32, %pair.i32* %p, i64 %i, i32 1
+  %p_i_plus_1.y = getelementptr inbounds %pair.i32, %pair.i32* %p, i64 %i_plus_1, i32 1
+  %0 = load i32, i32* %p_i.x, align 4
+  store i32 %0, i32* %p_i_plus_1.y, align 4
+  %1 = load i32, i32* %p_i.y, align 4
+  %2 = add nsw i32 %1, %s
+  %i.next = add nuw nsw i64 %i, 1
+  %cond = icmp slt i64 %i.next, %n
+  br i1 %cond, label %for.body, label %for.end
+
+for.end:
+  %3 = phi i32 [ %2, %for.body ]
+  ret i32 %3
+}
+
+; PR27626_4: Ensure we form an interleaved group for strided stores in the
+;            presence of a write-after-write dependence. We create a group for
+;            (2) and (3) while excluding (1).
+
+; void PR27626_4(int *a, int x, int y, int z, int n) {
+;   for (int i = 0; i < n; i += 2) {
+;     a[i] = x;      // (1)
+;     a[i] = y;      // (2)
+;     a[i + 1] = z;  // (3)
+;   }
+; }
+
+; CHECK-LABEL: @PR27626_4(
+; CHECK: vector.ph:
+; CHECK:   %[[INS_Y:.+]] = insertelement <4 x i32> undef, i32 %y, i32 0
+; CHECK:   %[[SPLAT_Y:.+]] = shufflevector <4 x i32> %[[INS_Y]], <4 x i32> undef, <4 x i32> zeroinitializer
+; CHECK:   %[[INS_Z:.+]] = insertelement <4 x i32> undef, i32 %z, i32 0
+; CHECK:   %[[SPLAT_Z:.+]] = shufflevector <4 x i32> %[[INS_Z]], <4 x i32> undef, <4 x i32> zeroinitializer
+; CHECK: vector.body:
+; CHECK:   store i32 %x, {{.*}}
+; CHECK:   store i32 %x, {{.*}}
+; CHECK:   store i32 %x, {{.*}}
+; CHECK:   store i32 %x, {{.*}}
+; CHECK:   %[[VEC:.+]] = shufflevector <4 x i32> %[[SPLAT_Y]], <4 x i32> %[[SPLAT_Z]], <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
+; CHECK:   store <8 x i32> %[[VEC]], {{.*}}
+
+define void @PR27626_4(i32 *%a, i32 %x, i32 %y, i32 %z, i64 %n) {
+entry:
+  br label %for.body
+
+for.body:
+  %i = phi i64 [ %i.next, %for.body ], [ 0, %entry ]
+  %i_plus_1 = add i64 %i, 1
+  %a_i = getelementptr inbounds i32, i32* %a, i64 %i
+  %a_i_plus_1 = getelementptr inbounds i32, i32* %a, i64 %i_plus_1
+  store i32 %x, i32* %a_i, align 4
+  store i32 %y, i32* %a_i, align 4
+  store i32 %z, i32* %a_i_plus_1, align 4
+  %i.next = add nuw nsw i64 %i, 2
+  %cond = icmp slt i64 %i.next, %n
+  br i1 %cond, label %for.body, label %for.end
+
+for.end:
+  ret void
+}
+
+; PR27626_5: Ensure we do not form an interleaved group for strided stores in
+;            the presence of a write-after-write dependence.
+
+; void PR27626_5(int *a, int x, int y, int z, int n) {
+;   for (int i = 3; i < n; i += 2) {
+;     a[i - 1] = x;
+;     a[i - 3] = y;
+;     a[i] = z;
+;   }
+; }
+
+; CHECK-LABEL: @PR27626_5(
+; CHECK: vector.body:
+; CHECK:   store i32 %x, {{.*}}
+; CHECK:   store i32 %x, {{.*}}
+; CHECK:   store i32 %x, {{.*}}
+; CHECK:   store i32 %x, {{.*}}
+; CHECK:   store i32 %y, {{.*}}
+; CHECK:   store i32 %y, {{.*}}
+; CHECK:   store i32 %y, {{.*}}
+; CHECK:   store i32 %y, {{.*}}
+; CHECK:   store i32 %z, {{.*}}
+; CHECK:   store i32 %z, {{.*}}
+; CHECK:   store i32 %z, {{.*}}
+; CHECK:   store i32 %z, {{.*}}
+
+define void @PR27626_5(i32 *%a, i32 %x, i32 %y, i32 %z, i64 %n) {
+entry:
+  br label %for.body
+
+for.body:
+  %i = phi i64 [ %i.next, %for.body ], [ 3, %entry ]
+  %i_minus_1 = sub i64 %i, 1
+  %i_minus_3 = sub i64 %i_minus_1, 2
+  %a_i = getelementptr inbounds i32, i32* %a, i64 %i
+  %a_i_minus_1 = getelementptr inbounds i32, i32* %a, i64 %i_minus_1
+  %a_i_minus_3 = getelementptr inbounds i32, i32* %a, i64 %i_minus_3
+  store i32 %x, i32* %a_i_minus_1, align 4
+  store i32 %y, i32* %a_i_minus_3, align 4
+  store i32 %z, i32* %a_i, align 4
+  %i.next = add nuw nsw i64 %i, 2
+  %cond = icmp slt i64 %i.next, %n
+  br i1 %cond, label %for.body, label %for.end
+
+for.end:
+  ret void
+}
+
  attributes #0 = { "unsafe-fp-math"="true" }
author	Matthew Simpson <mssimpso@codeaurora.org>
	Fri, 24 Jun 2016 15:33:25 +0000 (15:33 +0000)
committer	Matthew Simpson <mssimpso@codeaurora.org>
	Fri, 24 Jun 2016 15:33:25 +0000 (15:33 +0000)
lib/Transforms/Vectorize/LoopVectorize.cpp		patch \| blob \| history
test/Transforms/LoopVectorize/interleaved-accesses.ll		patch \| blob \| history