class InterleavedAccessInfo {
public:
InterleavedAccessInfo(PredicatedScalarEvolution &PSE, Loop *L,
- DominatorTree *DT)
- : PSE(PSE), TheLoop(L), DT(DT), RequiresScalarEpilogue(false) {}
+ DominatorTree *DT, LoopInfo *LI)
+ : PSE(PSE), TheLoop(L), DT(DT), LI(LI), LAI(nullptr),
+ RequiresScalarEpilogue(false) {}
~InterleavedAccessInfo() {
SmallSet<InterleaveGroup *, 4> DelSet;
/// out-of-bounds requires a scalar epilogue iteration for correctness.
bool requiresScalarEpilogue() const { return RequiresScalarEpilogue; }
+ /// \brief Initialize the LoopAccessInfo used for dependence checking.
+ void setLAI(const LoopAccessInfo *Info) { LAI = Info; }
+
private:
/// A wrapper around ScalarEvolution, used to add runtime SCEV checks.
/// Simplifies SCEV expressions in the context of existing SCEV assumptions.
PredicatedScalarEvolution &PSE;
Loop *TheLoop;
DominatorTree *DT;
+ LoopInfo *LI;
+ const LoopAccessInfo *LAI;
/// True if the loop may contain non-reversed interleaved groups with
/// out-of-bounds accesses. We ensure we don't speculatively access memory
/// Holds the relationships between the members and the interleave group.
DenseMap<Instruction *, InterleaveGroup *> InterleaveGroupMap;
+ /// Holds dependences among the memory accesses in the loop. It maps a source
+ /// access to a set of dependent sink accesses.
+ DenseMap<Instruction *, SmallPtrSet<Instruction *, 2>> Dependences;
+
/// \brief The descriptor for a strided memory access.
struct StrideDescriptor {
StrideDescriptor(int Stride, const SCEV *Scev, unsigned Size,
unsigned Align; // The alignment of this access.
};
+ /// \brief A type for holding instructions and their stride descriptors.
+ typedef std::pair<Instruction *, StrideDescriptor> StrideEntry;
+
/// \brief Create a new interleave group with the given instruction \p Instr,
/// stride \p Stride and alignment \p Align.
///
void collectConstStridedAccesses(
MapVector<Instruction *, StrideDescriptor> &StrideAccesses,
const ValueToValueMap &Strides);
+
+ /// \brief Returns true if \p Stride is allowed in an interleaved group.
+ static bool isStrided(int Stride) {
+ unsigned Factor = std::abs(Stride);
+ return Factor >= 2 && Factor <= MaxInterleaveGroupFactor;
+ }
+
+ /// \brief Returns true if LoopAccessInfo can be used for dependence queries.
+ bool areDependencesValid() const {
+ return LAI && LAI->getDepChecker().getDependences();
+ }
+
+ /// \brief Returns true if memory accesses \p B and \p A can be reordered, if
+ /// necessary, when constructing interleaved groups.
+ ///
+ /// \p B must precede \p A in program order. We return false if reordering is
+ /// not necessary or is prevented because \p B and \p A may be dependent.
+ bool canReorderMemAccessesForInterleavedGroups(StrideEntry *B,
+ StrideEntry *A) const {
+
+ // Code motion for interleaved accesses can potentially hoist strided loads
+ // and sink strided stores. The code below checks the legality of the
+ // following two conditions:
+ //
+ // 1. Potentially moving a strided load (A) before any store (B) that
+ // precedes A, or
+ //
+ // 2. Potentially moving a strided store (B) after any load or store (A)
+ // that B precedes.
+ //
+ // It's legal to reorder B and A if we know there isn't a dependence from B
+ // to A. Note that this determination is conservative since some
+ // dependences could potentially be reordered safely.
+
+ // B is potentially the source of a dependence.
+ auto *Src = B->first;
+ auto SrcDes = B->second;
+
+ // A is potentially the sink of a dependence.
+ auto *Sink = A->first;
+ auto SinkDes = A->second;
+
+ // Code motion for interleaved accesses can't violate WAR dependences.
+ // Thus, reordering is legal if the source isn't a write.
+ if (!Src->mayWriteToMemory())
+ return true;
+
+ // At least one of the accesses must be strided.
+ if (!isStrided(SrcDes.Stride) && !isStrided(SinkDes.Stride))
+ return true;
+
+ // If dependence information is not available from LoopAccessInfo,
+ // conservatively assume the instructions can't be reordered.
+ if (!areDependencesValid())
+ return false;
+
+ // If we know there is a dependence from source to sink, assume the
+ // instructions can't be reordered. Otherwise, reordering is legal.
+ return !Dependences.count(Src) || !Dependences.lookup(Src).count(Sink);
+ }
+
+ /// \brief Collect the dependences from LoopAccessInfo.
+ ///
+ /// We process the dependences once during the interleaved access analysis to
+ /// enable constant-time dependence queries.
+ void collectDependences() {
+ if (!areDependencesValid())
+ return;
+ auto *Deps = LAI->getDepChecker().getDependences();
+ for (auto Dep : *Deps)
+ Dependences[Dep.getSource(*LAI)].insert(Dep.getDestination(*LAI));
+ }
};
/// Utility class for getting and setting loop vectorizer hints in the form
DominatorTree *DT, TargetLibraryInfo *TLI,
AliasAnalysis *AA, Function *F,
const TargetTransformInfo *TTI,
- LoopAccessAnalysis *LAA,
+ LoopAccessAnalysis *LAA, LoopInfo *LI,
LoopVectorizationRequirements *R,
LoopVectorizeHints *H)
: NumPredStores(0), TheLoop(L), PSE(PSE), TLI(TLI), TheFunction(F),
- TTI(TTI), DT(DT), LAA(LAA), LAI(nullptr), InterleaveInfo(PSE, L, DT),
- Induction(nullptr), WidestIndTy(nullptr), HasFunNoNaNAttr(false),
- Requirements(R), Hints(H) {}
+ TTI(TTI), DT(DT), LAA(LAA), LAI(nullptr),
+ InterleaveInfo(PSE, L, DT, LI), Induction(nullptr),
+ WidestIndTy(nullptr), HasFunNoNaNAttr(false), Requirements(R),
+ Hints(H) {}
/// ReductionList contains the reduction descriptors for all
/// of the reductions that were found in the loop.
// Check if it is legal to vectorize the loop.
LoopVectorizationRequirements Requirements;
- LoopVectorizationLegality LVL(L, PSE, DT, TLI, AA, F, TTI, LAA,
+ LoopVectorizationLegality LVL(L, PSE, DT, TLI, AA, F, TTI, LAA, LI,
&Requirements, &Hints);
if (!LVL.canVectorize()) {
DEBUG(dbgs() << "LV: Not vectorizing: Cannot prove legality.\n");
bool LoopVectorizationLegality::canVectorizeMemory() {
LAI = &LAA->getInfo(TheLoop);
+ InterleaveInfo.setLAI(LAI);
auto &OptionalReport = LAI->getReport();
if (OptionalReport)
emitAnalysis(VectorizationReport(*OptionalReport));
// Holds load/store instructions in program order.
SmallVector<Instruction *, 16> AccessList;
- for (auto *BB : TheLoop->getBlocks()) {
+ // Since it's desired that the load/store instructions be maintained in
+ // "program order" for the interleaved access analysis, we have to visit the
+ // blocks in the loop in reverse postorder (i.e., in a topological order).
+ // Such an ordering will ensure that any load/store that may be executed
+ // before a second load/store will precede the second load/store in the
+ // AccessList.
+ LoopBlocksDFS DFS(TheLoop);
+ DFS.perform(LI);
+ for (LoopBlocksDFS::RPOIterator I = DFS.beginRPO(), E = DFS.endRPO(); I != E;
+ ++I) {
+ BasicBlock *BB = *I;
bool IsPred = LoopAccessInfo::blockNeedsPredication(BB, TheLoop, DT);
for (auto &I : *BB) {
Value *Ptr = LI ? LI->getPointerOperand() : SI->getPointerOperand();
int Stride = getPtrStride(PSE, Ptr, TheLoop, Strides);
- // The factor of the corresponding interleave group.
- unsigned Factor = std::abs(Stride);
-
- // Ignore the access if the factor is too small or too large.
- if (Factor < 2 || Factor > MaxInterleaveGroupFactor)
- continue;
-
const SCEV *Scev = replaceSymbolicStrideSCEV(PSE, Strides, Ptr);
PointerType *PtrTy = dyn_cast<PointerType>(Ptr->getType());
unsigned Size = DL.getTypeAllocSize(PtrTy->getElementType());
}
}
-// Analyze interleaved accesses and collect them into interleave groups.
+// Analyze interleaved accesses and collect them into interleaved load and
+// store groups.
+//
+// When generating code for an interleaved load group, we effectively hoist all
+// loads in the group to the location of the first load in program order. When
+// generating code for an interleaved store group, we sink all stores to the
+// location of the last store. This code motion can change the order of load
+// and store instructions and may break dependences.
+//
+// The code generation strategy mentioned above ensures that we won't violate
+// any write-after-read (WAR) dependences.
+//
+// E.g., for the WAR dependence: a = A[i]; // (1)
+// A[i] = b; // (2)
//
-// Notice that the vectorization on interleaved groups will change instruction
-// orders and may break dependences. But the memory dependence check guarantees
-// that there is no overlap between two pointers of different strides, element
-// sizes or underlying bases.
+// The store group of (2) is always inserted at or below (2), and the load
+// group of (1) is always inserted at or above (1). Thus, the instructions will
+// never be reordered. All other dependences are checked to ensure the
+// correctness of the instruction reordering.
//
-// For pointers sharing the same stride, element size and underlying base, no
-// need to worry about Read-After-Write dependences and Write-After-Read
+// The algorithm visits all memory accesses in the loop in bottom-up program
+// order. Program order is established by traversing the blocks in the loop in
+// reverse postorder when collecting the accesses.
+//
+// We visit the memory accesses in bottom-up order because it can simplify the
+// construction of store groups in the presence of write-after-write (WAW)
// dependences.
//
-// E.g. The RAW dependence: A[i] = a;
-// b = A[i];
-// This won't exist as it is a store-load forwarding conflict, which has
-// already been checked and forbidden in the dependence check.
+// E.g., for the WAW dependence: A[i] = a; // (1)
+// A[i] = b; // (2)
+// A[i + 1] = c; // (3)
//
-// E.g. The WAR dependence: a = A[i]; // (1)
-// A[i] = b; // (2)
-// The store group of (2) is always inserted at or below (2), and the load group
-// of (1) is always inserted at or above (1). The dependence is safe.
+// We will first create a store group with (3) and (2). (1) can't be added to
+// this group because it and (2) are dependent. However, (1) can be grouped
+// with other accesses that may precede it in program order. Note that a
+// bottom-up order does not imply that WAW dependences should not be checked.
void InterleavedAccessInfo::analyzeInterleaving(
const ValueToValueMap &Strides) {
DEBUG(dbgs() << "LV: Analyzing interleaved accesses...\n");
if (StrideAccesses.empty())
return;
+ // Collect the dependences in the loop.
+ collectDependences();
+
// Holds all interleaved store groups temporarily.
SmallSetVector<InterleaveGroup *, 4> StoreGroups;
// Holds all interleaved load groups temporarily.
// 1. A and B have the same stride.
// 2. A and B have the same memory object size.
// 3. B belongs to the group according to the distance.
- //
- // The bottom-up order can avoid breaking the Write-After-Write dependences
- // between two pointers of the same base.
- // E.g. A[i] = a; (1)
- // A[i] = b; (2)
- // A[i+1] = c (3)
- // We form the group (2)+(3) in front, so (1) has to form groups with accesses
- // above (1), which guarantees that (1) is always above (2).
- for (auto I = StrideAccesses.rbegin(), E = StrideAccesses.rend(); I != E;
- ++I) {
- Instruction *A = I->first;
- StrideDescriptor DesA = I->second;
-
- InterleaveGroup *Group = getInterleaveGroup(A);
- if (!Group) {
- DEBUG(dbgs() << "LV: Creating an interleave group with:" << *A << '\n');
- Group = createInterleaveGroup(A, DesA.Stride, DesA.Align);
+ for (auto AI = StrideAccesses.rbegin(), E = StrideAccesses.rend(); AI != E;
+ ++AI) {
+ Instruction *A = AI->first;
+ StrideDescriptor DesA = AI->second;
+
+ // Initialize a group for A if it has an allowable stride. Even if we don't
+ // create a group for A, we continue with the bottom-up algorithm to ensure
+ // we don't break any of A's dependences.
+ InterleaveGroup *Group = nullptr;
+ if (isStrided(DesA.Stride)) {
+ Group = getInterleaveGroup(A);
+ if (!Group) {
+ DEBUG(dbgs() << "LV: Creating an interleave group with:" << *A << '\n');
+ Group = createInterleaveGroup(A, DesA.Stride, DesA.Align);
+ }
+ if (A->mayWriteToMemory())
+ StoreGroups.insert(Group);
+ else
+ LoadGroups.insert(Group);
}
- if (A->mayWriteToMemory())
- StoreGroups.insert(Group);
- else
- LoadGroups.insert(Group);
+ for (auto BI = std::next(AI); BI != E; ++BI) {
+ Instruction *B = BI->first;
+ StrideDescriptor DesB = BI->second;
+
+ // Our code motion strategy implies that we can't have dependences
+ // between accesses in an interleaved group and other accesses located
+ // between the first and last member of the group. Note that this also
+ // means that a group can't have more than one member at a given offset.
+ // The accesses in a group can have dependences with other accesses, but
+ // we must ensure we don't extend the boundaries of the group such that
+ // we encompass those dependent accesses.
+ //
+ // For example, assume we have the sequence of accesses shown below in a
+ // stride-2 loop:
+ //
+ // (1, 2) is a group | A[i] = a; // (1)
+ // | A[i-1] = b; // (2) |
+ // A[i-3] = c; // (3)
+ // A[i] = d; // (4) | (2, 4) is not a group
+ //
+ // Because accesses (2) and (3) are dependent, we can group (2) with (1)
+ // but not with (4). If we did, the dependent access (3) would be within
+ // the boundaries of the (2, 4) group.
+ if (!canReorderMemAccessesForInterleavedGroups(&*BI, &*AI)) {
+
+ // If a dependence exists and B is already in a group, we know that B
+ // must be a store since B precedes A and WAR dependences are allowed.
+ // Thus, B would be sunk below A. We release B's group to prevent this
+ // illegal code motion. B will then be free to form another group with
+ // instructions that precede it.
+ if (isInterleaved(B)) {
+ InterleaveGroup *StoreGroup = getInterleaveGroup(B);
+ StoreGroups.remove(StoreGroup);
+ releaseGroup(StoreGroup);
+ }
+
+ // If a dependence exists and B is not already in a group (or it was
+ // and we just released it), A might be hoisted above B (if A is a
+ // load) or another store might be sunk below B (if A is a store). In
+ // either case, we can't add additional instructions to A's group. A
+ // will only form a group with instructions that it precedes.
+ break;
+ }
- for (auto II = std::next(I); II != E; ++II) {
- Instruction *B = II->first;
- StrideDescriptor DesB = II->second;
+ // At this point, we've checked for illegal code motion. If either A or B
+ // isn't strided, there's nothing left to do.
+ if (!isStrided(DesA.Stride) || !isStrided(DesB.Stride))
+ continue;
// Ignore if B is already in a group or B is a different memory operation.
if (isInterleaved(B) || A->mayReadFromMemory() != B->mayReadFromMemory())
br i1 %exitcond, label %for.cond.cleanup, label %for.body
}
+; Check vectorization of interleaved access groups in the presence of
+; dependences (PR27626). The following tests check that we don't reorder
+; dependent loads and stores when generating code for interleaved access
+; groups. Stores should be scalarized because the required code motion would
+; break dependences, and the remaining interleaved load groups should have
+; gaps.
+
+; PR27626_0: Ensure a strided store is not moved after a dependent (zero
+; distance) strided load.
+
+; void PR27626_0(struct pair *p, int z, int n) {
+; for (int i = 0; i < n; i++) {
+; p[i].x = z;
+; p[i].y = p[i].x;
+; }
+; }
+
+; CHECK-LABEL: @PR27626_0(
+; CHECK: min.iters.checked:
+; CHECK: %n.mod.vf = and i64 %[[N:.+]], 3
+; CHECK: %[[IsZero:[a-zA-Z0-9]+]] = icmp eq i64 %n.mod.vf, 0
+; CHECK: %[[R:[a-zA-Z0-9]+]] = select i1 %[[IsZero]], i64 4, i64 %n.mod.vf
+; CHECK: %n.vec = sub i64 %[[N]], %[[R]]
+; CHECK: vector.body:
+; CHECK: %[[L1:.+]] = load <8 x i32>, <8 x i32>* {{.*}}
+; CHECK: %[[X1:.+]] = extractelement <8 x i32> %[[L1]], i32 0
+; CHECK: store i32 %[[X1]], {{.*}}
+; CHECK: %[[X2:.+]] = extractelement <8 x i32> %[[L1]], i32 2
+; CHECK: store i32 %[[X2]], {{.*}}
+; CHECK: %[[X3:.+]] = extractelement <8 x i32> %[[L1]], i32 4
+; CHECK: store i32 %[[X3]], {{.*}}
+; CHECK: %[[X4:.+]] = extractelement <8 x i32> %[[L1]], i32 6
+; CHECK: store i32 %[[X4]], {{.*}}
+
+%pair.i32 = type { i32, i32 }
+define void @PR27626_0(%pair.i32 *%p, i32 %z, i64 %n) {
+entry:
+ br label %for.body
+
+for.body:
+ %i = phi i64 [ %i.next, %for.body ], [ 0, %entry ]
+ %p_i.x = getelementptr inbounds %pair.i32, %pair.i32* %p, i64 %i, i32 0
+ %p_i.y = getelementptr inbounds %pair.i32, %pair.i32* %p, i64 %i, i32 1
+ store i32 %z, i32* %p_i.x, align 4
+ %0 = load i32, i32* %p_i.x, align 4
+ store i32 %0, i32 *%p_i.y, align 4
+ %i.next = add nuw nsw i64 %i, 1
+ %cond = icmp slt i64 %i.next, %n
+ br i1 %cond, label %for.body, label %for.end
+
+for.end:
+ ret void
+}
+
+; PR27626_1: Ensure a strided load is not moved before a dependent (zero
+; distance) strided store.
+
+; void PR27626_1(struct pair *p, int n) {
+; int s = 0;
+; for (int i = 0; i < n; i++) {
+; p[i].y = p[i].x;
+; s += p[i].y
+; }
+; }
+
+; CHECK-LABEL: @PR27626_1(
+; CHECK: min.iters.checked:
+; CHECK: %n.mod.vf = and i64 %[[N:.+]], 3
+; CHECK: %[[IsZero:[a-zA-Z0-9]+]] = icmp eq i64 %n.mod.vf, 0
+; CHECK: %[[R:[a-zA-Z0-9]+]] = select i1 %[[IsZero]], i64 4, i64 %n.mod.vf
+; CHECK: %n.vec = sub i64 %[[N]], %[[R]]
+; CHECK: vector.body:
+; CHECK: %[[Phi:.+]] = phi <4 x i32> [ zeroinitializer, %vector.ph ], [ {{.*}}, %vector.body ]
+; CHECK: %[[L1:.+]] = load <8 x i32>, <8 x i32>* {{.*}}
+; CHECK: %[[X1:.+]] = extractelement <8 x i32> %[[L1:.+]], i32 0
+; CHECK: store i32 %[[X1:.+]], {{.*}}
+; CHECK: %[[X2:.+]] = extractelement <8 x i32> %[[L1:.+]], i32 2
+; CHECK: store i32 %[[X2:.+]], {{.*}}
+; CHECK: %[[X3:.+]] = extractelement <8 x i32> %[[L1:.+]], i32 4
+; CHECK: store i32 %[[X3:.+]], {{.*}}
+; CHECK: %[[X4:.+]] = extractelement <8 x i32> %[[L1:.+]], i32 6
+; CHECK: store i32 %[[X4:.+]], {{.*}}
+; CHECK: %[[L2:.+]] = load <8 x i32>, <8 x i32>* {{.*}}
+; CHECK: %[[S1:.+]] = shufflevector <8 x i32> %[[L2]], <8 x i32> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+; CHECK: add nsw <4 x i32> %[[S1]], %[[Phi]]
+
+define i32 @PR27626_1(%pair.i32 *%p, i64 %n) {
+entry:
+ br label %for.body
+
+for.body:
+ %i = phi i64 [ %i.next, %for.body ], [ 0, %entry ]
+ %s = phi i32 [ %2, %for.body ], [ 0, %entry ]
+ %p_i.x = getelementptr inbounds %pair.i32, %pair.i32* %p, i64 %i, i32 0
+ %p_i.y = getelementptr inbounds %pair.i32, %pair.i32* %p, i64 %i, i32 1
+ %0 = load i32, i32* %p_i.x, align 4
+ store i32 %0, i32* %p_i.y, align 4
+ %1 = load i32, i32* %p_i.y, align 4
+ %2 = add nsw i32 %1, %s
+ %i.next = add nuw nsw i64 %i, 1
+ %cond = icmp slt i64 %i.next, %n
+ br i1 %cond, label %for.body, label %for.end
+
+for.end:
+ %3 = phi i32 [ %2, %for.body ]
+ ret i32 %3
+}
+
+; PR27626_2: Ensure a strided store is not moved after a dependent (negative
+; distance) strided load.
+
+; void PR27626_2(struct pair *p, int z, int n) {
+; for (int i = 0; i < n; i++) {
+; p[i].x = z;
+; p[i].y = p[i - 1].x;
+; }
+; }
+
+; CHECK-LABEL: @PR27626_2(
+; CHECK: min.iters.checked:
+; CHECK: %n.mod.vf = and i64 %[[N:.+]], 3
+; CHECK: %[[IsZero:[a-zA-Z0-9]+]] = icmp eq i64 %n.mod.vf, 0
+; CHECK: %[[R:[a-zA-Z0-9]+]] = select i1 %[[IsZero]], i64 4, i64 %n.mod.vf
+; CHECK: %n.vec = sub i64 %[[N]], %[[R]]
+; CHECK: vector.body:
+; CHECK: %[[L1:.+]] = load <8 x i32>, <8 x i32>* {{.*}}
+; CHECK: %[[X1:.+]] = extractelement <8 x i32> %[[L1]], i32 0
+; CHECK: store i32 %[[X1]], {{.*}}
+; CHECK: %[[X2:.+]] = extractelement <8 x i32> %[[L1]], i32 2
+; CHECK: store i32 %[[X2]], {{.*}}
+; CHECK: %[[X3:.+]] = extractelement <8 x i32> %[[L1]], i32 4
+; CHECK: store i32 %[[X3]], {{.*}}
+; CHECK: %[[X4:.+]] = extractelement <8 x i32> %[[L1]], i32 6
+; CHECK: store i32 %[[X4]], {{.*}}
+
+define void @PR27626_2(%pair.i32 *%p, i64 %n, i32 %z) {
+entry:
+ br label %for.body
+
+for.body:
+ %i = phi i64 [ %i.next, %for.body ], [ 0, %entry ]
+ %i_minus_1 = add nuw nsw i64 %i, -1
+ %p_i.x = getelementptr inbounds %pair.i32, %pair.i32* %p, i64 %i, i32 0
+ %p_i_minus_1.x = getelementptr inbounds %pair.i32, %pair.i32* %p, i64 %i_minus_1, i32 0
+ %p_i.y = getelementptr inbounds %pair.i32, %pair.i32* %p, i64 %i, i32 1
+ store i32 %z, i32* %p_i.x, align 4
+ %0 = load i32, i32* %p_i_minus_1.x, align 4
+ store i32 %0, i32 *%p_i.y, align 4
+ %i.next = add nuw nsw i64 %i, 1
+ %cond = icmp slt i64 %i.next, %n
+ br i1 %cond, label %for.body, label %for.end
+
+for.end:
+ ret void
+}
+
+; PR27626_3: Ensure a strided load is not moved before a dependent (negative
+; distance) strided store.
+
+; void PR27626_3(struct pair *p, int z, int n) {
+; for (int i = 0; i < n; i++) {
+; p[i + 1].y = p[i].x;
+; s += p[i].y;
+; }
+; }
+
+; CHECK-LABEL: @PR27626_3(
+; CHECK: min.iters.checked:
+; CHECK: %n.mod.vf = and i64 %[[N:.+]], 3
+; CHECK: %[[IsZero:[a-zA-Z0-9]+]] = icmp eq i64 %n.mod.vf, 0
+; CHECK: %[[R:[a-zA-Z0-9]+]] = select i1 %[[IsZero]], i64 4, i64 %n.mod.vf
+; CHECK: %n.vec = sub i64 %[[N]], %[[R]]
+; CHECK: vector.body:
+; CHECK: %[[Phi:.+]] = phi <4 x i32> [ zeroinitializer, %vector.ph ], [ {{.*}}, %vector.body ]
+; CHECK: %[[L1:.+]] = load <8 x i32>, <8 x i32>* {{.*}}
+; CHECK: %[[X1:.+]] = extractelement <8 x i32> %[[L1:.+]], i32 0
+; CHECK: store i32 %[[X1:.+]], {{.*}}
+; CHECK: %[[X2:.+]] = extractelement <8 x i32> %[[L1:.+]], i32 2
+; CHECK: store i32 %[[X2:.+]], {{.*}}
+; CHECK: %[[X3:.+]] = extractelement <8 x i32> %[[L1:.+]], i32 4
+; CHECK: store i32 %[[X3:.+]], {{.*}}
+; CHECK: %[[X4:.+]] = extractelement <8 x i32> %[[L1:.+]], i32 6
+; CHECK: store i32 %[[X4:.+]], {{.*}}
+; CHECK: %[[L2:.+]] = load <8 x i32>, <8 x i32>* {{.*}}
+; CHECK: %[[S1:.+]] = shufflevector <8 x i32> %[[L2]], <8 x i32> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+; CHECK: add nsw <4 x i32> %[[S1]], %[[Phi]]
+
+define i32 @PR27626_3(%pair.i32 *%p, i64 %n, i32 %z) {
+entry:
+ br label %for.body
+
+for.body:
+ %i = phi i64 [ %i.next, %for.body ], [ 0, %entry ]
+ %s = phi i32 [ %2, %for.body ], [ 0, %entry ]
+ %i_plus_1 = add nuw nsw i64 %i, 1
+ %p_i.x = getelementptr inbounds %pair.i32, %pair.i32* %p, i64 %i, i32 0
+ %p_i.y = getelementptr inbounds %pair.i32, %pair.i32* %p, i64 %i, i32 1
+ %p_i_plus_1.y = getelementptr inbounds %pair.i32, %pair.i32* %p, i64 %i_plus_1, i32 1
+ %0 = load i32, i32* %p_i.x, align 4
+ store i32 %0, i32* %p_i_plus_1.y, align 4
+ %1 = load i32, i32* %p_i.y, align 4
+ %2 = add nsw i32 %1, %s
+ %i.next = add nuw nsw i64 %i, 1
+ %cond = icmp slt i64 %i.next, %n
+ br i1 %cond, label %for.body, label %for.end
+
+for.end:
+ %3 = phi i32 [ %2, %for.body ]
+ ret i32 %3
+}
+
+; PR27626_4: Ensure we form an interleaved group for strided stores in the
+; presence of a write-after-write dependence. We create a group for
+; (2) and (3) while excluding (1).
+
+; void PR27626_4(int *a, int x, int y, int z, int n) {
+; for (int i = 0; i < n; i += 2) {
+; a[i] = x; // (1)
+; a[i] = y; // (2)
+; a[i + 1] = z; // (3)
+; }
+; }
+
+; CHECK-LABEL: @PR27626_4(
+; CHECK: vector.ph:
+; CHECK: %[[INS_Y:.+]] = insertelement <4 x i32> undef, i32 %y, i32 0
+; CHECK: %[[SPLAT_Y:.+]] = shufflevector <4 x i32> %[[INS_Y]], <4 x i32> undef, <4 x i32> zeroinitializer
+; CHECK: %[[INS_Z:.+]] = insertelement <4 x i32> undef, i32 %z, i32 0
+; CHECK: %[[SPLAT_Z:.+]] = shufflevector <4 x i32> %[[INS_Z]], <4 x i32> undef, <4 x i32> zeroinitializer
+; CHECK: vector.body:
+; CHECK: store i32 %x, {{.*}}
+; CHECK: store i32 %x, {{.*}}
+; CHECK: store i32 %x, {{.*}}
+; CHECK: store i32 %x, {{.*}}
+; CHECK: %[[VEC:.+]] = shufflevector <4 x i32> %[[SPLAT_Y]], <4 x i32> %[[SPLAT_Z]], <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
+; CHECK: store <8 x i32> %[[VEC]], {{.*}}
+
+define void @PR27626_4(i32 *%a, i32 %x, i32 %y, i32 %z, i64 %n) {
+entry:
+ br label %for.body
+
+for.body:
+ %i = phi i64 [ %i.next, %for.body ], [ 0, %entry ]
+ %i_plus_1 = add i64 %i, 1
+ %a_i = getelementptr inbounds i32, i32* %a, i64 %i
+ %a_i_plus_1 = getelementptr inbounds i32, i32* %a, i64 %i_plus_1
+ store i32 %x, i32* %a_i, align 4
+ store i32 %y, i32* %a_i, align 4
+ store i32 %z, i32* %a_i_plus_1, align 4
+ %i.next = add nuw nsw i64 %i, 2
+ %cond = icmp slt i64 %i.next, %n
+ br i1 %cond, label %for.body, label %for.end
+
+for.end:
+ ret void
+}
+
+; PR27626_5: Ensure we do not form an interleaved group for strided stores in
+; the presence of a write-after-write dependence.
+
+; void PR27626_5(int *a, int x, int y, int z, int n) {
+; for (int i = 3; i < n; i += 2) {
+; a[i - 1] = x;
+; a[i - 3] = y;
+; a[i] = z;
+; }
+; }
+
+; CHECK-LABEL: @PR27626_5(
+; CHECK: vector.body:
+; CHECK: store i32 %x, {{.*}}
+; CHECK: store i32 %x, {{.*}}
+; CHECK: store i32 %x, {{.*}}
+; CHECK: store i32 %x, {{.*}}
+; CHECK: store i32 %y, {{.*}}
+; CHECK: store i32 %y, {{.*}}
+; CHECK: store i32 %y, {{.*}}
+; CHECK: store i32 %y, {{.*}}
+; CHECK: store i32 %z, {{.*}}
+; CHECK: store i32 %z, {{.*}}
+; CHECK: store i32 %z, {{.*}}
+; CHECK: store i32 %z, {{.*}}
+
+define void @PR27626_5(i32 *%a, i32 %x, i32 %y, i32 %z, i64 %n) {
+entry:
+ br label %for.body
+
+for.body:
+ %i = phi i64 [ %i.next, %for.body ], [ 3, %entry ]
+ %i_minus_1 = sub i64 %i, 1
+ %i_minus_3 = sub i64 %i_minus_1, 2
+ %a_i = getelementptr inbounds i32, i32* %a, i64 %i
+ %a_i_minus_1 = getelementptr inbounds i32, i32* %a, i64 %i_minus_1
+ %a_i_minus_3 = getelementptr inbounds i32, i32* %a, i64 %i_minus_3
+ store i32 %x, i32* %a_i_minus_1, align 4
+ store i32 %y, i32* %a_i_minus_3, align 4
+ store i32 %z, i32* %a_i, align 4
+ %i.next = add nuw nsw i64 %i, 2
+ %cond = icmp slt i64 %i.next, %n
+ br i1 %cond, label %for.body, label %for.end
+
+for.end:
+ ret void
+}
+
attributes #0 = { "unsafe-fp-math"="true" }