From: Michael Kuperstein <mkuper@google.com>
Date: Fri, 3 Feb 2017 19:09:45 +0000 (+0000)
Subject: [SLP] Use SCEV to sort memory accesses.
X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=dce4987e9b54c8d4bc74722963668b55b7383e4c;p=llvm

[SLP] Use SCEV to sort memory accesses.

This generalizes memory access sorting to use differences between SCEVs,
instead of relying on constant offsets. That allows us to properly do
SLP vectorization of non-sequentially ordered loads within loops bodies.

Differential Revision: https://reviews.llvm.org/D29425


git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@294027 91177308-0d34-0410-b5e6-96231b3b80d8
---

diff --git a/include/llvm/Analysis/LoopAccessAnalysis.h b/include/llvm/Analysis/LoopAccessAnalysis.h
index 9eee08c101c..1d9ade4468c 100644
--- a/include/llvm/Analysis/LoopAccessAnalysis.h
+++ b/include/llvm/Analysis/LoopAccessAnalysis.h
@@ -690,8 +690,14 @@ int64_t getPtrStride(PredicatedScalarEvolution &PSE, Value *Ptr, const Loop *Lp,
                      const ValueToValueMap &StridesMap = ValueToValueMap(),
                      bool Assume = false, bool ShouldCheckWrap = true);
 
-/// \brief Saves the sorted memory accesses in vector argument 'Sorted' after
-/// sorting the jumbled memory accesses.
+/// \brief Try to sort an array of loads / stores.
+///
+/// If all pointers refer to the same object, and the differences between all
+/// pointer operands are known to be constant, the array is sorted by offset,
+/// and returned in \p Sorted.
+/// 
+/// If those conditions do not hold, the output array is an arbitrary
+/// permutation of the input.
 void sortMemAccesses(ArrayRef<Value *> VL, const DataLayout &DL,
                      ScalarEvolution &SE, SmallVectorImpl<Value *> &Sorted);
 
diff --git a/lib/Analysis/LoopAccessAnalysis.cpp b/lib/Analysis/LoopAccessAnalysis.cpp
index e8f52eda3c4..4e076f5f004 100644
--- a/lib/Analysis/LoopAccessAnalysis.cpp
+++ b/lib/Analysis/LoopAccessAnalysis.cpp
@@ -1058,30 +1058,47 @@ static unsigned getAddressSpaceOperand(Value *I) {
   return -1;
 }
 
-/// Saves the memory accesses after sorting it into vector argument 'Sorted'.
 void llvm::sortMemAccesses(ArrayRef<Value *> VL, const DataLayout &DL,
                          ScalarEvolution &SE,
                          SmallVectorImpl<Value *> &Sorted) {
-  SmallVector<std::pair<int, Value *>, 4> OffValPairs;
+  SmallVector<std::pair<int64_t, Value *>, 4> OffValPairs;
+  OffValPairs.reserve(VL.size());
+  Sorted.reserve(VL.size());
+
+  // Walk over the pointers, and map each of them to an offset relative to
+  // first pointer in the array.
+  Value *Ptr0 = getPointerOperand(VL[0]);
+  const SCEV *Scev0 = SE.getSCEV(Ptr0);
+  Value *Obj0 = GetUnderlyingObject(Ptr0, DL);
+
   for (auto *Val : VL) {
-    // Compute the constant offset from the base pointer of each memory accesses
-    // and insert into the vector of key,value pair which needs to be sorted.
     Value *Ptr = getPointerOperand(Val);
-    unsigned AS = getAddressSpaceOperand(Val);
-    unsigned PtrBitWidth = DL.getPointerSizeInBits(AS);
-    Type *Ty = cast<PointerType>(Ptr->getType())->getElementType();
-    APInt Size(PtrBitWidth, DL.getTypeStoreSize(Ty));
-
-    // FIXME: Currently the offsets are assumed to be constant.However this not
-    // always true as offsets can be variables also and we would need to
-    // consider the difference of the variable offsets.
-    APInt Offset(PtrBitWidth, 0);
-    Ptr->stripAndAccumulateInBoundsConstantOffsets(DL, Offset);
-    OffValPairs.push_back(std::make_pair(Offset.getSExtValue(), Val));
+
+    // If a pointer refers to a different underlying object, bail - the
+    // pointers are by definition incomparable.
+    Value *CurrObj = GetUnderlyingObject(Ptr, DL);
+    if (CurrObj != Obj0) {
+      Sorted.append(VL.begin(), VL.end());
+      return;
+    }
+
+    const SCEVConstant *Diff =
+        dyn_cast<SCEVConstant>(SE.getMinusSCEV(SE.getSCEV(Ptr), Scev0));
+
+    // The pointers may not have a constant offset from each other, or SCEV
+    // may just not be smart enough to figure out they do. Regardless,
+    // there's nothing we can do.
+    if (!Diff) {
+      Sorted.append(VL.begin(), VL.end());
+      return;
+    }
+
+    OffValPairs.emplace_back(Diff->getAPInt().getSExtValue(), Val);
   }
+
   std::sort(OffValPairs.begin(), OffValPairs.end(),
-            [](const std::pair<int, Value *> &Left,
-               const std::pair<int, Value *> &Right) {
+            [](const std::pair<int64_t, Value *> &Left,
+               const std::pair<int64_t, Value *> &Right) {
               return Left.first < Right.first;
             });
 
diff --git a/test/Transforms/SLPVectorizer/X86/jumbled-load.ll b/test/Transforms/SLPVectorizer/X86/jumbled-load.ll
index 1c70a291a86..571cb7302f9 100644
--- a/test/Transforms/SLPVectorizer/X86/jumbled-load.ll
+++ b/test/Transforms/SLPVectorizer/X86/jumbled-load.ll
@@ -1,18 +1,18 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt < %s -S -mtriple=x86_64-unknown -mattr=+avx -slp-threshold=-10 -slp-vectorizer | FileCheck %s
 
-
+@total = common global i32 0, align 4
 
 define i32 @jumbled-load(i32* noalias nocapture %in, i32* noalias nocapture %inn, i32* noalias nocapture %out) {
 ; CHECK-LABEL: @jumbled-load(
-; CHECK-NEXT:    [[IN_ADDR:%.*]] = getelementptr inbounds i32, i32* %in, i64 0
+; CHECK-NEXT:    [[IN_ADDR:%.*]] = getelementptr inbounds i32, i32* [[IN:%.*]], i64 0
 ; CHECK-NEXT:    [[GEP_1:%.*]] = getelementptr inbounds i32, i32* [[IN_ADDR]], i64 3
 ; CHECK-NEXT:    [[GEP_2:%.*]] = getelementptr inbounds i32, i32* [[IN_ADDR]], i64 1
 ; CHECK-NEXT:    [[GEP_3:%.*]] = getelementptr inbounds i32, i32* [[IN_ADDR]], i64 2
 ; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i32* [[IN_ADDR]] to <4 x i32>*
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]], align 4
 ; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> undef, <4 x i32> <i32 1, i32 3, i32 2, i32 0>
-; CHECK-NEXT:    [[INN_ADDR:%.*]] = getelementptr inbounds i32, i32* %inn, i64 0
+; CHECK-NEXT:    [[INN_ADDR:%.*]] = getelementptr inbounds i32, i32* [[INN:%.*]], i64 0
 ; CHECK-NEXT:    [[GEP_4:%.*]] = getelementptr inbounds i32, i32* [[INN_ADDR]], i64 2
 ; CHECK-NEXT:    [[GEP_5:%.*]] = getelementptr inbounds i32, i32* [[INN_ADDR]], i64 3
 ; CHECK-NEXT:    [[GEP_6:%.*]] = getelementptr inbounds i32, i32* [[INN_ADDR]], i64 1
@@ -20,10 +20,10 @@ define i32 @jumbled-load(i32* noalias nocapture %in, i32* noalias nocapture %inn
 ; CHECK-NEXT:    [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* [[TMP4]], align 4
 ; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> undef, <4 x i32> <i32 0, i32 1, i32 3, i32 2>
 ; CHECK-NEXT:    [[TMP7:%.*]] = mul <4 x i32> [[TMP3]], [[TMP6]]
-; CHECK-NEXT:    [[GEP_7:%.*]] = getelementptr inbounds i32, i32* %out, i64 0
-; CHECK-NEXT:    [[GEP_8:%.*]] = getelementptr inbounds i32, i32* %out, i64 1
-; CHECK-NEXT:    [[GEP_9:%.*]] = getelementptr inbounds i32, i32* %out, i64 2
-; CHECK-NEXT:    [[GEP_10:%.*]] = getelementptr inbounds i32, i32* %out, i64 3
+; CHECK-NEXT:    [[GEP_7:%.*]] = getelementptr inbounds i32, i32* [[OUT:%.*]], i64 0
+; CHECK-NEXT:    [[GEP_8:%.*]] = getelementptr inbounds i32, i32* [[OUT]], i64 1
+; CHECK-NEXT:    [[GEP_9:%.*]] = getelementptr inbounds i32, i32* [[OUT]], i64 2
+; CHECK-NEXT:    [[GEP_10:%.*]] = getelementptr inbounds i32, i32* [[OUT]], i64 3
 ; CHECK-NEXT:    [[TMP8:%.*]] = bitcast i32* [[GEP_7]] to <4 x i32>*
 ; CHECK-NEXT:    store <4 x i32> [[TMP7]], <4 x i32>* [[TMP8]], align 4
 ; CHECK-NEXT:    ret i32 undef
@@ -59,3 +59,116 @@ define i32 @jumbled-load(i32* noalias nocapture %in, i32* noalias nocapture %inn
 
   ret i32 undef
 }
+
+; Make sure we can sort loads even if they have non-constant offsets, as long as
+; the offset *differences* are constant and computable by SCEV.
+define void @scev(i64 %N, i32* nocapture readonly %b, i32* nocapture readonly %c) {
+; CHECK-LABEL: @scev(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CMP_OUTER:%.*]] = icmp sgt i64 [[N:%.*]], 0
+; CHECK-NEXT:    br i1 [[CMP_OUTER]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_END:%.*]]
+; CHECK:       for.body.preheader:
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[I_P:%.*]] = phi i64 [ [[ADD21:%.*]], [[FOR_BODY]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = phi <4 x i32> [ [[TMP14:%.*]], [[FOR_BODY]] ], [ zeroinitializer, [[FOR_BODY_PREHEADER]] ]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 [[I_P]]
+; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i32, i32* [[C:%.*]], i64 [[I_P]]
+; CHECK-NEXT:    [[ADD3:%.*]] = or i64 [[I_P]], 1
+; CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[ADD3]]
+; CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds i32, i32* [[C]], i64 [[ADD3]]
+; CHECK-NEXT:    [[ADD9:%.*]] = or i64 [[I_P]], 2
+; CHECK-NEXT:    [[ARRAYIDX10:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[ADD9]]
+; CHECK-NEXT:    [[ARRAYIDX12:%.*]] = getelementptr inbounds i32, i32* [[C]], i64 [[ADD9]]
+; CHECK-NEXT:    [[ADD15:%.*]] = or i64 [[I_P]], 3
+; CHECK-NEXT:    [[ARRAYIDX16:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[ADD15]]
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i32* [[ARRAYIDX]] to <4 x i32>*
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]], align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> undef, <4 x i32> <i32 2, i32 1, i32 0, i32 3>
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i32* [[ARRAYIDX]] to <4 x i32>*
+; CHECK-NEXT:    [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* [[TMP4]], align 4
+; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> undef, <4 x i32> <i32 2, i32 1, i32 0, i32 3>
+; CHECK-NEXT:    [[ARRAYIDX18:%.*]] = getelementptr inbounds i32, i32* [[C]], i64 [[ADD15]]
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast i32* [[ARRAYIDX1]] to <4 x i32>*
+; CHECK-NEXT:    [[TMP8:%.*]] = load <4 x i32>, <4 x i32>* [[TMP7]], align 4
+; CHECK-NEXT:    [[TMP9:%.*]] = shufflevector <4 x i32> [[TMP8]], <4 x i32> undef, <4 x i32> <i32 2, i32 1, i32 0, i32 3>
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i32* [[ARRAYIDX1]] to <4 x i32>*
+; CHECK-NEXT:    [[TMP11:%.*]] = load <4 x i32>, <4 x i32>* [[TMP10]], align 4
+; CHECK-NEXT:    [[TMP12:%.*]] = shufflevector <4 x i32> [[TMP11]], <4 x i32> undef, <4 x i32> <i32 2, i32 1, i32 0, i32 3>
+; CHECK-NEXT:    [[TMP13:%.*]] = add <4 x i32> [[TMP3]], [[TMP0]]
+; CHECK-NEXT:    [[TMP14]] = add <4 x i32> [[TMP13]], [[TMP12]]
+; CHECK-NEXT:    [[ADD21]] = add nuw nsw i64 [[I_P]], 4
+; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i64 [[ADD21]], [[N]]
+; CHECK-NEXT:    br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_END_LOOPEXIT:%.*]]
+; CHECK:       for.end.loopexit:
+; CHECK-NEXT:    br label [[FOR_END]]
+; CHECK:       for.end:
+; CHECK-NEXT:    [[TMP15:%.*]] = phi <4 x i32> [ zeroinitializer, [[ENTRY:%.*]] ], [ [[TMP14]], [[FOR_END_LOOPEXIT]] ]
+; CHECK-NEXT:    [[TMP16:%.*]] = extractelement <4 x i32> [[TMP15]], i32 0
+; CHECK-NEXT:    [[TMP17:%.*]] = extractelement <4 x i32> [[TMP15]], i32 1
+; CHECK-NEXT:    [[ADD22:%.*]] = add nsw i32 [[TMP17]], [[TMP16]]
+; CHECK-NEXT:    [[TMP18:%.*]] = extractelement <4 x i32> [[TMP15]], i32 2
+; CHECK-NEXT:    [[ADD23:%.*]] = add nsw i32 [[ADD22]], [[TMP18]]
+; CHECK-NEXT:    [[TMP19:%.*]] = extractelement <4 x i32> [[TMP15]], i32 3
+; CHECK-NEXT:    [[ADD24:%.*]] = add nsw i32 [[ADD23]], [[TMP19]]
+; CHECK-NEXT:    store i32 [[ADD24]], i32* @total, align 4
+; CHECK-NEXT:    ret void
+;
+entry:
+  %cmp.outer = icmp sgt i64 %N, 0
+  br i1 %cmp.outer, label %for.body.preheader, label %for.end
+
+for.body.preheader:                               ; preds = %entry
+  br label %for.body
+
+for.body:                                         ; preds = %for.body.preheader, %for.body
+  %a4.p = phi i32 [ %add20, %for.body ], [ 0, %for.body.preheader ]
+  %a3.p = phi i32 [ %add2, %for.body ], [ 0, %for.body.preheader ]
+  %a2.p = phi i32 [ %add8, %for.body ], [ 0, %for.body.preheader ]
+  %a1.p = phi i32 [ %add14, %for.body ], [ 0, %for.body.preheader ]
+  %i.p = phi i64 [ %add21, %for.body ], [ 0, %for.body.preheader ]
+  %arrayidx = getelementptr inbounds i32, i32* %b, i64 %i.p
+  %0 = load i32, i32* %arrayidx, align 4
+  %arrayidx1 = getelementptr inbounds i32, i32* %c, i64 %i.p
+  %1 = load i32, i32* %arrayidx1, align 4
+  %add = add i32 %0, %a3.p
+  %add2 = add i32 %add, %1
+  %add3 = or i64 %i.p, 1
+  %arrayidx4 = getelementptr inbounds i32, i32* %b, i64 %add3
+  %2 = load i32, i32* %arrayidx4, align 4
+  %arrayidx6 = getelementptr inbounds i32, i32* %c, i64 %add3
+  %3 = load i32, i32* %arrayidx6, align 4
+  %add7 = add i32 %2, %a2.p
+  %add8 = add i32 %add7, %3
+  %add9 = or i64 %i.p, 2
+  %arrayidx10 = getelementptr inbounds i32, i32* %b, i64 %add9
+  %4 = load i32, i32* %arrayidx10, align 4
+  %arrayidx12 = getelementptr inbounds i32, i32* %c, i64 %add9
+  %5 = load i32, i32* %arrayidx12, align 4
+  %add13 = add i32 %4, %a1.p
+  %add14 = add i32 %add13, %5
+  %add15 = or i64 %i.p, 3
+  %arrayidx16 = getelementptr inbounds i32, i32* %b, i64 %add15
+  %6 = load i32, i32* %arrayidx16, align 4
+  %arrayidx18 = getelementptr inbounds i32, i32* %c, i64 %add15
+  %7 = load i32, i32* %arrayidx18, align 4
+  %add19 = add i32 %6, %a4.p
+  %add20 = add i32 %add19, %7
+  %add21 = add nuw nsw i64 %i.p, 4
+  %cmp = icmp slt i64 %add21, %N
+  br i1 %cmp, label %for.body, label %for.end.loopexit
+
+for.end.loopexit:                                 ; preds = %for.body
+  br label %for.end
+
+for.end:                                          ; preds = %for.end.loopexit, %entry
+  %a1.0.lcssa = phi i32 [ 0, %entry ], [ %add14, %for.end.loopexit ]
+  %a2.0.lcssa = phi i32 [ 0, %entry ], [ %add8, %for.end.loopexit ]
+  %a3.0.lcssa = phi i32 [ 0, %entry ], [ %add2, %for.end.loopexit ]
+  %a4.0.lcssa = phi i32 [ 0, %entry ], [ %add20, %for.end.loopexit ]
+  %add22 = add nsw i32 %a2.0.lcssa, %a1.0.lcssa
+  %add23 = add nsw i32 %add22, %a3.0.lcssa
+  %add24 = add nsw i32 %add23, %a4.0.lcssa
+  store i32 %add24, i32* @total, align 4
+  ret void
+}