From: Mohammad Shahid Date: Sat, 28 Jan 2017 17:59:44 +0000 (+0000) Subject: [SLP] Vectorize loads of consecutive memory accesses, accessed in non-consecutive... X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=13e3a7f7a3f4b81f240f8117c2b7c6e418827193;p=llvm [SLP] Vectorize loads of consecutive memory accesses, accessed in non-consecutive (jumbled) way. The jumbled scalar loads will be sorted while building the tree and these accesses will be marked to generate shufflevector after the vectorized load with proper mask. Reviewers: hfinkel, mssimpso, mkuper Differential Revision: https://reviews.llvm.org/D26905 Change-Id: I9c0c8e6f91a00076a7ee1465440a3f6ae092f7ad git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@293386 91177308-0d34-0410-b5e6-96231b3b80d8 --- diff --git a/include/llvm/Analysis/LoopAccessAnalysis.h b/include/llvm/Analysis/LoopAccessAnalysis.h index 901b193c7e2..9eee08c101c 100644 --- a/include/llvm/Analysis/LoopAccessAnalysis.h +++ b/include/llvm/Analysis/LoopAccessAnalysis.h @@ -690,6 +690,11 @@ int64_t getPtrStride(PredicatedScalarEvolution &PSE, Value *Ptr, const Loop *Lp, const ValueToValueMap &StridesMap = ValueToValueMap(), bool Assume = false, bool ShouldCheckWrap = true); +/// \brief Saves the sorted memory accesses in vector argument 'Sorted' after +/// sorting the jumbled memory accesses. +void sortMemAccesses(ArrayRef VL, const DataLayout &DL, + ScalarEvolution &SE, SmallVectorImpl &Sorted); + /// \brief Returns true if the memory operations \p A and \p B are consecutive. /// This is a simple API that does not depend on the analysis pass. bool isConsecutiveAccess(Value *A, Value *B, const DataLayout &DL, diff --git a/lib/Analysis/LoopAccessAnalysis.cpp b/lib/Analysis/LoopAccessAnalysis.cpp index bf800721309..e8f52eda3c4 100644 --- a/lib/Analysis/LoopAccessAnalysis.cpp +++ b/lib/Analysis/LoopAccessAnalysis.cpp @@ -1058,6 +1058,37 @@ static unsigned getAddressSpaceOperand(Value *I) { return -1; } +/// Saves the memory accesses after sorting it into vector argument 'Sorted'. +void llvm::sortMemAccesses(ArrayRef VL, const DataLayout &DL, + ScalarEvolution &SE, + SmallVectorImpl &Sorted) { + SmallVector, 4> OffValPairs; + for (auto *Val : VL) { + // Compute the constant offset from the base pointer of each memory accesses + // and insert into the vector of key,value pair which needs to be sorted. + Value *Ptr = getPointerOperand(Val); + unsigned AS = getAddressSpaceOperand(Val); + unsigned PtrBitWidth = DL.getPointerSizeInBits(AS); + Type *Ty = cast(Ptr->getType())->getElementType(); + APInt Size(PtrBitWidth, DL.getTypeStoreSize(Ty)); + + // FIXME: Currently the offsets are assumed to be constant.However this not + // always true as offsets can be variables also and we would need to + // consider the difference of the variable offsets. + APInt Offset(PtrBitWidth, 0); + Ptr->stripAndAccumulateInBoundsConstantOffsets(DL, Offset); + OffValPairs.push_back(std::make_pair(Offset.getSExtValue(), Val)); + } + std::sort(OffValPairs.begin(), OffValPairs.end(), + [](const std::pair &Left, + const std::pair &Right) { + return Left.first < Right.first; + }); + + for (auto& it : OffValPairs) + Sorted.push_back(it.second); +} + /// Returns true if the memory operations \p A and \p B are consecutive. bool llvm::isConsecutiveAccess(Value *A, Value *B, const DataLayout &DL, ScalarEvolution &SE, bool CheckType) { diff --git a/lib/Transforms/Vectorize/SLPVectorizer.cpp b/lib/Transforms/Vectorize/SLPVectorizer.cpp index e27814526ce..eeaf29dceb5 100644 --- a/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -410,8 +410,10 @@ private: /// be vectorized to use the original vector (or aggregate "bitcast" to a vector). bool canReuseExtract(ArrayRef VL, unsigned Opcode) const; - /// Vectorize a single entry in the tree. - Value *vectorizeTree(TreeEntry *E); + /// Vectorize a single entry in the tree. VL icontains all isomorphic scalars + /// in order of its usage in a user program, for example ADD1, ADD2 and so on + /// or LOAD1 , LOAD2 etc. + Value *vectorizeTree(ArrayRef VL, TreeEntry *E); /// Vectorize a single entry in the tree, starting in \p VL. Value *vectorizeTree(ArrayRef VL); @@ -452,7 +454,7 @@ private: SmallVectorImpl &Right); struct TreeEntry { TreeEntry() : Scalars(), VectorizedValue(nullptr), - NeedToGather(0) {} + NeedToGather(0), NeedToShuffle(0) {} /// \returns true if the scalars in VL are equal to this entry. bool isSame(ArrayRef VL) const { @@ -460,6 +462,15 @@ private: return std::equal(VL.begin(), VL.end(), Scalars.begin()); } + /// \returns true if the scalars in VL are found in this tree entry. + bool isFoundJumbled(ArrayRef VL, const DataLayout &DL, + ScalarEvolution &SE) const { + assert(VL.size() == Scalars.size() && "Invalid size"); + SmallVector List; + sortMemAccesses(VL, DL, SE, List); + return std::equal(List.begin(), List.end(), Scalars.begin()); + } + /// A vector of scalars. ValueList Scalars; @@ -468,15 +479,20 @@ private: /// Do we need to gather this sequence ? bool NeedToGather; + + /// Do we need to shuffle the load ? + bool NeedToShuffle; }; /// Create a new VectorizableTree entry. - TreeEntry *newTreeEntry(ArrayRef VL, bool Vectorized) { + TreeEntry *newTreeEntry(ArrayRef VL, bool Vectorized, + bool NeedToShuffle) { VectorizableTree.emplace_back(); int idx = VectorizableTree.size() - 1; TreeEntry *Last = &VectorizableTree[idx]; Last->Scalars.insert(Last->Scalars.begin(), VL.begin(), VL.end()); Last->NeedToGather = !Vectorized; + Last->NeedToShuffle = NeedToShuffle; if (Vectorized) { for (int i = 0, e = VL.size(); i != e; ++i) { assert(!ScalarToTreeEntry.count(VL[i]) && "Scalar already in tree!"); @@ -983,21 +999,21 @@ void BoUpSLP::buildTree_rec(ArrayRef VL, unsigned Depth) { if (Depth == RecursionMaxDepth) { DEBUG(dbgs() << "SLP: Gathering due to max recursion depth.\n"); - newTreeEntry(VL, false); + newTreeEntry(VL, false, false); return; } // Don't handle vectors. if (VL[0]->getType()->isVectorTy()) { DEBUG(dbgs() << "SLP: Gathering due to vector type.\n"); - newTreeEntry(VL, false); + newTreeEntry(VL, false, false); return; } if (StoreInst *SI = dyn_cast(VL[0])) if (SI->getValueOperand()->getType()->isVectorTy()) { DEBUG(dbgs() << "SLP: Gathering due to store vector type.\n"); - newTreeEntry(VL, false); + newTreeEntry(VL, false, false); return; } unsigned Opcode = getSameOpcode(VL); @@ -1014,7 +1030,7 @@ void BoUpSLP::buildTree_rec(ArrayRef VL, unsigned Depth) { // If all of the operands are identical or constant we have a simple solution. if (allConstant(VL) || isSplat(VL) || !allSameBlock(VL) || !Opcode) { DEBUG(dbgs() << "SLP: Gathering due to C,S,B,O. \n"); - newTreeEntry(VL, false); + newTreeEntry(VL, false, false); return; } @@ -1026,7 +1042,7 @@ void BoUpSLP::buildTree_rec(ArrayRef VL, unsigned Depth) { if (EphValues.count(VL[i])) { DEBUG(dbgs() << "SLP: The instruction (" << *VL[i] << ") is ephemeral.\n"); - newTreeEntry(VL, false); + newTreeEntry(VL, false, false); return; } } @@ -1039,7 +1055,7 @@ void BoUpSLP::buildTree_rec(ArrayRef VL, unsigned Depth) { DEBUG(dbgs() << "SLP: \tChecking bundle: " << *VL[i] << ".\n"); if (E->Scalars[i] != VL[i]) { DEBUG(dbgs() << "SLP: Gathering due to partial overlap.\n"); - newTreeEntry(VL, false); + newTreeEntry(VL, false, false); return; } } @@ -1052,7 +1068,7 @@ void BoUpSLP::buildTree_rec(ArrayRef VL, unsigned Depth) { if (ScalarToTreeEntry.count(VL[i])) { DEBUG(dbgs() << "SLP: The instruction (" << *VL[i] << ") is already in tree.\n"); - newTreeEntry(VL, false); + newTreeEntry(VL, false, false); return; } } @@ -1062,7 +1078,7 @@ void BoUpSLP::buildTree_rec(ArrayRef VL, unsigned Depth) { for (unsigned i = 0, e = VL.size(); i != e; ++i) { if (MustGather.count(VL[i])) { DEBUG(dbgs() << "SLP: Gathering due to gathered scalar.\n"); - newTreeEntry(VL, false); + newTreeEntry(VL, false, false); return; } } @@ -1076,7 +1092,7 @@ void BoUpSLP::buildTree_rec(ArrayRef VL, unsigned Depth) { // Don't go into unreachable blocks. They may contain instructions with // dependency cycles which confuse the final scheduling. DEBUG(dbgs() << "SLP: bundle in unreachable block.\n"); - newTreeEntry(VL, false); + newTreeEntry(VL, false, false); return; } @@ -1085,7 +1101,7 @@ void BoUpSLP::buildTree_rec(ArrayRef VL, unsigned Depth) { for (unsigned j = i+1; j < e; ++j) if (VL[i] == VL[j]) { DEBUG(dbgs() << "SLP: Scalar used twice in bundle.\n"); - newTreeEntry(VL, false); + newTreeEntry(VL, false, false); return; } @@ -1100,7 +1116,7 @@ void BoUpSLP::buildTree_rec(ArrayRef VL, unsigned Depth) { assert((!BS.getScheduleData(VL[0]) || !BS.getScheduleData(VL[0])->isPartOfBundle()) && "tryScheduleBundle should cancelScheduling on failure"); - newTreeEntry(VL, false); + newTreeEntry(VL, false, false); return; } DEBUG(dbgs() << "SLP: We are able to schedule this bundle.\n"); @@ -1117,12 +1133,12 @@ void BoUpSLP::buildTree_rec(ArrayRef VL, unsigned Depth) { if (Term) { DEBUG(dbgs() << "SLP: Need to swizzle PHINodes (TerminatorInst use).\n"); BS.cancelScheduling(VL); - newTreeEntry(VL, false); + newTreeEntry(VL, false, false); return; } } - newTreeEntry(VL, true); + newTreeEntry(VL, true, false); DEBUG(dbgs() << "SLP: added a vector of PHINodes.\n"); for (unsigned i = 0, e = PH->getNumIncomingValues(); i < e; ++i) { @@ -1144,7 +1160,7 @@ void BoUpSLP::buildTree_rec(ArrayRef VL, unsigned Depth) { } else { BS.cancelScheduling(VL); } - newTreeEntry(VL, Reuse); + newTreeEntry(VL, Reuse, false); return; } case Instruction::Load: { @@ -1160,7 +1176,7 @@ void BoUpSLP::buildTree_rec(ArrayRef VL, unsigned Depth) { if (DL->getTypeSizeInBits(ScalarTy) != DL->getTypeAllocSizeInBits(ScalarTy)) { BS.cancelScheduling(VL); - newTreeEntry(VL, false); + newTreeEntry(VL, false, false); DEBUG(dbgs() << "SLP: Gathering loads of non-packed type.\n"); return; } @@ -1171,15 +1187,13 @@ void BoUpSLP::buildTree_rec(ArrayRef VL, unsigned Depth) { LoadInst *L = cast(VL[i]); if (!L->isSimple()) { BS.cancelScheduling(VL); - newTreeEntry(VL, false); + newTreeEntry(VL, false, false); DEBUG(dbgs() << "SLP: Gathering non-simple loads.\n"); return; } } // Check if the loads are consecutive, reversed, or neither. - // TODO: What we really want is to sort the loads, but for now, check - // the two likely directions. bool Consecutive = true; bool ReverseConsecutive = true; for (unsigned i = 0, e = VL.size() - 1; i < e; ++i) { @@ -1193,7 +1207,7 @@ void BoUpSLP::buildTree_rec(ArrayRef VL, unsigned Depth) { if (Consecutive) { ++NumLoadsWantToKeepOrder; - newTreeEntry(VL, true); + newTreeEntry(VL, true, false); DEBUG(dbgs() << "SLP: added a vector of loads.\n"); return; } @@ -1207,8 +1221,25 @@ void BoUpSLP::buildTree_rec(ArrayRef VL, unsigned Depth) { break; } + if (VL.size() > 2 && !ReverseConsecutive) { + bool ShuffledLoads = true; + SmallVector List; + sortMemAccesses(VL, *DL, *SE, List); + auto NewVL = makeArrayRef(List.begin(), List.end()); + for (unsigned i = 0, e = NewVL.size() - 1; i < e; ++i) { + if (!isConsecutiveAccess(NewVL[i], NewVL[i + 1], *DL, *SE)) { + ShuffledLoads = false; + break; + } + } + if (ShuffledLoads) { + newTreeEntry(NewVL, true, true); + return; + } + } + BS.cancelScheduling(VL); - newTreeEntry(VL, false); + newTreeEntry(VL, false, false); if (ReverseConsecutive) { ++NumLoadsWantToChangeOrder; @@ -1231,16 +1262,16 @@ void BoUpSLP::buildTree_rec(ArrayRef VL, unsigned Depth) { case Instruction::FPTrunc: case Instruction::BitCast: { Type *SrcTy = VL0->getOperand(0)->getType(); - for (unsigned i = 0; i < VL.size(); ++i) { - Type *Ty = cast(VL[i])->getOperand(0)->getType(); + for (Value *Val : VL) { + Type *Ty = cast(Val)->getOperand(0)->getType(); if (Ty != SrcTy || !isValidElementType(Ty)) { BS.cancelScheduling(VL); - newTreeEntry(VL, false); + newTreeEntry(VL, false, false); DEBUG(dbgs() << "SLP: Gathering casts with different src types.\n"); return; } } - newTreeEntry(VL, true); + newTreeEntry(VL, true, false); DEBUG(dbgs() << "SLP: added a vector of casts.\n"); for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) { @@ -1263,13 +1294,13 @@ void BoUpSLP::buildTree_rec(ArrayRef VL, unsigned Depth) { if (Cmp->getPredicate() != P0 || Cmp->getOperand(0)->getType() != ComparedTy) { BS.cancelScheduling(VL); - newTreeEntry(VL, false); + newTreeEntry(VL, false, false); DEBUG(dbgs() << "SLP: Gathering cmp with different predicate.\n"); return; } } - newTreeEntry(VL, true); + newTreeEntry(VL, true, false); DEBUG(dbgs() << "SLP: added a vector of compares.\n"); for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) { @@ -1301,7 +1332,7 @@ void BoUpSLP::buildTree_rec(ArrayRef VL, unsigned Depth) { case Instruction::And: case Instruction::Or: case Instruction::Xor: { - newTreeEntry(VL, true); + newTreeEntry(VL, true, false); DEBUG(dbgs() << "SLP: added a vector of bin op.\n"); // Sort operands of the instructions so that each side is more likely to @@ -1326,11 +1357,11 @@ void BoUpSLP::buildTree_rec(ArrayRef VL, unsigned Depth) { } case Instruction::GetElementPtr: { // We don't combine GEPs with complicated (nested) indexing. - for (unsigned j = 0; j < VL.size(); ++j) { - if (cast(VL[j])->getNumOperands() != 2) { + for (Value *Val : VL) { + if (cast(Val)->getNumOperands() != 2) { DEBUG(dbgs() << "SLP: not-vectorizable GEP (nested indexes).\n"); BS.cancelScheduling(VL); - newTreeEntry(VL, false); + newTreeEntry(VL, false, false); return; } } @@ -1338,29 +1369,29 @@ void BoUpSLP::buildTree_rec(ArrayRef VL, unsigned Depth) { // We can't combine several GEPs into one vector if they operate on // different types. Type *Ty0 = cast(VL0)->getOperand(0)->getType(); - for (unsigned j = 0; j < VL.size(); ++j) { - Type *CurTy = cast(VL[j])->getOperand(0)->getType(); + for (Value *Val : VL) { + Type *CurTy = cast(Val)->getOperand(0)->getType(); if (Ty0 != CurTy) { DEBUG(dbgs() << "SLP: not-vectorizable GEP (different types).\n"); BS.cancelScheduling(VL); - newTreeEntry(VL, false); + newTreeEntry(VL, false, false); return; } } // We don't combine GEPs with non-constant indexes. - for (unsigned j = 0; j < VL.size(); ++j) { - auto Op = cast(VL[j])->getOperand(1); + for (Value *Val : VL) { + auto Op = cast(Val)->getOperand(1); if (!isa(Op)) { DEBUG( dbgs() << "SLP: not-vectorizable GEP (non-constant indexes).\n"); BS.cancelScheduling(VL); - newTreeEntry(VL, false); + newTreeEntry(VL, false, false); return; } } - newTreeEntry(VL, true); + newTreeEntry(VL, true, false); DEBUG(dbgs() << "SLP: added a vector of GEPs.\n"); for (unsigned i = 0, e = 2; i < e; ++i) { ValueList Operands; @@ -1377,12 +1408,12 @@ void BoUpSLP::buildTree_rec(ArrayRef VL, unsigned Depth) { for (unsigned i = 0, e = VL.size() - 1; i < e; ++i) if (!isConsecutiveAccess(VL[i], VL[i + 1], *DL, *SE)) { BS.cancelScheduling(VL); - newTreeEntry(VL, false); + newTreeEntry(VL, false, false); DEBUG(dbgs() << "SLP: Non-consecutive store.\n"); return; } - newTreeEntry(VL, true); + newTreeEntry(VL, true, false); DEBUG(dbgs() << "SLP: added a vector of stores.\n"); ValueList Operands; @@ -1400,7 +1431,7 @@ void BoUpSLP::buildTree_rec(ArrayRef VL, unsigned Depth) { Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); if (!isTriviallyVectorizable(ID)) { BS.cancelScheduling(VL); - newTreeEntry(VL, false); + newTreeEntry(VL, false, false); DEBUG(dbgs() << "SLP: Non-vectorizable call.\n"); return; } @@ -1414,7 +1445,7 @@ void BoUpSLP::buildTree_rec(ArrayRef VL, unsigned Depth) { getVectorIntrinsicIDForCall(CI2, TLI) != ID || !CI->hasIdenticalOperandBundleSchema(*CI2)) { BS.cancelScheduling(VL); - newTreeEntry(VL, false); + newTreeEntry(VL, false, false); DEBUG(dbgs() << "SLP: mismatched calls:" << *CI << "!=" << *VL[i] << "\n"); return; @@ -1425,7 +1456,7 @@ void BoUpSLP::buildTree_rec(ArrayRef VL, unsigned Depth) { Value *A1J = CI2->getArgOperand(1); if (A1I != A1J) { BS.cancelScheduling(VL); - newTreeEntry(VL, false); + newTreeEntry(VL, false, false); DEBUG(dbgs() << "SLP: mismatched arguments in call:" << *CI << " argument "<< A1I<<"!=" << A1J << "\n"); @@ -1438,14 +1469,14 @@ void BoUpSLP::buildTree_rec(ArrayRef VL, unsigned Depth) { CI->op_begin() + CI->getBundleOperandsEndIndex(), CI2->op_begin() + CI2->getBundleOperandsStartIndex())) { BS.cancelScheduling(VL); - newTreeEntry(VL, false); + newTreeEntry(VL, false, false); DEBUG(dbgs() << "SLP: mismatched bundle operands in calls:" << *CI << "!=" << *VL[i] << '\n'); return; } } - newTreeEntry(VL, true); + newTreeEntry(VL, true, false); for (unsigned i = 0, e = CI->getNumArgOperands(); i != e; ++i) { ValueList Operands; // Prepare the operand vector. @@ -1462,11 +1493,11 @@ void BoUpSLP::buildTree_rec(ArrayRef VL, unsigned Depth) { // then do not vectorize this instruction. if (!isAltShuffle) { BS.cancelScheduling(VL); - newTreeEntry(VL, false); + newTreeEntry(VL, false, false); DEBUG(dbgs() << "SLP: ShuffleVector are not vectorized.\n"); return; } - newTreeEntry(VL, true); + newTreeEntry(VL, true, false); DEBUG(dbgs() << "SLP: added a ShuffleVector op.\n"); // Reorder operands if reordering would enable vectorization. @@ -1490,7 +1521,7 @@ void BoUpSLP::buildTree_rec(ArrayRef VL, unsigned Depth) { } default: BS.cancelScheduling(VL); - newTreeEntry(VL, false); + newTreeEntry(VL, false, false); DEBUG(dbgs() << "SLP: Gathering unknown instruction.\n"); return; } @@ -1723,6 +1754,10 @@ int BoUpSLP::getEntryCost(TreeEntry *E) { TTI->getMemoryOpCost(Instruction::Load, ScalarTy, alignment, 0); int VecLdCost = TTI->getMemoryOpCost(Instruction::Load, VecTy, alignment, 0); + if (E->NeedToShuffle) { + VecLdCost += TTI->getShuffleCost( + TargetTransformInfo::SK_PermuteSingleSrc, VecTy, 0); + } return VecLdCost - ScalarLdCost; } case Instruction::Store: { @@ -2285,8 +2320,8 @@ Value *BoUpSLP::vectorizeTree(ArrayRef VL) { if (ScalarToTreeEntry.count(VL[0])) { int Idx = ScalarToTreeEntry[VL[0]]; TreeEntry *E = &VectorizableTree[Idx]; - if (E->isSame(VL)) - return vectorizeTree(E); + if (E->isSame(VL) || (E->NeedToShuffle && E->isFoundJumbled(VL, *DL, *SE))) + return vectorizeTree(VL, E); } Type *ScalarTy = VL[0]->getType(); @@ -2297,10 +2332,10 @@ Value *BoUpSLP::vectorizeTree(ArrayRef VL) { return Gather(VL, VecTy); } -Value *BoUpSLP::vectorizeTree(TreeEntry *E) { +Value *BoUpSLP::vectorizeTree(ArrayRef VL, TreeEntry *E) { IRBuilder<>::InsertPointGuard Guard(Builder); - if (E->VectorizedValue) { + if (E->VectorizedValue && !E->NeedToShuffle) { DEBUG(dbgs() << "SLP: Diamond merged for " << *E->Scalars[0] << ".\n"); return E->VectorizedValue; } @@ -2534,7 +2569,35 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { LI->setAlignment(Alignment); E->VectorizedValue = LI; ++NumVectorInstructions; - return propagateMetadata(LI, E->Scalars); + propagateMetadata(LI, E->Scalars); + + // As program order of scalar loads are jumbled, the vectorized 'load' + // must be followed by a 'shuffle' with the required jumbled mask. + if (!VL.empty() && (E->NeedToShuffle)) { + assert(VL.size() == E->Scalars.size() && + "Equal number of scalars expected"); + SmallVector Mask; + for (Value *Val : VL) { + if (ScalarToTreeEntry.count(Val)) { + int Idx = ScalarToTreeEntry[Val]; + TreeEntry *E = &VectorizableTree[Idx]; + for (unsigned Lane = 0, LE = VL.size(); Lane != LE; ++Lane) { + if (E->Scalars[Lane] == Val) { + Mask.push_back(Builder.getInt32(Lane)); + break; + } + } + } + } + + // Generate shuffle for jumbled memory access + Value *Undef = UndefValue::get(VecTy); + Value *Shuf = Builder.CreateShuffleVector((Value *)LI, Undef, + ConstantVector::get(Mask)); + return Shuf; + } + + return LI; } case Instruction::Store: { StoreInst *SI = cast(VL0); @@ -2709,7 +2772,7 @@ Value *BoUpSLP::vectorizeTree() { } Builder.SetInsertPoint(&F->getEntryBlock().front()); - auto *VectorRoot = vectorizeTree(&VectorizableTree[0]); + auto *VectorRoot = vectorizeTree(ArrayRef(), &VectorizableTree[0]); // If the vectorized tree can be rewritten in a smaller type, we truncate the // vectorized root. InstCombine will then rewrite the entire expression. We diff --git a/test/Transforms/SLPVectorizer/X86/jumbled-load.ll b/test/Transforms/SLPVectorizer/X86/jumbled-load.ll index 06e051a90b0..1c70a291a86 100644 --- a/test/Transforms/SLPVectorizer/X86/jumbled-load.ll +++ b/test/Transforms/SLPVectorizer/X86/jumbled-load.ll @@ -1,38 +1,31 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: opt < %s -S -mtriple=x86_64-unknown -mattr=+avx -slp-vectorizer | FileCheck %s +; RUN: opt < %s -S -mtriple=x86_64-unknown -mattr=+avx -slp-threshold=-10 -slp-vectorizer | FileCheck %s define i32 @jumbled-load(i32* noalias nocapture %in, i32* noalias nocapture %inn, i32* noalias nocapture %out) { ; CHECK-LABEL: @jumbled-load( ; CHECK-NEXT: [[IN_ADDR:%.*]] = getelementptr inbounds i32, i32* %in, i64 0 -; CHECK-NEXT: [[LOAD_1:%.*]] = load i32, i32* [[IN_ADDR]], align 4 ; CHECK-NEXT: [[GEP_1:%.*]] = getelementptr inbounds i32, i32* [[IN_ADDR]], i64 3 -; CHECK-NEXT: [[LOAD_2:%.*]] = load i32, i32* [[GEP_1]], align 4 ; CHECK-NEXT: [[GEP_2:%.*]] = getelementptr inbounds i32, i32* [[IN_ADDR]], i64 1 -; CHECK-NEXT: [[LOAD_3:%.*]] = load i32, i32* [[GEP_2]], align 4 ; CHECK-NEXT: [[GEP_3:%.*]] = getelementptr inbounds i32, i32* [[IN_ADDR]], i64 2 -; CHECK-NEXT: [[LOAD_4:%.*]] = load i32, i32* [[GEP_3]], align 4 +; CHECK-NEXT: [[TMP1:%.*]] = bitcast i32* [[IN_ADDR]] to <4 x i32>* +; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]], align 4 +; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> undef, <4 x i32> ; CHECK-NEXT: [[INN_ADDR:%.*]] = getelementptr inbounds i32, i32* %inn, i64 0 -; CHECK-NEXT: [[LOAD_5:%.*]] = load i32, i32* [[INN_ADDR]], align 4 ; CHECK-NEXT: [[GEP_4:%.*]] = getelementptr inbounds i32, i32* [[INN_ADDR]], i64 2 -; CHECK-NEXT: [[LOAD_6:%.*]] = load i32, i32* [[GEP_4]], align 4 ; CHECK-NEXT: [[GEP_5:%.*]] = getelementptr inbounds i32, i32* [[INN_ADDR]], i64 3 -; CHECK-NEXT: [[LOAD_7:%.*]] = load i32, i32* [[GEP_5]], align 4 ; CHECK-NEXT: [[GEP_6:%.*]] = getelementptr inbounds i32, i32* [[INN_ADDR]], i64 1 -; CHECK-NEXT: [[LOAD_8:%.*]] = load i32, i32* [[GEP_6]], align 4 -; CHECK-NEXT: [[MUL_1:%.*]] = mul i32 [[LOAD_3]], [[LOAD_5]] -; CHECK-NEXT: [[MUL_2:%.*]] = mul i32 [[LOAD_2]], [[LOAD_8]] -; CHECK-NEXT: [[MUL_3:%.*]] = mul i32 [[LOAD_4]], [[LOAD_7]] -; CHECK-NEXT: [[MUL_4:%.*]] = mul i32 [[LOAD_1]], [[LOAD_6]] +; CHECK-NEXT: [[TMP4:%.*]] = bitcast i32* [[INN_ADDR]] to <4 x i32>* +; CHECK-NEXT: [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* [[TMP4]], align 4 +; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> undef, <4 x i32> +; CHECK-NEXT: [[TMP7:%.*]] = mul <4 x i32> [[TMP3]], [[TMP6]] ; CHECK-NEXT: [[GEP_7:%.*]] = getelementptr inbounds i32, i32* %out, i64 0 -; CHECK-NEXT: store i32 [[MUL_1]], i32* [[GEP_7]], align 4 ; CHECK-NEXT: [[GEP_8:%.*]] = getelementptr inbounds i32, i32* %out, i64 1 -; CHECK-NEXT: store i32 [[MUL_2]], i32* [[GEP_8]], align 4 ; CHECK-NEXT: [[GEP_9:%.*]] = getelementptr inbounds i32, i32* %out, i64 2 -; CHECK-NEXT: store i32 [[MUL_3]], i32* [[GEP_9]], align 4 ; CHECK-NEXT: [[GEP_10:%.*]] = getelementptr inbounds i32, i32* %out, i64 3 -; CHECK-NEXT: store i32 [[MUL_4]], i32* [[GEP_10]], align 4 +; CHECK-NEXT: [[TMP8:%.*]] = bitcast i32* [[GEP_7]] to <4 x i32>* +; CHECK-NEXT: store <4 x i32> [[TMP7]], <4 x i32>* [[TMP8]], align 4 ; CHECK-NEXT: ret i32 undef ; %in.addr = getelementptr inbounds i32, i32* %in, i64 0 diff --git a/test/Transforms/SLPVectorizer/X86/reduction_loads.ll b/test/Transforms/SLPVectorizer/X86/reduction_loads.ll index 76b3b9174a5..1ffbe0d87bf 100644 --- a/test/Transforms/SLPVectorizer/X86/reduction_loads.ll +++ b/test/Transforms/SLPVectorizer/X86/reduction_loads.ll @@ -31,8 +31,8 @@ define i32 @test(i32* nocapture readonly %p) { ; CHECK-NEXT: [[BIN_RDX2:%.*]] = add <8 x i32> [[BIN_RDX]], [[RDX_SHUF1]] ; CHECK-NEXT: [[RDX_SHUF3:%.*]] = shufflevector <8 x i32> [[BIN_RDX2]], <8 x i32> undef, <8 x i32> ; CHECK-NEXT: [[BIN_RDX4:%.*]] = add <8 x i32> [[BIN_RDX2]], [[RDX_SHUF3]] -; CHECK-NEXT: [[TMP3:%.*]] = extractelement <8 x i32> [[BIN_RDX4]], i32 0 -; CHECK-NEXT: [[ADD_7:%.*]] = add i32 [[TMP3]], [[SUM]] +; CHECK-NEXT: [[TMP4:%.*]] = extractelement <8 x i32> [[BIN_RDX4]], i32 0 +; CHECK-NEXT: [[ADD_7:%.*]] = add i32 [[TMP4]], [[SUM]] ; CHECK-NEXT: br i1 true, label %for.end, label %for.body ; CHECK: for.end: ; CHECK-NEXT: ret i32 [[ADD_7]] diff --git a/test/Transforms/SLPVectorizer/X86/store-jumbled.ll b/test/Transforms/SLPVectorizer/X86/store-jumbled.ll index 1b2c76384e0..3f772c70145 100644 --- a/test/Transforms/SLPVectorizer/X86/store-jumbled.ll +++ b/test/Transforms/SLPVectorizer/X86/store-jumbled.ll @@ -1,38 +1,31 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: opt < %s -S -mtriple=x86_64-unknown -mattr=+avx -slp-vectorizer | FileCheck %s +; RUN: opt < %s -S -mtriple=x86_64-unknown -mattr=+avx -slp-threshold=-10 -slp-vectorizer | FileCheck %s define i32 @jumbled-load(i32* noalias nocapture %in, i32* noalias nocapture %inn, i32* noalias nocapture %out) { ; CHECK-LABEL: @jumbled-load( ; CHECK-NEXT: [[IN_ADDR:%.*]] = getelementptr inbounds i32, i32* [[IN:%.*]], i64 0 -; CHECK-NEXT: [[LOAD_1:%.*]] = load i32, i32* [[IN_ADDR]], align 4 ; CHECK-NEXT: [[GEP_1:%.*]] = getelementptr inbounds i32, i32* [[IN_ADDR]], i64 1 -; CHECK-NEXT: [[LOAD_2:%.*]] = load i32, i32* [[GEP_1]], align 4 ; CHECK-NEXT: [[GEP_2:%.*]] = getelementptr inbounds i32, i32* [[IN_ADDR]], i64 2 -; CHECK-NEXT: [[LOAD_3:%.*]] = load i32, i32* [[GEP_2]], align 4 ; CHECK-NEXT: [[GEP_3:%.*]] = getelementptr inbounds i32, i32* [[IN_ADDR]], i64 3 -; CHECK-NEXT: [[LOAD_4:%.*]] = load i32, i32* [[GEP_3]], align 4 +; CHECK-NEXT: [[TMP1:%.*]] = bitcast i32* [[IN_ADDR]] to <4 x i32>* +; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]], align 4 +; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> undef, <4 x i32> ; CHECK-NEXT: [[INN_ADDR:%.*]] = getelementptr inbounds i32, i32* [[INN:%.*]], i64 0 -; CHECK-NEXT: [[LOAD_5:%.*]] = load i32, i32* [[INN_ADDR]], align 4 ; CHECK-NEXT: [[GEP_4:%.*]] = getelementptr inbounds i32, i32* [[INN_ADDR]], i64 1 -; CHECK-NEXT: [[LOAD_6:%.*]] = load i32, i32* [[GEP_4]], align 4 ; CHECK-NEXT: [[GEP_5:%.*]] = getelementptr inbounds i32, i32* [[INN_ADDR]], i64 2 -; CHECK-NEXT: [[LOAD_7:%.*]] = load i32, i32* [[GEP_5]], align 4 ; CHECK-NEXT: [[GEP_6:%.*]] = getelementptr inbounds i32, i32* [[INN_ADDR]], i64 3 -; CHECK-NEXT: [[LOAD_8:%.*]] = load i32, i32* [[GEP_6]], align 4 -; CHECK-NEXT: [[MUL_1:%.*]] = mul i32 [[LOAD_1]], [[LOAD_5]] -; CHECK-NEXT: [[MUL_2:%.*]] = mul i32 [[LOAD_2]], [[LOAD_6]] -; CHECK-NEXT: [[MUL_3:%.*]] = mul i32 [[LOAD_3]], [[LOAD_7]] -; CHECK-NEXT: [[MUL_4:%.*]] = mul i32 [[LOAD_4]], [[LOAD_8]] +; CHECK-NEXT: [[TMP4:%.*]] = bitcast i32* [[INN_ADDR]] to <4 x i32>* +; CHECK-NEXT: [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* [[TMP4]], align 4 +; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> undef, <4 x i32> +; CHECK-NEXT: [[TMP7:%.*]] = mul <4 x i32> [[TMP3]], [[TMP6]] ; CHECK-NEXT: [[GEP_7:%.*]] = getelementptr inbounds i32, i32* [[OUT:%.*]], i64 0 ; CHECK-NEXT: [[GEP_8:%.*]] = getelementptr inbounds i32, i32* [[OUT]], i64 1 ; CHECK-NEXT: [[GEP_9:%.*]] = getelementptr inbounds i32, i32* [[OUT]], i64 2 ; CHECK-NEXT: [[GEP_10:%.*]] = getelementptr inbounds i32, i32* [[OUT]], i64 3 -; CHECK-NEXT: store i32 [[MUL_1]], i32* [[GEP_9]], align 4 -; CHECK-NEXT: store i32 [[MUL_2]], i32* [[GEP_7]], align 4 -; CHECK-NEXT: store i32 [[MUL_3]], i32* [[GEP_10]], align 4 -; CHECK-NEXT: store i32 [[MUL_4]], i32* [[GEP_8]], align 4 +; CHECK-NEXT: [[TMP8:%.*]] = bitcast i32* [[GEP_7]] to <4 x i32>* +; CHECK-NEXT: store <4 x i32> [[TMP7]], <4 x i32>* [[TMP8]], align 4 ; CHECK-NEXT: ret i32 undef ; %in.addr = getelementptr inbounds i32, i32* %in, i64 0