//===----------------------------------------------------------------------===//
#include "llvm/Transforms/Vectorize/LoopVectorize.h"
+#include "VPlan.h"
#include "llvm/ADT/DenseMap.h"
#include "llvm/ADT/Hashing.h"
#include "llvm/ADT/MapVector.h"
#include "llvm/Transforms/Utils/LoopVersioning.h"
#include "llvm/Transforms/Vectorize.h"
#include <algorithm>
+#include <functional>
#include <map>
#include <tuple>
class LoopVectorizationLegality;
class LoopVectorizationCostModel;
class LoopVectorizationRequirements;
+class VPInterleaveRecipe;
+class VPReplicateRecipe;
+class VPWidenIntOrFpInductionRecipe;
+class VPWidenRecipe;
/// Returns true if the given loop body has a cycle, excluding the loop
/// itself.
: ConstantFP::get(Ty, C);
}
+} // end anonymous namespace
+
+namespace llvm {
/// InnerLoopVectorizer vectorizes loops which contain only one basic
/// block to a specified vectorization factor (VF).
/// This class performs the widening of scalars into vectors, or multiple
AddedSafetyChecks(false) {}
/// Create a new empty loop. Unlink the old loop and connect the new one.
- void createVectorizedLoopSkeleton();
+ /// Return the pre-header block of the new loop.
+ BasicBlock *createVectorizedLoopSkeleton();
- /// Vectorize a single instruction within the innermost loop.
- void vectorizeInstruction(Instruction &I);
+ /// Widen a single instruction within the innermost loop.
+ void widenInstruction(Instruction &I);
/// Fix the vectorized code, taking care of header phi's, live-outs, and more.
void fixVectorizedLoop();
virtual ~InnerLoopVectorizer() {}
-protected:
- /// A small list of PHINodes.
- typedef SmallVector<PHINode *, 4> PhiVector;
-
/// A type for vectorized values in the new loop. Each value from the
/// original loop, when vectorized, is represented by UF vector values in the
/// new unrolled loop, where UF is the unroll factor.
typedef SmallVector<Value *, 2> VectorParts;
+ /// A helper function that computes the predicate of the block BB, assuming
+ /// that the header block of the loop is set to True. It returns the *entry*
+ /// mask for the block BB.
+ VectorParts createBlockInMask(BasicBlock *BB);
+
+ /// Vectorize a single PHINode in a block. This method handles the induction
+ /// variable canonicalization. It supports both VF = 1 for unrolled loops and
+ /// arbitrary length vectors.
+ void widenPHIInstruction(Instruction *PN, unsigned UF, unsigned VF);
+
+ /// A helper function to scalarize a single Instruction in the innermost loop.
+ /// Generates a sequence of scalar instances for each lane between \p MinLane
+ /// and \p MaxLane, times each part between \p MinPart and \p MaxPart,
+ /// inclusive..
+ void scalarizeInstruction(Instruction *Instr, const VPIteration &Instance,
+ bool IfPredicateInstr);
+
+ /// Widen an integer or floating-point induction variable \p IV. If \p Trunc
+ /// is provided, the integer induction variable will first be truncated to
+ /// the corresponding type.
+ void widenIntOrFpInduction(PHINode *IV, TruncInst *Trunc = nullptr);
+
+ /// getOrCreateVectorValue and getOrCreateScalarValue coordinate to generate a
+ /// vector or scalar value on-demand if one is not yet available. When
+ /// vectorizing a loop, we visit the definition of an instruction before its
+ /// uses. When visiting the definition, we either vectorize or scalarize the
+ /// instruction, creating an entry for it in the corresponding map. (In some
+ /// cases, such as induction variables, we will create both vector and scalar
+ /// entries.) Then, as we encounter uses of the definition, we derive values
+ /// for each scalar or vector use unless such a value is already available.
+ /// For example, if we scalarize a definition and one of its uses is vector,
+ /// we build the required vector on-demand with an insertelement sequence
+ /// when visiting the use. Otherwise, if the use is scalar, we can use the
+ /// existing scalar definition.
+ ///
+ /// Return a value in the new loop corresponding to \p V from the original
+ /// loop at unroll index \p Part. If the value has already been vectorized,
+ /// the corresponding vector entry in VectorLoopValueMap is returned. If,
+ /// however, the value has a scalar entry in VectorLoopValueMap, we construct
+ /// a new vector value on-demand by inserting the scalar values into a vector
+ /// with an insertelement sequence. If the value has been neither vectorized
+ /// nor scalarized, it must be loop invariant, so we simply broadcast the
+ /// value into a vector.
+ Value *getOrCreateVectorValue(Value *V, unsigned Part);
+
+ /// Return a value in the new loop corresponding to \p V from the original
+ /// loop at unroll and vector indices \p Instance. If the value has been
+ /// vectorized but not scalarized, the necessary extractelement instruction
+ /// will be generated.
+ Value *getOrCreateScalarValue(Value *V, const VPIteration &Instance);
+
+ /// Construct the vector value of a scalarized value \p V one lane at a time.
+ void packScalarIntoVectorValue(Value *V, const VPIteration &Instance);
+
+ /// Try to vectorize the interleaved access group that \p Instr belongs to.
+ void vectorizeInterleaveGroup(Instruction *Instr);
+
+protected:
+ /// A small list of PHINodes.
+ typedef SmallVector<PHINode *, 4> PhiVector;
+
/// A type for scalarized values in the new loop. Each value from the
/// original loop, when scalarized, is represented by UF x VF scalar values
/// in the new unrolled loop, where UF is the unroll factor and VF is the
/// the block that was created for it.
void sinkScalarOperands(Instruction *PredInst);
- /// Predicate conditional instructions that require predication on their
- /// respective conditions.
- void predicateInstructions();
-
/// Shrinks vector element sizes to the smallest bitwidth they can be legally
/// represented as.
void truncateToMinimalBitwidths();
- /// A helper function that computes the predicate of the block BB, assuming
- /// that the header block of the loop is set to True. It returns the *entry*
- /// mask for the block BB.
- VectorParts createBlockInMask(BasicBlock *BB);
/// A helper function that computes the predicate of the edge between SRC
/// and DST.
VectorParts createEdgeMask(BasicBlock *Src, BasicBlock *Dst);
- /// Vectorize a single PHINode in a block. This method handles the induction
- /// variable canonicalization. It supports both VF = 1 for unrolled loops and
- /// arbitrary length vectors.
- void widenPHIInstruction(Instruction *PN, unsigned UF, unsigned VF);
-
/// Insert the new loop to the loop hierarchy and pass manager
/// and update the analysis passes.
void updateAnalysis();
- /// This instruction is un-vectorizable. Implement it as a sequence
- /// of scalars. If \p IfPredicateInstr is true we need to 'hide' each
- /// scalarized instruction behind an if block predicated on the control
- /// dependence of the instruction.
- void scalarizeInstruction(Instruction *Instr, bool IfPredicateInstr = false);
-
/// Vectorize Load and Store instructions,
virtual void vectorizeMemoryInstruction(Instruction *Instr);
void createVectorIntOrFpInductionPHI(const InductionDescriptor &II,
Value *Step, Instruction *EntryVal);
- /// Widen an integer or floating-point induction variable \p IV. If \p Trunc
- /// is provided, the integer induction variable will first be truncated to
- /// the corresponding type.
- void widenIntOrFpInduction(PHINode *IV, TruncInst *Trunc = nullptr);
-
/// Returns true if an instruction \p I should be scalarized instead of
/// vectorized for the chosen vectorization factor.
bool shouldScalarizeInstruction(Instruction *I) const;
/// Returns true if we should generate a scalar version of \p IV.
bool needsScalarInduction(Instruction *IV) const;
- /// getOrCreateVectorValue and getOrCreateScalarValue coordinate to generate a
- /// vector or scalar value on-demand if one is not yet available. When
- /// vectorizing a loop, we visit the definition of an instruction before its
- /// uses. When visiting the definition, we either vectorize or scalarize the
- /// instruction, creating an entry for it in the corresponding map. (In some
- /// cases, such as induction variables, we will create both vector and scalar
- /// entries.) Then, as we encounter uses of the definition, we derive values
- /// for each scalar or vector use unless such a value is already available.
- /// For example, if we scalarize a definition and one of its uses is vector,
- /// we build the required vector on-demand with an insertelement sequence
- /// when visiting the use. Otherwise, if the use is scalar, we can use the
- /// existing scalar definition.
- ///
- /// Return a value in the new loop corresponding to \p V from the original
- /// loop at unroll index \p Part. If the value has already been vectorized,
- /// the corresponding vector entry in VectorLoopValueMap is returned. If,
- /// however, the value has a scalar entry in VectorLoopValueMap, we construct
- /// a new vector value on-demand by inserting the scalar values into a vector
- /// with an insertelement sequence. If the value has been neither vectorized
- /// nor scalarized, it must be loop invariant, so we simply broadcast the
- /// value into a vector.
- Value *getOrCreateVectorValue(Value *V, unsigned Part);
-
- /// Return a value in the new loop corresponding to \p V from the original
- /// loop at unroll index \p Part and vector index \p Lane. If the value has
- /// been vectorized but not scalarized, the necessary extractelement
- /// instruction will be generated.
- Value *getOrCreateScalarValue(Value *V, unsigned Part, unsigned Lane);
-
- /// Try to vectorize the interleaved access group that \p Instr belongs to.
- void vectorizeInterleaveGroup(Instruction *Instr);
-
/// Generate a shuffle sequence that will reverse the vector Vec.
virtual Value *reverseVector(Value *Vec);
/// the instruction.
void setDebugLocFromInst(IRBuilder<> &B, const Value *Ptr);
- /// This is a helper class for maintaining vectorization state. It's used for
- /// mapping values from the original loop to their corresponding values in
- /// the new loop. Two mappings are maintained: one for vectorized values and
- /// one for scalarized values. Vectorized values are represented with UF
- /// vector values in the new loop, and scalarized values are represented with
- /// UF x VF scalar values in the new loop. UF and VF are the unroll and
- /// vectorization factors, respectively.
- ///
- /// Entries can be added to either map with setVectorValue and setScalarValue,
- /// which assert that an entry was not already added before. If an entry is to
- /// replace an existing one, call resetVectorValue. This is currently needed
- /// to modify the mapped values during "fix-up" operations that occur once the
- /// first phase of widening is complete. These operations include type
- /// truncation and the second phase of recurrence widening.
- ///
- /// Entries from either map can be retrieved using the getVectorValue and
- /// getScalarValue functions, which assert that the desired value exists.
-
- struct ValueMap {
-
- /// Construct an empty map with the given unroll and vectorization factors.
- ValueMap(unsigned UF, unsigned VF) : UF(UF), VF(VF) {}
-
- /// \return True if the map has any vector entry for \p Key.
- bool hasAnyVectorValue(Value *Key) const {
- return VectorMapStorage.count(Key);
- }
-
- /// \return True if the map has a vector entry for \p Key and \p Part.
- bool hasVectorValue(Value *Key, unsigned Part) const {
- assert(Part < UF && "Queried Vector Part is too large.");
- if (!hasAnyVectorValue(Key))
- return false;
- const VectorParts &Entry = VectorMapStorage.find(Key)->second;
- assert(Entry.size() == UF && "VectorParts has wrong dimensions.");
- return Entry[Part] != nullptr;
- }
-
- /// \return True if the map has any scalar entry for \p Key.
- bool hasAnyScalarValue(Value *Key) const {
- return ScalarMapStorage.count(Key);
- }
-
- /// \return True if the map has a scalar entry for \p Key, \p Part and
- /// \p Part.
- bool hasScalarValue(Value *Key, unsigned Part, unsigned Lane) const {
- assert(Part < UF && "Queried Scalar Part is too large.");
- assert(Lane < VF && "Queried Scalar Lane is too large.");
- if (!hasAnyScalarValue(Key))
- return false;
- const ScalarParts &Entry = ScalarMapStorage.find(Key)->second;
- assert(Entry.size() == UF && "ScalarParts has wrong dimensions.");
- assert(Entry[Part].size() == VF && "ScalarParts has wrong dimensions.");
- return Entry[Part][Lane] != nullptr;
- }
-
- /// Retrieve the existing vector value that corresponds to \p Key and
- /// \p Part.
- Value *getVectorValue(Value *Key, unsigned Part) {
- assert(hasVectorValue(Key, Part) && "Getting non-existent value.");
- return VectorMapStorage[Key][Part];
- }
-
- /// Retrieve the existing scalar value that corresponds to \p Key, \p Part
- /// and \p Lane.
- Value *getScalarValue(Value *Key, unsigned Part, unsigned Lane) {
- assert(hasScalarValue(Key, Part, Lane) && "Getting non-existent value.");
- return ScalarMapStorage[Key][Part][Lane];
- }
-
- /// Set a vector value associated with \p Key and \p Part. Assumes such a
- /// value is not already set. If it is, use resetVectorValue() instead.
- void setVectorValue(Value *Key, unsigned Part, Value *Vector) {
- assert(!hasVectorValue(Key, Part) && "Vector value already set for part");
- if (!VectorMapStorage.count(Key)) {
- VectorParts Entry(UF);
- VectorMapStorage[Key] = Entry;
- }
- VectorMapStorage[Key][Part] = Vector;
- }
-
- /// Set a scalar value associated with \p Key for \p Part and \p Lane.
- /// Assumes such a value is not already set.
- void setScalarValue(Value *Key, unsigned Part, unsigned Lane,
- Value *Scalar) {
- assert(!hasScalarValue(Key, Part, Lane) && "Scalar value already set");
- if (!ScalarMapStorage.count(Key)) {
- ScalarParts Entry(UF);
- for (unsigned Part = 0; Part < UF; ++Part)
- Entry[Part].resize(VF, nullptr);
- // TODO: Consider storing uniform values only per-part, as they occupy
- // lane 0 only, keeping the other VF-1 redundant entries null.
- ScalarMapStorage[Key] = Entry;
- }
- ScalarMapStorage[Key][Part][Lane] = Scalar;
- }
-
- /// Reset the vector value associated with \p Key for the given \p Part.
- /// This function can be used to update values that have already been
- /// vectorized. This is the case for "fix-up" operations including type
- /// truncation and the second phase of recurrence vectorization.
- void resetVectorValue(Value *Key, unsigned Part, Value *Vector) {
- assert(hasVectorValue(Key, Part) && "Vector value not set for part");
- VectorMapStorage[Key][Part] = Vector;
- }
-
- private:
- /// The unroll factor. Each entry in the vector map contains UF vector
- /// values.
- unsigned UF;
-
- /// The vectorization factor. Each entry in the scalar map contains UF x VF
- /// scalar values.
- unsigned VF;
-
- /// The vector and scalar map storage. We use std::map and not DenseMap
- /// because insertions to DenseMap invalidate its iterators.
- std::map<Value *, VectorParts> VectorMapStorage;
- std::map<Value *, ScalarParts> ScalarMapStorage;
- };
-
/// The original loop.
Loop *OrigLoop;
/// A wrapper around ScalarEvolution used to add runtime SCEV checks. Applies
/// vector elements.
unsigned VF;
-protected:
/// The vectorization unroll factor to use. Each scalar is vectorized to this
/// many different vector instructions.
unsigned UF;
/// vectorized loop. A key value can map to either vector values, scalar
/// values or both kinds of values, depending on whether the key was
/// vectorized and scalarized.
- ValueMap VectorLoopValueMap;
+ VectorizerValueMap VectorLoopValueMap;
- /// Store instructions that should be predicated, as a pair
- /// <StoreInst, Predicate>
- SmallVector<std::pair<Instruction *, Value *>, 4> PredicatedInstructions;
+ /// Store instructions that were predicated.
+ SmallVector<Instruction *, 4> PredicatedInstructions;
EdgeMaskCacheTy EdgeMaskCache;
BlockMaskCacheTy BlockMaskCache;
/// Trip count of the original loop.
// Holds the end values for each induction variable. We save the end values
// so we can later fix-up the external users of the induction variables.
DenseMap<PHINode *, Value *> IVEndValues;
+
+ friend class LoopVectorizationPlanner;
};
class InnerLoopUnroller : public InnerLoopVectorizer {
UnrollFactor, LVL, CM) {}
private:
- void vectorizeMemoryInstruction(Instruction *Instr) override;
Value *getBroadcastInstrs(Value *V) override;
Value *getStepVector(Value *Val, int StartIdx, Value *Step,
Instruction::BinaryOps Opcode =
}
}
+} // namespace llvm
+
+namespace {
+
/// \brief The group of interleaved loads/stores sharing the same stride and
/// close to each other.
///
/// \returns True if it is more profitable to scalarize instruction \p I for
/// vectorization factor \p VF.
bool isProfitableToScalarize(Instruction *I, unsigned VF) const {
+ assert(VF > 1 && "Profitable to scalarize relevant only for VF > 1.");
auto Scalars = InstsToScalarize.find(VF);
assert(Scalars != InstsToScalarize.end() &&
"VF not yet analyzed for scalarization profitability");
return Legal->isInductionVariable(Op);
}
+ /// Collects the instructions to scalarize for each predicated instruction in
+ /// the loop.
+ void collectInstsToScalarize(unsigned VF);
+
+ /// Collect Uniform and Scalar values for the given \p VF.
+ /// The sets depend on CM decision for Load/Store instructions
+ /// that may be vectorized as interleave, gather-scatter or scalarized.
+ void collectUniformsAndScalars(unsigned VF) {
+ // Do the analysis once.
+ if (VF == 1 || Uniforms.count(VF))
+ return;
+ setCostBasedWideningDecision(VF);
+ collectLoopUniforms(VF);
+ collectLoopScalars(VF);
+ }
+
private:
/// \return An upper bound for the vectorization factor, larger than zero.
/// One is returned if vectorization should best be avoided due to cost.
int computePredInstDiscount(Instruction *PredInst, ScalarCostsTy &ScalarCosts,
unsigned VF);
- /// Collects the instructions to scalarize for each predicated instruction in
- /// the loop.
- void collectInstsToScalarize(unsigned VF);
-
/// Collect the instructions that are uniform after vectorization. An
/// instruction is uniform if we represent it with a single scalar value in
/// the vectorized loop corresponding to each vector iteration. Examples of
/// iteration of the original scalar loop.
void collectLoopScalars(unsigned VF);
- /// Collect Uniform and Scalar values for the given \p VF.
- /// The sets depend on CM decision for Load/Store instructions
- /// that may be vectorized as interleave, gather-scatter or scalarized.
- void collectUniformsAndScalars(unsigned VF) {
- // Do the analysis once.
- if (VF == 1 || Uniforms.count(VF))
- return;
- setCostBasedWideningDecision(VF);
- collectLoopUniforms(VF);
- collectLoopScalars(VF);
- }
-
/// Keeps cost model vectorization decision and cost for instructions.
/// Right now it is used for memory instructions only.
typedef DenseMap<std::pair<Instruction *, unsigned>,
SmallPtrSet<const Value *, 16> VecValuesToIgnore;
};
+} // end anonymous namespace
+
+namespace llvm {
+/// InnerLoopVectorizer vectorizes loops which contain only one basic
/// LoopVectorizationPlanner - drives the vectorization process after having
/// passed Legality checks.
+/// The planner builds and optimizes the Vectorization Plans which record the
+/// decisions how to vectorize the given loop. In particular, represent the
+/// control-flow of the vectorized version, the replication of instructions that
+/// are to be scalarized, and interleave access groups.
class LoopVectorizationPlanner {
+ /// The loop that we evaluate.
+ Loop *OrigLoop;
+
+ /// Loop Info analysis.
+ LoopInfo *LI;
+
+ /// Target Library Info.
+ const TargetLibraryInfo *TLI;
+
+ /// Target Transform Info.
+ const TargetTransformInfo *TTI;
+
+ /// The legality analysis.
+ LoopVectorizationLegality *Legal;
+
+ /// The profitablity analysis.
+ LoopVectorizationCostModel &CM;
+
+ SmallVector<VPlan *, 4> VPlans;
+
+ unsigned BestVF;
+ unsigned BestUF;
+
public:
- LoopVectorizationPlanner(Loop *OrigLoop, LoopInfo *LI,
+ LoopVectorizationPlanner(Loop *L, LoopInfo *LI, const TargetLibraryInfo *TLI,
+ const TargetTransformInfo *TTI,
LoopVectorizationLegality *Legal,
LoopVectorizationCostModel &CM)
- : OrigLoop(OrigLoop), LI(LI), Legal(Legal), CM(CM) {}
-
- ~LoopVectorizationPlanner() {}
+ : OrigLoop(L), LI(LI), TLI(TLI), TTI(TTI), Legal(Legal), CM(CM),
+ BestVF(0), BestUF(0) {}
+
+ ~LoopVectorizationPlanner() {
+ while (!VPlans.empty()) {
+ VPlan *Plan = VPlans.back();
+ VPlans.pop_back();
+ delete Plan;
+ }
+ }
/// Plan how to best vectorize, return the best VF and its cost.
LoopVectorizationCostModel::VectorizationFactor plan(bool OptForSize,
unsigned UserVF);
- /// Generate the IR code for the vectorized loop.
- void executePlan(InnerLoopVectorizer &ILV);
+ /// Finalize the best decision and dispose of all other VPlans.
+ void setBestPlan(unsigned VF, unsigned UF);
+
+ /// Generate the IR code for the body of the vectorized loop according to the
+ /// best selected VPlan.
+ void executePlan(InnerLoopVectorizer &LB, DominatorTree *DT);
+
+ void printPlans(raw_ostream &O) {
+ for (VPlan *Plan : VPlans)
+ O << *Plan;
+ }
protected:
/// Collect the instructions from the original loop that would be trivially
void collectTriviallyDeadInstructions(
SmallPtrSetImpl<Instruction *> &DeadInstructions);
-private:
- /// The loop that we evaluate.
- Loop *OrigLoop;
+ /// A range of powers-of-2 vectorization factors with fixed start and
+ /// adjustable end. The range includes start and excludes end, e.g.,:
+ /// [1, 9) = {1, 2, 4, 8}
+ struct VFRange {
+ const unsigned Start; // A power of 2.
+ unsigned End; // Need not be a power of 2. If End <= Start range is empty.
+ };
- /// Loop Info analysis.
- LoopInfo *LI;
+ /// Test a \p Predicate on a \p Range of VF's. Return the value of applying
+ /// \p Predicate on Range.Start, possibly decreasing Range.End such that the
+ /// returned value holds for the entire \p Range.
+ bool getDecisionAndClampRange(const std::function<bool(unsigned)> &Predicate,
+ VFRange &Range);
- /// The legality analysis.
- LoopVectorizationLegality *Legal;
+ /// Build VPlans for power-of-2 VF's between \p MinVF and \p MaxVF inclusive,
+ /// according to the information gathered by Legal when it checked if it is
+ /// legal to vectorize the loop.
+ void buildVPlans(unsigned MinVF, unsigned MaxVF);
- /// The profitablity analysis.
- LoopVectorizationCostModel &CM;
+private:
+ /// Check if \I belongs to an Interleave Group within the given VF \p Range,
+ /// \return true in the first returned value if so and false otherwise.
+ /// Build a new VPInterleaveGroup Recipe if \I is the primary member of an IG
+ /// for \p Range.Start, and provide it as the second returned value.
+ /// Note that if \I is an adjunct member of an IG for \p Range.Start, the
+ /// \return value is <true, nullptr>, as it is handled by another recipe.
+ /// \p Range.End may be decreased to ensure same decision from \p Range.Start
+ /// to \p Range.End.
+ VPInterleaveRecipe *tryToInterleaveMemory(Instruction *I, VFRange &Range);
+
+ /// Check if an induction recipe should be constructed for \I within the given
+ /// VF \p Range. If so build and return it. If not, return null. \p Range.End
+ /// may be decreased to ensure same decision from \p Range.Start to
+ /// \p Range.End.
+ VPWidenIntOrFpInductionRecipe *tryToOptimizeInduction(Instruction *I,
+ VFRange &Range);
+
+ /// Check if \I can be widened within the given VF \p Range. If \I can be
+ /// widened for Range.Start, extend \p LastWidenRecipe to include \p I if
+ /// possible or else build a new VPWidenRecipe for it, and return the
+ /// VPWidenRecipe that includes \p I. If \p I cannot be widened for
+ /// Range.Start \return null. Range.End may be decreased to ensure same
+ /// decision from \p Range.Start to \p Range.End.
+ VPWidenRecipe *tryToWiden(Instruction *I, VPWidenRecipe *LastWidenRecipe,
+ VFRange &Range);
+
+ /// Build a VPReplicationRecipe for \p I and enclose it within a Region if it
+ /// is predicated. \return \p VPBB augmented with this new recipe if \p I is
+ /// not predicated, otherwise \return a new VPBasicBlock that succeeds the new
+ /// Region. Update the packing decision of predicated instructions if they
+ /// feed \p I. Range.End may be decreased to ensure same recipe behavior from
+ /// \p Range.Start to \p Range.End.
+ VPBasicBlock *handleReplication(
+ Instruction *I, VFRange &Range, VPBasicBlock *VPBB,
+ DenseMap<Instruction *, VPReplicateRecipe *> &PredInst2Recipe);
+
+ /// Create a replicating region for instruction \p I that requires
+ /// predication. \p PredRecipe is a VPReplicateRecipe holding \p I.
+ VPRegionBlock *createReplicateRegion(Instruction *I,
+ VPRecipeBase *PredRecipe);
+
+ /// Build a VPlan according to the information gathered by Legal. \return a
+ /// VPlan for vectorization factors \p Range.Start and up to \p Range.End
+ /// exclusive, possibly decreasing \p Range.End.
+ VPlan *buildVPlan(VFRange &Range);
};
+} // namespace llvm
+
+namespace {
+
/// \brief This holds vectorization requirements that must be verified late in
/// the process. The requirements are set by legalize and costmodel. Once
/// vectorization has been determined to be possible and profitable the
// iteration. If EntryVal is uniform, we only need to generate the first
// lane. Otherwise, we generate all VF values.
unsigned Lanes =
- Cost->isUniformAfterVectorization(cast<Instruction>(EntryVal), VF) ? 1 : VF;
-
+ Cost->isUniformAfterVectorization(cast<Instruction>(EntryVal), VF) ? 1
+ : VF;
// Compute the scalar steps and save the results in VectorLoopValueMap.
for (unsigned Part = 0; Part < UF; ++Part) {
for (unsigned Lane = 0; Lane < Lanes; ++Lane) {
auto *StartIdx = getSignedIntOrFpConstant(ScalarIVTy, VF * Part + Lane);
auto *Mul = addFastMathFlag(Builder.CreateBinOp(MulOp, StartIdx, Step));
auto *Add = addFastMathFlag(Builder.CreateBinOp(AddOp, ScalarIV, Mul));
- VectorLoopValueMap.setScalarValue(EntryVal, Part, Lane, Add);
+ VectorLoopValueMap.setScalarValue(EntryVal, {Part, Lane}, Add);
}
}
}
// vector form, we will construct the vector values on demand.
if (VectorLoopValueMap.hasAnyScalarValue(V)) {
- Value *ScalarValue = VectorLoopValueMap.getScalarValue(V, Part, 0);
+ Value *ScalarValue = VectorLoopValueMap.getScalarValue(V, {Part, 0});
// If we've scalarized a value, that value should be an instruction.
auto *I = cast<Instruction>(V);
// of the Part unroll iteration. Otherwise, the last instruction is the one
// we created for the last vector lane of the Part unroll iteration.
unsigned LastLane = Cost->isUniformAfterVectorization(I, VF) ? 0 : VF - 1;
- auto *LastInst =
- cast<Instruction>(VectorLoopValueMap.getScalarValue(V, Part, LastLane));
+ auto *LastInst = cast<Instruction>(
+ VectorLoopValueMap.getScalarValue(V, {Part, LastLane}));
// Set the insert point after the last scalarized instruction. This ensures
// the insertelement sequence will directly follow the scalar definitions.
Value *VectorValue = nullptr;
if (Cost->isUniformAfterVectorization(I, VF)) {
VectorValue = getBroadcastInstrs(ScalarValue);
+ VectorLoopValueMap.setVectorValue(V, Part, VectorValue);
} else {
- VectorValue = UndefValue::get(VectorType::get(V->getType(), VF));
+ // Initialize packing with insertelements to start from undef.
+ Value *Undef = UndefValue::get(VectorType::get(V->getType(), VF));
+ VectorLoopValueMap.setVectorValue(V, Part, Undef);
for (unsigned Lane = 0; Lane < VF; ++Lane)
- VectorValue = Builder.CreateInsertElement(
- VectorValue, getOrCreateScalarValue(V, Part, Lane),
- Builder.getInt32(Lane));
+ packScalarIntoVectorValue(V, {Part, Lane});
+ VectorValue = VectorLoopValueMap.getVectorValue(V, Part);
}
- VectorLoopValueMap.setVectorValue(V, Part, VectorValue);
Builder.restoreIP(OldIP);
return VectorValue;
}
return B;
}
-Value *InnerLoopVectorizer::getOrCreateScalarValue(Value *V, unsigned Part,
- unsigned Lane) {
-
+Value *
+InnerLoopVectorizer::getOrCreateScalarValue(Value *V,
+ const VPIteration &Instance) {
// If the value is not an instruction contained in the loop, it should
// already be scalar.
if (OrigLoop->isLoopInvariant(V))
return V;
- assert(Lane > 0 ? !Cost->isUniformAfterVectorization(cast<Instruction>(V), VF)
- : true && "Uniform values only have lane zero");
+ assert(Instance.Lane > 0
+ ? !Cost->isUniformAfterVectorization(cast<Instruction>(V), VF)
+ : true && "Uniform values only have lane zero");
// If the value from the original loop has not been vectorized, it is
// represented by UF x VF scalar values in the new loop. Return the requested
// scalar value.
- if (VectorLoopValueMap.hasScalarValue(V, Part, Lane))
- return VectorLoopValueMap.getScalarValue(V, Part, Lane);
+ if (VectorLoopValueMap.hasScalarValue(V, Instance))
+ return VectorLoopValueMap.getScalarValue(V, Instance);
// If the value has not been scalarized, get its entry in VectorLoopValueMap
// for the given unroll part. If this entry is not a vector type (i.e., the
// vectorization factor is one), there is no need to generate an
// extractelement instruction.
- auto *U = getOrCreateVectorValue(V, Part);
+ auto *U = getOrCreateVectorValue(V, Instance.Part);
if (!U->getType()->isVectorTy()) {
assert(VF == 1 && "Value not scalarized has non-vector type");
return U;
// Otherwise, the value from the original loop has been vectorized and is
// represented by UF vector values. Extract and return the requested scalar
// value from the appropriate vector lane.
- return Builder.CreateExtractElement(U, Builder.getInt32(Lane));
+ return Builder.CreateExtractElement(U, Builder.getInt32(Instance.Lane));
+}
+
+void InnerLoopVectorizer::packScalarIntoVectorValue(
+ Value *V, const VPIteration &Instance) {
+ assert(V != Induction && "The new induction variable should not be used.");
+ assert(!V->getType()->isVectorTy() && "Can't pack a vector");
+ assert(!V->getType()->isVoidTy() && "Type does not produce a value");
+
+ Value *ScalarInst = VectorLoopValueMap.getScalarValue(V, Instance);
+ Value *VectorValue = VectorLoopValueMap.getVectorValue(V, Instance.Part);
+ VectorValue = Builder.CreateInsertElement(VectorValue, ScalarInst,
+ Builder.getInt32(Instance.Lane));
+ VectorLoopValueMap.resetVectorValue(V, Instance.Part, VectorValue);
}
Value *InnerLoopVectorizer::reverseVector(Value *Vec) {
Index += (VF - 1) * Group->getFactor();
for (unsigned Part = 0; Part < UF; Part++) {
- Value *NewPtr = getOrCreateScalarValue(Ptr, Part, 0);
+ Value *NewPtr = getOrCreateScalarValue(Ptr, {Part, 0});
// Notice current instruction could be any index. Need to adjust the address
// to the member of index 0.
Alignment = DL.getABITypeAlignment(ScalarDataTy);
unsigned AddressSpace = getMemInstAddressSpace(Instr);
- // Scalarize the memory instruction if necessary.
- if (Decision == LoopVectorizationCostModel::CM_Scalarize)
- return scalarizeInstruction(Instr, Legal->isScalarWithPredication(Instr));
-
// Determine if the pointer operand of the access is either consecutive or
// reverse consecutive.
int ConsecutiveStride = Legal->isConsecutivePtr(Ptr);
// Handle consecutive loads/stores.
if (ConsecutiveStride)
- Ptr = getOrCreateScalarValue(Ptr, 0, 0);
+ Ptr = getOrCreateScalarValue(Ptr, {0, 0});
VectorParts Mask = createBlockInMask(Instr->getParent());
// Handle Stores:
}
void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr,
+ const VPIteration &Instance,
bool IfPredicateInstr) {
assert(!Instr->getType()->isAggregateType() && "Can't handle vectors");
- DEBUG(dbgs() << "LV: Scalarizing"
- << (IfPredicateInstr ? " and predicating:" : ":") << *Instr
- << '\n');
- // Holds vector parameters or scalars, in case of uniform vals.
- SmallVector<VectorParts, 4> Params;
setDebugLocFromInst(Builder, Instr);
// Does this instruction return a value ?
bool IsVoidRetTy = Instr->getType()->isVoidTy();
- VectorParts Cond;
- if (IfPredicateInstr)
- Cond = createBlockInMask(Instr->getParent());
-
- // Determine the number of scalars we need to generate for each unroll
- // iteration. If the instruction is uniform, we only need to generate the
- // first lane. Otherwise, we generate all VF values.
- unsigned Lanes = Cost->isUniformAfterVectorization(Instr, VF) ? 1 : VF;
-
- // For each vector unroll 'part':
- for (unsigned Part = 0; Part < UF; ++Part) {
- // For each scalar that we create:
- for (unsigned Lane = 0; Lane < Lanes; ++Lane) {
-
- // Start if-block.
- Value *Cmp = nullptr;
- if (IfPredicateInstr) {
- Cmp = Cond[Part];
- if (!Cmp) // Block in mask is all-one.
- Cmp = Builder.getTrue();
- else if (Cmp->getType()->isVectorTy())
- Cmp = Builder.CreateExtractElement(Cmp, Builder.getInt32(Lane));
- }
-
- Instruction *Cloned = Instr->clone();
- if (!IsVoidRetTy)
- Cloned->setName(Instr->getName() + ".cloned");
+ Instruction *Cloned = Instr->clone();
+ if (!IsVoidRetTy)
+ Cloned->setName(Instr->getName() + ".cloned");
- // Replace the operands of the cloned instructions with their scalar
- // equivalents in the new loop.
- for (unsigned op = 0, e = Instr->getNumOperands(); op != e; ++op) {
- auto *NewOp = getOrCreateScalarValue(Instr->getOperand(op), Part, Lane);
- Cloned->setOperand(op, NewOp);
- }
- addNewMetadata(Cloned, Instr);
+ // Replace the operands of the cloned instructions with their scalar
+ // equivalents in the new loop.
+ for (unsigned op = 0, e = Instr->getNumOperands(); op != e; ++op) {
+ auto *NewOp = getOrCreateScalarValue(Instr->getOperand(op), Instance);
+ Cloned->setOperand(op, NewOp);
+ }
+ addNewMetadata(Cloned, Instr);
- // Place the cloned scalar in the new loop.
- Builder.Insert(Cloned);
+ // Place the cloned scalar in the new loop.
+ Builder.Insert(Cloned);
- // Add the cloned scalar to the scalar map entry.
- VectorLoopValueMap.setScalarValue(Instr, Part, Lane, Cloned);
+ // Add the cloned scalar to the scalar map entry.
+ VectorLoopValueMap.setScalarValue(Instr, Instance, Cloned);
- // If we just cloned a new assumption, add it the assumption cache.
- if (auto *II = dyn_cast<IntrinsicInst>(Cloned))
- if (II->getIntrinsicID() == Intrinsic::assume)
- AC->registerAssumption(II);
+ // If we just cloned a new assumption, add it the assumption cache.
+ if (auto *II = dyn_cast<IntrinsicInst>(Cloned))
+ if (II->getIntrinsicID() == Intrinsic::assume)
+ AC->registerAssumption(II);
- // End if-block.
- if (IfPredicateInstr)
- PredicatedInstructions.push_back(std::make_pair(Cloned, Cmp));
- }
- }
+ // End if-block.
+ if (IfPredicateInstr)
+ PredicatedInstructions.push_back(Cloned);
}
PHINode *InnerLoopVectorizer::createInductionVariable(Loop *L, Value *Start,
LVer->prepareNoAliasMetadata();
}
-void InnerLoopVectorizer::createVectorizedLoopSkeleton() {
+BasicBlock *InnerLoopVectorizer::createVectorizedLoopSkeleton() {
/*
In this function we generate a new loop. The new loop will contain
the vectorized instructions while the old loop will continue to run the
LoopVectorizeHints Hints(Lp, true, *ORE);
Hints.setAlreadyVectorized();
+
+ return LoopVectorPreHeader;
}
// Fix up external users of the induction variable. At this point, we are
IVEndValues[Entry.first], LoopMiddleBlock);
fixLCSSAPHIs();
- predicateInstructions();
+ for (Instruction *PI : PredicatedInstructions)
+ sinkScalarOperands(&*PI);
// Remove redundant induction instructions.
cse(LoopVectorBody);
} while (Changed);
}
-void InnerLoopVectorizer::predicateInstructions() {
-
- // For each instruction I marked for predication on value C, split I into its
- // own basic block to form an if-then construct over C. Since I may be fed by
- // an extractelement instruction or other scalar operand, we try to
- // iteratively sink its scalar operands into the predicated block. If I feeds
- // an insertelement instruction, we try to move this instruction into the
- // predicated block as well. For non-void types, a phi node will be created
- // for the resulting value (either vector or scalar).
- //
- // So for some predicated instruction, e.g. the conditional sdiv in:
- //
- // for.body:
- // ...
- // %add = add nsw i32 %mul, %0
- // %cmp5 = icmp sgt i32 %2, 7
- // br i1 %cmp5, label %if.then, label %if.end
- //
- // if.then:
- // %div = sdiv i32 %0, %1
- // br label %if.end
- //
- // if.end:
- // %x.0 = phi i32 [ %div, %if.then ], [ %add, %for.body ]
- //
- // the sdiv at this point is scalarized and if-converted using a select.
- // The inactive elements in the vector are not used, but the predicated
- // instruction is still executed for all vector elements, essentially:
- //
- // vector.body:
- // ...
- // %17 = add nsw <2 x i32> %16, %wide.load
- // %29 = extractelement <2 x i32> %wide.load, i32 0
- // %30 = extractelement <2 x i32> %wide.load51, i32 0
- // %31 = sdiv i32 %29, %30
- // %32 = insertelement <2 x i32> undef, i32 %31, i32 0
- // %35 = extractelement <2 x i32> %wide.load, i32 1
- // %36 = extractelement <2 x i32> %wide.load51, i32 1
- // %37 = sdiv i32 %35, %36
- // %38 = insertelement <2 x i32> %32, i32 %37, i32 1
- // %predphi = select <2 x i1> %26, <2 x i32> %38, <2 x i32> %17
- //
- // Predication will now re-introduce the original control flow to avoid false
- // side-effects by the sdiv instructions on the inactive elements, yielding
- // (after cleanup):
- //
- // vector.body:
- // ...
- // %5 = add nsw <2 x i32> %4, %wide.load
- // %8 = icmp sgt <2 x i32> %wide.load52, <i32 7, i32 7>
- // %9 = extractelement <2 x i1> %8, i32 0
- // br i1 %9, label %pred.sdiv.if, label %pred.sdiv.continue
- //
- // pred.sdiv.if:
- // %10 = extractelement <2 x i32> %wide.load, i32 0
- // %11 = extractelement <2 x i32> %wide.load51, i32 0
- // %12 = sdiv i32 %10, %11
- // %13 = insertelement <2 x i32> undef, i32 %12, i32 0
- // br label %pred.sdiv.continue
- //
- // pred.sdiv.continue:
- // %14 = phi <2 x i32> [ undef, %vector.body ], [ %13, %pred.sdiv.if ]
- // %15 = extractelement <2 x i1> %8, i32 1
- // br i1 %15, label %pred.sdiv.if54, label %pred.sdiv.continue55
- //
- // pred.sdiv.if54:
- // %16 = extractelement <2 x i32> %wide.load, i32 1
- // %17 = extractelement <2 x i32> %wide.load51, i32 1
- // %18 = sdiv i32 %16, %17
- // %19 = insertelement <2 x i32> %14, i32 %18, i32 1
- // br label %pred.sdiv.continue55
- //
- // pred.sdiv.continue55:
- // %20 = phi <2 x i32> [ %14, %pred.sdiv.continue ], [ %19, %pred.sdiv.if54 ]
- // %predphi = select <2 x i1> %8, <2 x i32> %20, <2 x i32> %5
-
- for (auto KV : PredicatedInstructions) {
- BasicBlock::iterator I(KV.first);
- BasicBlock *Head = I->getParent();
- auto *T = SplitBlockAndInsertIfThen(KV.second, &*I, /*Unreachable=*/false,
- /*BranchWeights=*/nullptr, DT, LI);
- I->moveBefore(T);
- sinkScalarOperands(&*I);
-
- BasicBlock *PredicatedBlock = I->getParent();
- Twine BBNamePrefix = Twine("pred.") + I->getOpcodeName();
- PredicatedBlock->setName(BBNamePrefix + ".if");
- PredicatedBlock->getSingleSuccessor()->setName(BBNamePrefix + ".continue");
-
- // If the instruction is non-void create a Phi node at reconvergence point.
- if (!I->getType()->isVoidTy()) {
- Value *IncomingTrue = nullptr;
- Value *IncomingFalse = nullptr;
-
- if (I->hasOneUse() && isa<InsertElementInst>(*I->user_begin())) {
- // If the predicated instruction is feeding an insert-element, move it
- // into the Then block; Phi node will be created for the vector.
- InsertElementInst *IEI = cast<InsertElementInst>(*I->user_begin());
- IEI->moveBefore(T);
- IncomingTrue = IEI; // the new vector with the inserted element.
- IncomingFalse = IEI->getOperand(0); // the unmodified vector
- } else {
- // Phi node will be created for the scalar predicated instruction.
- IncomingTrue = &*I;
- IncomingFalse = UndefValue::get(I->getType());
- }
-
- BasicBlock *PostDom = I->getParent()->getSingleSuccessor();
- assert(PostDom && "Then block has multiple successors");
- PHINode *Phi =
- PHINode::Create(IncomingTrue->getType(), 2, "", &PostDom->front());
- IncomingTrue->replaceAllUsesWith(Phi);
- Phi->addIncoming(IncomingFalse, Head);
- Phi->addIncoming(IncomingTrue, I->getParent());
- }
- }
-
- DEBUG(DT->verifyDomTree());
-}
-
-InnerLoopVectorizer::VectorParts
-InnerLoopVectorizer::createEdgeMask(BasicBlock *Src, BasicBlock *Dst) {
- assert(is_contained(predecessors(Dst), Src) && "Invalid edge");
+InnerLoopVectorizer::VectorParts
+InnerLoopVectorizer::createEdgeMask(BasicBlock *Src, BasicBlock *Dst) {
+ assert(is_contained(predecessors(Dst), Src) && "Invalid edge");
// Look for cached value.
std::pair<BasicBlock *, BasicBlock *> Edge(Src, Dst);
llvm_unreachable("Unknown induction");
case InductionDescriptor::IK_IntInduction:
case InductionDescriptor::IK_FpInduction:
- return widenIntOrFpInduction(P);
+ llvm_unreachable("Integer/fp induction is handled elsewhere.");
case InductionDescriptor::IK_PtrInduction: {
// Handle the pointer induction variable case.
assert(P->getType()->isPointerTy() && "Unexpected type.");
Value *GlobalIdx = Builder.CreateAdd(PtrInd, Idx);
Value *SclrGep = II.transform(Builder, GlobalIdx, PSE.getSE(), DL);
SclrGep->setName("next.gep");
- VectorLoopValueMap.setScalarValue(P, Part, Lane, SclrGep);
+ VectorLoopValueMap.setScalarValue(P, {Part, Lane}, SclrGep);
}
}
return;
return !CInt || CInt->isZero();
}
-void InnerLoopVectorizer::vectorizeInstruction(Instruction &I) {
- // Scalarize instructions that should remain scalar after vectorization.
- if (VF > 1 &&
- !(isa<BranchInst>(&I) || isa<PHINode>(&I) || isa<DbgInfoIntrinsic>(&I)) &&
- shouldScalarizeInstruction(&I)) {
- scalarizeInstruction(&I, Legal->isScalarWithPredication(&I));
- return;
- }
-
+void InnerLoopVectorizer::widenInstruction(Instruction &I) {
switch (I.getOpcode()) {
case Instruction::Br:
- // Nothing to do for PHIs and BR, since we already took care of the
- // loop control flow instructions.
- break;
- case Instruction::PHI: {
- // Vectorize PHINodes.
- widenPHIInstruction(&I, UF, VF);
- break;
- } // End of PHI.
+ case Instruction::PHI:
+ llvm_unreachable("This instruction is handled by a different recipe.");
case Instruction::GetElementPtr: {
// Construct a vector GEP by widening the operands of the scalar GEP as
// necessary. We mark the vector GEP 'inbounds' if appropriate. A GEP
case Instruction::SDiv:
case Instruction::SRem:
case Instruction::URem:
- // Scalarize with predication if this instruction may divide by zero and
- // block execution is conditional, otherwise fallthrough.
- if (Legal->isScalarWithPredication(&I)) {
- scalarizeInstruction(&I, true);
- break;
- }
- LLVM_FALLTHROUGH;
case Instruction::Add:
case Instruction::FAdd:
case Instruction::Sub:
// We have to take the 'vectorized' value and pick the first lane.
// Instcombine will make this a no-op.
- auto *ScalarCond = getOrCreateScalarValue(I.getOperand(0), 0, 0);
+ auto *ScalarCond = getOrCreateScalarValue(I.getOperand(0), {0, 0});
for (unsigned Part = 0; Part < UF; ++Part) {
Value *Cond = getOrCreateVectorValue(I.getOperand(0), Part);
auto *CI = dyn_cast<CastInst>(&I);
setDebugLocFromInst(Builder, CI);
- // Optimize the special case where the source is a constant integer
- // induction variable. Notice that we can only optimize the 'trunc' case
- // because (a) FP conversions lose precision, (b) sext/zext may wrap, and
- // (c) other casts depend on pointer size.
- if (Cost->isOptimizableIVTruncate(CI, VF)) {
- widenIntOrFpInduction(cast<PHINode>(CI->getOperand(0)),
- cast<TruncInst>(CI));
- break;
- }
-
/// Vectorize casts.
Type *DestTy =
(VF == 1) ? CI->getType() : VectorType::get(CI->getType(), VF);
Tys.push_back(ToVectorTy(ArgOperand->getType(), VF));
Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
- if (ID && (ID == Intrinsic::assume || ID == Intrinsic::lifetime_end ||
- ID == Intrinsic::lifetime_start)) {
- scalarizeInstruction(&I);
- break;
- }
+
// The flag shows whether we use Intrinsic or a usual Call for vectorized
// version of the instruction.
// Is it beneficial to perform intrinsic call compared to lib call?
unsigned CallCost = getVectorCallCost(CI, VF, *TTI, TLI, NeedToScalarize);
bool UseVectorIntrinsic =
ID && getVectorIntrinsicCost(CI, VF, *TTI, TLI) <= CallCost;
- if (!UseVectorIntrinsic && NeedToScalarize) {
- scalarizeInstruction(&I);
- break;
- }
+ assert((UseVectorIntrinsic || !NeedToScalarize) &&
+ "Instruction should be scalarized elsewhere.");
for (unsigned Part = 0; Part < UF; ++Part) {
SmallVector<Value *, 4> Args;
}
default:
- // All other instructions are unsupported. Scalarize them.
- scalarizeInstruction(&I);
- break;
+ // All other instructions are scalarized.
+ DEBUG(dbgs() << "LV: Found an unhandled instruction: " << I);
+ llvm_unreachable("Unhandled instruction!");
} // end of switch.
}
assert(DT->properlyDominates(LoopBypassBlocks.front(), LoopExitBlock) &&
"Entry does not dominate exit.");
- DT->addNewBlock(LI->getLoopFor(LoopVectorBody)->getHeader(),
- LoopVectorPreHeader);
DT->addNewBlock(LoopMiddleBlock,
LI->getLoopFor(LoopVectorBody)->getLoopLatch());
DT->addNewBlock(LoopScalarPreHeader, LoopBypassBlocks[0]);
DT->changeImmediateDominator(LoopScalarBody, LoopScalarPreHeader);
DT->changeImmediateDominator(LoopExitBlock, LoopBypassBlocks[0]);
-
DEBUG(DT->verifyDomTree());
}
LoopVectorizationCostModel::expectedCost(unsigned VF) {
VectorizationCostTy Cost;
- // Collect Uniform and Scalar instructions after vectorization with VF.
- collectUniformsAndScalars(VF);
-
- // Collect the instructions (and their associated costs) that will be more
- // profitable to scalarize.
- collectInstsToScalarize(VF);
-
// For each block.
for (BasicBlock *BB : TheLoop->blocks()) {
VectorizationCostTy BlockCost;
// Collect the instructions (and their associated costs) that will be more
// profitable to scalarize.
CM.selectUserVectorizationFactor(UserVF);
+ buildVPlans(UserVF, UserVF);
+ DEBUG(printPlans(dbgs()));
return {UserVF, 0};
}
unsigned MaxVF = MaybeMaxVF.getValue();
assert(MaxVF != 0 && "MaxVF is zero.");
+
+ for (unsigned VF = 1; VF <= MaxVF; VF *= 2) {
+ // Collect Uniform and Scalar instructions after vectorization with VF.
+ CM.collectUniformsAndScalars(VF);
+
+ // Collect the instructions (and their associated costs) that will be more
+ // profitable to scalarize.
+ if (VF > 1)
+ CM.collectInstsToScalarize(VF);
+ }
+
+ buildVPlans(1, MaxVF);
+ DEBUG(printPlans(dbgs()));
if (MaxVF == 1)
return NoVectorization;
return CM.selectVectorizationFactor(MaxVF);
}
-void LoopVectorizationPlanner::executePlan(InnerLoopVectorizer &ILV) {
+void LoopVectorizationPlanner::setBestPlan(unsigned VF, unsigned UF) {
+ DEBUG(dbgs() << "Setting best plan to VF=" << VF << ", UF=" << UF << '\n');
+ BestVF = VF;
+ BestUF = UF;
+
+ for (auto *VPlanIter = VPlans.begin(); VPlanIter != VPlans.end();) {
+ VPlan *Plan = *VPlanIter;
+ if (Plan->hasVF(VF))
+ ++VPlanIter;
+ else {
+ VPlanIter = VPlans.erase(VPlanIter);
+ delete Plan;
+ }
+ }
+ assert(VPlans.size() == 1 && "Best VF has not a single VPlan.");
+}
+
+void LoopVectorizationPlanner::executePlan(InnerLoopVectorizer &ILV,
+ DominatorTree *DT) {
// Perform the actual loop transformation.
// 1. Create a new empty loop. Unlink the old loop and connect the new one.
- ILV.createVectorizedLoopSkeleton();
+ VPTransformState State{
+ BestVF, BestUF, LI, DT, ILV.Builder, ILV.VectorLoopValueMap, &ILV};
+ State.CFG.PrevBB = ILV.createVectorizedLoopSkeleton();
//===------------------------------------------------===//
//
//===------------------------------------------------===//
// 2. Copy and widen instructions from the old loop into the new loop.
-
- // Move instructions to handle first-order recurrences.
- DenseMap<Instruction *, Instruction *> &SinkAfter = Legal->getSinkAfter();
- for (auto &Entry : SinkAfter) {
- Entry.first->removeFromParent();
- Entry.first->insertAfter(Entry.second);
- DEBUG(dbgs() << "Sinking" << *Entry.first << " after" << *Entry.second
- << " to vectorize a 1st order recurrence.\n");
- }
-
- // Collect instructions from the original loop that will become trivially dead
- // in the vectorized loop. We don't need to vectorize these instructions. For
- // example, original induction update instructions can become dead because we
- // separately emit induction "steps" when generating code for the new loop.
- // Similarly, we create a new latch condition when setting up the structure
- // of the new loop, so the old one can become dead.
- SmallPtrSet<Instruction *, 4> DeadInstructions;
- collectTriviallyDeadInstructions(DeadInstructions);
-
- // Scan the loop in a topological order to ensure that defs are vectorized
- // before users.
- LoopBlocksDFS DFS(OrigLoop);
- DFS.perform(LI);
-
- // Vectorize all instructions in the original loop that will not become
- // trivially dead when vectorized.
- for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO()))
- for (Instruction &I : *BB)
- if (!DeadInstructions.count(&I))
- ILV.vectorizeInstruction(I);
+ assert(VPlans.size() == 1 && "Not a single VPlan to execute.");
+ VPlan *Plan = *VPlans.begin();
+ Plan->execute(&State);
// 3. Fix the vectorized code: take care of header phi's, live-outs,
// predication, updating analyses.
DeadInstructions.insert(IndUpdate);
}
}
-
-void InnerLoopUnroller::vectorizeMemoryInstruction(Instruction *Instr) {
- auto *SI = dyn_cast<StoreInst>(Instr);
- bool IfPredicateInstr = (SI && Legal->blockNeedsPredication(SI->getParent()));
-
- return scalarizeInstruction(Instr, IfPredicateInstr);
-}
-
Value *InnerLoopUnroller::reverseVector(Value *Vec) { return Vec; }
Value *InnerLoopUnroller::getBroadcastInstrs(Value *V) { return V; }
}
}
+namespace {
+/// VPWidenRecipe is a recipe for producing a copy of vector type for each
+/// Instruction in its ingredients independently, in order. This recipe covers
+/// most of the traditional vectorization cases where each ingredient transforms
+/// into a vectorized version of itself.
+class VPWidenRecipe : public VPRecipeBase {
+private:
+ /// Hold the ingredients by pointing to their original BasicBlock location.
+ BasicBlock::iterator Begin;
+ BasicBlock::iterator End;
+
+public:
+ VPWidenRecipe(Instruction *I) : VPRecipeBase(VPWidenSC) {
+ End = I->getIterator();
+ Begin = End++;
+ }
+
+ ~VPWidenRecipe() {}
+
+ /// Method to support type inquiry through isa, cast, and dyn_cast.
+ static inline bool classof(const VPRecipeBase *V) {
+ return V->getVPRecipeID() == VPRecipeBase::VPWidenSC;
+ }
+
+ /// Produce widened copies of all Ingredients.
+ void execute(VPTransformState &State) override {
+ for (auto &Instr : make_range(Begin, End))
+ State.ILV->widenInstruction(Instr);
+ }
+
+ /// Augment the recipe to include Instr, if it lies at its End.
+ bool appendInstruction(Instruction *Instr) {
+ if (End != Instr->getIterator())
+ return false;
+ End++;
+ return true;
+ }
+
+ /// Print the recipe.
+ void print(raw_ostream &O, const Twine &Indent) const override {
+ O << " +\n" << Indent << "\"WIDEN\\l\"";
+ for (auto &Instr : make_range(Begin, End))
+ O << " +\n" << Indent << "\" " << VPlanIngredient(&Instr) << "\\l\"";
+ }
+};
+
+/// A recipe for handling phi nodes of integer and floating-point inductions,
+/// producing their vector and scalar values.
+class VPWidenIntOrFpInductionRecipe : public VPRecipeBase {
+private:
+ PHINode *IV;
+ TruncInst *Trunc;
+
+public:
+ VPWidenIntOrFpInductionRecipe(PHINode *IV, TruncInst *Trunc = nullptr)
+ : VPRecipeBase(VPWidenIntOrFpInductionSC), IV(IV), Trunc(Trunc) {}
+
+ ~VPWidenIntOrFpInductionRecipe() {}
+
+ /// Method to support type inquiry through isa, cast, and dyn_cast.
+ static inline bool classof(const VPRecipeBase *V) {
+ return V->getVPRecipeID() == VPRecipeBase::VPWidenIntOrFpInductionSC;
+ }
+
+ /// Generate the vectorized and scalarized versions of the phi node as
+ /// needed by their users.
+ void execute(VPTransformState &State) override {
+ assert(!State.Instance && "Int or FP induction being replicated.");
+ State.ILV->widenIntOrFpInduction(IV, Trunc);
+ }
+
+ /// Print the recipe.
+ void print(raw_ostream &O, const Twine &Indent) const override {
+ O << " +\n" << Indent << "\"WIDEN-INDUCTION";
+ if (Trunc) {
+ O << "\\l\"";
+ O << " +\n" << Indent << "\" " << VPlanIngredient(IV) << "\\l\"";
+ O << " +\n" << Indent << "\" " << VPlanIngredient(Trunc) << "\\l\"";
+ } else
+ O << " " << VPlanIngredient(IV) << "\\l\"";
+ }
+};
+
+/// A recipe for handling all phi nodes except for integer and FP inductions.
+class VPWidenPHIRecipe : public VPRecipeBase {
+private:
+ PHINode *Phi;
+
+public:
+ VPWidenPHIRecipe(PHINode *Phi) : VPRecipeBase(VPWidenPHISC), Phi(Phi) {}
+
+ ~VPWidenPHIRecipe() {}
+
+ /// Method to support type inquiry through isa, cast, and dyn_cast.
+ static inline bool classof(const VPRecipeBase *V) {
+ return V->getVPRecipeID() == VPRecipeBase::VPWidenPHISC;
+ }
+
+ /// Generate the phi/select nodes.
+ void execute(VPTransformState &State) override {
+ State.ILV->widenPHIInstruction(Phi, State.UF, State.VF);
+ }
+
+ /// Print the recipe.
+ void print(raw_ostream &O, const Twine &Indent) const override {
+ O << " +\n" << Indent << "\"WIDEN-PHI " << VPlanIngredient(Phi) << "\\l\"";
+ }
+};
+
+/// VPInterleaveRecipe is a recipe for transforming an interleave group of load
+/// or stores into one wide load/store and shuffles.
+class VPInterleaveRecipe : public VPRecipeBase {
+private:
+ const InterleaveGroup *IG;
+
+public:
+ VPInterleaveRecipe(const InterleaveGroup *IG)
+ : VPRecipeBase(VPInterleaveSC), IG(IG) {}
+
+ ~VPInterleaveRecipe() {}
+
+ /// Method to support type inquiry through isa, cast, and dyn_cast.
+ static inline bool classof(const VPRecipeBase *V) {
+ return V->getVPRecipeID() == VPRecipeBase::VPInterleaveSC;
+ }
+
+ /// Generate the wide load or store, and shuffles.
+ void execute(VPTransformState &State) override {
+ assert(!State.Instance && "Interleave group being replicated.");
+ State.ILV->vectorizeInterleaveGroup(IG->getInsertPos());
+ }
+
+ /// Print the recipe.
+ void print(raw_ostream &O, const Twine &Indent) const override;
+
+ const InterleaveGroup *getInterleaveGroup() { return IG; }
+};
+
+/// VPReplicateRecipe replicates a given instruction producing multiple scalar
+/// copies of the original scalar type, one per lane, instead of producing a
+/// single copy of widened type for all lanes. If the instruction is known to be
+/// uniform only one copy, per lane zero, will be generated.
+class VPReplicateRecipe : public VPRecipeBase {
+private:
+ /// The instruction being replicated.
+ Instruction *Ingredient;
+
+ /// Indicator if only a single replica per lane is needed.
+ bool IsUniform;
+
+ /// Indicator if the replicas are also predicated.
+ bool IsPredicated;
+
+ /// Indicator if the scalar values should also be packed into a vector.
+ bool AlsoPack;
+
+public:
+ VPReplicateRecipe(Instruction *I, bool IsUniform, bool IsPredicated = false)
+ : VPRecipeBase(VPReplicateSC), Ingredient(I), IsUniform(IsUniform),
+ IsPredicated(IsPredicated) {
+ // Retain the previous behavior of predicateInstructions(), where an
+ // insert-element of a predicated instruction got hoisted into the
+ // predicated basic block iff it was its only user. This is achieved by
+ // having predicated instructions also pack their values into a vector by
+ // default unless they have a replicated user which uses their scalar value.
+ AlsoPack = IsPredicated && !I->use_empty();
+ }
+
+ ~VPReplicateRecipe() {}
+
+ /// Method to support type inquiry through isa, cast, and dyn_cast.
+ static inline bool classof(const VPRecipeBase *V) {
+ return V->getVPRecipeID() == VPRecipeBase::VPReplicateSC;
+ }
+
+ /// Generate replicas of the desired Ingredient. Replicas will be generated
+ /// for all parts and lanes unless a specific part and lane are specified in
+ /// the \p State.
+ void execute(VPTransformState &State) override;
+
+ void setAlsoPack(bool Pack) { AlsoPack = Pack; }
+
+ /// Print the recipe.
+ void print(raw_ostream &O, const Twine &Indent) const override {
+ O << " +\n"
+ << Indent << "\"" << (IsUniform ? "CLONE " : "REPLICATE ")
+ << VPlanIngredient(Ingredient);
+ if (AlsoPack)
+ O << " (S->V)";
+ O << "\\l\"";
+ }
+};
+
+/// A recipe for generating conditional branches on the bits of a mask.
+class VPBranchOnMaskRecipe : public VPRecipeBase {
+private:
+ /// The input IR basic block used to obtain the mask providing the condition
+ /// bits for the branch.
+ BasicBlock *MaskedBasicBlock;
+
+public:
+ VPBranchOnMaskRecipe(BasicBlock *BB)
+ : VPRecipeBase(VPBranchOnMaskSC), MaskedBasicBlock(BB) {}
+
+ /// Method to support type inquiry through isa, cast, and dyn_cast.
+ static inline bool classof(const VPRecipeBase *V) {
+ return V->getVPRecipeID() == VPRecipeBase::VPBranchOnMaskSC;
+ }
+
+ /// Generate the extraction of the appropriate bit from the block mask and the
+ /// conditional branch.
+ void execute(VPTransformState &State) override;
+
+ /// Print the recipe.
+ void print(raw_ostream &O, const Twine &Indent) const override {
+ O << " +\n"
+ << Indent << "\"BRANCH-ON-MASK-OF " << MaskedBasicBlock->getName()
+ << "\\l\"";
+ }
+};
+
+/// VPPredInstPHIRecipe is a recipe for generating the phi nodes needed when
+/// control converges back from a Branch-on-Mask. The phi nodes are needed in
+/// order to merge values that are set under such a branch and feed their uses.
+/// The phi nodes can be scalar or vector depending on the users of the value.
+/// This recipe works in concert with VPBranchOnMaskRecipe.
+class VPPredInstPHIRecipe : public VPRecipeBase {
+private:
+ Instruction *PredInst;
+
+public:
+ /// Construct a VPPredInstPHIRecipe given \p PredInst whose value needs a phi
+ /// nodes after merging back from a Branch-on-Mask.
+ VPPredInstPHIRecipe(Instruction *PredInst)
+ : VPRecipeBase(VPPredInstPHISC), PredInst(PredInst) {}
+
+ ~VPPredInstPHIRecipe() {}
+
+ /// Method to support type inquiry through isa, cast, and dyn_cast.
+ static inline bool classof(const VPRecipeBase *V) {
+ return V->getVPRecipeID() == VPRecipeBase::VPPredInstPHISC;
+ }
+
+ /// Generates phi nodes for live-outs as needed to retain SSA form.
+ void execute(VPTransformState &State) override;
+
+ /// Print the recipe.
+ void print(raw_ostream &O, const Twine &Indent) const override {
+ O << " +\n"
+ << Indent << "\"PHI-PREDICATED-INSTRUCTION " << VPlanIngredient(PredInst)
+ << "\\l\"";
+ }
+};
+} // end anonymous namespace
+
+bool LoopVectorizationPlanner::getDecisionAndClampRange(
+ const std::function<bool(unsigned)> &Predicate, VFRange &Range) {
+ assert(Range.End > Range.Start && "Trying to test an empty VF range.");
+ bool PredicateAtRangeStart = Predicate(Range.Start);
+
+ for (unsigned TmpVF = Range.Start * 2; TmpVF < Range.End; TmpVF *= 2)
+ if (Predicate(TmpVF) != PredicateAtRangeStart) {
+ Range.End = TmpVF;
+ break;
+ }
+
+ return PredicateAtRangeStart;
+}
+
+/// Build VPlans for the full range of feasible VF's = {\p MinVF, 2 * \p MinVF,
+/// 4 * \p MinVF, ..., \p MaxVF} by repeatedly building a VPlan for a sub-range
+/// of VF's starting at a given VF and extending it as much as possible. Each
+/// vectorization decision can potentially shorten this sub-range during
+/// buildVPlan().
+void LoopVectorizationPlanner::buildVPlans(unsigned MinVF, unsigned MaxVF) {
+ for (unsigned VF = MinVF; VF < MaxVF + 1;) {
+ VFRange SubRange = {VF, MaxVF + 1};
+ VPlan *Plan = buildVPlan(SubRange);
+ VPlans.push_back(Plan);
+ VF = SubRange.End;
+ }
+}
+
+VPInterleaveRecipe *
+LoopVectorizationPlanner::tryToInterleaveMemory(Instruction *I,
+ VFRange &Range) {
+ const InterleaveGroup *IG = Legal->getInterleavedAccessGroup(I);
+ if (!IG)
+ return nullptr;
+
+ // Now check if IG is relevant for VF's in the given range.
+ auto isIGMember = [&](Instruction *I) -> std::function<bool(unsigned)> {
+ return [=](unsigned VF) -> bool {
+ return (VF >= 2 && // Query is illegal for VF == 1
+ CM.getWideningDecision(I, VF) ==
+ LoopVectorizationCostModel::CM_Interleave);
+ };
+ };
+ if (!getDecisionAndClampRange(isIGMember(I), Range))
+ return nullptr;
+
+ // I is a member of an InterleaveGroup for VF's in the (possibly trimmed)
+ // range. If it's the primary member of the IG construct a VPInterleaveRecipe.
+ // Otherwise, it's an adjunct member of the IG, do not construct any Recipe.
+ assert(I == IG->getInsertPos() &&
+ "Generating a recipe for an adjunct member of an interleave group");
+
+ return new VPInterleaveRecipe(IG);
+}
+
+VPWidenIntOrFpInductionRecipe *
+LoopVectorizationPlanner::tryToOptimizeInduction(Instruction *I,
+ VFRange &Range) {
+ if (PHINode *Phi = dyn_cast<PHINode>(I)) {
+ // Check if this is an integer or fp induction. If so, build the recipe that
+ // produces its scalar and vector values.
+ InductionDescriptor II = Legal->getInductionVars()->lookup(Phi);
+ if (II.getKind() == InductionDescriptor::IK_IntInduction ||
+ II.getKind() == InductionDescriptor::IK_FpInduction)
+ return new VPWidenIntOrFpInductionRecipe(Phi);
+
+ return nullptr;
+ }
+
+ // Optimize the special case where the source is a constant integer
+ // induction variable. Notice that we can only optimize the 'trunc' case
+ // because (a) FP conversions lose precision, (b) sext/zext may wrap, and
+ // (c) other casts depend on pointer size.
+
+ // Determine whether \p K is a truncation based on an induction variable that
+ // can be optimized.
+ auto isOptimizableIVTruncate =
+ [&](Instruction *K) -> std::function<bool(unsigned)> {
+ return
+ [=](unsigned VF) -> bool { return CM.isOptimizableIVTruncate(K, VF); };
+ };
+
+ if (isa<TruncInst>(I) &&
+ getDecisionAndClampRange(isOptimizableIVTruncate(I), Range))
+ return new VPWidenIntOrFpInductionRecipe(cast<PHINode>(I->getOperand(0)),
+ cast<TruncInst>(I));
+ return nullptr;
+}
+
+VPWidenRecipe *LoopVectorizationPlanner::tryToWiden(
+ Instruction *I, VPWidenRecipe *LastWidenRecipe, VFRange &Range) {
+
+ if (Legal->isScalarWithPredication(I))
+ return nullptr;
+
+ static DenseSet<unsigned> VectorizableOpcodes = {
+ Instruction::Br, Instruction::PHI, Instruction::GetElementPtr,
+ Instruction::UDiv, Instruction::SDiv, Instruction::SRem,
+ Instruction::URem, Instruction::Add, Instruction::FAdd,
+ Instruction::Sub, Instruction::FSub, Instruction::Mul,
+ Instruction::FMul, Instruction::FDiv, Instruction::FRem,
+ Instruction::Shl, Instruction::LShr, Instruction::AShr,
+ Instruction::And, Instruction::Or, Instruction::Xor,
+ Instruction::Select, Instruction::ICmp, Instruction::FCmp,
+ Instruction::Store, Instruction::Load, Instruction::ZExt,
+ Instruction::SExt, Instruction::FPToUI, Instruction::FPToSI,
+ Instruction::FPExt, Instruction::PtrToInt, Instruction::IntToPtr,
+ Instruction::SIToFP, Instruction::UIToFP, Instruction::Trunc,
+ Instruction::FPTrunc, Instruction::BitCast, Instruction::Call};
+
+ if (!VectorizableOpcodes.count(I->getOpcode()))
+ return nullptr;
+
+ if (CallInst *CI = dyn_cast<CallInst>(I)) {
+ Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
+ if (ID && (ID == Intrinsic::assume || ID == Intrinsic::lifetime_end ||
+ ID == Intrinsic::lifetime_start))
+ return nullptr;
+ }
+
+ auto willWiden = [&](unsigned VF) -> bool {
+ if (!isa<PHINode>(I) && (CM.isScalarAfterVectorization(I, VF) ||
+ CM.isProfitableToScalarize(I, VF)))
+ return false;
+ if (CallInst *CI = dyn_cast<CallInst>(I)) {
+ Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
+ // The following case may be scalarized depending on the VF.
+ // The flag shows whether we use Intrinsic or a usual Call for vectorized
+ // version of the instruction.
+ // Is it beneficial to perform intrinsic call compared to lib call?
+ bool NeedToScalarize;
+ unsigned CallCost = getVectorCallCost(CI, VF, *TTI, TLI, NeedToScalarize);
+ bool UseVectorIntrinsic =
+ ID && getVectorIntrinsicCost(CI, VF, *TTI, TLI) <= CallCost;
+ return UseVectorIntrinsic || !NeedToScalarize;
+ }
+ if (isa<LoadInst>(I) || isa<StoreInst>(I)) {
+ LoopVectorizationCostModel::InstWidening Decision =
+ CM.getWideningDecision(I, VF);
+ assert(Decision != LoopVectorizationCostModel::CM_Unknown &&
+ "CM decision should be taken at this point.");
+ assert(Decision != LoopVectorizationCostModel::CM_Interleave &&
+ "Interleave memory opportunity should be caught earlier.");
+ return Decision != LoopVectorizationCostModel::CM_Scalarize;
+ }
+ return true;
+ };
+
+ if (!getDecisionAndClampRange(willWiden, Range))
+ return nullptr;
+
+ // Success: widen this instruction. We optimize the common case where
+ // consecutive instructions can be represented by a single recipe.
+ if (LastWidenRecipe && LastWidenRecipe->appendInstruction(I))
+ return LastWidenRecipe;
+ return new VPWidenRecipe(I);
+}
+
+VPBasicBlock *LoopVectorizationPlanner::handleReplication(
+ Instruction *I, VFRange &Range, VPBasicBlock *VPBB,
+ DenseMap<Instruction *, VPReplicateRecipe *> &PredInst2Recipe) {
+
+ bool IsUniform = getDecisionAndClampRange(
+ [&](unsigned VF) { return CM.isUniformAfterVectorization(I, VF); },
+ Range);
+
+ bool IsPredicated = Legal->isScalarWithPredication(I);
+ auto *Recipe = new VPReplicateRecipe(I, IsUniform, IsPredicated);
+
+ // Find if I uses a predicated instruction. If so, it will use its scalar
+ // value. Avoid hoisting the insert-element which packs the scalar value into
+ // a vector value, as that happens iff all users use the vector value.
+ for (auto &Op : I->operands())
+ if (auto *PredInst = dyn_cast<Instruction>(Op))
+ if (PredInst2Recipe.find(PredInst) != PredInst2Recipe.end())
+ PredInst2Recipe[PredInst]->setAlsoPack(false);
+
+ // Finalize the recipe for Instr, first if it is not predicated.
+ if (!IsPredicated) {
+ DEBUG(dbgs() << "LV: Scalarizing:" << *I << "\n");
+ VPBB->appendRecipe(Recipe);
+ return VPBB;
+ }
+ DEBUG(dbgs() << "LV: Scalarizing and predicating:" << *I << "\n");
+ assert(VPBB->getSuccessors().empty() &&
+ "VPBB has successors when handling predicated replication.");
+ // Record predicated instructions for above packing optimizations.
+ PredInst2Recipe[I] = Recipe;
+ VPBlockBase *Region = VPBB->setOneSuccessor(createReplicateRegion(I, Recipe));
+ return cast<VPBasicBlock>(Region->setOneSuccessor(new VPBasicBlock()));
+}
+
+VPRegionBlock *
+LoopVectorizationPlanner::createReplicateRegion(Instruction *Instr,
+ VPRecipeBase *PredRecipe) {
+ // Instructions marked for predication are replicated and placed under an
+ // if-then construct to prevent side-effects.
+
+ // Build the triangular if-then region.
+ std::string RegionName = (Twine("pred.") + Instr->getOpcodeName()).str();
+ assert(Instr->getParent() && "Predicated instruction not in any basic block");
+ auto *BOMRecipe = new VPBranchOnMaskRecipe(Instr->getParent());
+ auto *Entry = new VPBasicBlock(Twine(RegionName) + ".entry", BOMRecipe);
+ auto *PHIRecipe =
+ Instr->getType()->isVoidTy() ? nullptr : new VPPredInstPHIRecipe(Instr);
+ auto *Exit = new VPBasicBlock(Twine(RegionName) + ".continue", PHIRecipe);
+ auto *Pred = new VPBasicBlock(Twine(RegionName) + ".if", PredRecipe);
+ VPRegionBlock *Region = new VPRegionBlock(Entry, Exit, RegionName, true);
+
+ // Note: first set Entry as region entry and then connect successors starting
+ // from it in order, to propagate the "parent" of each VPBasicBlock.
+ Entry->setTwoSuccessors(Pred, Exit);
+ Pred->setOneSuccessor(Exit);
+
+ return Region;
+}
+
+VPlan *LoopVectorizationPlanner::buildVPlan(VFRange &Range) {
+
+ DenseMap<Instruction *, Instruction *> &SinkAfter = Legal->getSinkAfter();
+ DenseMap<Instruction *, Instruction *> SinkAfterInverse;
+
+ // Collect instructions from the original loop that will become trivially dead
+ // in the vectorized loop. We don't need to vectorize these instructions. For
+ // example, original induction update instructions can become dead because we
+ // separately emit induction "steps" when generating code for the new loop.
+ // Similarly, we create a new latch condition when setting up the structure
+ // of the new loop, so the old one can become dead.
+ SmallPtrSet<Instruction *, 4> DeadInstructions;
+ collectTriviallyDeadInstructions(DeadInstructions);
+
+ // Hold a mapping from predicated instructions to their recipes, in order to
+ // fix their AlsoPack behavior if a user is determined to replicate and use a
+ // scalar instead of vector value.
+ DenseMap<Instruction *, VPReplicateRecipe *> PredInst2Recipe;
+
+ // Create a dummy pre-entry VPBasicBlock to start building the VPlan.
+ VPBasicBlock *VPBB = new VPBasicBlock("Pre-Entry");
+ VPlan *Plan = new VPlan(VPBB);
+
+ // Scan the body of the loop in a topological order to visit each basic block
+ // after having visited its predecessor basic blocks.
+ LoopBlocksDFS DFS(OrigLoop);
+ DFS.perform(LI);
+
+ for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) {
+ // Relevant instructions from basic block BB will be grouped into VPRecipe
+ // ingredients and fill a new VPBasicBlock.
+ unsigned VPBBsForBB = 0;
+ auto *FirstVPBBForBB = new VPBasicBlock(BB->getName());
+ VPBB->setOneSuccessor(FirstVPBBForBB);
+ VPBB = FirstVPBBForBB;
+ VPWidenRecipe *LastWidenRecipe = nullptr;
+
+ std::vector<Instruction *> Ingredients;
+
+ // Organize the ingredients to vectorize from current basic block in the
+ // right order.
+ for (Instruction &I : *BB) {
+ Instruction *Instr = &I;
+
+ // First filter out irrelevant instructions, to ensure no recipes are
+ // built for them.
+ if (isa<BranchInst>(Instr) || isa<DbgInfoIntrinsic>(Instr) ||
+ DeadInstructions.count(Instr))
+ continue;
+
+ // I is a member of an InterleaveGroup for Range.Start. If it's an adjunct
+ // member of the IG, do not construct any Recipe for it.
+ const InterleaveGroup *IG = Legal->getInterleavedAccessGroup(Instr);
+ if (IG && Instr != IG->getInsertPos() &&
+ Range.Start >= 2 && // Query is illegal for VF == 1
+ CM.getWideningDecision(Instr, Range.Start) ==
+ LoopVectorizationCostModel::CM_Interleave)
+ continue;
+
+ // Move instructions to handle first-order recurrences, step 1: avoid
+ // handling this instruction until after we've handled the instruction it
+ // should follow.
+ auto SAIt = SinkAfter.find(Instr);
+ if (SAIt != SinkAfter.end()) {
+ DEBUG(dbgs() << "Sinking" << *SAIt->first << " after" << *SAIt->second
+ << " to vectorize a 1st order recurrence.\n");
+ SinkAfterInverse[SAIt->second] = Instr;
+ continue;
+ }
+
+ Ingredients.push_back(Instr);
+
+ // Move instructions to handle first-order recurrences, step 2: push the
+ // instruction to be sunk at its insertion point.
+ auto SAInvIt = SinkAfterInverse.find(Instr);
+ if (SAInvIt != SinkAfterInverse.end())
+ Ingredients.push_back(SAInvIt->second);
+ }
+
+ // Introduce each ingredient into VPlan.
+ for (Instruction *Instr : Ingredients) {
+ VPRecipeBase *Recipe = nullptr;
+
+ // Check if Instr should belong to an interleave memory recipe, or already
+ // does. In the latter case Instr is irrelevant.
+ if ((Recipe = tryToInterleaveMemory(Instr, Range))) {
+ VPBB->appendRecipe(Recipe);
+ continue;
+ }
+
+ // Check if Instr should form some PHI recipe.
+ if ((Recipe = tryToOptimizeInduction(Instr, Range))) {
+ VPBB->appendRecipe(Recipe);
+ continue;
+ }
+ if (PHINode *Phi = dyn_cast<PHINode>(Instr)) {
+ VPBB->appendRecipe(new VPWidenPHIRecipe(Phi));
+ continue;
+ }
+
+ // Check if Instr is to be widened by a general VPWidenRecipe, after
+ // having first checked for specific widening recipes that deal with
+ // Interleave Groups, Inductions and Phi nodes.
+ if ((Recipe = tryToWiden(Instr, LastWidenRecipe, Range))) {
+ if (Recipe != LastWidenRecipe)
+ VPBB->appendRecipe(Recipe);
+ LastWidenRecipe = cast<VPWidenRecipe>(Recipe);
+ continue;
+ }
+
+ // Otherwise, if all widening options failed, Instruction is to be
+ // replicated. This may create a successor for VPBB.
+ VPBasicBlock *NextVPBB =
+ handleReplication(Instr, Range, VPBB, PredInst2Recipe);
+ if (NextVPBB != VPBB) {
+ VPBB = NextVPBB;
+ VPBB->setName(BB->hasName() ? BB->getName() + "." + Twine(VPBBsForBB++)
+ : "");
+ }
+ }
+ }
+
+ // Discard empty dummy pre-entry VPBasicBlock. Note that other VPBasicBlocks
+ // may also be empty, such as the last one VPBB, reflecting original
+ // basic-blocks with no recipes.
+ VPBasicBlock *PreEntry = cast<VPBasicBlock>(Plan->getEntry());
+ assert(PreEntry->empty() && "Expecting empty pre-entry block.");
+ VPBlockBase *Entry = Plan->setEntry(PreEntry->getSingleSuccessor());
+ PreEntry->disconnectSuccessor(Entry);
+ delete PreEntry;
+
+ std::string PlanName;
+ raw_string_ostream RSO(PlanName);
+ unsigned VF = Range.Start;
+ Plan->addVF(VF);
+ RSO << "Initial VPlan for VF={" << VF;
+ for (VF *= 2; VF < Range.End; VF *= 2) {
+ Plan->addVF(VF);
+ RSO << "," << VF;
+ }
+ RSO << "},UF>=1";
+ RSO.flush();
+ Plan->setName(PlanName);
+
+ return Plan;
+}
+
+void VPInterleaveRecipe::print(raw_ostream &O, const Twine &Indent) const {
+ O << " +\n"
+ << Indent << "\"INTERLEAVE-GROUP with factor " << IG->getFactor() << " at ";
+ IG->getInsertPos()->printAsOperand(O, false);
+ O << "\\l\"";
+ for (unsigned i = 0; i < IG->getFactor(); ++i)
+ if (Instruction *I = IG->getMember(i))
+ O << " +\n"
+ << Indent << "\" " << VPlanIngredient(I) << " " << i << "\\l\"";
+}
+
+void VPReplicateRecipe::execute(VPTransformState &State) {
+
+ if (State.Instance) { // Generate a single instance.
+ State.ILV->scalarizeInstruction(Ingredient, *State.Instance, IsPredicated);
+ // Insert scalar instance packing it into a vector.
+ if (AlsoPack && State.VF > 1) {
+ // If we're constructing lane 0, initialize to start from undef.
+ if (State.Instance->Lane == 0) {
+ Value *Undef =
+ UndefValue::get(VectorType::get(Ingredient->getType(), State.VF));
+ State.ValueMap.setVectorValue(Ingredient, State.Instance->Part, Undef);
+ }
+ State.ILV->packScalarIntoVectorValue(Ingredient, *State.Instance);
+ }
+ return;
+ }
+
+ // Generate scalar instances for all VF lanes of all UF parts, unless the
+ // instruction is uniform inwhich case generate only the first lane for each
+ // of the UF parts.
+ unsigned EndLane = IsUniform ? 1 : State.VF;
+ for (unsigned Part = 0; Part < State.UF; ++Part)
+ for (unsigned Lane = 0; Lane < EndLane; ++Lane)
+ State.ILV->scalarizeInstruction(Ingredient, {Part, Lane}, IsPredicated);
+}
+
+void VPBranchOnMaskRecipe::execute(VPTransformState &State) {
+ assert(State.Instance && "Branch on Mask works only on single instance.");
+
+ unsigned Part = State.Instance->Part;
+ unsigned Lane = State.Instance->Lane;
+
+ auto Cond = State.ILV->createBlockInMask(MaskedBasicBlock);
+
+ Value *ConditionBit = Cond[Part];
+ if (!ConditionBit) // Block in mask is all-one.
+ ConditionBit = State.Builder.getTrue();
+ else if (ConditionBit->getType()->isVectorTy())
+ ConditionBit = State.Builder.CreateExtractElement(
+ ConditionBit, State.Builder.getInt32(Lane));
+
+ // Replace the temporary unreachable terminator with a new conditional branch,
+ // whose two destinations will be set later when they are created.
+ auto *CurrentTerminator = State.CFG.PrevBB->getTerminator();
+ assert(isa<UnreachableInst>(CurrentTerminator) &&
+ "Expected to replace unreachable terminator with conditional branch.");
+ auto *CondBr = BranchInst::Create(State.CFG.PrevBB, nullptr, ConditionBit);
+ CondBr->setSuccessor(0, nullptr);
+ ReplaceInstWithInst(CurrentTerminator, CondBr);
+
+ DEBUG(dbgs() << "\nLV: vectorizing BranchOnMask recipe "
+ << MaskedBasicBlock->getName());
+}
+
+void VPPredInstPHIRecipe::execute(VPTransformState &State) {
+ assert(State.Instance && "Predicated instruction PHI works per instance.");
+ Instruction *ScalarPredInst = cast<Instruction>(
+ State.ValueMap.getScalarValue(PredInst, *State.Instance));
+ BasicBlock *PredicatedBB = ScalarPredInst->getParent();
+ BasicBlock *PredicatingBB = PredicatedBB->getSinglePredecessor();
+ assert(PredicatingBB && "Predicated block has no single predecessor.");
+
+ // By current pack/unpack logic we need to generate only a single phi node: if
+ // a vector value for the predicated instruction exists at this point it means
+ // the instruction has vector users only, and a phi for the vector value is
+ // needed. In this case the recipe of the predicated instruction is marked to
+ // also do that packing, thereby "hoisting" the insert-element sequence.
+ // Otherwise, a phi node for the scalar value is needed.
+ unsigned Part = State.Instance->Part;
+ if (State.ValueMap.hasVectorValue(PredInst, Part)) {
+ Value *VectorValue = State.ValueMap.getVectorValue(PredInst, Part);
+ InsertElementInst *IEI = cast<InsertElementInst>(VectorValue);
+ PHINode *VPhi = State.Builder.CreatePHI(IEI->getType(), 2);
+ VPhi->addIncoming(IEI->getOperand(0), PredicatingBB); // Unmodified vector.
+ VPhi->addIncoming(IEI, PredicatedBB); // New vector with inserted element.
+ State.ValueMap.resetVectorValue(PredInst, Part, VPhi); // Update cache.
+ } else {
+ Type *PredInstType = PredInst->getType();
+ PHINode *Phi = State.Builder.CreatePHI(PredInstType, 2);
+ Phi->addIncoming(UndefValue::get(ScalarPredInst->getType()), PredicatingBB);
+ Phi->addIncoming(ScalarPredInst, PredicatedBB);
+ State.ValueMap.resetScalarValue(PredInst, *State.Instance, Phi);
+ }
+}
+
bool LoopVectorizePass::processLoop(Loop *L) {
assert(L->empty() && "Only process inner loops.");
CM.collectValuesToIgnore();
// Use the planner for vectorization.
- LoopVectorizationPlanner LVP(L, LI, &LVL, CM);
+ LoopVectorizationPlanner LVP(L, LI, TLI, TTI, &LVL, CM);
// Get user vectorization factor.
unsigned UserVF = Hints.getWidth();
DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n');
}
+ LVP.setBestPlan(VF.Width, IC);
+
using namespace ore;
if (!VectorizeLoop) {
assert(IC > 1 && "interleave count should not be 1 or 0");
// interleave it.
InnerLoopUnroller Unroller(L, PSE, LI, DT, TLI, TTI, AC, ORE, IC, &LVL,
&CM);
- LVP.executePlan(Unroller);
+ LVP.executePlan(Unroller, DT);
ORE->emit(OptimizationRemark(LV_NAME, "Interleaved", L->getStartLoc(),
L->getHeader())
// If we decided that it is *legal* to vectorize the loop, then do it.
InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, IC,
&LVL, &CM);
- LVP.executePlan(LB);
+ LVP.executePlan(LB, DT);
++LoopsVectorized;
// Add metadata to disable runtime unrolling a scalar loop when there are
--- /dev/null
+//===- VPlan.h - Represent A Vectorizer Plan ------------------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file contains the declarations of the Vectorization Plan base classes:
+/// 1. VPBasicBlock and VPRegionBlock that inherit from a common pure virtual
+/// VPBlockBase, together implementing a Hierarchical CFG;
+/// 2. Specializations of GraphTraits that allow VPBlockBase graphs to be
+/// treated as proper graphs for generic algorithms;
+/// 3. Pure virtual VPRecipeBase serving as the base class for recipes contained
+/// within VPBasicBlocks;
+/// 4. The VPlan class holding a candidate for vectorization;
+/// 5. The VPlanPrinter class providing a way to print a plan in dot format.
+/// These are documented in docs/VectorizationPlan.rst.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TRANSFORMS_VECTORIZE_VPLAN_H
+#define LLVM_TRANSFORMS_VECTORIZE_VPLAN_H
+
+#include "llvm/ADT/GraphTraits.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/ilist.h"
+#include "llvm/ADT/ilist_node.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/Support/raw_ostream.h"
+
+// The (re)use of existing LoopVectorize classes is subject to future VPlan
+// refactoring.
+namespace {
+// Forward declarations.
+//class InnerLoopVectorizer;
+class LoopVectorizationLegality;
+class LoopVectorizationCostModel;
+} // namespace
+
+namespace llvm {
+
+// Forward declarations.
+class BasicBlock;
+class InnerLoopVectorizer;
+class VPBasicBlock;
+
+/// In what follows, the term "input IR" refers to code that is fed into the
+/// vectorizer whereas the term "output IR" refers to code that is generated by
+/// the vectorizer.
+
+/// VPIteration represents a single point in the iteration space of the output
+/// (vectorized and/or unrolled) IR loop.
+struct VPIteration {
+ unsigned Part; ///< in [0..UF)
+ unsigned Lane; ///< in [0..VF)
+};
+
+/// This is a helper struct for maintaining vectorization state. It's used for
+/// mapping values from the original loop to their corresponding values in
+/// the new loop. Two mappings are maintained: one for vectorized values and
+/// one for scalarized values. Vectorized values are represented with UF
+/// vector values in the new loop, and scalarized values are represented with
+/// UF x VF scalar values in the new loop. UF and VF are the unroll and
+/// vectorization factors, respectively.
+///
+/// Entries can be added to either map with setVectorValue and setScalarValue,
+/// which assert that an entry was not already added before. If an entry is to
+/// replace an existing one, call resetVectorValue and resetScalarValue. This is
+/// currently needed to modify the mapped values during "fix-up" operations that
+/// occur once the first phase of widening is complete. These operations include
+/// type truncation and the second phase of recurrence widening.
+///
+/// Entries from either map can be retrieved using the getVectorValue and
+/// getScalarValue functions, which assert that the desired value exists.
+
+struct VectorizerValueMap {
+private:
+ /// The unroll factor. Each entry in the vector map contains UF vector values.
+ unsigned UF;
+
+ /// The vectorization factor. Each entry in the scalar map contains UF x VF
+ /// scalar values.
+ unsigned VF;
+
+ /// The vector and scalar map storage. We use std::map and not DenseMap
+ /// because insertions to DenseMap invalidate its iterators.
+ typedef SmallVector<Value *, 2> VectorParts;
+ typedef SmallVector<SmallVector<Value *, 4>, 2> ScalarParts;
+ std::map<Value *, VectorParts> VectorMapStorage;
+ std::map<Value *, ScalarParts> ScalarMapStorage;
+
+public:
+ /// Construct an empty map with the given unroll and vectorization factors.
+ VectorizerValueMap(unsigned UF, unsigned VF) : UF(UF), VF(VF) {}
+
+ /// \return True if the map has any vector entry for \p Key.
+ bool hasAnyVectorValue(Value *Key) const {
+ return VectorMapStorage.count(Key);
+ }
+
+ /// \return True if the map has a vector entry for \p Key and \p Part.
+ bool hasVectorValue(Value *Key, unsigned Part) const {
+ assert(Part < UF && "Queried Vector Part is too large.");
+ if (!hasAnyVectorValue(Key))
+ return false;
+ const VectorParts &Entry = VectorMapStorage.find(Key)->second;
+ assert(Entry.size() == UF && "VectorParts has wrong dimensions.");
+ return Entry[Part] != nullptr;
+ }
+
+ /// \return True if the map has any scalar entry for \p Key.
+ bool hasAnyScalarValue(Value *Key) const {
+ return ScalarMapStorage.count(Key);
+ }
+
+ /// \return True if the map has a scalar entry for \p Key and \p Instance.
+ bool hasScalarValue(Value *Key, const VPIteration &Instance) const {
+ assert(Instance.Part < UF && "Queried Scalar Part is too large.");
+ assert(Instance.Lane < VF && "Queried Scalar Lane is too large.");
+ if (!hasAnyScalarValue(Key))
+ return false;
+ const ScalarParts &Entry = ScalarMapStorage.find(Key)->second;
+ assert(Entry.size() == UF && "ScalarParts has wrong dimensions.");
+ assert(Entry[Instance.Part].size() == VF &&
+ "ScalarParts has wrong dimensions.");
+ return Entry[Instance.Part][Instance.Lane] != nullptr;
+ }
+
+ /// Retrieve the existing vector value that corresponds to \p Key and
+ /// \p Part.
+ Value *getVectorValue(Value *Key, unsigned Part) {
+ assert(hasVectorValue(Key, Part) && "Getting non-existent value.");
+ return VectorMapStorage[Key][Part];
+ }
+
+ /// Retrieve the existing scalar value that corresponds to \p Key and
+ /// \p Instance.
+ Value *getScalarValue(Value *Key, const VPIteration &Instance) {
+ assert(hasScalarValue(Key, Instance) && "Getting non-existent value.");
+ return ScalarMapStorage[Key][Instance.Part][Instance.Lane];
+ }
+
+ /// Set a vector value associated with \p Key and \p Part. Assumes such a
+ /// value is not already set. If it is, use resetVectorValue() instead.
+ void setVectorValue(Value *Key, unsigned Part, Value *Vector) {
+ assert(!hasVectorValue(Key, Part) && "Vector value already set for part");
+ if (!VectorMapStorage.count(Key)) {
+ VectorParts Entry(UF);
+ VectorMapStorage[Key] = Entry;
+ }
+ VectorMapStorage[Key][Part] = Vector;
+ }
+
+ /// Set a scalar value associated with \p Key and \p Instance. Assumes such a
+ /// value is not already set.
+ void setScalarValue(Value *Key, const VPIteration &Instance, Value *Scalar) {
+ assert(!hasScalarValue(Key, Instance) && "Scalar value already set");
+ if (!ScalarMapStorage.count(Key)) {
+ ScalarParts Entry(UF);
+ // TODO: Consider storing uniform values only per-part, as they occupy
+ // lane 0 only, keeping the other VF-1 redundant entries null.
+ for (unsigned Part = 0; Part < UF; ++Part)
+ Entry[Part].resize(VF, nullptr);
+ ScalarMapStorage[Key] = Entry;
+ }
+ ScalarMapStorage[Key][Instance.Part][Instance.Lane] = Scalar;
+ }
+
+ /// Reset the vector value associated with \p Key for the given \p Part.
+ /// This function can be used to update values that have already been
+ /// vectorized. This is the case for "fix-up" operations including type
+ /// truncation and the second phase of recurrence vectorization.
+ void resetVectorValue(Value *Key, unsigned Part, Value *Vector) {
+ assert(hasVectorValue(Key, Part) && "Vector value not set for part");
+ VectorMapStorage[Key][Part] = Vector;
+ }
+
+ /// Reset the scalar value associated with \p Key for \p Part and \p Lane.
+ /// This function can be used to update values that have already been
+ /// scalarized. This is the case for "fix-up" operations including scalar phi
+ /// nodes for scalarized and predicated instructions.
+ void resetScalarValue(Value *Key, const VPIteration &Instance,
+ Value *Scalar) {
+ assert(hasScalarValue(Key, Instance) &&
+ "Scalar value not set for part and lane");
+ ScalarMapStorage[Key][Instance.Part][Instance.Lane] = Scalar;
+ }
+};
+
+/// VPTransformState holds information passed down when "executing" a VPlan,
+/// needed for generating the output IR.
+struct VPTransformState {
+
+ VPTransformState(unsigned VF, unsigned UF, class LoopInfo *LI,
+ class DominatorTree *DT, IRBuilder<> &Builder,
+ VectorizerValueMap &ValueMap, InnerLoopVectorizer *ILV)
+ : VF(VF), UF(UF), Instance(), LI(LI), DT(DT), Builder(Builder),
+ ValueMap(ValueMap), ILV(ILV) {}
+
+ /// The chosen Vectorization and Unroll Factors of the loop being vectorized.
+ unsigned VF;
+ unsigned UF;
+
+ /// Hold the indices to generate specific scalar instructions. Null indicates
+ /// that all instances are to be generated, using either scalar or vector
+ /// instructions.
+ Optional<VPIteration> Instance;
+
+ /// Hold state information used when constructing the CFG of the output IR,
+ /// traversing the VPBasicBlocks and generating corresponding IR BasicBlocks.
+ struct CFGState {
+ /// The previous VPBasicBlock visited. Initially set to null.
+ VPBasicBlock *PrevVPBB;
+ /// The previous IR BasicBlock created or used. Initially set to the new
+ /// header BasicBlock.
+ BasicBlock *PrevBB;
+ /// The last IR BasicBlock in the output IR. Set to the new latch
+ /// BasicBlock, used for placing the newly created BasicBlocks.
+ BasicBlock *LastBB;
+ /// A mapping of each VPBasicBlock to the corresponding BasicBlock. In case
+ /// of replication, maps the BasicBlock of the last replica created.
+ SmallDenseMap<VPBasicBlock *, BasicBlock *> VPBB2IRBB;
+
+ CFGState() : PrevVPBB(nullptr), PrevBB(nullptr), LastBB(nullptr) {}
+ } CFG;
+
+ /// Hold a pointer to LoopInfo to register new basic blocks in the loop.
+ class LoopInfo *LI;
+
+ /// Hold a pointer to Dominator Tree to register new basic blocks in the loop.
+ class DominatorTree *DT;
+
+ /// Hold a reference to the IRBuilder used to generate output IR code.
+ IRBuilder<> &Builder;
+
+ /// Hold a reference to the Value state information used when generating the
+ /// Values of the output IR.
+ VectorizerValueMap &ValueMap;
+
+ /// Hold a pointer to InnerLoopVectorizer to reuse its IR generation methods.
+ class InnerLoopVectorizer *ILV;
+};
+
+/// VPBlockBase is the building block of the Hierarchical Control-Flow Graph.
+/// A VPBlockBase can be either a VPBasicBlock or a VPRegionBlock.
+class VPBlockBase {
+private:
+ const unsigned char SubclassID; ///< Subclass identifier (for isa/dyn_cast).
+
+ /// An optional name for the block.
+ std::string Name;
+
+ /// The immediate VPRegionBlock which this VPBlockBase belongs to, or null if
+ /// it is a topmost VPBlockBase.
+ class VPRegionBlock *Parent;
+
+ /// List of predecessor blocks.
+ SmallVector<VPBlockBase *, 1> Predecessors;
+
+ /// List of successor blocks.
+ SmallVector<VPBlockBase *, 1> Successors;
+
+ /// Add \p Successor as the last successor to this block.
+ void appendSuccessor(VPBlockBase *Successor) {
+ assert(Successor && "Cannot add nullptr successor!");
+ Successors.push_back(Successor);
+ }
+
+ /// Add \p Predecessor as the last predecessor to this block.
+ void appendPredecessor(VPBlockBase *Predecessor) {
+ assert(Predecessor && "Cannot add nullptr predecessor!");
+ Predecessors.push_back(Predecessor);
+ }
+
+ /// Remove \p Predecessor from the predecessors of this block.
+ void removePredecessor(VPBlockBase *Predecessor) {
+ auto Pos = std::find(Predecessors.begin(), Predecessors.end(), Predecessor);
+ assert(Pos && "Predecessor does not exist");
+ Predecessors.erase(Pos);
+ }
+
+ /// Remove \p Successor from the successors of this block.
+ void removeSuccessor(VPBlockBase *Successor) {
+ auto Pos = std::find(Successors.begin(), Successors.end(), Successor);
+ assert(Pos && "Successor does not exist");
+ Successors.erase(Pos);
+ }
+
+protected:
+ VPBlockBase(const unsigned char SC, const std::string &N)
+ : SubclassID(SC), Name(N), Parent(nullptr) {}
+
+public:
+ /// An enumeration for keeping track of the concrete subclass of VPBlockBase
+ /// that are actually instantiated. Values of this enumeration are kept in the
+ /// SubclassID field of the VPBlockBase objects. They are used for concrete
+ /// type identification.
+ typedef enum { VPBasicBlockSC, VPRegionBlockSC } VPBlockTy;
+
+ typedef SmallVectorImpl<VPBlockBase *> VPBlocksTy;
+
+ virtual ~VPBlockBase() {}
+
+ const std::string &getName() const { return Name; }
+
+ void setName(const Twine &newName) { Name = newName.str(); }
+
+ /// \return an ID for the concrete type of this object.
+ /// This is used to implement the classof checks. This should not be used
+ /// for any other purpose, as the values may change as LLVM evolves.
+ unsigned getVPBlockID() const { return SubclassID; }
+
+ const VPRegionBlock *getParent() const { return Parent; }
+
+ void setParent(VPRegionBlock *P) { Parent = P; }
+
+ /// \return the VPBasicBlock that is the entry of this VPBlockBase,
+ /// recursively, if the latter is a VPRegionBlock. Otherwise, if this
+ /// VPBlockBase is a VPBasicBlock, it is returned.
+ const VPBasicBlock *getEntryBasicBlock() const;
+ VPBasicBlock *getEntryBasicBlock();
+
+ /// \return the VPBasicBlock that is the exit of this VPBlockBase,
+ /// recursively, if the latter is a VPRegionBlock. Otherwise, if this
+ /// VPBlockBase is a VPBasicBlock, it is returned.
+ const VPBasicBlock *getExitBasicBlock() const;
+ VPBasicBlock *getExitBasicBlock();
+
+ const VPBlocksTy &getSuccessors() const { return Successors; }
+ VPBlocksTy &getSuccessors() { return Successors; }
+
+ const VPBlocksTy &getPredecessors() const { return Predecessors; }
+ VPBlocksTy &getPredecessors() { return Predecessors; }
+
+ /// \return the successor of this VPBlockBase if it has a single successor.
+ /// Otherwise return a null pointer.
+ VPBlockBase *getSingleSuccessor() const {
+ return (Successors.size() == 1 ? *Successors.begin() : nullptr);
+ }
+
+ /// \return the predecessor of this VPBlockBase if it has a single
+ /// predecessor. Otherwise return a null pointer.
+ VPBlockBase *getSinglePredecessor() const {
+ return (Predecessors.size() == 1 ? *Predecessors.begin() : nullptr);
+ }
+
+ /// An Enclosing Block of a block B is any block containing B, including B
+ /// itself. \return the closest enclosing block starting from "this", which
+ /// has successors. \return the root enclosing block if all enclosing blocks
+ /// have no successors.
+ VPBlockBase *getEnclosingBlockWithSuccessors();
+
+ /// \return the closest enclosing block starting from "this", which has
+ /// predecessors. \return the root enclosing block if all enclosing blocks
+ /// have no predecessors.
+ VPBlockBase *getEnclosingBlockWithPredecessors();
+
+ /// \return the successors either attached directly to this VPBlockBase or, if
+ /// this VPBlockBase is the exit block of a VPRegionBlock and has no
+ /// successors of its own, search recursively for the first enclosing
+ /// VPRegionBlock that has successors and return them. If no such
+ /// VPRegionBlock exists, return the (empty) successors of the topmost
+ /// VPBlockBase reached.
+ const VPBlocksTy &getHierarchicalSuccessors() {
+ return getEnclosingBlockWithSuccessors()->getSuccessors();
+ }
+
+ /// \return the hierarchical successor of this VPBlockBase if it has a single
+ /// hierarchical successor. Otherwise return a null pointer.
+ VPBlockBase *getSingleHierarchicalSuccessor() {
+ return getEnclosingBlockWithSuccessors()->getSingleSuccessor();
+ }
+
+ /// \return the predecessors either attached directly to this VPBlockBase or,
+ /// if this VPBlockBase is the entry block of a VPRegionBlock and has no
+ /// predecessors of its own, search recursively for the first enclosing
+ /// VPRegionBlock that has predecessors and return them. If no such
+ /// VPRegionBlock exists, return the (empty) predecessors of the topmost
+ /// VPBlockBase reached.
+ const VPBlocksTy &getHierarchicalPredecessors() {
+ return getEnclosingBlockWithPredecessors()->getPredecessors();
+ }
+
+ /// \return the hierarchical predecessor of this VPBlockBase if it has a
+ /// single hierarchical predecessor. Otherwise return a null pointer.
+ VPBlockBase *getSingleHierarchicalPredecessor() {
+ return getEnclosingBlockWithPredecessors()->getSinglePredecessor();
+ }
+
+ /// Sets a given VPBlockBase \p Successor as the single successor and \return
+ /// \p Successor. The parent of this Block is copied to be the parent of
+ /// \p Successor.
+ VPBlockBase *setOneSuccessor(VPBlockBase *Successor) {
+ assert(Successors.empty() && "Setting one successor when others exist.");
+ appendSuccessor(Successor);
+ Successor->appendPredecessor(this);
+ Successor->Parent = Parent;
+ return Successor;
+ }
+
+ /// Sets two given VPBlockBases \p IfTrue and \p IfFalse to be the two
+ /// successors. The parent of this Block is copied to be the parent of both
+ /// \p IfTrue and \p IfFalse.
+ void setTwoSuccessors(VPBlockBase *IfTrue, VPBlockBase *IfFalse) {
+ assert(Successors.empty() && "Setting two successors when others exist.");
+ appendSuccessor(IfTrue);
+ appendSuccessor(IfFalse);
+ IfTrue->appendPredecessor(this);
+ IfFalse->appendPredecessor(this);
+ IfTrue->Parent = Parent;
+ IfFalse->Parent = Parent;
+ }
+
+ void disconnectSuccessor(VPBlockBase *Successor) {
+ assert(Successor && "Successor to disconnect is null.");
+ removeSuccessor(Successor);
+ Successor->removePredecessor(this);
+ }
+
+ /// The method which generates the output IR that correspond to this
+ /// VPBlockBase, thereby "executing" the VPlan.
+ virtual void execute(struct VPTransformState *State) = 0;
+
+ /// Delete all blocks reachable from a given VPBlockBase, inclusive.
+ static void deleteCFG(VPBlockBase *Entry);
+};
+
+/// VPRecipeBase is a base class modeling a sequence of one or more output IR
+/// instructions.
+class VPRecipeBase : public ilist_node_with_parent<VPRecipeBase, VPBasicBlock> {
+ friend VPBasicBlock;
+
+private:
+ const unsigned char SubclassID; ///< Subclass identifier (for isa/dyn_cast).
+
+ /// Each VPRecipe belongs to a single VPBasicBlock.
+ VPBasicBlock *Parent;
+
+public:
+ /// An enumeration for keeping track of the concrete subclass of VPRecipeBase
+ /// that is actually instantiated. Values of this enumeration are kept in the
+ /// SubclassID field of the VPRecipeBase objects. They are used for concrete
+ /// type identification.
+ typedef enum {
+ VPBranchOnMaskSC,
+ VPInterleaveSC,
+ VPPredInstPHISC,
+ VPReplicateSC,
+ VPWidenIntOrFpInductionSC,
+ VPWidenPHISC,
+ VPWidenSC,
+ } VPRecipeTy;
+
+ VPRecipeBase(const unsigned char SC) : SubclassID(SC), Parent(nullptr) {}
+
+ virtual ~VPRecipeBase() {}
+
+ /// \return an ID for the concrete type of this object.
+ /// This is used to implement the classof checks. This should not be used
+ /// for any other purpose, as the values may change as LLVM evolves.
+ unsigned getVPRecipeID() const { return SubclassID; }
+
+ /// \return the VPBasicBlock which this VPRecipe belongs to.
+ VPBasicBlock *getParent() { return Parent; }
+ const VPBasicBlock *getParent() const { return Parent; }
+
+ /// The method which generates the output IR instructions that correspond to
+ /// this VPRecipe, thereby "executing" the VPlan.
+ virtual void execute(struct VPTransformState &State) = 0;
+
+ /// Each recipe prints itself.
+ virtual void print(raw_ostream &O, const Twine &Indent) const = 0;
+};
+
+/// VPBasicBlock serves as the leaf of the Hierarchical Control-Flow Graph. It
+/// holds a sequence of zero or more VPRecipe's each representing a sequence of
+/// output IR instructions.
+class VPBasicBlock : public VPBlockBase {
+public:
+ typedef iplist<VPRecipeBase> RecipeListTy;
+
+private:
+ /// The VPRecipes held in the order of output instructions to generate.
+ RecipeListTy Recipes;
+
+public:
+ /// Instruction iterators...
+ typedef RecipeListTy::iterator iterator;
+ typedef RecipeListTy::const_iterator const_iterator;
+ typedef RecipeListTy::reverse_iterator reverse_iterator;
+ typedef RecipeListTy::const_reverse_iterator const_reverse_iterator;
+
+ //===--------------------------------------------------------------------===//
+ /// Recipe iterator methods
+ ///
+ inline iterator begin() { return Recipes.begin(); }
+ inline const_iterator begin() const { return Recipes.begin(); }
+ inline iterator end() { return Recipes.end(); }
+ inline const_iterator end() const { return Recipes.end(); }
+
+ inline reverse_iterator rbegin() { return Recipes.rbegin(); }
+ inline const_reverse_iterator rbegin() const { return Recipes.rbegin(); }
+ inline reverse_iterator rend() { return Recipes.rend(); }
+ inline const_reverse_iterator rend() const { return Recipes.rend(); }
+
+ inline size_t size() const { return Recipes.size(); }
+ inline bool empty() const { return Recipes.empty(); }
+ inline const VPRecipeBase &front() const { return Recipes.front(); }
+ inline VPRecipeBase &front() { return Recipes.front(); }
+ inline const VPRecipeBase &back() const { return Recipes.back(); }
+ inline VPRecipeBase &back() { return Recipes.back(); }
+
+ /// \brief Returns a pointer to a member of the recipe list.
+ static RecipeListTy VPBasicBlock::*getSublistAccess(VPRecipeBase *) {
+ return &VPBasicBlock::Recipes;
+ }
+
+ VPBasicBlock(const Twine &Name = "", VPRecipeBase *Recipe = nullptr)
+ : VPBlockBase(VPBasicBlockSC, Name.str()) {
+ if (Recipe)
+ appendRecipe(Recipe);
+ }
+
+ ~VPBasicBlock() { Recipes.clear(); }
+
+ /// Method to support type inquiry through isa, cast, and dyn_cast.
+ static inline bool classof(const VPBlockBase *V) {
+ return V->getVPBlockID() == VPBlockBase::VPBasicBlockSC;
+ }
+
+ /// Augment the existing recipes of a VPBasicBlock with an additional
+ /// \p Recipe as the last recipe.
+ void appendRecipe(VPRecipeBase *Recipe) {
+ assert(Recipe && "No recipe to append.");
+ assert(!Recipe->Parent && "Recipe already in VPlan");
+ Recipe->Parent = this;
+ return Recipes.push_back(Recipe);
+ }
+
+ /// The method which generates the output IR instructions that correspond to
+ /// this VPBasicBlock, thereby "executing" the VPlan.
+ void execute(struct VPTransformState *State) override;
+
+private:
+ /// Create an IR BasicBlock to hold the output instructions generated by this
+ /// VPBasicBlock, and return it. Update the CFGState accordingly.
+ BasicBlock *createEmptyBasicBlock(VPTransformState::CFGState &CFG);
+};
+
+/// VPRegionBlock represents a collection of VPBasicBlocks and VPRegionBlocks
+/// which form a Single-Entry-Single-Exit subgraph of the output IR CFG.
+/// A VPRegionBlock may indicate that its contents are to be replicated several
+/// times. This is designed to support predicated scalarization, in which a
+/// scalar if-then code structure needs to be generated VF * UF times. Having
+/// this replication indicator helps to keep a single model for multiple
+/// candidate VF's. The actual replication takes place only once the desired VF
+/// and UF have been determined.
+class VPRegionBlock : public VPBlockBase {
+private:
+ /// Hold the Single Entry of the SESE region modelled by the VPRegionBlock.
+ VPBlockBase *Entry;
+
+ /// Hold the Single Exit of the SESE region modelled by the VPRegionBlock.
+ VPBlockBase *Exit;
+
+ /// An indicator whether this region is to generate multiple replicated
+ /// instances of output IR corresponding to its VPBlockBases.
+ bool IsReplicator;
+
+public:
+ VPRegionBlock(VPBlockBase *Entry, VPBlockBase *Exit,
+ const std::string &Name = "", bool IsReplicator = false)
+ : VPBlockBase(VPRegionBlockSC, Name), Entry(Entry), Exit(Exit),
+ IsReplicator(IsReplicator) {
+ assert(Entry->getPredecessors().empty() && "Entry block has predecessors.");
+ assert(Exit->getSuccessors().empty() && "Exit block has successors.");
+ Entry->setParent(this);
+ Exit->setParent(this);
+ }
+
+ ~VPRegionBlock() {
+ if (Entry)
+ deleteCFG(Entry);
+ }
+
+ /// Method to support type inquiry through isa, cast, and dyn_cast.
+ static inline bool classof(const VPBlockBase *V) {
+ return V->getVPBlockID() == VPBlockBase::VPRegionBlockSC;
+ }
+
+ const VPBlockBase *getEntry() const { return Entry; }
+ VPBlockBase *getEntry() { return Entry; }
+
+ const VPBlockBase *getExit() const { return Exit; }
+ VPBlockBase *getExit() { return Exit; }
+
+ /// An indicator whether this region is to generate multiple replicated
+ /// instances of output IR corresponding to its VPBlockBases.
+ bool isReplicator() const { return IsReplicator; }
+
+ /// The method which generates the output IR instructions that correspond to
+ /// this VPRegionBlock, thereby "executing" the VPlan.
+ void execute(struct VPTransformState *State) override;
+};
+
+/// VPlan models a candidate for vectorization, encoding various decisions take
+/// to produce efficient output IR, including which branches, basic-blocks and
+/// output IR instructions to generate, and their cost. VPlan holds a
+/// Hierarchical-CFG of VPBasicBlocks and VPRegionBlocks rooted at an Entry
+/// VPBlock.
+class VPlan {
+private:
+ /// Hold the single entry to the Hierarchical CFG of the VPlan.
+ VPBlockBase *Entry;
+
+ /// Holds the VFs applicable to this VPlan.
+ SmallSet<unsigned, 2> VFs;
+
+ /// Holds the name of the VPlan, for printing.
+ std::string Name;
+
+public:
+ VPlan(VPBlockBase *Entry = nullptr) : Entry(Entry) {}
+
+ ~VPlan() {
+ if (Entry)
+ VPBlockBase::deleteCFG(Entry);
+ }
+
+ /// Generate the IR code for this VPlan.
+ void execute(struct VPTransformState *State);
+
+ VPBlockBase *getEntry() { return Entry; }
+ const VPBlockBase *getEntry() const { return Entry; }
+
+ VPBlockBase *setEntry(VPBlockBase *Block) { return Entry = Block; }
+
+ void addVF(unsigned VF) { VFs.insert(VF); }
+
+ bool hasVF(unsigned VF) { return VFs.count(VF); }
+
+ const std::string &getName() const { return Name; }
+
+ void setName(const Twine &newName) { Name = newName.str(); }
+
+private:
+ /// Add to the given dominator tree the header block and every new basic block
+ /// that was created between it and the latch block, inclusive.
+ static void updateDominatorTree(class DominatorTree *DT,
+ BasicBlock *LoopPreHeaderBB,
+ BasicBlock *LoopLatchBB);
+};
+
+/// VPlanPrinter prints a given VPlan to a given output stream. The printing is
+/// indented and follows the dot format.
+class VPlanPrinter {
+ friend inline raw_ostream &operator<<(raw_ostream &OS, VPlan &Plan);
+ friend inline raw_ostream &operator<<(raw_ostream &OS,
+ const struct VPlanIngredient &I);
+
+private:
+ raw_ostream &OS;
+ VPlan &Plan;
+ unsigned Depth;
+ unsigned TabWidth = 2;
+ std::string Indent;
+
+ unsigned BID = 0;
+
+ SmallDenseMap<const VPBlockBase *, unsigned> BlockID;
+
+ /// Handle indentation.
+ void bumpIndent(int b) { Indent = std::string((Depth += b) * TabWidth, ' '); }
+
+ /// Print a given \p Block of the Plan.
+ void dumpBlock(const VPBlockBase *Block);
+
+ /// Print the information related to the CFG edges going out of a given
+ /// \p Block, followed by printing the successor blocks themselves.
+ void dumpEdges(const VPBlockBase *Block);
+
+ /// Print a given \p BasicBlock, including its VPRecipes, followed by printing
+ /// its successor blocks.
+ void dumpBasicBlock(const VPBasicBlock *BasicBlock);
+
+ /// Print a given \p Region of the Plan.
+ void dumpRegion(const VPRegionBlock *Region);
+
+ unsigned getOrCreateBID(const VPBlockBase *Block) {
+ return BlockID.count(Block) ? BlockID[Block] : BlockID[Block] = BID++;
+ }
+
+ const Twine getOrCreateName(const VPBlockBase *Block);
+
+ const Twine getUID(const VPBlockBase *Block);
+
+ /// Print the information related to a CFG edge between two VPBlockBases.
+ void drawEdge(const VPBlockBase *From, const VPBlockBase *To, bool Hidden,
+ const Twine &Label);
+
+ VPlanPrinter(raw_ostream &O, VPlan &P) : OS(O), Plan(P) {}
+
+ void dump();
+
+ static void printAsIngredient(raw_ostream &O, Value *V);
+};
+
+struct VPlanIngredient {
+ Value *V;
+ VPlanIngredient(Value *V) : V(V) {}
+};
+
+inline raw_ostream &operator<<(raw_ostream &OS, const VPlanIngredient &I) {
+ VPlanPrinter::printAsIngredient(OS, I.V);
+ return OS;
+}
+
+inline raw_ostream &operator<<(raw_ostream &OS, VPlan &Plan) {
+ VPlanPrinter Printer(OS, Plan);
+ Printer.dump();
+ return OS;
+}
+
+//===--------------------------------------------------------------------===//
+// GraphTraits specializations for VPlan/VPRegionBlock Control-Flow Graphs //
+//===--------------------------------------------------------------------===//
+
+// Provide specializations of GraphTraits to be able to treat a VPBlockBase as a
+// graph of VPBlockBase nodes...
+
+template <> struct GraphTraits<VPBlockBase *> {
+ typedef VPBlockBase *NodeRef;
+ typedef SmallVectorImpl<VPBlockBase *>::iterator ChildIteratorType;
+
+ static NodeRef getEntryNode(NodeRef N) { return N; }
+
+ static inline ChildIteratorType child_begin(NodeRef N) {
+ return N->getSuccessors().begin();
+ }
+
+ static inline ChildIteratorType child_end(NodeRef N) {
+ return N->getSuccessors().end();
+ }
+};
+
+template <> struct GraphTraits<const VPBlockBase *> {
+ typedef const VPBlockBase *NodeRef;
+ typedef SmallVectorImpl<VPBlockBase *>::const_iterator ChildIteratorType;
+
+ static NodeRef getEntryNode(NodeRef N) { return N; }
+
+ static inline ChildIteratorType child_begin(NodeRef N) {
+ return N->getSuccessors().begin();
+ }
+
+ static inline ChildIteratorType child_end(NodeRef N) {
+ return N->getSuccessors().end();
+ }
+};
+
+// Provide specializations of GraphTraits to be able to treat a VPBlockBase as a
+// graph of VPBlockBase nodes... and to walk it in inverse order. Inverse order
+// for a VPBlockBase is considered to be when traversing the predecessors of a
+// VPBlockBase instead of its successors.
+//
+
+template <> struct GraphTraits<Inverse<VPBlockBase *>> {
+ typedef VPBlockBase *NodeRef;
+ typedef SmallVectorImpl<VPBlockBase *>::iterator ChildIteratorType;
+
+ static Inverse<VPBlockBase *> getEntryNode(Inverse<VPBlockBase *> B) {
+ return B;
+ }
+
+ static inline ChildIteratorType child_begin(NodeRef N) {
+ return N->getPredecessors().begin();
+ }
+
+ static inline ChildIteratorType child_end(NodeRef N) {
+ return N->getPredecessors().end();
+ }
+};
+
+} // namespace llvm
+
+#endif // LLVM_TRANSFORMS_VECTORIZE_VPLAN_H