diff options
author | Chandler Carruth <chandlerc@gmail.com> | 2017-08-20 23:17:11 +0000 |
---|---|---|
committer | Chandler Carruth <chandlerc@gmail.com> | 2017-08-20 23:17:11 +0000 |
commit | 3f7e6da69674b23fa4e4db7efff18366b8e43dc9 (patch) | |
tree | b8330386af9349596443ea26c48e749a7204fd8b /lib/Transforms/Vectorize/LoopVectorize.cpp | |
parent | b3bdcc1c1bf9b58c51d1a9fb617ab18ec8610357 (diff) |
Revert r311077: [LV] Using VPlan ...
This causes LLVM to assert fail on PPC64 and crash / infloop in other
cases. Filed http://llvm.org/PR34248 with reproducer attached.
git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@311304 91177308-0d34-0410-b5e6-96231b3b80d8
Diffstat (limited to 'lib/Transforms/Vectorize/LoopVectorize.cpp')
-rw-r--r-- | lib/Transforms/Vectorize/LoopVectorize.cpp | 1583 |
1 files changed, 523 insertions, 1060 deletions
diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp index 6c417ba340f..44ffcfd3de0 100644 --- a/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -47,7 +47,6 @@ //===----------------------------------------------------------------------===// #include "llvm/Transforms/Vectorize/LoopVectorize.h" -#include "VPlan.h" #include "llvm/ADT/DenseMap.h" #include "llvm/ADT/Hashing.h" #include "llvm/ADT/MapVector.h" @@ -99,7 +98,6 @@ #include "llvm/Transforms/Utils/LoopVersioning.h" #include "llvm/Transforms/Vectorize.h" #include <algorithm> -#include <functional> #include <map> #include <tuple> @@ -252,10 +250,6 @@ class LoopVectorizeHints; class LoopVectorizationLegality; class LoopVectorizationCostModel; class LoopVectorizationRequirements; -class VPInterleaveRecipe; -class VPReplicateRecipe; -class VPWidenIntOrFpInductionRecipe; -class VPWidenRecipe; /// Returns true if the given loop body has a cycle, excluding the loop /// itself. @@ -368,9 +362,6 @@ static Constant *getSignedIntOrFpConstant(Type *Ty, int64_t C) { : ConstantFP::get(Ty, C); } -} // end anonymous namespace - -namespace llvm { /// InnerLoopVectorizer vectorizes loops which contain only one basic /// block to a specified vectorization factor (VF). /// This class performs the widening of scalars into vectors, or multiple @@ -402,11 +393,10 @@ public: AddedSafetyChecks(false) {} /// Create a new empty loop. Unlink the old loop and connect the new one. - /// Return the pre-header block of the new loop. - BasicBlock *createVectorizedLoopSkeleton(); + void createVectorizedLoopSkeleton(); - /// Widen a single instruction within the innermost loop. - void widenInstruction(Instruction &I); + /// Vectorize a single instruction within the innermost loop. + void vectorizeInstruction(Instruction &I); /// Fix the vectorized code, taking care of header phi's, live-outs, and more. void fixVectorizedLoop(); @@ -416,72 +406,15 @@ public: virtual ~InnerLoopVectorizer() {} +protected: + /// A small list of PHINodes. + typedef SmallVector<PHINode *, 4> PhiVector; + /// A type for vectorized values in the new loop. Each value from the /// original loop, when vectorized, is represented by UF vector values in the /// new unrolled loop, where UF is the unroll factor. typedef SmallVector<Value *, 2> VectorParts; - /// A helper function that computes the predicate of the block BB, assuming - /// that the header block of the loop is set to True. It returns the *entry* - /// mask for the block BB. - VectorParts createBlockInMask(BasicBlock *BB); - - /// Vectorize a single PHINode in a block. This method handles the induction - /// variable canonicalization. It supports both VF = 1 for unrolled loops and - /// arbitrary length vectors. - void widenPHIInstruction(Instruction *PN, unsigned UF, unsigned VF); - - /// A helper function to scalarize a single Instruction in the innermost loop. - /// Generates a sequence of scalar instances for each lane between \p MinLane - /// and \p MaxLane, times each part between \p MinPart and \p MaxPart, - /// inclusive.. - void scalarizeInstruction(Instruction *Instr, const VPIteration &Instance, - bool IfPredicateInstr); - - /// Widen an integer or floating-point induction variable \p IV. If \p Trunc - /// is provided, the integer induction variable will first be truncated to - /// the corresponding type. - void widenIntOrFpInduction(PHINode *IV, TruncInst *Trunc = nullptr); - - /// getOrCreateVectorValue and getOrCreateScalarValue coordinate to generate a - /// vector or scalar value on-demand if one is not yet available. When - /// vectorizing a loop, we visit the definition of an instruction before its - /// uses. When visiting the definition, we either vectorize or scalarize the - /// instruction, creating an entry for it in the corresponding map. (In some - /// cases, such as induction variables, we will create both vector and scalar - /// entries.) Then, as we encounter uses of the definition, we derive values - /// for each scalar or vector use unless such a value is already available. - /// For example, if we scalarize a definition and one of its uses is vector, - /// we build the required vector on-demand with an insertelement sequence - /// when visiting the use. Otherwise, if the use is scalar, we can use the - /// existing scalar definition. - /// - /// Return a value in the new loop corresponding to \p V from the original - /// loop at unroll index \p Part. If the value has already been vectorized, - /// the corresponding vector entry in VectorLoopValueMap is returned. If, - /// however, the value has a scalar entry in VectorLoopValueMap, we construct - /// a new vector value on-demand by inserting the scalar values into a vector - /// with an insertelement sequence. If the value has been neither vectorized - /// nor scalarized, it must be loop invariant, so we simply broadcast the - /// value into a vector. - Value *getOrCreateVectorValue(Value *V, unsigned Part); - - /// Return a value in the new loop corresponding to \p V from the original - /// loop at unroll and vector indices \p Instance. If the value has been - /// vectorized but not scalarized, the necessary extractelement instruction - /// will be generated. - Value *getOrCreateScalarValue(Value *V, const VPIteration &Instance); - - /// Construct the vector value of a scalarized value \p V one lane at a time. - void packScalarIntoVectorValue(Value *V, const VPIteration &Instance); - - /// Try to vectorize the interleaved access group that \p Instr belongs to. - void vectorizeInterleaveGroup(Instruction *Instr); - -protected: - /// A small list of PHINodes. - typedef SmallVector<PHINode *, 4> PhiVector; - /// A type for scalarized values in the new loop. Each value from the /// original loop, when scalarized, is represented by UF x VF scalar values /// in the new unrolled loop, where UF is the unroll factor and VF is the @@ -524,18 +457,37 @@ protected: /// the block that was created for it. void sinkScalarOperands(Instruction *PredInst); + /// Predicate conditional instructions that require predication on their + /// respective conditions. + void predicateInstructions(); + /// Shrinks vector element sizes to the smallest bitwidth they can be legally /// represented as. void truncateToMinimalBitwidths(); + /// A helper function that computes the predicate of the block BB, assuming + /// that the header block of the loop is set to True. It returns the *entry* + /// mask for the block BB. + VectorParts createBlockInMask(BasicBlock *BB); /// A helper function that computes the predicate of the edge between SRC /// and DST. VectorParts createEdgeMask(BasicBlock *Src, BasicBlock *Dst); + /// Vectorize a single PHINode in a block. This method handles the induction + /// variable canonicalization. It supports both VF = 1 for unrolled loops and + /// arbitrary length vectors. + void widenPHIInstruction(Instruction *PN, unsigned UF, unsigned VF); + /// Insert the new loop to the loop hierarchy and pass manager /// and update the analysis passes. void updateAnalysis(); + /// This instruction is un-vectorizable. Implement it as a sequence + /// of scalars. If \p IfPredicateInstr is true we need to 'hide' each + /// scalarized instruction behind an if block predicated on the control + /// dependence of the instruction. + void scalarizeInstruction(Instruction *Instr, bool IfPredicateInstr = false); + /// Vectorize Load and Store instructions, virtual void vectorizeMemoryInstruction(Instruction *Instr); @@ -569,6 +521,11 @@ protected: void createVectorIntOrFpInductionPHI(const InductionDescriptor &II, Value *Step, Instruction *EntryVal); + /// Widen an integer or floating-point induction variable \p IV. If \p Trunc + /// is provided, the integer induction variable will first be truncated to + /// the corresponding type. + void widenIntOrFpInduction(PHINode *IV, TruncInst *Trunc = nullptr); + /// Returns true if an instruction \p I should be scalarized instead of /// vectorized for the chosen vectorization factor. bool shouldScalarizeInstruction(Instruction *I) const; @@ -576,6 +533,38 @@ protected: /// Returns true if we should generate a scalar version of \p IV. bool needsScalarInduction(Instruction *IV) const; + /// getOrCreateVectorValue and getOrCreateScalarValue coordinate to generate a + /// vector or scalar value on-demand if one is not yet available. When + /// vectorizing a loop, we visit the definition of an instruction before its + /// uses. When visiting the definition, we either vectorize or scalarize the + /// instruction, creating an entry for it in the corresponding map. (In some + /// cases, such as induction variables, we will create both vector and scalar + /// entries.) Then, as we encounter uses of the definition, we derive values + /// for each scalar or vector use unless such a value is already available. + /// For example, if we scalarize a definition and one of its uses is vector, + /// we build the required vector on-demand with an insertelement sequence + /// when visiting the use. Otherwise, if the use is scalar, we can use the + /// existing scalar definition. + /// + /// Return a value in the new loop corresponding to \p V from the original + /// loop at unroll index \p Part. If the value has already been vectorized, + /// the corresponding vector entry in VectorLoopValueMap is returned. If, + /// however, the value has a scalar entry in VectorLoopValueMap, we construct + /// a new vector value on-demand by inserting the scalar values into a vector + /// with an insertelement sequence. If the value has been neither vectorized + /// nor scalarized, it must be loop invariant, so we simply broadcast the + /// value into a vector. + Value *getOrCreateVectorValue(Value *V, unsigned Part); + + /// Return a value in the new loop corresponding to \p V from the original + /// loop at unroll index \p Part and vector index \p Lane. If the value has + /// been vectorized but not scalarized, the necessary extractelement + /// instruction will be generated. + Value *getOrCreateScalarValue(Value *V, unsigned Part, unsigned Lane); + + /// Try to vectorize the interleaved access group that \p Instr belongs to. + void vectorizeInterleaveGroup(Instruction *Instr); + /// Generate a shuffle sequence that will reverse the vector Vec. virtual Value *reverseVector(Value *Vec); @@ -616,6 +605,127 @@ protected: /// the instruction. void setDebugLocFromInst(IRBuilder<> &B, const Value *Ptr); + /// This is a helper class for maintaining vectorization state. It's used for + /// mapping values from the original loop to their corresponding values in + /// the new loop. Two mappings are maintained: one for vectorized values and + /// one for scalarized values. Vectorized values are represented with UF + /// vector values in the new loop, and scalarized values are represented with + /// UF x VF scalar values in the new loop. UF and VF are the unroll and + /// vectorization factors, respectively. + /// + /// Entries can be added to either map with setVectorValue and setScalarValue, + /// which assert that an entry was not already added before. If an entry is to + /// replace an existing one, call resetVectorValue. This is currently needed + /// to modify the mapped values during "fix-up" operations that occur once the + /// first phase of widening is complete. These operations include type + /// truncation and the second phase of recurrence widening. + /// + /// Entries from either map can be retrieved using the getVectorValue and + /// getScalarValue functions, which assert that the desired value exists. + + struct ValueMap { + + /// Construct an empty map with the given unroll and vectorization factors. + ValueMap(unsigned UF, unsigned VF) : UF(UF), VF(VF) {} + + /// \return True if the map has any vector entry for \p Key. + bool hasAnyVectorValue(Value *Key) const { + return VectorMapStorage.count(Key); + } + + /// \return True if the map has a vector entry for \p Key and \p Part. + bool hasVectorValue(Value *Key, unsigned Part) const { + assert(Part < UF && "Queried Vector Part is too large."); + if (!hasAnyVectorValue(Key)) + return false; + const VectorParts &Entry = VectorMapStorage.find(Key)->second; + assert(Entry.size() == UF && "VectorParts has wrong dimensions."); + return Entry[Part] != nullptr; + } + + /// \return True if the map has any scalar entry for \p Key. + bool hasAnyScalarValue(Value *Key) const { + return ScalarMapStorage.count(Key); + } + + /// \return True if the map has a scalar entry for \p Key, \p Part and + /// \p Part. + bool hasScalarValue(Value *Key, unsigned Part, unsigned Lane) const { + assert(Part < UF && "Queried Scalar Part is too large."); + assert(Lane < VF && "Queried Scalar Lane is too large."); + if (!hasAnyScalarValue(Key)) + return false; + const ScalarParts &Entry = ScalarMapStorage.find(Key)->second; + assert(Entry.size() == UF && "ScalarParts has wrong dimensions."); + assert(Entry[Part].size() == VF && "ScalarParts has wrong dimensions."); + return Entry[Part][Lane] != nullptr; + } + + /// Retrieve the existing vector value that corresponds to \p Key and + /// \p Part. + Value *getVectorValue(Value *Key, unsigned Part) { + assert(hasVectorValue(Key, Part) && "Getting non-existent value."); + return VectorMapStorage[Key][Part]; + } + + /// Retrieve the existing scalar value that corresponds to \p Key, \p Part + /// and \p Lane. + Value *getScalarValue(Value *Key, unsigned Part, unsigned Lane) { + assert(hasScalarValue(Key, Part, Lane) && "Getting non-existent value."); + return ScalarMapStorage[Key][Part][Lane]; + } + + /// Set a vector value associated with \p Key and \p Part. Assumes such a + /// value is not already set. If it is, use resetVectorValue() instead. + void setVectorValue(Value *Key, unsigned Part, Value *Vector) { + assert(!hasVectorValue(Key, Part) && "Vector value already set for part"); + if (!VectorMapStorage.count(Key)) { + VectorParts Entry(UF); + VectorMapStorage[Key] = Entry; + } + VectorMapStorage[Key][Part] = Vector; + } + + /// Set a scalar value associated with \p Key for \p Part and \p Lane. + /// Assumes such a value is not already set. + void setScalarValue(Value *Key, unsigned Part, unsigned Lane, + Value *Scalar) { + assert(!hasScalarValue(Key, Part, Lane) && "Scalar value already set"); + if (!ScalarMapStorage.count(Key)) { + ScalarParts Entry(UF); + for (unsigned Part = 0; Part < UF; ++Part) + Entry[Part].resize(VF, nullptr); + // TODO: Consider storing uniform values only per-part, as they occupy + // lane 0 only, keeping the other VF-1 redundant entries null. + ScalarMapStorage[Key] = Entry; + } + ScalarMapStorage[Key][Part][Lane] = Scalar; + } + + /// Reset the vector value associated with \p Key for the given \p Part. + /// This function can be used to update values that have already been + /// vectorized. This is the case for "fix-up" operations including type + /// truncation and the second phase of recurrence vectorization. + void resetVectorValue(Value *Key, unsigned Part, Value *Vector) { + assert(hasVectorValue(Key, Part) && "Vector value not set for part"); + VectorMapStorage[Key][Part] = Vector; + } + + private: + /// The unroll factor. Each entry in the vector map contains UF vector + /// values. + unsigned UF; + + /// The vectorization factor. Each entry in the scalar map contains UF x VF + /// scalar values. + unsigned VF; + + /// The vector and scalar map storage. We use std::map and not DenseMap + /// because insertions to DenseMap invalidate its iterators. + std::map<Value *, VectorParts> VectorMapStorage; + std::map<Value *, ScalarParts> ScalarMapStorage; + }; + /// The original loop. Loop *OrigLoop; /// A wrapper around ScalarEvolution used to add runtime SCEV checks. Applies @@ -648,6 +758,7 @@ protected: /// vector elements. unsigned VF; +protected: /// The vectorization unroll factor to use. Each scalar is vectorized to this /// many different vector instructions. unsigned UF; @@ -681,10 +792,11 @@ protected: /// vectorized loop. A key value can map to either vector values, scalar /// values or both kinds of values, depending on whether the key was /// vectorized and scalarized. - VectorizerValueMap VectorLoopValueMap; + ValueMap VectorLoopValueMap; - /// Store instructions that were predicated. - SmallVector<Instruction *, 4> PredicatedInstructions; + /// Store instructions that should be predicated, as a pair + /// <StoreInst, Predicate> + SmallVector<std::pair<Instruction *, Value *>, 4> PredicatedInstructions; EdgeMaskCacheTy EdgeMaskCache; BlockMaskCacheTy BlockMaskCache; /// Trip count of the original loop. @@ -704,8 +816,6 @@ protected: // Holds the end values for each induction variable. We save the end values // so we can later fix-up the external users of the induction variables. DenseMap<PHINode *, Value *> IVEndValues; - - friend class LoopVectorizationPlanner; }; class InnerLoopUnroller : public InnerLoopVectorizer { @@ -721,6 +831,7 @@ public: UnrollFactor, LVL, CM) {} private: + void vectorizeMemoryInstruction(Instruction *Instr) override; Value *getBroadcastInstrs(Value *V) override; Value *getStepVector(Value *Val, int StartIdx, Value *Step, Instruction::BinaryOps Opcode = @@ -797,10 +908,6 @@ void InnerLoopVectorizer::addMetadata(ArrayRef<Value *> To, } } -} // namespace llvm - -namespace { - /// \brief The group of interleaved loads/stores sharing the same stride and /// close to each other. /// @@ -1822,7 +1929,6 @@ public: /// \returns True if it is more profitable to scalarize instruction \p I for /// vectorization factor \p VF. bool isProfitableToScalarize(Instruction *I, unsigned VF) const { - assert(VF > 1 && "Profitable to scalarize relevant only for VF > 1."); auto Scalars = InstsToScalarize.find(VF); assert(Scalars != InstsToScalarize.end() && "VF not yet analyzed for scalarization profitability"); @@ -1936,22 +2042,6 @@ public: return Legal->isInductionVariable(Op); } - /// Collects the instructions to scalarize for each predicated instruction in - /// the loop. - void collectInstsToScalarize(unsigned VF); - - /// Collect Uniform and Scalar values for the given \p VF. - /// The sets depend on CM decision for Load/Store instructions - /// that may be vectorized as interleave, gather-scatter or scalarized. - void collectUniformsAndScalars(unsigned VF) { - // Do the analysis once. - if (VF == 1 || Uniforms.count(VF)) - return; - setCostBasedWideningDecision(VF); - collectLoopUniforms(VF); - collectLoopScalars(VF); - } - private: /// \return An upper bound for the vectorization factor, larger than zero. /// One is returned if vectorization should best be avoided due to cost. @@ -2053,6 +2143,10 @@ private: int computePredInstDiscount(Instruction *PredInst, ScalarCostsTy &ScalarCosts, unsigned VF); + /// Collects the instructions to scalarize for each predicated instruction in + /// the loop. + void collectInstsToScalarize(unsigned VF); + /// Collect the instructions that are uniform after vectorization. An /// instruction is uniform if we represent it with a single scalar value in /// the vectorized loop corresponding to each vector iteration. Examples of @@ -2071,6 +2165,18 @@ private: /// iteration of the original scalar loop. void collectLoopScalars(unsigned VF); + /// Collect Uniform and Scalar values for the given \p VF. + /// The sets depend on CM decision for Load/Store instructions + /// that may be vectorized as interleave, gather-scatter or scalarized. + void collectUniformsAndScalars(unsigned VF) { + // Do the analysis once. + if (VF == 1 || Uniforms.count(VF)) + return; + setCostBasedWideningDecision(VF); + collectLoopUniforms(VF); + collectLoopScalars(VF); + } + /// Keeps cost model vectorization decision and cost for instructions. /// Right now it is used for memory instructions only. typedef DenseMap<std::pair<Instruction *, unsigned>, @@ -2108,71 +2214,23 @@ public: SmallPtrSet<const Value *, 16> VecValuesToIgnore; }; -} // end anonymous namespace - -namespace llvm { -/// InnerLoopVectorizer vectorizes loops which contain only one basic /// LoopVectorizationPlanner - drives the vectorization process after having /// passed Legality checks. -/// The planner builds and optimizes the Vectorization Plans which record the -/// decisions how to vectorize the given loop. In particular, represent the -/// control-flow of the vectorized version, the replication of instructions that -/// are to be scalarized, and interleave access groups. class LoopVectorizationPlanner { - /// The loop that we evaluate. - Loop *OrigLoop; - - /// Loop Info analysis. - LoopInfo *LI; - - /// Target Library Info. - const TargetLibraryInfo *TLI; - - /// Target Transform Info. - const TargetTransformInfo *TTI; - - /// The legality analysis. - LoopVectorizationLegality *Legal; - - /// The profitablity analysis. - LoopVectorizationCostModel &CM; - - SmallVector<VPlan *, 4> VPlans; - - unsigned BestVF; - unsigned BestUF; - public: - LoopVectorizationPlanner(Loop *L, LoopInfo *LI, const TargetLibraryInfo *TLI, - const TargetTransformInfo *TTI, + LoopVectorizationPlanner(Loop *OrigLoop, LoopInfo *LI, LoopVectorizationLegality *Legal, LoopVectorizationCostModel &CM) - : OrigLoop(L), LI(LI), TLI(TLI), TTI(TTI), Legal(Legal), CM(CM), - BestVF(0), BestUF(0) {} - - ~LoopVectorizationPlanner() { - while (!VPlans.empty()) { - VPlan *Plan = VPlans.back(); - VPlans.pop_back(); - delete Plan; - } - } + : OrigLoop(OrigLoop), LI(LI), Legal(Legal), CM(CM) {} + + ~LoopVectorizationPlanner() {} /// Plan how to best vectorize, return the best VF and its cost. LoopVectorizationCostModel::VectorizationFactor plan(bool OptForSize, unsigned UserVF); - /// Finalize the best decision and dispose of all other VPlans. - void setBestPlan(unsigned VF, unsigned UF); - - /// Generate the IR code for the body of the vectorized loop according to the - /// best selected VPlan. - void executePlan(InnerLoopVectorizer &LB, DominatorTree *DT); - - void printPlans(raw_ostream &O) { - for (VPlan *Plan : VPlans) - O << *Plan; - } + /// Generate the IR code for the vectorized loop. + void executePlan(InnerLoopVectorizer &ILV); protected: /// Collect the instructions from the original loop that would be trivially @@ -2180,77 +2238,20 @@ protected: void collectTriviallyDeadInstructions( SmallPtrSetImpl<Instruction *> &DeadInstructions); - /// A range of powers-of-2 vectorization factors with fixed start and - /// adjustable end. The range includes start and excludes end, e.g.,: - /// [1, 9) = {1, 2, 4, 8} - struct VFRange { - const unsigned Start; // A power of 2. - unsigned End; // Need not be a power of 2. If End <= Start range is empty. - }; +private: + /// The loop that we evaluate. + Loop *OrigLoop; - /// Test a \p Predicate on a \p Range of VF's. Return the value of applying - /// \p Predicate on Range.Start, possibly decreasing Range.End such that the - /// returned value holds for the entire \p Range. - bool getDecisionAndClampRange(const std::function<bool(unsigned)> &Predicate, - VFRange &Range); + /// Loop Info analysis. + LoopInfo *LI; - /// Build VPlans for power-of-2 VF's between \p MinVF and \p MaxVF inclusive, - /// according to the information gathered by Legal when it checked if it is - /// legal to vectorize the loop. - void buildVPlans(unsigned MinVF, unsigned MaxVF); + /// The legality analysis. + LoopVectorizationLegality *Legal; -private: - /// Check if \I belongs to an Interleave Group within the given VF \p Range, - /// \return true in the first returned value if so and false otherwise. - /// Build a new VPInterleaveGroup Recipe if \I is the primary member of an IG - /// for \p Range.Start, and provide it as the second returned value. - /// Note that if \I is an adjunct member of an IG for \p Range.Start, the - /// \return value is <true, nullptr>, as it is handled by another recipe. - /// \p Range.End may be decreased to ensure same decision from \p Range.Start - /// to \p Range.End. - VPInterleaveRecipe *tryToInterleaveMemory(Instruction *I, VFRange &Range); - - /// Check if an induction recipe should be constructed for \I within the given - /// VF \p Range. If so build and return it. If not, return null. \p Range.End - /// may be decreased to ensure same decision from \p Range.Start to - /// \p Range.End. - VPWidenIntOrFpInductionRecipe *tryToOptimizeInduction(Instruction *I, - VFRange &Range); - - /// Check if \I can be widened within the given VF \p Range. If \I can be - /// widened for Range.Start, extend \p LastWidenRecipe to include \p I if - /// possible or else build a new VPWidenRecipe for it, and return the - /// VPWidenRecipe that includes \p I. If \p I cannot be widened for - /// Range.Start \return null. Range.End may be decreased to ensure same - /// decision from \p Range.Start to \p Range.End. - VPWidenRecipe *tryToWiden(Instruction *I, VPWidenRecipe *LastWidenRecipe, - VFRange &Range); - - /// Build a VPReplicationRecipe for \p I and enclose it within a Region if it - /// is predicated. \return \p VPBB augmented with this new recipe if \p I is - /// not predicated, otherwise \return a new VPBasicBlock that succeeds the new - /// Region. Update the packing decision of predicated instructions if they - /// feed \p I. Range.End may be decreased to ensure same recipe behavior from - /// \p Range.Start to \p Range.End. - VPBasicBlock *handleReplication( - Instruction *I, VFRange &Range, VPBasicBlock *VPBB, - DenseMap<Instruction *, VPReplicateRecipe *> &PredInst2Recipe); - - /// Create a replicating region for instruction \p I that requires - /// predication. \p PredRecipe is a VPReplicateRecipe holding \p I. - VPRegionBlock *createReplicateRegion(Instruction *I, - VPRecipeBase *PredRecipe); - - /// Build a VPlan according to the information gathered by Legal. \return a - /// VPlan for vectorization factors \p Range.Start and up to \p Range.End - /// exclusive, possibly decreasing \p Range.End. - VPlan *buildVPlan(VFRange &Range); + /// The profitablity analysis. + LoopVectorizationCostModel &CM; }; -} // namespace llvm - -namespace { - /// \brief This holds vectorization requirements that must be verified late in /// the process. The requirements are set by legalize and costmodel. Once /// vectorization has been determined to be possible and profitable the @@ -2671,15 +2672,15 @@ void InnerLoopVectorizer::buildScalarSteps(Value *ScalarIV, Value *Step, // iteration. If EntryVal is uniform, we only need to generate the first // lane. Otherwise, we generate all VF values. unsigned Lanes = - Cost->isUniformAfterVectorization(cast<Instruction>(EntryVal), VF) ? 1 - : VF; + Cost->isUniformAfterVectorization(cast<Instruction>(EntryVal), VF) ? 1 : VF; + // Compute the scalar steps and save the results in VectorLoopValueMap. for (unsigned Part = 0; Part < UF; ++Part) { for (unsigned Lane = 0; Lane < Lanes; ++Lane) { auto *StartIdx = getSignedIntOrFpConstant(ScalarIVTy, VF * Part + Lane); auto *Mul = addFastMathFlag(Builder.CreateBinOp(MulOp, StartIdx, Step)); auto *Add = addFastMathFlag(Builder.CreateBinOp(AddOp, ScalarIV, Mul)); - VectorLoopValueMap.setScalarValue(EntryVal, {Part, Lane}, Add); + VectorLoopValueMap.setScalarValue(EntryVal, Part, Lane, Add); } } } @@ -2717,7 +2718,7 @@ Value *InnerLoopVectorizer::getOrCreateVectorValue(Value *V, unsigned Part) { // vector form, we will construct the vector values on demand. if (VectorLoopValueMap.hasAnyScalarValue(V)) { - Value *ScalarValue = VectorLoopValueMap.getScalarValue(V, {Part, 0}); + Value *ScalarValue = VectorLoopValueMap.getScalarValue(V, Part, 0); // If we've scalarized a value, that value should be an instruction. auto *I = cast<Instruction>(V); @@ -2734,8 +2735,8 @@ Value *InnerLoopVectorizer::getOrCreateVectorValue(Value *V, unsigned Part) { // of the Part unroll iteration. Otherwise, the last instruction is the one // we created for the last vector lane of the Part unroll iteration. unsigned LastLane = Cost->isUniformAfterVectorization(I, VF) ? 0 : VF - 1; - auto *LastInst = cast<Instruction>( - VectorLoopValueMap.getScalarValue(V, {Part, LastLane})); + auto *LastInst = + cast<Instruction>(VectorLoopValueMap.getScalarValue(V, Part, LastLane)); // Set the insert point after the last scalarized instruction. This ensures // the insertelement sequence will directly follow the scalar definitions. @@ -2752,15 +2753,14 @@ Value *InnerLoopVectorizer::getOrCreateVectorValue(Value *V, unsigned Part) { Value *VectorValue = nullptr; if (Cost->isUniformAfterVectorization(I, VF)) { VectorValue = getBroadcastInstrs(ScalarValue); - VectorLoopValueMap.setVectorValue(V, Part, VectorValue); } else { - // Initialize packing with insertelements to start from undef. - Value *Undef = UndefValue::get(VectorType::get(V->getType(), VF)); - VectorLoopValueMap.setVectorValue(V, Part, Undef); + VectorValue = UndefValue::get(VectorType::get(V->getType(), VF)); for (unsigned Lane = 0; Lane < VF; ++Lane) - packScalarIntoVectorValue(V, {Part, Lane}); - VectorValue = VectorLoopValueMap.getVectorValue(V, Part); + VectorValue = Builder.CreateInsertElement( + VectorValue, getOrCreateScalarValue(V, Part, Lane), + Builder.getInt32(Lane)); } + VectorLoopValueMap.setVectorValue(V, Part, VectorValue); Builder.restoreIP(OldIP); return VectorValue; } @@ -2772,29 +2772,28 @@ Value *InnerLoopVectorizer::getOrCreateVectorValue(Value *V, unsigned Part) { return B; } -Value * -InnerLoopVectorizer::getOrCreateScalarValue(Value *V, - const VPIteration &Instance) { +Value *InnerLoopVectorizer::getOrCreateScalarValue(Value *V, unsigned Part, + unsigned Lane) { + // If the value is not an instruction contained in the loop, it should // already be scalar. if (OrigLoop->isLoopInvariant(V)) return V; - assert(Instance.Lane > 0 - ? !Cost->isUniformAfterVectorization(cast<Instruction>(V), VF) - : true && "Uniform values only have lane zero"); + assert(Lane > 0 ? !Cost->isUniformAfterVectorization(cast<Instruction>(V), VF) + : true && "Uniform values only have lane zero"); // If the value from the original loop has not been vectorized, it is // represented by UF x VF scalar values in the new loop. Return the requested // scalar value. - if (VectorLoopValueMap.hasScalarValue(V, Instance)) - return VectorLoopValueMap.getScalarValue(V, Instance); + if (VectorLoopValueMap.hasScalarValue(V, Part, Lane)) + return VectorLoopValueMap.getScalarValue(V, Part, Lane); // If the value has not been scalarized, get its entry in VectorLoopValueMap // for the given unroll part. If this entry is not a vector type (i.e., the // vectorization factor is one), there is no need to generate an // extractelement instruction. - auto *U = getOrCreateVectorValue(V, Instance.Part); + auto *U = getOrCreateVectorValue(V, Part); if (!U->getType()->isVectorTy()) { assert(VF == 1 && "Value not scalarized has non-vector type"); return U; @@ -2803,20 +2802,7 @@ InnerLoopVectorizer::getOrCreateScalarValue(Value *V, // Otherwise, the value from the original loop has been vectorized and is // represented by UF vector values. Extract and return the requested scalar // value from the appropriate vector lane. - return Builder.CreateExtractElement(U, Builder.getInt32(Instance.Lane)); -} - -void InnerLoopVectorizer::packScalarIntoVectorValue( - Value *V, const VPIteration &Instance) { - assert(V != Induction && "The new induction variable should not be used."); - assert(!V->getType()->isVectorTy() && "Can't pack a vector"); - assert(!V->getType()->isVoidTy() && "Type does not produce a value"); - - Value *ScalarInst = VectorLoopValueMap.getScalarValue(V, Instance); - Value *VectorValue = VectorLoopValueMap.getVectorValue(V, Instance.Part); - VectorValue = Builder.CreateInsertElement(VectorValue, ScalarInst, - Builder.getInt32(Instance.Lane)); - VectorLoopValueMap.resetVectorValue(V, Instance.Part, VectorValue); + return Builder.CreateExtractElement(U, Builder.getInt32(Lane)); } Value *InnerLoopVectorizer::reverseVector(Value *Vec) { @@ -2889,7 +2875,7 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup(Instruction *Instr) { Index += (VF - 1) * Group->getFactor(); for (unsigned Part = 0; Part < UF; Part++) { - Value *NewPtr = getOrCreateScalarValue(Ptr, {Part, 0}); + Value *NewPtr = getOrCreateScalarValue(Ptr, Part, 0); // Notice current instruction could be any index. Need to adjust the address // to the member of index 0. @@ -3015,6 +3001,10 @@ void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr) { Alignment = DL.getABITypeAlignment(ScalarDataTy); unsigned AddressSpace = getMemInstAddressSpace(Instr); + // Scalarize the memory instruction if necessary. + if (Decision == LoopVectorizationCostModel::CM_Scalarize) + return scalarizeInstruction(Instr, Legal->isScalarWithPredication(Instr)); + // Determine if the pointer operand of the access is either consecutive or // reverse consecutive. int ConsecutiveStride = Legal->isConsecutivePtr(Ptr); @@ -3029,7 +3019,7 @@ void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr) { // Handle consecutive loads/stores. if (ConsecutiveStride) - Ptr = getOrCreateScalarValue(Ptr, {0, 0}); + Ptr = getOrCreateScalarValue(Ptr, 0, 0); VectorParts Mask = createBlockInMask(Instr->getParent()); // Handle Stores: @@ -3126,41 +3116,71 @@ void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr) { } void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr, - const VPIteration &Instance, bool IfPredicateInstr) { assert(!Instr->getType()->isAggregateType() && "Can't handle vectors"); + DEBUG(dbgs() << "LV: Scalarizing" + << (IfPredicateInstr ? " and predicating:" : ":") << *Instr + << '\n'); + // Holds vector parameters or scalars, in case of uniform vals. + SmallVector<VectorParts, 4> Params; setDebugLocFromInst(Builder, Instr); // Does this instruction return a value ? bool IsVoidRetTy = Instr->getType()->isVoidTy(); - Instruction *Cloned = Instr->clone(); - if (!IsVoidRetTy) - Cloned->setName(Instr->getName() + ".cloned"); + VectorParts Cond; + if (IfPredicateInstr) + Cond = createBlockInMask(Instr->getParent()); + + // Determine the number of scalars we need to generate for each unroll + // iteration. If the instruction is uniform, we only need to generate the + // first lane. Otherwise, we generate all VF values. + unsigned Lanes = Cost->isUniformAfterVectorization(Instr, VF) ? 1 : VF; - // Replace the operands of the cloned instructions with their scalar - // equivalents in the new loop. - for (unsigned op = 0, e = Instr->getNumOperands(); op != e; ++op) { - auto *NewOp = getOrCreateScalarValue(Instr->getOperand(op), Instance); - Cloned->setOperand(op, NewOp); - } - addNewMetadata(Cloned, Instr); + // For each vector unroll 'part': + for (unsigned Part = 0; Part < UF; ++Part) { + // For each scalar that we create: + for (unsigned Lane = 0; Lane < Lanes; ++Lane) { - // Place the cloned scalar in the new loop. - Builder.Insert(Cloned); + // Start if-block. + Value *Cmp = nullptr; + if (IfPredicateInstr) { + Cmp = Cond[Part]; + if (!Cmp) // Block in mask is all-one. + Cmp = Builder.getTrue(); + else if (Cmp->getType()->isVectorTy()) + Cmp = Builder.CreateExtractElement(Cmp, Builder.getInt32(Lane)); + } - // Add the cloned scalar to the scalar map entry. - VectorLoopValueMap.setScalarValue(Instr, Instance, Cloned); + Instruction *Cloned = Instr->clone(); + if (!IsVoidRetTy) + Cloned->setName(Instr->getName() + ".cloned"); - // If we just cloned a new assumption, add it the assumption cache. - if (auto *II = dyn_cast<IntrinsicInst>(Cloned)) - if (II->getIntrinsicID() == Intrinsic::assume) - AC->registerAssumption(II); + // Replace the operands of the cloned instructions with their scalar + // equivalents in the new loop. + for (unsigned op = 0, e = Instr->getNumOperands(); op != e; ++op) { + auto *NewOp = getOrCreateScalarValue(Instr->getOperand(op), Part, Lane); + Cloned->setOperand(op, NewOp); + } + addNewMetadata(Cloned, Instr); - // End if-block. - if (IfPredicateInstr) - PredicatedInstructions.push_back(Cloned); + // Place the cloned scalar in the new loop. + Builder.Insert(Cloned); + + // Add the cloned scalar to the scalar map entry. + VectorLoopValueMap.setScalarValue(Instr, Part, Lane, Cloned); + + // If we just cloned a new assumption, add it the assumption cache. + if (auto *II = dyn_cast<IntrinsicInst>(Cloned)) + if (II->getIntrinsicID() == Intrinsic::assume) + AC->registerAssumption(II); + + // End if-block. + if (IfPredicateInstr) + PredicatedInstructions.push_back(std::make_pair(Cloned, Cmp)); + } + } } PHINode *InnerLoopVectorizer::createInductionVariable(Loop *L, Value *Start, @@ -3364,7 +3384,7 @@ void InnerLoopVectorizer::emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass) { LVer->prepareNoAliasMetadata(); } -BasicBlock *InnerLoopVectorizer::createVectorizedLoopSkeleton() { +void InnerLoopVectorizer::createVectorizedLoopSkeleton() { /* In this function we generate a new loop. The new loop will contain the vectorized instructions while the old loop will continue to run the @@ -3545,8 +3565,6 @@ BasicBlock *InnerLoopVectorizer::createVectorizedLoopSkeleton() { LoopVectorizeHints Hints(Lp, true, *ORE); Hints.setAlreadyVectorized(); - - return LoopVectorPreHeader; } // Fix up external users of the induction variable. At this point, we are @@ -3920,8 +3938,7 @@ void InnerLoopVectorizer::fixVectorizedLoop() { IVEndValues[Entry.first], LoopMiddleBlock); fixLCSSAPHIs(); - for (Instruction *PI : PredicatedInstructions) - sinkScalarOperands(&*PI); + predicateInstructions(); // Remove redundant induction instructions. cse(LoopVectorBody); @@ -4376,6 +4393,126 @@ void InnerLoopVectorizer::sinkScalarOperands(Instruction *PredInst) { } while (Changed); } +void InnerLoopVectorizer::predicateInstructions() { + + // For each instruction I marked for predication on value C, split I into its + // own basic block to form an if-then construct over C. Since I may be fed by + // an extractelement instruction or other scalar operand, we try to + // iteratively sink its scalar operands into the predicated block. If I feeds + // an insertelement instruction, we try to move this instruction into the + // predicated block as well. For non-void types, a phi node will be created + // for the resulting value (either vector or scalar). + // + // So for some predicated instruction, e.g. the conditional sdiv in: + // + // for.body: + // ... + // %add = add nsw i32 %mul, %0 + // %cmp5 = icmp sgt i32 %2, 7 + // br i1 %cmp5, label %if.then, label %if.end + // + // if.then: + // %div = sdiv i32 %0, %1 + // br label %if.end + // + // if.end: + // %x.0 = phi i32 [ %div, %if.then ], [ %add, %for.body ] + // + // the sdiv at this point is scalarized and if-converted using a select. + // The inactive elements in the vector are not used, but the predicated + // instruction is still executed for all vector elements, essentially: + // + // vector.body: + // ... + // %17 = add nsw <2 x i32> %16, %wide.load + // %29 = extractelement <2 x i32> %wide.load, i32 0 + // %30 = extractelement <2 x i32> %wide.load51, i32 0 + // %31 = sdiv i32 %29, %30 + // %32 = insertelement <2 x i32> undef, i32 %31, i32 0 + // %35 = extractelement <2 x i32> %wide.load, i32 1 + // %36 = extractelement <2 x i32> %wide.load51, i32 1 + // %37 = sdiv i32 %35, %36 + // %38 = insertelement <2 x i32> %32, i32 %37, i32 1 + // %predphi = select <2 x i1> %26, <2 x i32> %38, <2 x i32> %17 + // + // Predication will now re-introduce the original control flow to avoid false + // side-effects by the sdiv instructions on the inactive elements, yielding + // (after cleanup): + // + // vector.body: + // ... + // %5 = add nsw <2 x i32> %4, %wide.load + // %8 = icmp sgt <2 x i32> %wide.load52, <i32 7, i32 7> + // %9 = extractelement <2 x i1> %8, i32 0 + // br i1 %9, label %pred.sdiv.if, label %pred.sdiv.continue + // + // pred.sdiv.if: + // %10 = extractelement <2 x i32> %wide.load, i32 0 + // %11 = extractelement <2 x i32> %wide.load51, i32 0 + // %12 = sdiv i32 %10, %11 + // %13 = insertelement <2 x i32> undef, i32 %12, i32 0 + // br label %pred.sdiv.continue + // + // pred.sdiv.continue: + // %14 = phi <2 x i32> [ undef, %vector.body ], [ %13, %pred.sdiv.if ] + // %15 = extractelement <2 x i1> %8, i32 1 + // br i1 %15, label %pred.sdiv.if54, label %pred.sdiv.continue55 + // + // pred.sdiv.if54: + // %16 = extractelement <2 x i32> %wide.load, i32 1 + // %17 = extractelement <2 x i32> %wide.load51, i32 1 + // %18 = sdiv i32 %16, %17 + // %19 = insertelement <2 x i32> %14, i32 %18, i32 1 + // br label %pred.sdiv.continue55 + // + // pred.sdiv.continue55: + // %20 = phi <2 x i32> [ %14, %pred.sdiv.continue ], [ %19, %pred.sdiv.if54 ] + // %predphi = select <2 x i1> %8, <2 x i32> %20, <2 x i32> %5 + + for (auto KV : PredicatedInstructions) { + BasicBlock::iterator I(KV.first); + BasicBlock *Head = I->getParent(); + auto *T = SplitBlockAndInsertIfThen(KV.second, &*I, /*Unreachable=*/false, + /*BranchWeights=*/nullptr, DT, LI); + I->moveBefore(T); + sinkScalarOperands(&*I); + + BasicBlock *PredicatedBlock = I->getParent(); + Twine BBNamePrefix = Twine("pred.") + I->getOpcodeName(); + PredicatedBlock->setName(BBNamePrefix + ".if"); + PredicatedBlock->getSingleSuccessor()->setName(BBNamePrefix + ".continue"); + + // If the instruction is non-void create a Phi node at reconvergence point. + if (!I->getType()->isVoidTy()) { + Value *IncomingTrue = nullptr; + Value *IncomingFalse = nullptr; + + if (I->hasOneUse() && isa<InsertElementInst>(*I->user_begin())) { + // If the predicated instruction is feeding an insert-element, move it + // into the Then block; Phi node will be created for the vector. + InsertElementInst *IEI = cast<InsertElementInst>(*I->user_begin()); + IEI->moveBefore(T); + IncomingTrue = IEI; // the new vector with the inserted element. + IncomingFalse = IEI->getOperand(0); // the unmodified vector + } else { + // Phi node will be created for the scalar predicated instruction. + IncomingTrue = &*I; + IncomingFalse = UndefValue::get(I->getType()); + } + + BasicBlock *PostDom = I->getParent()->getSingleSuccessor(); + assert(PostDom && "Then block has multiple successors"); + PHINode *Phi = + PHINode::Create(IncomingTrue->getType(), 2, "", &PostDom->front()); + IncomingTrue->replaceAllUsesWith(Phi); + Phi->addIncoming(IncomingFalse, Head); + Phi->addIncoming(IncomingTrue, I->getParent()); + } + } + + DEBUG(DT->verifyDomTree()); +} + InnerLoopVectorizer::VectorParts InnerLoopVectorizer::createEdgeMask(BasicBlock *Src, BasicBlock *Dst) { assert(is_contained(predecessors(Dst), Src) && "Invalid edge"); @@ -4519,7 +4656,7 @@ void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN, unsigned UF, llvm_unreachable("Unknown induction"); case InductionDescriptor::IK_IntInduction: case InductionDescriptor::IK_FpInduction: - llvm_unreachable("Integer/fp induction is handled elsewhere."); + return widenIntOrFpInduction(P); case InductionDescriptor::IK_PtrInduction: { // Handle the pointer induction variable case. assert(P->getType()->isPointerTy() && "Unexpected type."); @@ -4538,7 +4675,7 @@ void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN, unsigned UF, Value *GlobalIdx = Builder.CreateAdd(PtrInd, Idx); Value *SclrGep = II.transform(Builder, GlobalIdx, PSE.getSE(), DL); SclrGep->setName("next.gep"); - VectorLoopValueMap.setScalarValue(P, {Part, Lane}, SclrGep); + VectorLoopValueMap.setScalarValue(P, Part, Lane, SclrGep); } } return; @@ -4564,11 +4701,25 @@ static bool mayDivideByZero(Instruction &I) { return !CInt || CInt->isZero(); } -void InnerLoopVectorizer::widenInstruction(Instruction &I) { +void InnerLoopVectorizer::vectorizeInstruction(Instruction &I) { + // Scalarize instructions that should remain scalar after vectorization. + if (VF > 1 && + !(isa<BranchInst>(&I) || isa<PHINode>(&I) || isa<DbgInfoIntrinsic>(&I)) && + shouldScalarizeInstruction(&I)) { + scalarizeInstruction(&I, Legal->isScalarWithPredication(&I)); + return; + } + switch (I.getOpcode()) { case Instruction::Br: - case Instruction::PHI: - llvm_unreachable("This instruction is handled by a different recipe."); + // Nothing to do for PHIs and BR, since we already took care of the + // loop control flow instructions. + break; + case Instruction::PHI: { + // Vectorize PHINodes. + widenPHIInstruction(&I, UF, VF); + break; + } // End of PHI. case Instruction::GetElementPtr: { // Construct a vector GEP by widening the operands of the scalar GEP as // necessary. We mark the vector GEP 'inbounds' if appropriate. A GEP @@ -4641,6 +4792,13 @@ void InnerLoopVectorizer::widenInstruction(Instruction &I) { case Instruction::SDiv: case Instruction::SRem: case Instruction::URem: + // Scalarize with predication if this instruction may divide by zero and + // block execution is conditional, otherwise fallthrough. + if (Legal->isScalarWithPredication(&I)) { + scalarizeInstruction(&I, true); + break; + } + LLVM_FALLTHROUGH; case Instruction::Add: case Instruction::FAdd: case Instruction::Sub: @@ -4688,7 +4846,7 @@ void InnerLoopVectorizer::widenInstruction(Instruction &I) { // We have to take the 'vectorized' value and pick the first lane. // Instcombine will make this a no-op. - auto *ScalarCond = getOrCreateScalarValue(I.getOperand(0), {0, 0}); + auto *ScalarCond = getOrCreateScalarValue(I.getOperand(0), 0, 0); for (unsigned Part = 0; Part < UF; ++Part) { Value *Cond = getOrCreateVectorValue(I.getOperand(0), Part); @@ -4747,6 +4905,16 @@ void InnerLoopVectorizer::widenInstruction(Instruction &I) { auto *CI = dyn_cast<CastInst>(&I); setDebugLocFromInst(Builder, CI); + // Optimize the special case where the source is a constant integer + // induction variable. Notice that we can only optimize the 'trunc' case + // because (a) FP conversions lose precision, (b) sext/zext may wrap, and + // (c) other casts depend on pointer size. + if (Cost->isOptimizableIVTruncate(CI, VF)) { + widenIntOrFpInduction(cast<PHINode>(CI->getOperand(0)), + cast<TruncInst>(CI)); + break; + } + /// Vectorize casts. Type *DestTy = (VF == 1) ? CI->getType() : VectorType::get(CI->getType(), VF); @@ -4777,7 +4945,11 @@ void InnerLoopVectorizer::widenInstruction(Instruction &I) { Tys.push_back(ToVectorTy(ArgOperand->getType(), VF)); Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); - + if (ID && (ID == Intrinsic::assume || ID == Intrinsic::lifetime_end || + ID == Intrinsic::lifetime_start)) { + scalarizeInstruction(&I); + break; + } // The flag shows whether we use Intrinsic or a usual Call for vectorized // version of the instruction. // Is it beneficial to perform intrinsic call compared to lib call? @@ -4785,8 +4957,10 @@ void InnerLoopVectorizer::widenInstruction(Instruction &I) { unsigned CallCost = getVectorCallCost(CI, VF, *TTI, TLI, NeedToScalarize); bool UseVectorIntrinsic = ID && getVectorIntrinsicCost(CI, VF, *TTI, TLI) <= CallCost; - assert((UseVectorIntrinsic || !NeedToScalarize) && - "Instruction should be scalarized elsewhere."); + if (!UseVectorIntrinsic && NeedToScalarize) { + scalarizeInstruction(&I); + break; + } for (unsigned Part = 0; Part < UF; ++Part) { SmallVector<Value *, 4> Args; @@ -4836,9 +5010,9 @@ void InnerLoopVectorizer::widenInstruction(Instruction &I) { } default: - // All other instructions are scalarized. - DEBUG(dbgs() << "LV: Found an unhandled instruction: " << I); - llvm_unreachable("Unhandled instruction!"); + // All other instructions are unsupported. Scalarize them. + scalarizeInstruction(&I); + break; } // end of switch. } @@ -4850,11 +5024,14 @@ void InnerLoopVectorizer::updateAnalysis() { assert(DT->properlyDominates(LoopBypassBlocks.front(), LoopExitBlock) && "Entry does not dominate exit."); + DT->addNewBlock(LI->getLoopFor(LoopVectorBody)->getHeader(), + LoopVectorPreHeader); DT->addNewBlock(LoopMiddleBlock, LI->getLoopFor(LoopVectorBody)->getLoopLatch()); DT->addNewBlock(LoopScalarPreHeader, LoopBypassBlocks[0]); DT->changeImmediateDominator(LoopScalarBody, LoopScalarPreHeader); DT->changeImmediateDominator(LoopExitBlock, LoopBypassBlocks[0]); + DEBUG(DT->verifyDomTree()); } @@ -6794,6 +6971,13 @@ LoopVectorizationCostModel::VectorizationCostTy LoopVectorizationCostModel::expectedCost(unsigned VF) { VectorizationCostTy Cost; + // Collect Uniform and Scalar instructions after vectorization with VF. + collectUniformsAndScalars(VF); + + // Collect the instructions (and their associated costs) that will be more + // profitable to scalarize. + collectInstsToScalarize(VF); + // For each block. for (BasicBlock *BB : TheLoop->blocks()) { VectorizationCostTy BlockCost; @@ -7451,26 +7635,11 @@ LoopVectorizationPlanner::plan(bool OptForSize, unsigned UserVF) { // Collect the instructions (and their associated costs) that will be more // profitable to scalarize. CM.selectUserVectorizationFactor(UserVF); - buildVPlans(UserVF, UserVF); - DEBUG(printPlans(dbgs())); return {UserVF, 0}; } unsigned MaxVF = MaybeMaxVF.getValue(); assert(MaxVF != 0 && "MaxVF is zero."); - - for (unsigned VF = 1; VF <= MaxVF; VF *= 2) { - // Collect Uniform and Scalar instructions after vectorization with VF. - CM.collectUniformsAndScalars(VF); - - // Collect the instructions (and their associated costs) that will be more - // profitable to scalarize. - if (VF > 1) - CM.collectInstsToScalarize(VF); - } - - buildVPlans(1, MaxVF); - DEBUG(printPlans(dbgs())); if (MaxVF == 1) return NoVectorization; @@ -7478,31 +7647,11 @@ LoopVectorizationPlanner::plan(bool OptForSize, unsigned UserVF) { return CM.selectVectorizationFactor(MaxVF); } -void LoopVectorizationPlanner::setBestPlan(unsigned VF, unsigned UF) { - DEBUG(dbgs() << "Setting best plan to VF=" << VF << ", UF=" << UF << '\n'); - BestVF = VF; - BestUF = UF; - - for (auto *VPlanIter = VPlans.begin(); VPlanIter != VPlans.end();) { - VPlan *Plan = *VPlanIter; - if (Plan->hasVF(VF)) - ++VPlanIter; - else { - VPlanIter = VPlans.erase(VPlanIter); - delete Plan; - } - } - assert(VPlans.size() == 1 && "Best VF has not a single VPlan."); -} - -void LoopVectorizationPlanner::executePlan(InnerLoopVectorizer &ILV, - DominatorTree *DT) { +void LoopVectorizationPlanner::executePlan(InnerLoopVectorizer &ILV) { // Perform the actual loop transformation. // 1. Create a new empty loop. Unlink the old loop and connect the new one. - VPTransformState State{ - BestVF, BestUF, LI, DT, ILV.Builder, ILV.VectorLoopValueMap, &ILV}; - State.CFG.PrevBB = ILV.createVectorizedLoopSkeleton(); + ILV.createVectorizedLoopSkeleton(); //===------------------------------------------------===// // @@ -7513,9 +7662,36 @@ void LoopVectorizationPlanner::executePlan(InnerLoopVectorizer &ILV, //===------------------------------------------------===// // 2. Copy and widen instructions from the old loop into the new loop. - assert(VPlans.size() == 1 && "Not a single VPlan to execute."); - VPlan *Plan = *VPlans.begin(); - Plan->execute(&State); + + // Move instructions to handle first-order recurrences. + DenseMap<Instruction *, Instruction *> &SinkAfter = Legal->getSinkAfter(); + for (auto &Entry : SinkAfter) { + Entry.first->removeFromParent(); + Entry.first->insertAfter(Entry.second); + DEBUG(dbgs() << "Sinking" << *Entry.first << " after" << *Entry.second + << " to vectorize a 1st order recurrence.\n"); + } + + // Collect instructions from the original loop that will become trivially dead + // in the vectorized loop. We don't need to vectorize these instructions. For + // example, original induction update instructions can become dead because we + // separately emit induction "steps" when generating code for the new loop. + // Similarly, we create a new latch condition when setting up the structure + // of the new loop, so the old one can become dead. + SmallPtrSet<Instruction *, 4> DeadInstructions; + collectTriviallyDeadInstructions(DeadInstructions); + + // Scan the loop in a topological order to ensure that defs are vectorized + // before users. + LoopBlocksDFS DFS(OrigLoop); + DFS.perform(LI); + + // Vectorize all instructions in the original loop that will not become + // trivially dead when vectorized. + for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) + for (Instruction &I : *BB) + if (!DeadInstructions.count(&I)) + ILV.vectorizeInstruction(I); // 3. Fix the vectorized code: take care of header phi's, live-outs, // predication, updating analyses. @@ -7545,6 +7721,14 @@ void LoopVectorizationPlanner::collectTriviallyDeadInstructions( DeadInstructions.insert(IndUpdate); } } + +void InnerLoopUnroller::vectorizeMemoryInstruction(Instruction *Instr) { + auto *SI = dyn_cast<StoreInst>(Instr); + bool IfPredicateInstr = (SI && Legal->blockNeedsPredication(SI->getParent())); + + return scalarizeInstruction(Instr, IfPredicateInstr); +} + Value *InnerLoopUnroller::reverseVector(Value *Vec) { return Vec; } Value *InnerLoopUnroller::getBroadcastInstrs(Value *V) { return V; } @@ -7600,725 +7784,6 @@ static void AddRuntimeUnrollDisableMetaData(Loop *L) { } } -namespace { -/// VPWidenRecipe is a recipe for producing a copy of vector type for each -/// Instruction in its ingredients independently, in order. This recipe covers -/// most of the traditional vectorization cases where each ingredient transforms -/// into a vectorized version of itself. -class VPWidenRecipe : public VPRecipeBase { -private: - /// Hold the ingredients by pointing to their original BasicBlock location. - BasicBlock::iterator Begin; - BasicBlock::iterator End; - -public: - VPWidenRecipe(Instruction *I) : VPRecipeBase(VPWidenSC) { - End = I->getIterator(); - Begin = End++; - } - - ~VPWidenRecipe() {} - - /// Method to support type inquiry through isa, cast, and dyn_cast. - static inline bool classof(const VPRecipeBase *V) { - return V->getVPRecipeID() == VPRecipeBase::VPWidenSC; - } - - /// Produce widened copies of all Ingredients. - void execute(VPTransformState &State) override { - for (auto &Instr : make_range(Begin, End)) - State.ILV->widenInstruction(Instr); - } - - /// Augment the recipe to include Instr, if it lies at its End. - bool appendInstruction(Instruction *Instr) { - if (End != Instr->getIterator()) - return false; - End++; - return true; - } - - /// Print the recipe. - void print(raw_ostream &O, const Twine &Indent) const override { - O << " +\n" << Indent << "\"WIDEN\\l\""; - for (auto &Instr : make_range(Begin, End)) - O << " +\n" << Indent << "\" " << VPlanIngredient(&Instr) << "\\l\""; - } -}; - -/// A recipe for handling phi nodes of integer and floating-point inductions, -/// producing their vector and scalar values. -class VPWidenIntOrFpInductionRecipe : public VPRecipeBase { -private: - PHINode *IV; - TruncInst *Trunc; - -public: - VPWidenIntOrFpInductionRecipe(PHINode *IV, TruncInst *Trunc = nullptr) - : VPRecipeBase(VPWidenIntOrFpInductionSC), IV(IV), Trunc(Trunc) {} - - ~VPWidenIntOrFpInductionRecipe() {} - - /// Method to support type inquiry through isa, cast, and dyn_cast. - static inline bool classof(const VPRecipeBase *V) { - return V->getVPRecipeID() == VPRecipeBase::VPWidenIntOrFpInductionSC; - } - - /// Generate the vectorized and scalarized versions of the phi node as - /// needed by their users. - void execute(VPTransformState &State) override { - assert(!State.Instance && "Int or FP induction being replicated."); - State.ILV->widenIntOrFpInduction(IV, Trunc); - } - - /// Print the recipe. - void print(raw_ostream &O, const Twine &Indent) const override { - O << " +\n" << Indent << "\"WIDEN-INDUCTION"; - if (Trunc) { - O << "\\l\""; - O << " +\n" << Indent << "\" " << VPlanIngredient(IV) << "\\l\""; - O << " +\n" << Indent << "\" " << VPlanIngredient(Trunc) << "\\l\""; - } else - O << " " << VPlanIngredient(IV) << "\\l\""; - } -}; - -/// A recipe for handling all phi nodes except for integer and FP inductions. -class VPWidenPHIRecipe : public VPRecipeBase { -private: - PHINode *Phi; - -public: - VPWidenPHIRecipe(PHINode *Phi) : VPRecipeBase(VPWidenPHISC), Phi(Phi) {} - - ~VPWidenPHIRecipe() {} - - /// Method to support type inquiry through isa, cast, and dyn_cast. - static inline bool classof(const VPRecipeBase *V) { - return V->getVPRecipeID() == VPRecipeBase::VPWidenPHISC; - } - - /// Generate the phi/select nodes. - void execute(VPTransformState &State) override { - State.ILV->widenPHIInstruction(Phi, State.UF, State.VF); - } - - /// Print the recipe. - void print(raw_ostream &O, const Twine &Indent) const override { - O << " +\n" << Indent << "\"WIDEN-PHI " << VPlanIngredient(Phi) << "\\l\""; - } -}; - -/// VPInterleaveRecipe is a recipe for transforming an interleave group of load -/// or stores into one wide load/store and shuffles. -class VPInterleaveRecipe : public VPRecipeBase { -private: - const InterleaveGroup *IG; - -public: - VPInterleaveRecipe(const InterleaveGroup *IG) - : VPRecipeBase(VPInterleaveSC), IG(IG) {} - - ~VPInterleaveRecipe() {} - - /// Method to support type inquiry through isa, cast, and dyn_cast. - static inline bool classof(const VPRecipeBase *V) { - return V->getVPRecipeID() == VPRecipeBase::VPInterleaveSC; - } - - /// Generate the wide load or store, and shuffles. - void execute(VPTransformState &State) override { - assert(!State.Instance && "Interleave group being replicated."); - State.ILV->vectorizeInterleaveGroup(IG->getInsertPos()); - } - - /// Print the recipe. - void print(raw_ostream &O, const Twine &Indent) const override; - - const InterleaveGroup *getInterleaveGroup() { return IG; } -}; - -/// VPReplicateRecipe replicates a given instruction producing multiple scalar -/// copies of the original scalar type, one per lane, instead of producing a -/// single copy of widened type for all lanes. If the instruction is known to be -/// uniform only one copy, per lane zero, will be generated. -class VPReplicateRecipe : public VPRecipeBase { -private: - /// The instruction being replicated. - Instruction *Ingredient; - - /// Indicator if only a single replica per lane is needed. - bool IsUniform; - - /// Indicator if the replicas are also predicated. - bool IsPredicated; - - /// Indicator if the scalar values should also be packed into a vector. - bool AlsoPack; - -public: - VPReplicateRecipe(Instruction *I, bool IsUniform, bool IsPredicated = false) - : VPRecipeBase(VPReplicateSC), Ingredient(I), IsUniform(IsUniform), - IsPredicated(IsPredicated) { - // Retain the previous behavior of predicateInstructions(), where an - // insert-element of a predicated instruction got hoisted into the - // predicated basic block iff it was its only user. This is achieved by - // having predicated instructions also pack their values into a vector by - // default unless they have a replicated user which uses their scalar value. - AlsoPack = IsPredicated && !I->use_empty(); - } - - ~VPReplicateRecipe() {} - - /// Method to support type inquiry through isa, cast, and dyn_cast. - static inline bool classof(const VPRecipeBase *V) { - return V->getVPRecipeID() == VPRecipeBase::VPReplicateSC; - } - - /// Generate replicas of the desired Ingredient. Replicas will be generated - /// for all parts and lanes unless a specific part and lane are specified in - /// the \p State. - void execute(VPTransformState &State) override; - - void setAlsoPack(bool Pack) { AlsoPack = Pack; } - - /// Print the recipe. - void print(raw_ostream &O, const Twine &Indent) const override { - O << " +\n" - << Indent << "\"" << (IsUniform ? "CLONE " : "REPLICATE ") - << VPlanIngredient(Ingredient); - if (AlsoPack) - O << " (S->V)"; - O << "\\l\""; - } -}; - -/// A recipe for generating conditional branches on the bits of a mask. -class VPBranchOnMaskRecipe : public VPRecipeBase { -private: - /// The input IR basic block used to obtain the mask providing the condition - /// bits for the branch. - BasicBlock *MaskedBasicBlock; - -public: - VPBranchOnMaskRecipe(BasicBlock *BB) - : VPRecipeBase(VPBranchOnMaskSC), MaskedBasicBlock(BB) {} - - /// Method to support type inquiry through isa, cast, and dyn_cast. - static inline bool classof(const VPRecipeBase *V) { - return V->getVPRecipeID() == VPRecipeBase::VPBranchOnMaskSC; - } - - /// Generate the extraction of the appropriate bit from the block mask and the - /// conditional branch. - void execute(VPTransformState &State) override; - - /// Print the recipe. - void print(raw_ostream &O, const Twine &Indent) const override { - O << " +\n" - << Indent << "\"BRANCH-ON-MASK-OF " << MaskedBasicBlock->getName() - << "\\l\""; - } -}; - -/// VPPredInstPHIRecipe is a recipe for generating the phi nodes needed when -/// control converges back from a Branch-on-Mask. The phi nodes are needed in -/// order to merge values that are set under such a branch and feed their uses. -/// The phi nodes can be scalar or vector depending on the users of the value. -/// This recipe works in concert with VPBranchOnMaskRecipe. -class VPPredInstPHIRecipe : public VPRecipeBase { -private: - Instruction *PredInst; - -public: - /// Construct a VPPredInstPHIRecipe given \p PredInst whose value needs a phi - /// nodes after merging back from a Branch-on-Mask. - VPPredInstPHIRecipe(Instruction *PredInst) - : VPRecipeBase(VPPredInstPHISC), PredInst(PredInst) {} - - ~VPPredInstPHIRecipe() {} - - /// Method to support type inquiry through isa, cast, and dyn_cast. - static inline bool classof(const VPRecipeBase *V) { - return V->getVPRecipeID() == VPRecipeBase::VPPredInstPHISC; - } - - /// Generates phi nodes for live-outs as needed to retain SSA form. - void execute(VPTransformState &State) override; - - /// Print the recipe. - void print(raw_ostream &O, const Twine &Indent) const override { - O << " +\n" - << Indent << "\"PHI-PREDICATED-INSTRUCTION " << VPlanIngredient(PredInst) - << "\\l\""; - } -}; -} // end anonymous namespace - -bool LoopVectorizationPlanner::getDecisionAndClampRange( - const std::function<bool(unsigned)> &Predicate, VFRange &Range) { - assert(Range.End > Range.Start && "Trying to test an empty VF range."); - bool PredicateAtRangeStart = Predicate(Range.Start); - - for (unsigned TmpVF = Range.Start * 2; TmpVF < Range.End; TmpVF *= 2) - if (Predicate(TmpVF) != PredicateAtRangeStart) { - Range.End = TmpVF; - break; - } - - return PredicateAtRangeStart; -} - -/// Build VPlans for the full range of feasible VF's = {\p MinVF, 2 * \p MinVF, -/// 4 * \p MinVF, ..., \p MaxVF} by repeatedly building a VPlan for a sub-range -/// of VF's starting at a given VF and extending it as much as possible. Each -/// vectorization decision can potentially shorten this sub-range during -/// buildVPlan(). -void LoopVectorizationPlanner::buildVPlans(unsigned MinVF, unsigned MaxVF) { - for (unsigned VF = MinVF; VF < MaxVF + 1;) { - VFRange SubRange = {VF, MaxVF + 1}; - VPlan *Plan = buildVPlan(SubRange); - VPlans.push_back(Plan); - VF = SubRange.End; - } -} - -VPInterleaveRecipe * -LoopVectorizationPlanner::tryToInterleaveMemory(Instruction *I, - VFRange &Range) { - const InterleaveGroup *IG = Legal->getInterleavedAccessGroup(I); - if (!IG) - return nullptr; - - // Now check if IG is relevant for VF's in the given range. - auto isIGMember = [&](Instruction *I) -> std::function<bool(unsigned)> { - return [=](unsigned VF) -> bool { - return (VF >= 2 && // Query is illegal for VF == 1 - CM.getWideningDecision(I, VF) == - LoopVectorizationCostModel::CM_Interleave); - }; - }; - if (!getDecisionAndClampRange(isIGMember(I), Range)) - return nullptr; - - // I is a member of an InterleaveGroup for VF's in the (possibly trimmed) - // range. If it's the primary member of the IG construct a VPInterleaveRecipe. - // Otherwise, it's an adjunct member of the IG, do not construct any Recipe. - assert(I == IG->getInsertPos() && - "Generating a recipe for an adjunct member of an interleave group"); - - return new VPInterleaveRecipe(IG); -} - -VPWidenIntOrFpInductionRecipe * -LoopVectorizationPlanner::tryToOptimizeInduction(Instruction *I, - VFRange &Range) { - if (PHINode *Phi = dyn_cast<PHINode>(I)) { - // Check if this is an integer or fp induction. If so, build the recipe that - // produces its scalar and vector values. - InductionDescriptor II = Legal->getInductionVars()->lookup(Phi); - if (II.getKind() == InductionDescriptor::IK_IntInduction || - II.getKind() == InductionDescriptor::IK_FpInduction) - return new VPWidenIntOrFpInductionRecipe(Phi); - - return nullptr; - } - - // Optimize the special case where the source is a constant integer - // induction variable. Notice that we can only optimize the 'trunc' case - // because (a) FP conversions lose precision, (b) sext/zext may wrap, and - // (c) other casts depend on pointer size. - - // Determine whether \p K is a truncation based on an induction variable that - // can be optimized. - auto isOptimizableIVTruncate = - [&](Instruction *K) -> std::function<bool(unsigned)> { - return - [=](unsigned VF) -> bool { return CM.isOptimizableIVTruncate(K, VF); }; - }; - - if (isa<TruncInst>(I) && - getDecisionAndClampRange(isOptimizableIVTruncate(I), Range)) - return new VPWidenIntOrFpInductionRecipe(cast<PHINode>(I->getOperand(0)), - cast<TruncInst>(I)); - return nullptr; -} - -VPWidenRecipe *LoopVectorizationPlanner::tryToWiden( - Instruction *I, VPWidenRecipe *LastWidenRecipe, VFRange &Range) { - - if (Legal->isScalarWithPredication(I)) - return nullptr; - - static DenseSet<unsigned> VectorizableOpcodes = { - Instruction::Br, Instruction::PHI, Instruction::GetElementPtr, - Instruction::UDiv, Instruction::SDiv, Instruction::SRem, - Instruction::URem, Instruction::Add, Instruction::FAdd, - Instruction::Sub, Instruction::FSub, Instruction::Mul, - Instruction::FMul, Instruction::FDiv, Instruction::FRem, - Instruction::Shl, Instruction::LShr, Instruction::AShr, - Instruction::And, Instruction::Or, Instruction::Xor, - Instruction::Select, Instruction::ICmp, Instruction::FCmp, - Instruction::Store, Instruction::Load, Instruction::ZExt, - Instruction::SExt, Instruction::FPToUI, Instruction::FPToSI, - Instruction::FPExt, Instruction::PtrToInt, Instruction::IntToPtr, - Instruction::SIToFP, Instruction::UIToFP, Instruction::Trunc, - Instruction::FPTrunc, Instruction::BitCast, Instruction::Call}; - - if (!VectorizableOpcodes.count(I->getOpcode())) - return nullptr; - - if (CallInst *CI = dyn_cast<CallInst>(I)) { - Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); - if (ID && (ID == Intrinsic::assume || ID == Intrinsic::lifetime_end || - ID == Intrinsic::lifetime_start)) - return nullptr; - } - - auto willWiden = [&](unsigned VF) -> bool { - if (!isa<PHINode>(I) && (CM.isScalarAfterVectorization(I, VF) || - CM.isProfitableToScalarize(I, VF))) - return false; - if (CallInst *CI = dyn_cast<CallInst>(I)) { - Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); - // The following case may be scalarized depending on the VF. - // The flag shows whether we use Intrinsic or a usual Call for vectorized - // version of the instruction. - // Is it beneficial to perform intrinsic call compared to lib call? - bool NeedToScalarize; - unsigned CallCost = getVectorCallCost(CI, VF, *TTI, TLI, NeedToScalarize); - bool UseVectorIntrinsic = - ID && getVectorIntrinsicCost(CI, VF, *TTI, TLI) <= CallCost; - return UseVectorIntrinsic || !NeedToScalarize; - } - if (isa<LoadInst>(I) || isa<StoreInst>(I)) { - LoopVectorizationCostModel::InstWidening Decision = - CM.getWideningDecision(I, VF); - assert(Decision != LoopVectorizationCostModel::CM_Unknown && - "CM decision should be taken at this point."); - assert(Decision != LoopVectorizationCostModel::CM_Interleave && - "Interleave memory opportunity should be caught earlier."); - return Decision != LoopVectorizationCostModel::CM_Scalarize; - } - return true; - }; - - if (!getDecisionAndClampRange(willWiden, Range)) - return nullptr; - - // Success: widen this instruction. We optimize the common case where - // consecutive instructions can be represented by a single recipe. - if (LastWidenRecipe && LastWidenRecipe->appendInstruction(I)) - return LastWidenRecipe; - return new VPWidenRecipe(I); -} - -VPBasicBlock *LoopVectorizationPlanner::handleReplication( - Instruction *I, VFRange &Range, VPBasicBlock *VPBB, - DenseMap<Instruction *, VPReplicateRecipe *> &PredInst2Recipe) { - - bool IsUniform = getDecisionAndClampRange( - [&](unsigned VF) { return CM.isUniformAfterVectorization(I, VF); }, - Range); - - bool IsPredicated = Legal->isScalarWithPredication(I); - auto *Recipe = new VPReplicateRecipe(I, IsUniform, IsPredicated); - - // Find if I uses a predicated instruction. If so, it will use its scalar - // value. Avoid hoisting the insert-element which packs the scalar value into - // a vector value, as that happens iff all users use the vector value. - for (auto &Op : I->operands()) - if (auto *PredInst = dyn_cast<Instruction>(Op)) - if (PredInst2Recipe.find(PredInst) != PredInst2Recipe.end()) - PredInst2Recipe[PredInst]->setAlsoPack(false); - - // Finalize the recipe for Instr, first if it is not predicated. - if (!IsPredicated) { - DEBUG(dbgs() << "LV: Scalarizing:" << *I << "\n"); - VPBB->appendRecipe(Recipe); - return VPBB; - } - DEBUG(dbgs() << "LV: Scalarizing and predicating:" << *I << "\n"); - assert(VPBB->getSuccessors().empty() && - "VPBB has successors when handling predicated replication."); - // Record predicated instructions for above packing optimizations. - PredInst2Recipe[I] = Recipe; - VPBlockBase *Region = VPBB->setOneSuccessor(createReplicateRegion(I, Recipe)); - return cast<VPBasicBlock>(Region->setOneSuccessor(new VPBasicBlock())); -} - -VPRegionBlock * -LoopVectorizationPlanner::createReplicateRegion(Instruction *Instr, - VPRecipeBase *PredRecipe) { - // Instructions marked for predication are replicated and placed under an - // if-then construct to prevent side-effects. - - // Build the triangular if-then region. - std::string RegionName = (Twine("pred.") + Instr->getOpcodeName()).str(); - assert(Instr->getParent() && "Predicated instruction not in any basic block"); - auto *BOMRecipe = new VPBranchOnMaskRecipe(Instr->getParent()); - auto *Entry = new VPBasicBlock(Twine(RegionName) + ".entry", BOMRecipe); - auto *PHIRecipe = - Instr->getType()->isVoidTy() ? nullptr : new VPPredInstPHIRecipe(Instr); - auto *Exit = new VPBasicBlock(Twine(RegionName) + ".continue", PHIRecipe); - auto *Pred = new VPBasicBlock(Twine(RegionName) + ".if", PredRecipe); - VPRegionBlock *Region = new VPRegionBlock(Entry, Exit, RegionName, true); - - // Note: first set Entry as region entry and then connect successors starting - // from it in order, to propagate the "parent" of each VPBasicBlock. - Entry->setTwoSuccessors(Pred, Exit); - Pred->setOneSuccessor(Exit); - - return Region; -} - -VPlan *LoopVectorizationPlanner::buildVPlan(VFRange &Range) { - - DenseMap<Instruction *, Instruction *> &SinkAfter = Legal->getSinkAfter(); - DenseMap<Instruction *, Instruction *> SinkAfterInverse; - - // Collect instructions from the original loop that will become trivially dead - // in the vectorized loop. We don't need to vectorize these instructions. For - // example, original induction update instructions can become dead because we - // separately emit induction "steps" when generating code for the new loop. - // Similarly, we create a new latch condition when setting up the structure - // of the new loop, so the old one can become dead. - SmallPtrSet<Instruction *, 4> DeadInstructions; - collectTriviallyDeadInstructions(DeadInstructions); - - // Hold a mapping from predicated instructions to their recipes, in order to - // fix their AlsoPack behavior if a user is determined to replicate and use a - // scalar instead of vector value. - DenseMap<Instruction *, VPReplicateRecipe *> PredInst2Recipe; - - // Create a dummy pre-entry VPBasicBlock to start building the VPlan. - VPBasicBlock *VPBB = new VPBasicBlock("Pre-Entry"); - VPlan *Plan = new VPlan(VPBB); - - // Scan the body of the loop in a topological order to visit each basic block - // after having visited its predecessor basic blocks. - LoopBlocksDFS DFS(OrigLoop); - DFS.perform(LI); - - for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) { - // Relevant instructions from basic block BB will be grouped into VPRecipe - // ingredients and fill a new VPBasicBlock. - unsigned VPBBsForBB = 0; - auto *FirstVPBBForBB = new VPBasicBlock(BB->getName()); - VPBB->setOneSuccessor(FirstVPBBForBB); - VPBB = FirstVPBBForBB; - VPWidenRecipe *LastWidenRecipe = nullptr; - - std::vector<Instruction *> Ingredients; - - // Organize the ingredients to vectorize from current basic block in the - // right order. - for (Instruction &I : *BB) { - Instruction *Instr = &I; - - // First filter out irrelevant instructions, to ensure no recipes are - // built for them. - if (isa<BranchInst>(Instr) || isa<DbgInfoIntrinsic>(Instr) || - DeadInstructions.count(Instr)) - continue; - - // I is a member of an InterleaveGroup for Range.Start. If it's an adjunct - // member of the IG, do not construct any Recipe for it. - const InterleaveGroup *IG = Legal->getInterleavedAccessGroup(Instr); - if (IG && Instr != IG->getInsertPos() && - Range.Start >= 2 && // Query is illegal for VF == 1 - CM.getWideningDecision(Instr, Range.Start) == - LoopVectorizationCostModel::CM_Interleave) - continue; - - // Move instructions to handle first-order recurrences, step 1: avoid - // handling this instruction until after we've handled the instruction it - // should follow. - auto SAIt = SinkAfter.find(Instr); - if (SAIt != SinkAfter.end()) { - DEBUG(dbgs() << "Sinking" << *SAIt->first << " after" << *SAIt->second - << " to vectorize a 1st order recurrence.\n"); - SinkAfterInverse[SAIt->second] = Instr; - continue; - } - - Ingredients.push_back(Instr); - - // Move instructions to handle first-order recurrences, step 2: push the - // instruction to be sunk at its insertion point. - auto SAInvIt = SinkAfterInverse.find(Instr); - if (SAInvIt != SinkAfterInverse.end()) - Ingredients.push_back(SAInvIt->second); - } - - // Introduce each ingredient into VPlan. - for (Instruction *Instr : Ingredients) { - VPRecipeBase *Recipe = nullptr; - - // Check if Instr should belong to an interleave memory recipe, or already - // does. In the latter case Instr is irrelevant. - if ((Recipe = tryToInterleaveMemory(Instr, Range))) { - VPBB->appendRecipe(Recipe); - continue; - } - - // Check if Instr should form some PHI recipe. - if ((Recipe = tryToOptimizeInduction(Instr, Range))) { - VPBB->appendRecipe(Recipe); - continue; - } - if (PHINode *Phi = dyn_cast<PHINode>(Instr)) { - VPBB->appendRecipe(new VPWidenPHIRecipe(Phi)); - continue; - } - - // Check if Instr is to be widened by a general VPWidenRecipe, after - // having first checked for specific widening recipes that deal with - // Interleave Groups, Inductions and Phi nodes. - if ((Recipe = tryToWiden(Instr, LastWidenRecipe, Range))) { - if (Recipe != LastWidenRecipe) - VPBB->appendRecipe(Recipe); - LastWidenRecipe = cast<VPWidenRecipe>(Recipe); - continue; - } - - // Otherwise, if all widening options failed, Instruction is to be - // replicated. This may create a successor for VPBB. - VPBasicBlock *NextVPBB = - handleReplication(Instr, Range, VPBB, PredInst2Recipe); - if (NextVPBB != VPBB) { - VPBB = NextVPBB; - VPBB->setName(BB->hasName() ? BB->getName() + "." + Twine(VPBBsForBB++) - : ""); - } - } - } - - // Discard empty dummy pre-entry VPBasicBlock. Note that other VPBasicBlocks - // may also be empty, such as the last one VPBB, reflecting original - // basic-blocks with no recipes. - VPBasicBlock *PreEntry = cast<VPBasicBlock>(Plan->getEntry()); - assert(PreEntry->empty() && "Expecting empty pre-entry block."); - VPBlockBase *Entry = Plan->setEntry(PreEntry->getSingleSuccessor()); - PreEntry->disconnectSuccessor(Entry); - delete PreEntry; - - std::string PlanName; - raw_string_ostream RSO(PlanName); - unsigned VF = Range.Start; - Plan->addVF(VF); - RSO << "Initial VPlan for VF={" << VF; - for (VF *= 2; VF < Range.End; VF *= 2) { - Plan->addVF(VF); - RSO << "," << VF; - } - RSO << "},UF>=1"; - RSO.flush(); - Plan->setName(PlanName); - - return Plan; -} - -void VPInterleaveRecipe::print(raw_ostream &O, const Twine &Indent) const { - O << " +\n" - << Indent << "\"INTERLEAVE-GROUP with factor " << IG->getFactor() << " at "; - IG->getInsertPos()->printAsOperand(O, false); - O << "\\l\""; - for (unsigned i = 0; i < IG->getFactor(); ++i) - if (Instruction *I = IG->getMember(i)) - O << " +\n" - << Indent << "\" " << VPlanIngredient(I) << " " << i << "\\l\""; -} - -void VPReplicateRecipe::execute(VPTransformState &State) { - - if (State.Instance) { // Generate a single instance. - State.ILV->scalarizeInstruction(Ingredient, *State.Instance, IsPredicated); - if (AlsoPack) { // Insert scalar instance packing it into a vector. - // If we're constructing lane 0, initialize to start from undef. - if (State.Instance->Lane == 0) { - Value *Undef = - UndefValue::get(VectorType::get(Ingredient->getType(), State.VF)); - State.ValueMap.setVectorValue(Ingredient, State.Instance->Part, Undef); - } - State.ILV->packScalarIntoVectorValue(Ingredient, *State.Instance); - } - return; - } - - // Generate scalar instances for all VF lanes of all UF parts, unless the - // instruction is uniform inwhich case generate only the first lane for each - // of the UF parts. - unsigned EndLane = IsUniform ? 1 : State.VF; - for (unsigned Part = 0; Part < State.UF; ++Part) - for (unsigned Lane = 0; Lane < EndLane; ++Lane) - State.ILV->scalarizeInstruction(Ingredient, {Part, Lane}, IsPredicated); - - // Broadcast or group together all instances into a vector. - if (AlsoPack) - for (unsigned Part = 0; Part < State.UF; ++Part) - State.ILV->getOrCreateVectorValue(Ingredient, Part); -} - -void VPBranchOnMaskRecipe::execute(VPTransformState &State) { - assert(State.Instance && "Branch on Mask works only on single instance."); - - unsigned Part = State.Instance->Part; - unsigned Lane = State.Instance->Lane; - - auto Cond = State.ILV->createBlockInMask(MaskedBasicBlock); - - Value *ConditionBit = Cond[Part]; - if (!ConditionBit) // Block in mask is all-one. - ConditionBit = State.Builder.getTrue(); - else if (ConditionBit->getType()->isVectorTy()) - ConditionBit = State.Builder.CreateExtractElement( - ConditionBit, State.Builder.getInt32(Lane)); - - // Replace the temporary unreachable terminator with a new conditional branch, - // whose two destinations will be set later when they are created. - auto *CurrentTerminator = State.CFG.PrevBB->getTerminator(); - assert(isa<UnreachableInst>(CurrentTerminator) && - "Expected to replace unreachable terminator with conditional branch."); - auto *CondBr = BranchInst::Create(State.CFG.PrevBB, nullptr, ConditionBit); - CondBr->setSuccessor(0, nullptr); - ReplaceInstWithInst(CurrentTerminator, CondBr); - - DEBUG(dbgs() << "\nLV: vectorizing BranchOnMask recipe " - << MaskedBasicBlock->getName()); -} - -void VPPredInstPHIRecipe::execute(VPTransformState &State) { - assert(State.Instance && "Predicated instruction PHI works per instance."); - Instruction *ScalarPredInst = cast<Instruction>( - State.ValueMap.getScalarValue(PredInst, *State.Instance)); - BasicBlock *PredicatedBB = ScalarPredInst->getParent(); - BasicBlock *PredicatingBB = PredicatedBB->getSinglePredecessor(); - assert(PredicatingBB && "Predicated block has no single predecessor."); - - // By current pack/unpack logic we need to generate only a single phi node: if - // a vector value for the predicated instruction exists at this point it means - // the instruction has vector users only, and a phi for the vector value is - // needed. In this case the recipe of the predicated instruction is marked to - // also do that packing, thereby "hoisting" the insert-element sequence. - // Otherwise, a phi node for the scalar value is needed. - unsigned Part = State.Instance->Part; - if (State.ValueMap.hasVectorValue(PredInst, Part)) { - Value *VectorValue = State.ValueMap.getVectorValue(PredInst, Part); - InsertElementInst *IEI = cast<InsertElementInst>(VectorValue); - PHINode *VPhi = State.Builder.CreatePHI(IEI->getType(), 2); - VPhi->addIncoming(IEI->getOperand(0), PredicatingBB); // Unmodified vector. - VPhi->addIncoming(IEI, PredicatedBB); // New vector with inserted element. - State.ValueMap.resetVectorValue(PredInst, Part, VPhi); // Update cache. - } else { - Type *PredInstType = PredInst->getType(); - PHINode *Phi = State.Builder.CreatePHI(PredInstType, 2); - Phi->addIncoming(UndefValue::get(ScalarPredInst->getType()), PredicatingBB); - Phi->addIncoming(ScalarPredInst, PredicatedBB); - State.ValueMap.resetScalarValue(PredInst, *State.Instance, Phi); - } -} - bool LoopVectorizePass::processLoop(Loop *L) { assert(L->empty() && "Only process inner loops."); @@ -8437,7 +7902,7 @@ bool LoopVectorizePass::processLoop(Loop *L) { CM.collectValuesToIgnore(); // Use the planner for vectorization. - LoopVectorizationPlanner LVP(L, LI, TLI, TTI, &LVL, CM); + LoopVectorizationPlanner LVP(L, LI, &LVL, CM); // Get user vectorization factor. unsigned UserVF = Hints.getWidth(); @@ -8524,8 +7989,6 @@ bool LoopVectorizePass::processLoop(Loop *L) { DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n'); } - LVP.setBestPlan(VF.Width, IC); - using namespace ore; if (!VectorizeLoop) { assert(IC > 1 && "interleave count should not be 1 or 0"); @@ -8533,7 +7996,7 @@ bool LoopVectorizePass::processLoop(Loop *L) { // interleave it. InnerLoopUnroller Unroller(L, PSE, LI, DT, TLI, TTI, AC, ORE, IC, &LVL, &CM); - LVP.executePlan(Unroller, DT); + LVP.executePlan(Unroller); ORE->emit(OptimizationRemark(LV_NAME, "Interleaved", L->getStartLoc(), L->getHeader()) @@ -8543,7 +8006,7 @@ bool LoopVectorizePass::processLoop(Loop *L) { // If we decided that it is *legal* to vectorize the loop, then do it. InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, IC, &LVL, &CM); - LVP.executePlan(LB, DT); + LVP.executePlan(LB); ++LoopsVectorized; // Add metadata to disable runtime unrolling a scalar loop when there are |