From a88bbf0e225973dcc282e70acb01a883f172ffec Mon Sep 17 00:00:00 2001 From: Wei Mi Date: Fri, 21 Apr 2017 15:50:16 +0000 Subject: [PATCH] [ConstHoisting] Add BFI in constanthoisting pass and select the best insertion places based on it. Existing constant hoisting pass will merge a group of contants in a small range and hoist the const materialization code to the common dominator of their uses. However, if the uses are all in cold pathes, existing implementation may hoist the materialization code from cold pathes to a hot place. This may hurt performance. The patch introduces BFI to the pass and selects the best insertion places based on it. The change is controlled by an option consthoist-with-block-frequency which is off by default for now. Differential Revision: https://reviews.llvm.org/D28962 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@300989 91177308-0d34-0410-b5e6-96231b3b80d8 --- .../llvm/Transforms/Scalar/ConstantHoisting.h | 8 +- lib/Transforms/Scalar/ConstantHoisting.cpp | 212 +++++++++++++++--- test/CodeGen/X86/constant-hoisting-bfi.ll | 115 ++++++++++ test/CodeGen/X86/fold-tied-op.ll | 1 - test/Transforms/ConstantHoisting/X86/ehpad.ll | 11 + 5 files changed, 313 insertions(+), 34 deletions(-) create mode 100644 test/CodeGen/X86/constant-hoisting-bfi.ll diff --git a/include/llvm/Transforms/Scalar/ConstantHoisting.h b/include/llvm/Transforms/Scalar/ConstantHoisting.h index 3e2b3327a9f..edc91add7a7 100644 --- a/include/llvm/Transforms/Scalar/ConstantHoisting.h +++ b/include/llvm/Transforms/Scalar/ConstantHoisting.h @@ -36,6 +36,7 @@ #ifndef LLVM_TRANSFORMS_SCALAR_CONSTANTHOISTING_H #define LLVM_TRANSFORMS_SCALAR_CONSTANTHOISTING_H +#include "llvm/Analysis/BlockFrequencyInfo.h" #include "llvm/Analysis/TargetTransformInfo.h" #include "llvm/IR/Dominators.h" #include "llvm/IR/PassManager.h" @@ -98,7 +99,7 @@ public: // Glue for old PM. bool runImpl(Function &F, TargetTransformInfo &TTI, DominatorTree &DT, - BasicBlock &Entry); + BlockFrequencyInfo *BFI, BasicBlock &Entry); void releaseMemory() { ConstantVec.clear(); @@ -112,6 +113,7 @@ private: const TargetTransformInfo *TTI; DominatorTree *DT; + BlockFrequencyInfo *BFI; BasicBlock *Entry; /// Keeps track of constant candidates found in the function. @@ -124,8 +126,8 @@ private: SmallVector ConstantVec; Instruction *findMatInsertPt(Instruction *Inst, unsigned Idx = ~0U) const; - Instruction *findConstantInsertionPoint( - const consthoist::ConstantInfo &ConstInfo) const; + SmallPtrSet + findConstantInsertionPoint(const consthoist::ConstantInfo &ConstInfo) const; void collectConstantCandidates(ConstCandMapType &ConstCandMap, Instruction *Inst, unsigned Idx, ConstantInt *ConstInt); diff --git a/lib/Transforms/Scalar/ConstantHoisting.cpp b/lib/Transforms/Scalar/ConstantHoisting.cpp index ee6333e8871..f62e111460c 100644 --- a/lib/Transforms/Scalar/ConstantHoisting.cpp +++ b/lib/Transforms/Scalar/ConstantHoisting.cpp @@ -53,6 +53,12 @@ using namespace consthoist; STATISTIC(NumConstantsHoisted, "Number of constants hoisted"); STATISTIC(NumConstantsRebased, "Number of constants rebased"); +static cl::opt ConstHoistWithBlockFrequency( + "consthoist-with-block-frequency", cl::init(false), cl::Hidden, + cl::desc("Enable the use of the block frequency analysis to reduce the " + "chance to execute const materialization more frequently than " + "without hoisting.")); + namespace { /// \brief The constant hoisting pass. class ConstantHoistingLegacyPass : public FunctionPass { @@ -68,6 +74,8 @@ public: void getAnalysisUsage(AnalysisUsage &AU) const override { AU.setPreservesCFG(); + if (ConstHoistWithBlockFrequency) + AU.addRequired(); AU.addRequired(); AU.addRequired(); } @@ -82,6 +90,7 @@ private: char ConstantHoistingLegacyPass::ID = 0; INITIALIZE_PASS_BEGIN(ConstantHoistingLegacyPass, "consthoist", "Constant Hoisting", false, false) +INITIALIZE_PASS_DEPENDENCY(BlockFrequencyInfoWrapperPass) INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass) INITIALIZE_PASS_END(ConstantHoistingLegacyPass, "consthoist", @@ -99,9 +108,13 @@ bool ConstantHoistingLegacyPass::runOnFunction(Function &Fn) { DEBUG(dbgs() << "********** Begin Constant Hoisting **********\n"); DEBUG(dbgs() << "********** Function: " << Fn.getName() << '\n'); - bool MadeChange = Impl.runImpl( - Fn, getAnalysis().getTTI(Fn), - getAnalysis().getDomTree(), Fn.getEntryBlock()); + bool MadeChange = + Impl.runImpl(Fn, getAnalysis().getTTI(Fn), + getAnalysis().getDomTree(), + ConstHoistWithBlockFrequency + ? &getAnalysis().getBFI() + : nullptr, + Fn.getEntryBlock()); if (MadeChange) { DEBUG(dbgs() << "********** Function after Constant Hoisting: " @@ -148,33 +161,142 @@ Instruction *ConstantHoistingPass::findMatInsertPt(Instruction *Inst, return IDom->getBlock()->getTerminator(); } +/// \brief Given \p BBs as input, find another set of BBs which collectively +/// dominates \p BBs and have the minimal sum of frequencies. Return the BB +/// set found in \p BBs. +void findBestInsertionSet(DominatorTree &DT, BlockFrequencyInfo &BFI, + BasicBlock *Entry, + SmallPtrSet &BBs) { + assert(!BBs.count(Entry) && "Assume Entry is not in BBs"); + // Nodes on the current path to the root. + SmallPtrSet Path; + // Candidates includes any block 'BB' in set 'BBs' that is not strictly + // dominated by any other blocks in set 'BBs', and all nodes in the path + // in the dominator tree from Entry to 'BB'. + SmallPtrSet Candidates; + for (auto BB : BBs) { + Path.clear(); + // Walk up the dominator tree until Entry or another BB in BBs + // is reached. Insert the nodes on the way to the Path. + BasicBlock *Node = BB; + // The "Path" is a candidate path to be added into Candidates set. + bool isCandidate = false; + do { + Path.insert(Node); + if (Node == Entry || Candidates.count(Node)) { + isCandidate = true; + break; + } + assert(DT.getNode(Node)->getIDom() && + "Entry doens't dominate current Node"); + Node = DT.getNode(Node)->getIDom()->getBlock(); + } while (!BBs.count(Node)); + + // If isCandidate is false, Node is another Block in BBs dominating + // current 'BB'. Drop the nodes on the Path. + if (!isCandidate) + continue; + + // Add nodes on the Path into Candidates. + Candidates.insert(Path.begin(), Path.end()); + } + + // Sort the nodes in Candidates in top-down order and save the nodes + // in Orders. + unsigned Idx = 0; + SmallVector Orders; + Orders.push_back(Entry); + while (Idx != Orders.size()) { + BasicBlock *Node = Orders[Idx++]; + for (auto ChildDomNode : DT.getNode(Node)->getChildren()) { + if (Candidates.count(ChildDomNode->getBlock())) + Orders.push_back(ChildDomNode->getBlock()); + } + } + + // Visit Orders in bottom-up order. + typedef std::pair, BlockFrequency> + InsertPtsCostPair; + // InsertPtsMap is a map from a BB to the best insertion points for the + // subtree of BB (subtree not including the BB itself). + DenseMap InsertPtsMap; + InsertPtsMap.reserve(Orders.size() + 1); + for (auto RIt = Orders.rbegin(); RIt != Orders.rend(); RIt++) { + BasicBlock *Node = *RIt; + bool NodeInBBs = BBs.count(Node); + SmallPtrSet &InsertPts = InsertPtsMap[Node].first; + BlockFrequency &InsertPtsFreq = InsertPtsMap[Node].second; + + // Return the optimal insert points in BBs. + if (Node == Entry) { + BBs.clear(); + if (InsertPtsFreq > BFI.getBlockFreq(Node)) + BBs.insert(Entry); + else + BBs.insert(InsertPts.begin(), InsertPts.end()); + break; + } + + BasicBlock *Parent = DT.getNode(Node)->getIDom()->getBlock(); + // Initially, ParentInsertPts is empty and ParentPtsFreq is 0. Every child + // will update its parent's ParentInsertPts and ParentPtsFreq. + SmallPtrSet &ParentInsertPts = InsertPtsMap[Parent].first; + BlockFrequency &ParentPtsFreq = InsertPtsMap[Parent].second; + // Choose to insert in Node or in subtree of Node. + if (InsertPtsFreq > BFI.getBlockFreq(Node) || NodeInBBs) { + ParentInsertPts.insert(Node); + ParentPtsFreq += BFI.getBlockFreq(Node); + } else { + ParentInsertPts.insert(InsertPts.begin(), InsertPts.end()); + ParentPtsFreq += InsertPtsFreq; + } + } +} + /// \brief Find an insertion point that dominates all uses. -Instruction *ConstantHoistingPass::findConstantInsertionPoint( +SmallPtrSet ConstantHoistingPass::findConstantInsertionPoint( const ConstantInfo &ConstInfo) const { assert(!ConstInfo.RebasedConstants.empty() && "Invalid constant info entry."); // Collect all basic blocks. SmallPtrSet BBs; + SmallPtrSet InsertPts; for (auto const &RCI : ConstInfo.RebasedConstants) for (auto const &U : RCI.Uses) BBs.insert(findMatInsertPt(U.Inst, U.OpndIdx)->getParent()); - if (BBs.count(Entry)) - return &Entry->front(); + if (BBs.count(Entry)) { + InsertPts.insert(&Entry->front()); + return InsertPts; + } + + if (BFI) { + findBestInsertionSet(*DT, *BFI, Entry, BBs); + for (auto BB : BBs) { + BasicBlock::iterator InsertPt = BB->begin(); + for (; isa(InsertPt) || InsertPt->isEHPad(); ++InsertPt) + ; + InsertPts.insert(&*InsertPt); + } + return InsertPts; + } while (BBs.size() >= 2) { BasicBlock *BB, *BB1, *BB2; BB1 = *BBs.begin(); BB2 = *std::next(BBs.begin()); BB = DT->findNearestCommonDominator(BB1, BB2); - if (BB == Entry) - return &Entry->front(); + if (BB == Entry) { + InsertPts.insert(&Entry->front()); + return InsertPts; + } BBs.erase(BB1); BBs.erase(BB2); BBs.insert(BB); } assert((BBs.size() == 1) && "Expected only one element."); Instruction &FirstInst = (*BBs.begin())->front(); - return findMatInsertPt(&FirstInst); + InsertPts.insert(findMatInsertPt(&FirstInst)); + return InsertPts; } @@ -557,29 +679,54 @@ bool ConstantHoistingPass::emitBaseConstants() { bool MadeChange = false; for (auto const &ConstInfo : ConstantVec) { // Hoist and hide the base constant behind a bitcast. - Instruction *IP = findConstantInsertionPoint(ConstInfo); - IntegerType *Ty = ConstInfo.BaseConstant->getType(); - Instruction *Base = - new BitCastInst(ConstInfo.BaseConstant, Ty, "const", IP); - DEBUG(dbgs() << "Hoist constant (" << *ConstInfo.BaseConstant << ") to BB " - << IP->getParent()->getName() << '\n' << *Base << '\n'); - NumConstantsHoisted++; + SmallPtrSet IPSet = findConstantInsertionPoint(ConstInfo); + assert(!IPSet.empty() && "IPSet is empty"); + + unsigned UsesNum = 0; + unsigned ReBasesNum = 0; + for (Instruction *IP : IPSet) { + IntegerType *Ty = ConstInfo.BaseConstant->getType(); + Instruction *Base = + new BitCastInst(ConstInfo.BaseConstant, Ty, "const", IP); + DEBUG(dbgs() << "Hoist constant (" << *ConstInfo.BaseConstant + << ") to BB " << IP->getParent()->getName() << '\n' + << *Base << '\n'); + + // Emit materialization code for all rebased constants. + unsigned Uses = 0; + for (auto const &RCI : ConstInfo.RebasedConstants) { + for (auto const &U : RCI.Uses) { + Uses++; + BasicBlock *OrigMatInsertBB = + findMatInsertPt(U.Inst, U.OpndIdx)->getParent(); + // If Base constant is to be inserted in multiple places, + // generate rebase for U using the Base dominating U. + if (IPSet.size() == 1 || + DT->dominates(Base->getParent(), OrigMatInsertBB)) { + emitBaseConstants(Base, RCI.Offset, U); + ReBasesNum++; + } + } + } + UsesNum = Uses; - // Emit materialization code for all rebased constants. - for (auto const &RCI : ConstInfo.RebasedConstants) { - NumConstantsRebased++; - for (auto const &U : RCI.Uses) - emitBaseConstants(Base, RCI.Offset, U); + // Use the same debug location as the last user of the constant. + assert(!Base->use_empty() && "The use list is empty!?"); + assert(isa(Base->user_back()) && + "All uses should be instructions."); + Base->setDebugLoc(cast(Base->user_back())->getDebugLoc()); } + (void)UsesNum; + (void)ReBasesNum; + // Expect all uses are rebased after rebase is done. + assert(UsesNum == ReBasesNum && "Not all uses are rebased"); + + NumConstantsHoisted++; - // Use the same debug location as the last user of the constant. - assert(!Base->use_empty() && "The use list is empty!?"); - assert(isa(Base->user_back()) && - "All uses should be instructions."); - Base->setDebugLoc(cast(Base->user_back())->getDebugLoc()); + // Base constant is also included in ConstInfo.RebasedConstants, so + // deduct 1 from ConstInfo.RebasedConstants.size(). + NumConstantsRebased = ConstInfo.RebasedConstants.size() - 1; - // Correct for base constant, which we counted above too. - NumConstantsRebased--; MadeChange = true; } return MadeChange; @@ -595,9 +742,11 @@ void ConstantHoistingPass::deleteDeadCastInst() const { /// \brief Optimize expensive integer constants in the given function. bool ConstantHoistingPass::runImpl(Function &Fn, TargetTransformInfo &TTI, - DominatorTree &DT, BasicBlock &Entry) { + DominatorTree &DT, BlockFrequencyInfo *BFI, + BasicBlock &Entry) { this->TTI = &TTI; this->DT = &DT; + this->BFI = BFI; this->Entry = &Entry; // Collect all constant candidates. collectConstantCandidates(Fn); @@ -628,7 +777,10 @@ PreservedAnalyses ConstantHoistingPass::run(Function &F, FunctionAnalysisManager &AM) { auto &DT = AM.getResult(F); auto &TTI = AM.getResult(F); - if (!runImpl(F, TTI, DT, F.getEntryBlock())) + auto BFI = ConstHoistWithBlockFrequency + ? &AM.getResult(F) + : nullptr; + if (!runImpl(F, TTI, DT, BFI, F.getEntryBlock())) return PreservedAnalyses::all(); PreservedAnalyses PA; diff --git a/test/CodeGen/X86/constant-hoisting-bfi.ll b/test/CodeGen/X86/constant-hoisting-bfi.ll new file mode 100644 index 00000000000..83589b7706f --- /dev/null +++ b/test/CodeGen/X86/constant-hoisting-bfi.ll @@ -0,0 +1,115 @@ +; RUN: opt -consthoist -mtriple=x86_64-unknown-linux-gnu -consthoist-with-block-frequency=true -S < %s | FileCheck %s + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" + +; Check when BFI is enabled for constant hoisting, constant 214748364701 +; will not be hoisted to the func entry. +; CHECK-LABEL: @foo( +; CHECK: entry: +; CHECK-NOT: bitcast i64 214748364701 to i64 +; CHECK: if.then: + +; Function Attrs: norecurse nounwind uwtable +define i64 @foo(i64* nocapture %a) { +entry: + %arrayidx = getelementptr inbounds i64, i64* %a, i64 9 + %t0 = load i64, i64* %arrayidx, align 8 + %cmp = icmp slt i64 %t0, 564 + br i1 %cmp, label %if.then, label %if.else5 + +if.then: ; preds = %entry + %arrayidx1 = getelementptr inbounds i64, i64* %a, i64 5 + %t1 = load i64, i64* %arrayidx1, align 8 + %cmp2 = icmp slt i64 %t1, 1009 + br i1 %cmp2, label %if.then3, label %return + +if.then3: ; preds = %if.then + %arrayidx4 = getelementptr inbounds i64, i64* %a, i64 6 + %t2 = load i64, i64* %arrayidx4, align 8 + %inc = add nsw i64 %t2, 1 + store i64 %inc, i64* %arrayidx4, align 8 + br label %return + +if.else5: ; preds = %entry + %arrayidx6 = getelementptr inbounds i64, i64* %a, i64 6 + %t3 = load i64, i64* %arrayidx6, align 8 + %cmp7 = icmp slt i64 %t3, 3512 + br i1 %cmp7, label %if.then8, label %return + +if.then8: ; preds = %if.else5 + %arrayidx9 = getelementptr inbounds i64, i64* %a, i64 7 + %t4 = load i64, i64* %arrayidx9, align 8 + %inc10 = add nsw i64 %t4, 1 + store i64 %inc10, i64* %arrayidx9, align 8 + br label %return + +return: ; preds = %if.else5, %if.then, %if.then8, %if.then3 + %retval.0 = phi i64 [ 214748364701, %if.then3 ], [ 214748364701, %if.then8 ], [ 250148364702, %if.then ], [ 256148364704, %if.else5 ] + ret i64 %retval.0 +} + +; Check when BFI is enabled for constant hoisting, constant 214748364701 +; in while.body will be hoisted to while.body.preheader. 214748364701 in +; if.then16 and if.else10 will be merged and hoisted to the beginning of +; if.else10 because if.else10 dominates if.then16. +; CHECK-LABEL: @goo( +; CHECK: entry: +; CHECK-NOT: bitcast i64 214748364701 to i64 +; CHECK: while.body.preheader: +; CHECK-NEXT: bitcast i64 214748364701 to i64 +; CHECK-NOT: bitcast i64 214748364701 to i64 +; CHECK: if.else10: +; CHECK-NEXT: bitcast i64 214748364701 to i64 +; CHECK-NOT: bitcast i64 214748364701 to i64 +define i64 @goo(i64* nocapture %a) { +entry: + %arrayidx = getelementptr inbounds i64, i64* %a, i64 9 + %t0 = load i64, i64* %arrayidx, align 8 + %cmp = icmp ult i64 %t0, 56 + br i1 %cmp, label %if.then, label %if.else10, !prof !0 + +if.then: ; preds = %entry + %arrayidx1 = getelementptr inbounds i64, i64* %a, i64 5 + %t1 = load i64, i64* %arrayidx1, align 8 + %cmp2 = icmp ult i64 %t1, 10 + br i1 %cmp2, label %while.cond.preheader, label %return, !prof !0 + +while.cond.preheader: ; preds = %if.then + %arrayidx7 = getelementptr inbounds i64, i64* %a, i64 6 + %t2 = load i64, i64* %arrayidx7, align 8 + %cmp823 = icmp ugt i64 %t2, 10000 + br i1 %cmp823, label %while.body.preheader, label %return + +while.body.preheader: ; preds = %while.cond.preheader + br label %while.body + +while.body: ; preds = %while.body.preheader, %while.body + %t3 = phi i64 [ %add, %while.body ], [ %t2, %while.body.preheader ] + %add = add i64 %t3, 214748364701 + %cmp8 = icmp ugt i64 %add, 10000 + br i1 %cmp8, label %while.body, label %while.cond.return.loopexit_crit_edge + +if.else10: ; preds = %entry + %arrayidx11 = getelementptr inbounds i64, i64* %a, i64 6 + %t4 = load i64, i64* %arrayidx11, align 8 + %add2 = add i64 %t4, 214748364701 + %cmp12 = icmp ult i64 %add2, 35 + br i1 %cmp12, label %if.then16, label %return, !prof !0 + +if.then16: ; preds = %if.else10 + %arrayidx17 = getelementptr inbounds i64, i64* %a, i64 7 + %t5 = load i64, i64* %arrayidx17, align 8 + %inc = add i64 %t5, 1 + store i64 %inc, i64* %arrayidx17, align 8 + br label %return + +while.cond.return.loopexit_crit_edge: ; preds = %while.body + store i64 %add, i64* %arrayidx7, align 8 + br label %return + +return: ; preds = %while.cond.preheader, %while.cond.return.loopexit_crit_edge, %if.else10, %if.then, %if.then16 + %retval.0 = phi i64 [ 214748364701, %if.then16 ], [ 0, %if.then ], [ 0, %if.else10 ], [ 0, %while.cond.return.loopexit_crit_edge ], [ 0, %while.cond.preheader ] + ret i64 %retval.0 +} + +!0 = !{!"branch_weights", i32 1, i32 2000} diff --git a/test/CodeGen/X86/fold-tied-op.ll b/test/CodeGen/X86/fold-tied-op.ll index 5c481197c3b..d68236e9d25 100644 --- a/test/CodeGen/X86/fold-tied-op.ll +++ b/test/CodeGen/X86/fold-tied-op.ll @@ -7,7 +7,6 @@ target triple = "i386--netbsd" ; CHECK-LABEL: fn1 ; CHECK: addl {{.*#+}} 4-byte Folded Reload -; CHECK: addl {{.*#+}} 4-byte Folded Reload ; CHECK: imull {{.*#+}} 4-byte Folded Reload ; CHECK: orl {{.*#+}} 4-byte Folded Reload ; CHECK: retl diff --git a/test/Transforms/ConstantHoisting/X86/ehpad.ll b/test/Transforms/ConstantHoisting/X86/ehpad.ll index 3178e87f754..4f87572f344 100644 --- a/test/Transforms/ConstantHoisting/X86/ehpad.ll +++ b/test/Transforms/ConstantHoisting/X86/ehpad.ll @@ -1,4 +1,5 @@ ; RUN: opt -S -consthoist < %s | FileCheck %s +; RUN: opt -S -consthoist -consthoist-with-block-frequency=true < %s | FileCheck --check-prefix=BFIHOIST %s ; FIXME: The catchpad doesn't even use the constant, so a better fix would be to ; insert the bitcast in the catchpad block. @@ -11,6 +12,16 @@ target triple = "x86_64-pc-windows-msvc" ; CHECK-NEXT: bitcast i64 9209618997431186100 to i64 ; CHECK-NEXT: br i1 %tobool +; BFIHOIST-LABEL: define i32 @main +; BFIHOIST: then: +; BFIHOIST: %[[CONST1:.*]] = bitcast i64 9209618997431186100 to i64 +; BFIHOIST: %add = add i64 %call4, %[[CONST1]] +; BFIHOIST: br label %endif +; BFIHOIST: else: +; BFIHOIST: %[[CONST2:.*]] = bitcast i64 9209618997431186100 to i64 +; BFIHOIST: %add6 = add i64 %call5, %[[CONST2]] +; BFIHOIST: br label %endif + ; Function Attrs: norecurse define i32 @main(i32 %argc, i8** nocapture readnone %argv) local_unnamed_addr #0 personality i8* bitcast (i32 (...)* @__CxxFrameHandler3 to i8*) { %call = tail call i64 @fn(i64 0) -- 2.50.1