From: Hiroshi Yamauchi Date: Thu, 2 Nov 2017 22:26:51 +0000 (+0000) Subject: Irreducible loop metadata for more accurate block frequency under PGO. X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=dd33e177dd838793692d7a291dc5552e30642842;p=llvm Irreducible loop metadata for more accurate block frequency under PGO. Summary: Currently the block frequency analysis is an approximation for irreducible loops. The new irreducible loop metadata is used to annotate the irreducible loop headers with their header weights based on the PGO profile (currently this is approximated to be evenly weighted) and to help improve the accuracy of the block frequency analysis for irreducible loops. This patch is a basic support for this. Reviewers: davidxl Reviewed By: davidxl Subscribers: mehdi_amini, llvm-commits, eraman Differential Revision: https://reviews.llvm.org/D39028 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@317278 91177308-0d34-0410-b5e6-96231b3b80d8 --- diff --git a/docs/LangRef.rst b/docs/LangRef.rst index 9d910568bd5..6823fe5fcd7 100644 --- a/docs/LangRef.rst +++ b/docs/LangRef.rst @@ -5194,6 +5194,29 @@ the loop identifier metadata node directly: !1 = !{!1} ; an identifier for the inner loop !2 = !{!2} ; an identifier for the outer loop +'``irr_loop``' Metadata +^^^^^^^^^^^^^^^^^^^^^^^ + +``irr_loop`` metadata may be attached to the terminator instruction of a basic +block that's an irreducible loop header (note that an irreducible loop has more +than once header basic blocks.) If ``irr_loop`` metadata is attached to the +terminator instruction of a basic block that is not really an irreducible loop +header, the behavior is undefined. The intent of this metadata is to improve the +accuracy of the block frequency propagation. For example, in the code below, the +block ``header0`` may have a loop header weight (relative to the other headers of +the irreducible loop) of 100: + +.. code-block:: llvm + + header0: + ... + br i1 %cmp, label %t1, label %t2, !irr_loop !0 + + ... + !0 = !{"loop_header_weight", i64 100} + +Irreducible loop header weights are typically based on profile data. + '``invariant.group``' Metadata ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ diff --git a/include/llvm/Analysis/BlockFrequencyInfo.h b/include/llvm/Analysis/BlockFrequencyInfo.h index d663b09d5cf..89370cbeeea 100644 --- a/include/llvm/Analysis/BlockFrequencyInfo.h +++ b/include/llvm/Analysis/BlockFrequencyInfo.h @@ -75,6 +75,10 @@ public: /// the enclosing function's count (if available) and returns the value. Optional getProfileCountFromFreq(uint64_t Freq) const; + /// \brief Returns true if \p BB is an irreducible loop header + /// block. Otherwise false. + bool isIrrLoopHeader(const BasicBlock *BB); + // Set the frequency of the given basic block. void setBlockFreq(const BasicBlock *BB, uint64_t Freq); diff --git a/include/llvm/Analysis/BlockFrequencyInfoImpl.h b/include/llvm/Analysis/BlockFrequencyInfoImpl.h index 7f166f4a646..7b916e3653b 100644 --- a/include/llvm/Analysis/BlockFrequencyInfoImpl.h +++ b/include/llvm/Analysis/BlockFrequencyInfoImpl.h @@ -20,6 +20,7 @@ #include "llvm/ADT/Optional.h" #include "llvm/ADT/PostOrderIterator.h" #include "llvm/ADT/SmallVector.h" +#include "llvm/ADT/SparseBitVector.h" #include "llvm/ADT/Twine.h" #include "llvm/ADT/iterator_range.h" #include "llvm/IR/BasicBlock.h" @@ -414,6 +415,10 @@ public: /// \brief Data about each block. This is used downstream. std::vector Freqs; + /// \brief Whether each block is an irreducible loop header. + /// This is used downstream. + SparseBitVector<> IsIrrLoopHeader; + /// \brief Loop data: see initializeLoops(). std::vector Working; @@ -492,6 +497,8 @@ public: /// the backedges going into each of the loop headers. void adjustLoopHeaderMass(LoopData &Loop); + void distributeIrrLoopHeaderMass(Distribution &Dist); + /// \brief Package up a loop. void packageLoop(LoopData &Loop); @@ -520,6 +527,7 @@ public: const BlockNode &Node) const; Optional getProfileCountFromFreq(const Function &F, uint64_t Freq) const; + bool isIrrLoopHeader(const BlockNode &Node); void setBlockFreq(const BlockNode &Node, uint64_t Freq); @@ -973,6 +981,10 @@ public: return BlockFrequencyInfoImplBase::getProfileCountFromFreq(F, Freq); } + bool isIrrLoopHeader(const BlockT *BB) { + return BlockFrequencyInfoImplBase::isIrrLoopHeader(getNode(BB)); + } + void setBlockFreq(const BlockT *BB, uint64_t Freq); Scaled64 getFloatingBlockFreq(const BlockT *BB) const { @@ -1140,17 +1152,39 @@ bool BlockFrequencyInfoImpl::computeMassInLoop(LoopData &Loop) { DEBUG(dbgs() << "compute-mass-in-loop: " << getLoopName(Loop) << "\n"); if (Loop.isIrreducible()) { - BlockMass Remaining = BlockMass::getFull(); + DEBUG(dbgs() << "isIrreducible = true\n"); + Distribution Dist; + unsigned NumHeadersWithWeight = 0; for (uint32_t H = 0; H < Loop.NumHeaders; ++H) { - auto &Mass = Working[Loop.Nodes[H].Index].getMass(); - Mass = Remaining * BranchProbability(1, Loop.NumHeaders - H); - Remaining -= Mass; + auto &HeaderNode = Loop.Nodes[H]; + const BlockT *Block = getBlock(HeaderNode); + IsIrrLoopHeader.set(Loop.Nodes[H].Index); + Optional HeaderWeight = Block->getIrrLoopHeaderWeight(); + if (!HeaderWeight) + continue; + DEBUG(dbgs() << getBlockName(HeaderNode) + << " has irr loop header weight " << HeaderWeight.getValue() + << "\n"); + NumHeadersWithWeight++; + uint64_t HeaderWeightValue = HeaderWeight.getValue(); + if (HeaderWeightValue) + Dist.addLocal(HeaderNode, HeaderWeightValue); } + if (NumHeadersWithWeight != Loop.NumHeaders) { + // Not all headers have a weight metadata. Distribute weight evenly. + Dist = Distribution(); + for (uint32_t H = 0; H < Loop.NumHeaders; ++H) { + auto &HeaderNode = Loop.Nodes[H]; + Dist.addLocal(HeaderNode, 1); + } + } + distributeIrrLoopHeaderMass(Dist); for (const BlockNode &M : Loop.Nodes) if (!propagateMassToSuccessors(&Loop, M)) llvm_unreachable("unhandled irreducible control flow"); - - adjustLoopHeaderMass(Loop); + if (NumHeadersWithWeight != Loop.NumHeaders) + // Not all headers have a weight metadata. Adjust header mass. + adjustLoopHeaderMass(Loop); } else { Working[Loop.getHeader().Index].getMass() = BlockMass::getFull(); if (!propagateMassToSuccessors(&Loop, Loop.getHeader())) @@ -1285,6 +1319,9 @@ raw_ostream &BlockFrequencyInfoImpl::print(raw_ostream &OS) const { BlockFrequencyInfoImplBase::getBlockProfileCount( *F->getFunction(), getNode(&BB))) OS << ", count = " << ProfileCount.getValue(); + if (Optional IrrLoopHeaderWeight = + BB.getIrrLoopHeaderWeight()) + OS << ", irr_loop_header_weight = " << IrrLoopHeaderWeight.getValue(); OS << "\n"; } diff --git a/include/llvm/CodeGen/MachineBasicBlock.h b/include/llvm/CodeGen/MachineBasicBlock.h index 51a0d96deda..0f5b04d9045 100644 --- a/include/llvm/CodeGen/MachineBasicBlock.h +++ b/include/llvm/CodeGen/MachineBasicBlock.h @@ -97,6 +97,8 @@ private: using const_probability_iterator = std::vector::const_iterator; + Optional IrrLoopHeaderWeight; + /// Keep track of the physical registers that are livein of the basicblock. using LiveInVector = std::vector; LiveInVector LiveIns; @@ -729,6 +731,14 @@ public: /// Return the MCSymbol for this basic block. MCSymbol *getSymbol() const; + Optional getIrrLoopHeaderWeight() const { + return IrrLoopHeaderWeight; + } + + void setIrrLoopHeaderWeight(uint64_t Weight) { + IrrLoopHeaderWeight = Weight; + } + private: /// Return probability iterator corresponding to the I successor iterator. probability_iterator getProbabilityIterator(succ_iterator I); diff --git a/include/llvm/CodeGen/MachineBlockFrequencyInfo.h b/include/llvm/CodeGen/MachineBlockFrequencyInfo.h index cba79c818a7..5b4b99ca0a5 100644 --- a/include/llvm/CodeGen/MachineBlockFrequencyInfo.h +++ b/include/llvm/CodeGen/MachineBlockFrequencyInfo.h @@ -62,6 +62,8 @@ public: Optional getBlockProfileCount(const MachineBasicBlock *MBB) const; Optional getProfileCountFromFreq(uint64_t Freq) const; + bool isIrrLoopHeader(const MachineBasicBlock *MBB); + const MachineFunction *getFunction() const; const MachineBranchProbabilityInfo *getMBPI() const; void view(const Twine &Name, bool isSimple = true) const; diff --git a/include/llvm/IR/BasicBlock.h b/include/llvm/IR/BasicBlock.h index 6714f2c9747..77cfc9776df 100644 --- a/include/llvm/IR/BasicBlock.h +++ b/include/llvm/IR/BasicBlock.h @@ -398,6 +398,8 @@ public: /// \brief Return true if it is legal to hoist instructions into this block. bool isLegalToHoistInto() const; + Optional getIrrLoopHeaderWeight() const; + private: /// \brief Increment the internal refcount of the number of BlockAddresses /// referencing this BasicBlock by \p Amt. diff --git a/include/llvm/IR/LLVMContext.h b/include/llvm/IR/LLVMContext.h index 9e935823c77..a95634d32c2 100644 --- a/include/llvm/IR/LLVMContext.h +++ b/include/llvm/IR/LLVMContext.h @@ -101,6 +101,7 @@ public: MD_absolute_symbol = 21, // "absolute_symbol" MD_associated = 22, // "associated" MD_callees = 23, // "callees" + MD_irr_loop = 24, // "irr_loop" }; /// Known operand bundle tag IDs, which always have the same value. All diff --git a/include/llvm/IR/MDBuilder.h b/include/llvm/IR/MDBuilder.h index d679cef95b6..15c1b9cb60e 100644 --- a/include/llvm/IR/MDBuilder.h +++ b/include/llvm/IR/MDBuilder.h @@ -173,6 +173,9 @@ public: /// base type, access type and offset relative to the base type. MDNode *createTBAAStructTagNode(MDNode *BaseType, MDNode *AccessType, uint64_t Offset, bool IsConstant = false); + + /// \brief Return metadata containing an irreducible loop header weight. + MDNode *createIrrLoopHeaderWeight(uint64_t Weight); }; } // end namespace llvm diff --git a/include/llvm/Transforms/PGOInstrumentation.h b/include/llvm/Transforms/PGOInstrumentation.h index fa7a68624ec..c2cc76c422d 100644 --- a/include/llvm/Transforms/PGOInstrumentation.h +++ b/include/llvm/Transforms/PGOInstrumentation.h @@ -68,6 +68,8 @@ public: void setProfMetadata(Module *M, Instruction *TI, ArrayRef EdgeCounts, uint64_t MaxCount); +void setIrrLoopHeaderMetadata(Module *M, Instruction *TI, uint64_t Count); + } // end namespace llvm #endif // LLVM_TRANSFORMS_PGOINSTRUMENTATION_H diff --git a/lib/Analysis/BlockFrequencyInfo.cpp b/lib/Analysis/BlockFrequencyInfo.cpp index 5d2170dcf15..41c29589521 100644 --- a/lib/Analysis/BlockFrequencyInfo.cpp +++ b/lib/Analysis/BlockFrequencyInfo.cpp @@ -218,6 +218,11 @@ BlockFrequencyInfo::getProfileCountFromFreq(uint64_t Freq) const { return BFI->getProfileCountFromFreq(*getFunction(), Freq); } +bool BlockFrequencyInfo::isIrrLoopHeader(const BasicBlock *BB) { + assert(BFI && "Expected analysis to be available"); + return BFI->isIrrLoopHeader(BB); +} + void BlockFrequencyInfo::setBlockFreq(const BasicBlock *BB, uint64_t Freq) { assert(BFI && "Expected analysis to be available"); BFI->setBlockFreq(BB, Freq); diff --git a/lib/Analysis/BlockFrequencyInfoImpl.cpp b/lib/Analysis/BlockFrequencyInfoImpl.cpp index 1030407b766..7e323022d9c 100644 --- a/lib/Analysis/BlockFrequencyInfoImpl.cpp +++ b/lib/Analysis/BlockFrequencyInfoImpl.cpp @@ -271,6 +271,7 @@ void BlockFrequencyInfoImplBase::clear() { // Swap with a default-constructed std::vector, since std::vector<>::clear() // does not actually clear heap storage. std::vector().swap(Freqs); + IsIrrLoopHeader.clear(); std::vector().swap(Working); Loops.clear(); } @@ -280,8 +281,10 @@ void BlockFrequencyInfoImplBase::clear() { /// Releases all memory not used downstream. In particular, saves Freqs. static void cleanup(BlockFrequencyInfoImplBase &BFI) { std::vector SavedFreqs(std::move(BFI.Freqs)); + SparseBitVector<> SavedIsIrrLoopHeader(std::move(BFI.IsIrrLoopHeader)); BFI.clear(); BFI.Freqs = std::move(SavedFreqs); + BFI.IsIrrLoopHeader = std::move(SavedIsIrrLoopHeader); } bool BlockFrequencyInfoImplBase::addToDist(Distribution &Dist, @@ -572,6 +575,13 @@ BlockFrequencyInfoImplBase::getProfileCountFromFreq(const Function &F, return BlockCount.getLimitedValue(); } +bool +BlockFrequencyInfoImplBase::isIrrLoopHeader(const BlockNode &Node) { + if (!Node.isValid()) + return false; + return IsIrrLoopHeader.test(Node.Index); +} + Scaled64 BlockFrequencyInfoImplBase::getFloatingBlockFreq(const BlockNode &Node) const { if (!Node.isValid()) @@ -819,3 +829,14 @@ void BlockFrequencyInfoImplBase::adjustLoopHeaderMass(LoopData &Loop) { DEBUG(debugAssign(*this, D, W.TargetNode, Taken, nullptr)); } } + +void BlockFrequencyInfoImplBase::distributeIrrLoopHeaderMass(Distribution &Dist) { + BlockMass LoopMass = BlockMass::getFull(); + DitheringDistributer D(Dist, LoopMass); + for (const Weight &W : Dist.Weights) { + BlockMass Taken = D.takeMass(W.Amount); + assert(W.Type == Weight::Local && "all weights should be local"); + Working[W.TargetNode.Index].getMass() = Taken; + DEBUG(debugAssign(*this, D, W.TargetNode, Taken, nullptr)); + } +} diff --git a/lib/CodeGen/MachineBasicBlock.cpp b/lib/CodeGen/MachineBasicBlock.cpp index d5758da0464..d65916f4966 100644 --- a/lib/CodeGen/MachineBasicBlock.cpp +++ b/lib/CodeGen/MachineBasicBlock.cpp @@ -42,6 +42,8 @@ using namespace llvm; MachineBasicBlock::MachineBasicBlock(MachineFunction &MF, const BasicBlock *B) : BB(B), Number(-1), xParent(&MF) { Insts.Parent = this; + if (B) + IrrLoopHeaderWeight = B->getIrrLoopHeaderWeight(); } MachineBasicBlock::~MachineBasicBlock() { @@ -338,6 +340,12 @@ void MachineBasicBlock::print(raw_ostream &OS, ModuleSlotTracker &MST, } OS << '\n'; } + if (IrrLoopHeaderWeight) { + if (Indexes) OS << '\t'; + OS << " Irreducible loop header weight: " + << IrrLoopHeaderWeight.getValue(); + OS << '\n'; + } } void MachineBasicBlock::printAsOperand(raw_ostream &OS, diff --git a/lib/CodeGen/MachineBlockFrequencyInfo.cpp b/lib/CodeGen/MachineBlockFrequencyInfo.cpp index 14cd91206d8..2c336e45056 100644 --- a/lib/CodeGen/MachineBlockFrequencyInfo.cpp +++ b/lib/CodeGen/MachineBlockFrequencyInfo.cpp @@ -234,6 +234,12 @@ MachineBlockFrequencyInfo::getProfileCountFromFreq(uint64_t Freq) const { return MBFI ? MBFI->getProfileCountFromFreq(*F, Freq) : None; } +bool +MachineBlockFrequencyInfo::isIrrLoopHeader(const MachineBasicBlock *MBB) { + assert(MBFI && "Expected analysis to be available"); + return MBFI->isIrrLoopHeader(MBB); +} + const MachineFunction *MachineBlockFrequencyInfo::getFunction() const { return MBFI ? MBFI->getFunction() : nullptr; } diff --git a/lib/IR/BasicBlock.cpp b/lib/IR/BasicBlock.cpp index 2b780adf6c6..22513924a96 100644 --- a/lib/IR/BasicBlock.cpp +++ b/lib/IR/BasicBlock.cpp @@ -447,3 +447,16 @@ bool BasicBlock::isLandingPad() const { const LandingPadInst *BasicBlock::getLandingPadInst() const { return dyn_cast(getFirstNonPHI()); } + +Optional BasicBlock::getIrrLoopHeaderWeight() const { + const TerminatorInst *TI = getTerminator(); + if (MDNode *MDIrrLoopHeader = + TI->getMetadata(LLVMContext::MD_irr_loop)) { + MDString *MDName = cast(MDIrrLoopHeader->getOperand(0)); + if (MDName->getString().equals("loop_header_weight")) { + auto *CI = mdconst::extract(MDIrrLoopHeader->getOperand(1)); + return Optional(CI->getValue().getZExtValue()); + } + } + return Optional(); +} diff --git a/lib/IR/LLVMContext.cpp b/lib/IR/LLVMContext.cpp index a94da5452b8..c8b7c10a9a4 100644 --- a/lib/IR/LLVMContext.cpp +++ b/lib/IR/LLVMContext.cpp @@ -60,6 +60,7 @@ LLVMContext::LLVMContext() : pImpl(new LLVMContextImpl(*this)) { {MD_absolute_symbol, "absolute_symbol"}, {MD_associated, "associated"}, {MD_callees, "callees"}, + {MD_irr_loop, "irr_loop"}, }; for (auto &MDKind : MDKinds) { diff --git a/lib/IR/MDBuilder.cpp b/lib/IR/MDBuilder.cpp index 54783e884e9..d8e64db7c5d 100644 --- a/lib/IR/MDBuilder.cpp +++ b/lib/IR/MDBuilder.cpp @@ -197,3 +197,10 @@ MDNode *MDBuilder::createTBAAStructTagNode(MDNode *BaseType, MDNode *AccessType, } return MDNode::get(Context, {BaseType, AccessType, createConstant(Off)}); } + +MDNode *MDBuilder::createIrrLoopHeaderWeight(uint64_t Weight) { + SmallVector Vals(2); + Vals[0] = createString("loop_header_weight"); + Vals[1] = createConstant(ConstantInt::get(Type::getInt64Ty(Context), Weight)); + return MDNode::get(Context, Vals); +} diff --git a/lib/Transforms/Instrumentation/PGOInstrumentation.cpp b/lib/Transforms/Instrumentation/PGOInstrumentation.cpp index 11a43e803a9..c92d48396c8 100644 --- a/lib/Transforms/Instrumentation/PGOInstrumentation.cpp +++ b/lib/Transforms/Instrumentation/PGOInstrumentation.cpp @@ -844,8 +844,9 @@ public: PGOUseFunc(Function &Func, Module *Modu, std::unordered_multimap &ComdatMembers, BranchProbabilityInfo *BPI = nullptr, - BlockFrequencyInfo *BFI = nullptr) - : F(Func), M(Modu), FuncInfo(Func, ComdatMembers, false, BPI, BFI), + BlockFrequencyInfo *BFIin = nullptr) + : F(Func), M(Modu), BFI(BFIin), + FuncInfo(Func, ComdatMembers, false, BPI, BFIin), FreqAttr(FFA_Normal) {} // Read counts for the instrumented BB from profile. @@ -863,6 +864,9 @@ public: // Annotate the value profile call sites for one value kind. void annotateValueSites(uint32_t Kind); + // Annotate the irreducible loop header weights. + void annotateIrrLoopHeaderWeights(); + // The hotness of the function from the profile count. enum FuncFreqAttr { FFA_Normal, FFA_Cold, FFA_Hot }; @@ -894,6 +898,7 @@ public: private: Function &F; Module *M; + BlockFrequencyInfo *BFI; // This member stores the shared information with class PGOGenFunc. FuncPGOInstrumentation FuncInfo; @@ -1183,6 +1188,18 @@ void PGOUseFunc::setBranchWeights() { } } +void PGOUseFunc::annotateIrrLoopHeaderWeights() { + DEBUG(dbgs() << "\nAnnotating irreducible loop header weights.\n"); + // Find irr loop headers + for (auto &BB : F) { + if (BFI->isIrrLoopHeader(&BB)) { + TerminatorInst *TI = BB.getTerminator(); + const UseBBInfo &BBCountInfo = getBBInfo(&BB); + setIrrLoopHeaderMetadata(M, TI, BBCountInfo.CountValue); + } + } +} + void SelectInstVisitor::instrumentOneSelectInst(SelectInst &SI) { Module *M = F.getParent(); IRBuilder<> Builder(&SI); @@ -1441,6 +1458,7 @@ static bool annotateAllFunctions( Func.populateCounters(); Func.setBranchWeights(); Func.annotateValueSites(); + Func.annotateIrrLoopHeaderWeights(); PGOUseFunc::FuncFreqAttr FreqAttr = Func.getFuncFreqAttr(); if (FreqAttr == PGOUseFunc::FFA_Cold) ColdFunctions.push_back(&F); @@ -1582,6 +1600,12 @@ void llvm::setProfMetadata(Module *M, Instruction *TI, namespace llvm { +void setIrrLoopHeaderMetadata(Module *M, Instruction *TI, uint64_t Count) { + MDBuilder MDB(M->getContext()); + TI->setMetadata(llvm::LLVMContext::MD_irr_loop, + MDB.createIrrLoopHeaderWeight(Count)); +} + template <> struct GraphTraits { using NodeRef = const BasicBlock *; using ChildIteratorType = succ_const_iterator; diff --git a/test/Analysis/BlockFrequencyInfo/irreducible_pgo.ll b/test/Analysis/BlockFrequencyInfo/irreducible_pgo.ll new file mode 100644 index 00000000000..0a580276d95 --- /dev/null +++ b/test/Analysis/BlockFrequencyInfo/irreducible_pgo.ll @@ -0,0 +1,208 @@ +; RUN: opt < %s -analyze -block-freq | FileCheck %s +; RUN: opt < %s -passes='print' -disable-output 2>&1 | FileCheck %s + +; Function Attrs: noinline norecurse nounwind readnone uwtable +define i32 @_Z11irreducibleii(i32 %iter_outer, i32 %iter_inner) local_unnamed_addr !prof !27 { +entry: + %cmp24 = icmp sgt i32 %iter_outer, 0 + br i1 %cmp24, label %for.body, label %entry.for.cond.cleanup_crit_edge, !prof !28 + +entry.for.cond.cleanup_crit_edge: ; preds = %entry + br label %for.cond.cleanup + +for.cond.cleanup: ; preds = %for.end, %entry.for.cond.cleanup_crit_edge + %sum.0.lcssa = phi i32 [ 0, %entry.for.cond.cleanup_crit_edge ], [ %sum.1, %for.end ] + ret i32 %sum.0.lcssa + +for.body: ; preds = %for.end, %entry + %k.026 = phi i32 [ %inc12, %for.end ], [ 0, %entry ] + %sum.025 = phi i32 [ %sum.1, %for.end ], [ 0, %entry ] + %rem23 = and i32 %k.026, 1 + %cmp1 = icmp eq i32 %rem23, 0 + br i1 %cmp1, label %entry8, label %for.cond2, !prof !29 + +for.cond2: ; preds = %if.end9, %for.body + %sum.1 = phi i32 [ %add10, %if.end9 ], [ %sum.025, %for.body ] + %i.0 = phi i32 [ %inc, %if.end9 ], [ 0, %for.body ] + %cmp3 = icmp slt i32 %i.0, %iter_inner + br i1 %cmp3, label %for.body4, label %for.end, !prof !30, !irr_loop !31 + +for.body4: ; preds = %for.cond2 + %rem5 = srem i32 %k.026, 3 + %cmp6 = icmp eq i32 %rem5, 0 + br i1 %cmp6, label %entry8, label %if.end9, !prof !32 + +entry8: ; preds = %for.body4, %for.body + %sum.2 = phi i32 [ %sum.025, %for.body ], [ %sum.1, %for.body4 ] + %i.1 = phi i32 [ 0, %for.body ], [ %i.0, %for.body4 ] + %add = add nsw i32 %sum.2, 4 + br label %if.end9, !irr_loop !33 + +if.end9: ; preds = %entry8, %for.body4 + %sum.3 = phi i32 [ %add, %entry8 ], [ %sum.1, %for.body4 ] + %i.2 = phi i32 [ %i.1, %entry8 ], [ %i.0, %for.body4 ] + %add10 = add nsw i32 %sum.3, 1 + %inc = add nsw i32 %i.2, 1 + br label %for.cond2, !irr_loop !34 + +for.end: ; preds = %for.cond2 + %inc12 = add nuw nsw i32 %k.026, 1 + %exitcond = icmp eq i32 %inc12, %iter_outer + br i1 %exitcond, label %for.cond.cleanup, label %for.body, !prof !35 +} + +!27 = !{!"function_entry_count", i64 1} +!28 = !{!"branch_weights", i32 1, i32 0} +!29 = !{!"branch_weights", i32 50, i32 50} +!30 = !{!"branch_weights", i32 950, i32 100} +!31 = !{!"loop_header_weight", i64 1050} +!32 = !{!"branch_weights", i32 323, i32 627} +!33 = !{!"loop_header_weight", i64 373} +!34 = !{!"loop_header_weight", i64 1000} +!35 = !{!"branch_weights", i32 1, i32 99} + +; CHECK-LABEL: Printing analysis {{.*}} for function '_Z11irreducibleii': +; CHECK-NEXT: block-frequency-info: _Z11irreducibleii +; CHECK-NEXT: - entry: {{.*}} count = 1 +; CHECK-NEXT: - entry.for.cond.cleanup_crit_edge: {{.*}} count = 0 +; CHECK-NEXT: - for.cond.cleanup: {{.*}} count = 1 +; CHECK-NEXT: - for.body: {{.*}} count = 100 +; CHECK-NEXT: - for.cond2: {{.*}} count = 1050, irr_loop_header_weight = 1050 +; CHECK-NEXT: - for.body4: {{.*}} count = 950 +; CHECK-NEXT: - entry8: {{.*}} count = 373, irr_loop_header_weight = 373 +; CHECK-NEXT: - if.end9: {{.*}} count = 1000, irr_loop_header_weight = 1000 +; CHECK-NEXT: - for.end: {{.*}} count = 100 + +@targets = local_unnamed_addr global [256 x i8*] zeroinitializer, align 16 +@tracing = local_unnamed_addr global i32 0, align 4 + +; Function Attrs: noinline norecurse nounwind uwtable +define i32 @_Z11irreduciblePh(i8* nocapture readonly %p) !prof !27 { +entry: + store <2 x i8*> , <2 x i8*>* bitcast ([256 x i8*]* @targets to <2 x i8*>*), align 16 + store i8* blockaddress(@_Z11irreduciblePh, %TARGET_2), i8** getelementptr inbounds ([256 x i8*], [256 x i8*]* @targets, i64 0, i64 2), align 16 + %0 = load i32, i32* @tracing, align 4 + %tobool = icmp eq i32 %0, 0 + br label %for.cond1 + +for.cond1: ; preds = %sw.default, %entry + %p.addr.0 = phi i8* [ %p, %entry ], [ %p.addr.4, %sw.default ] + %sum.0 = phi i32 [ 0, %entry ], [ %add25, %sw.default ] + %incdec.ptr = getelementptr inbounds i8, i8* %p.addr.0, i64 1 + %1 = load i8, i8* %p.addr.0, align 1 + %incdec.ptr2 = getelementptr inbounds i8, i8* %p.addr.0, i64 2 + %2 = load i8, i8* %incdec.ptr, align 1 + %conv3 = zext i8 %2 to i32 + br label %dispatch_op + +dispatch_op: ; preds = %sw.bb6, %for.cond1 + %p.addr.1 = phi i8* [ %incdec.ptr2, %for.cond1 ], [ %p.addr.2, %sw.bb6 ] + %op.0 = phi i8 [ %1, %for.cond1 ], [ 1, %sw.bb6 ] + %oparg.0 = phi i32 [ %conv3, %for.cond1 ], [ %oparg.2, %sw.bb6 ] + %sum.1 = phi i32 [ %sum.0, %for.cond1 ], [ %add7, %sw.bb6 ] + switch i8 %op.0, label %sw.default [ + i8 0, label %sw.bb + i8 1, label %dispatch_op.sw.bb6_crit_edge + i8 2, label %sw.bb15 + ], !prof !36 + +dispatch_op.sw.bb6_crit_edge: ; preds = %dispatch_op + br label %sw.bb6 + +sw.bb: ; preds = %indirectgoto, %dispatch_op + %oparg.1 = phi i32 [ %oparg.0, %dispatch_op ], [ 0, %indirectgoto ] + %sum.2 = phi i32 [ %sum.1, %dispatch_op ], [ %sum.7, %indirectgoto ] + %add.neg = sub i32 -5, %oparg.1 + %sub = add i32 %add.neg, %sum.2 + br label %exit + +TARGET_1: ; preds = %indirectgoto + %incdec.ptr4 = getelementptr inbounds i8, i8* %add.ptr.pn, i64 2 + %3 = load i8, i8* %p.addr.5, align 1 + %conv5 = zext i8 %3 to i32 + br label %sw.bb6 + +sw.bb6: ; preds = %TARGET_1, %dispatch_op.sw.bb6_crit_edge + %p.addr.2 = phi i8* [ %incdec.ptr4, %TARGET_1 ], [ %p.addr.1, %dispatch_op.sw.bb6_crit_edge ] + %oparg.2 = phi i32 [ %conv5, %TARGET_1 ], [ %oparg.0, %dispatch_op.sw.bb6_crit_edge ] + %sum.3 = phi i32 [ %sum.7, %TARGET_1 ], [ %sum.1, %dispatch_op.sw.bb6_crit_edge ] + %mul = mul nsw i32 %oparg.2, 7 + %add7 = add nsw i32 %sum.3, %mul + %rem46 = and i32 %add7, 1 + %cmp8 = icmp eq i32 %rem46, 0 + br i1 %cmp8, label %dispatch_op, label %if.then, !prof !37, !irr_loop !38 + +if.then: ; preds = %sw.bb6 + %mul9 = mul nsw i32 %add7, 9 + br label %indirectgoto + +TARGET_2: ; preds = %indirectgoto + %incdec.ptr13 = getelementptr inbounds i8, i8* %add.ptr.pn, i64 2 + %4 = load i8, i8* %p.addr.5, align 1 + %conv14 = zext i8 %4 to i32 + br label %sw.bb15 + +sw.bb15: ; preds = %TARGET_2, %dispatch_op + %p.addr.3 = phi i8* [ %p.addr.1, %dispatch_op ], [ %incdec.ptr13, %TARGET_2 ] + %oparg.3 = phi i32 [ %oparg.0, %dispatch_op ], [ %conv14, %TARGET_2 ] + %sum.4 = phi i32 [ %sum.1, %dispatch_op ], [ %sum.7, %TARGET_2 ] + %add16 = add nsw i32 %oparg.3, 3 + %add17 = add nsw i32 %add16, %sum.4 + br i1 %tobool, label %if.then18, label %exit, !prof !39, !irr_loop !40 + +if.then18: ; preds = %sw.bb15 + %idx.ext = sext i32 %oparg.3 to i64 + %add.ptr = getelementptr inbounds i8, i8* %p.addr.3, i64 %idx.ext + %mul19 = mul nsw i32 %add17, 17 + br label %indirectgoto + +unknown_op: ; preds = %indirectgoto + %sub24 = add nsw i32 %sum.7, -4 + br label %sw.default + +sw.default: ; preds = %unknown_op, %dispatch_op + %p.addr.4 = phi i8* [ %p.addr.5, %unknown_op ], [ %p.addr.1, %dispatch_op ] + %sum.5 = phi i32 [ %sub24, %unknown_op ], [ %sum.1, %dispatch_op ] + %add25 = add nsw i32 %sum.5, 11 + br label %for.cond1 + +exit: ; preds = %sw.bb15, %sw.bb + %sum.6 = phi i32 [ %sub, %sw.bb ], [ %add17, %sw.bb15 ] + ret i32 %sum.6 + +indirectgoto: ; preds = %if.then18, %if.then + %add.ptr.pn = phi i8* [ %add.ptr, %if.then18 ], [ %p.addr.2, %if.then ] + %sum.7 = phi i32 [ %mul19, %if.then18 ], [ %mul9, %if.then ] + %p.addr.5 = getelementptr inbounds i8, i8* %add.ptr.pn, i64 1 + %5 = load i8, i8* %add.ptr.pn, align 1 + %idxprom21 = zext i8 %5 to i64 + %arrayidx22 = getelementptr inbounds [256 x i8*], [256 x i8*]* @targets, i64 0, i64 %idxprom21 + %6 = load i8*, i8** %arrayidx22, align 8 + indirectbr i8* %6, [label %unknown_op, label %sw.bb, label %TARGET_1, label %TARGET_2], !prof !41, !irr_loop !42 +} + +!36 = !{!"branch_weights", i32 0, i32 0, i32 201, i32 1} +!37 = !{!"branch_weights", i32 201, i32 300} +!38 = !{!"loop_header_weight", i64 501} +!39 = !{!"branch_weights", i32 100, i32 0} +!40 = !{!"loop_header_weight", i64 100} +!41 = !{!"branch_weights", i32 0, i32 1, i32 300, i32 99} +!42 = !{!"loop_header_weight", i64 400} + +; CHECK-LABEL: Printing analysis {{.*}} for function '_Z11irreduciblePh': +; CHECK-NEXT: block-frequency-info: _Z11irreduciblePh +; CHECK-NEXT: - entry: {{.*}} count = 1 +; CHECK-NEXT: - for.cond1: {{.*}} count = 1 +; CHECK-NEXT: - dispatch_op: {{.*}} count = 201 +; CHECK-NEXT: - dispatch_op.sw.bb6_crit_edge: {{.*}} count = 200 +; CHECK-NEXT: - sw.bb: {{.*}} count = 0 +; CHECK-NEXT: - TARGET_1: {{.*}} count = 299 +; CHECK-NEXT: - sw.bb6: {{.*}} count = 500, irr_loop_header_weight = 501 +; CHECK-NEXT: - if.then: {{.*}} count = 299 +; CHECK-NEXT: - TARGET_2: {{.*}} count = 98 +; CHECK-NEXT: - sw.bb15: {{.*}} count = 99, irr_loop_header_weight = 100 +; CHECK-NEXT: - if.then18: {{.*}} count = 99 +; CHECK-NEXT: - unknown_op: {{.*}} count = 0 +; CHECK-NEXT: - sw.default: {{.*}} count = 0 +; CHECK-NEXT: - exit: {{.*}} count = 1 +; CHECK-NEXT: - indirectgoto: {{.*}} count = 399, irr_loop_header_weight = 400 diff --git a/test/ThinLTO/X86/lazyload_metadata.ll b/test/ThinLTO/X86/lazyload_metadata.ll index a6d46e5586a..4680e462458 100644 --- a/test/ThinLTO/X86/lazyload_metadata.ll +++ b/test/ThinLTO/X86/lazyload_metadata.ll @@ -10,13 +10,13 @@ ; RUN: llvm-lto -thinlto-action=import %t2.bc -thinlto-index=%t3.bc \ ; RUN: -o /dev/null -stats \ ; RUN: 2>&1 | FileCheck %s -check-prefix=LAZY -; LAZY: 53 bitcode-reader - Number of Metadata records loaded +; LAZY: 55 bitcode-reader - Number of Metadata records loaded ; LAZY: 2 bitcode-reader - Number of MDStrings loaded ; RUN: llvm-lto -thinlto-action=import %t2.bc -thinlto-index=%t3.bc \ ; RUN: -o /dev/null -disable-ondemand-mds-loading -stats \ ; RUN: 2>&1 | FileCheck %s -check-prefix=NOTLAZY -; NOTLAZY: 62 bitcode-reader - Number of Metadata records loaded +; NOTLAZY: 64 bitcode-reader - Number of Metadata records loaded ; NOTLAZY: 7 bitcode-reader - Number of MDStrings loaded diff --git a/test/Transforms/PGOProfile/Inputs/irreducible.proftext b/test/Transforms/PGOProfile/Inputs/irreducible.proftext new file mode 100644 index 00000000000..9b0210d9a30 --- /dev/null +++ b/test/Transforms/PGOProfile/Inputs/irreducible.proftext @@ -0,0 +1,29 @@ +:ir +_Z11irreducibleii +# Func Hash: +64451410787 +# Num Counters: +6 +# Counter Values: +1000 +950 +100 +373 +1 +0 + +_Z11irreduciblePh +# Func Hash: +104649601521 +# Num Counters: +9 +# Counter Values: +100 +300 +99 +300 +201 +1 +1 +0 +0 diff --git a/test/Transforms/PGOProfile/irreducible.ll b/test/Transforms/PGOProfile/irreducible.ll new file mode 100644 index 00000000000..37f6e206ee9 --- /dev/null +++ b/test/Transforms/PGOProfile/irreducible.ll @@ -0,0 +1,184 @@ +; RUN: llvm-profdata merge %S/Inputs/irreducible.proftext -o %t.profdata +; RUN: opt < %s -pgo-instr-use -pgo-test-profile-file=%t.profdata -S | FileCheck %s --check-prefix=USE +; RUN: opt < %s -passes=pgo-instr-use -pgo-test-profile-file=%t.profdata -S | FileCheck %s --check-prefix=USE + +; GEN: $__llvm_profile_raw_version = comdat any + +; Function Attrs: noinline norecurse nounwind readnone uwtable +define i32 @_Z11irreducibleii(i32 %iter_outer, i32 %iter_inner) local_unnamed_addr #0 { +entry: + %cmp24 = icmp sgt i32 %iter_outer, 0 + br i1 %cmp24, label %for.body, label %entry.for.cond.cleanup_crit_edge + +entry.for.cond.cleanup_crit_edge: ; preds = %entry + br label %for.cond.cleanup + +for.cond.cleanup: ; preds = %entry.for.cond.cleanup_crit_edge, %for.end + %sum.0.lcssa = phi i32 [ 0, %entry.for.cond.cleanup_crit_edge ], [ %sum.1, %for.end ] + ret i32 %sum.0.lcssa + +for.body: ; preds = %entry, %for.end + %k.026 = phi i32 [ %inc12, %for.end ], [ 0, %entry ] + %sum.025 = phi i32 [ %sum.1, %for.end ], [ 0, %entry ] + %rem23 = and i32 %k.026, 1 + %cmp1 = icmp eq i32 %rem23, 0 + br i1 %cmp1, label %entry8, label %for.cond2 + +for.cond2: ; preds = %for.body, %if.end9 + %sum.1 = phi i32 [ %add10, %if.end9 ], [ %sum.025, %for.body ] + %i.0 = phi i32 [ %inc, %if.end9 ], [ 0, %for.body ] + %cmp3 = icmp slt i32 %i.0, %iter_inner + br i1 %cmp3, label %for.body4, label %for.end +; USE: br i1 %cmp3, label %for.body4, label %for.end, !prof !{{[0-9]+}}, +; USE-SAME: !irr_loop ![[FOR_COND2_IRR_LOOP:[0-9]+]] + +for.body4: ; preds = %for.cond2 + %rem5 = srem i32 %k.026, 3 + %cmp6 = icmp eq i32 %rem5, 0 + br i1 %cmp6, label %entry8, label %if.end9 + +entry8: ; preds = %for.body4, %for.body + %sum.2 = phi i32 [ %sum.025, %for.body ], [ %sum.1, %for.body4 ] + %i.1 = phi i32 [ 0, %for.body ], [ %i.0, %for.body4 ] + %add = add nsw i32 %sum.2, 4 + br label %if.end9 +; USE: br label %if.end9, +; USE-SAME: !irr_loop ![[ENTRY8_IRR_LOOP:[0-9]+]] + +if.end9: ; preds = %entry8, %for.body4 + %sum.3 = phi i32 [ %add, %entry8 ], [ %sum.1, %for.body4 ] + %i.2 = phi i32 [ %i.1, %entry8 ], [ %i.0, %for.body4 ] + %add10 = add nsw i32 %sum.3, 1 + %inc = add nsw i32 %i.2, 1 + br label %for.cond2 +; USE: br label %for.cond2, +; USE-SAME: !irr_loop ![[IF_END9_IRR_LOOP:[0-9]+]] + +for.end: ; preds = %for.cond2 + %inc12 = add nuw nsw i32 %k.026, 1 + %exitcond = icmp eq i32 %inc12, %iter_outer + br i1 %exitcond, label %for.cond.cleanup, label %for.body +} + + + +@targets = local_unnamed_addr global [256 x i8*] zeroinitializer, align 16 +@tracing = local_unnamed_addr global i32 0, align 4 + +; Function Attrs: noinline norecurse nounwind uwtable +define i32 @_Z11irreduciblePh(i8* nocapture readonly %p) { +entry: + store <2 x i8*> , <2 x i8*>* bitcast ([256 x i8*]* @targets to <2 x i8*>*), align 16 + store i8* blockaddress(@_Z11irreduciblePh, %TARGET_2), i8** getelementptr inbounds ([256 x i8*], [256 x i8*]* @targets, i64 0, i64 2), align 16 + %0 = load i32, i32* @tracing, align 4 + %tobool = icmp eq i32 %0, 0 + br label %for.cond1 + +for.cond1: ; preds = %sw.default, %entry + %p.addr.0 = phi i8* [ %p, %entry ], [ %p.addr.4, %sw.default ] + %sum.0 = phi i32 [ 0, %entry ], [ %add25, %sw.default ] + %incdec.ptr = getelementptr inbounds i8, i8* %p.addr.0, i64 1 + %1 = load i8, i8* %p.addr.0, align 1 + %incdec.ptr2 = getelementptr inbounds i8, i8* %p.addr.0, i64 2 + %2 = load i8, i8* %incdec.ptr, align 1 + %conv3 = zext i8 %2 to i32 + br label %dispatch_op + +dispatch_op: ; preds = %sw.bb6, %for.cond1 + %p.addr.1 = phi i8* [ %incdec.ptr2, %for.cond1 ], [ %p.addr.2, %sw.bb6 ] + %op.0 = phi i8 [ %1, %for.cond1 ], [ 1, %sw.bb6 ] + %oparg.0 = phi i32 [ %conv3, %for.cond1 ], [ %oparg.2, %sw.bb6 ] + %sum.1 = phi i32 [ %sum.0, %for.cond1 ], [ %add7, %sw.bb6 ] + switch i8 %op.0, label %sw.default [ + i8 0, label %sw.bb + i8 1, label %dispatch_op.sw.bb6_crit_edge + i8 2, label %sw.bb15 + ] + +dispatch_op.sw.bb6_crit_edge: ; preds = %dispatch_op + br label %sw.bb6 + +sw.bb: ; preds = %indirectgoto, %dispatch_op + %oparg.1 = phi i32 [ %oparg.0, %dispatch_op ], [ 0, %indirectgoto ] + %sum.2 = phi i32 [ %sum.1, %dispatch_op ], [ %sum.7, %indirectgoto ] + %add.neg = sub i32 -5, %oparg.1 + %sub = add i32 %add.neg, %sum.2 + br label %exit + +TARGET_1: ; preds = %indirectgoto + %incdec.ptr4 = getelementptr inbounds i8, i8* %add.ptr.pn, i64 2 + %3 = load i8, i8* %p.addr.5, align 1 + %conv5 = zext i8 %3 to i32 + br label %sw.bb6 + +sw.bb6: ; preds = %dispatch_op.sw.bb6_crit_edge, %TARGET_1 + %p.addr.2 = phi i8* [ %incdec.ptr4, %TARGET_1 ], [ %p.addr.1, %dispatch_op.sw.bb6_crit_edge ] + %oparg.2 = phi i32 [ %conv5, %TARGET_1 ], [ %oparg.0, %dispatch_op.sw.bb6_crit_edge ] + %sum.3 = phi i32 [ %sum.7, %TARGET_1 ], [ %sum.1, %dispatch_op.sw.bb6_crit_edge ] + %mul = mul nsw i32 %oparg.2, 7 + %add7 = add nsw i32 %sum.3, %mul + %rem46 = and i32 %add7, 1 + %cmp8 = icmp eq i32 %rem46, 0 + br i1 %cmp8, label %dispatch_op, label %if.then +; USE: br i1 %cmp8, label %dispatch_op, label %if.then, !prof !{{[0-9]+}}, +; USE-SAME: !irr_loop ![[SW_BB6_IRR_LOOP:[0-9]+]] + +if.then: ; preds = %sw.bb6 + %mul9 = mul nsw i32 %add7, 9 + br label %indirectgoto + +TARGET_2: ; preds = %indirectgoto + %incdec.ptr13 = getelementptr inbounds i8, i8* %add.ptr.pn, i64 2 + %4 = load i8, i8* %p.addr.5, align 1 + %conv14 = zext i8 %4 to i32 + br label %sw.bb15 + +sw.bb15: ; preds = %TARGET_2, %dispatch_op + %p.addr.3 = phi i8* [ %p.addr.1, %dispatch_op ], [ %incdec.ptr13, %TARGET_2 ] + %oparg.3 = phi i32 [ %oparg.0, %dispatch_op ], [ %conv14, %TARGET_2 ] + %sum.4 = phi i32 [ %sum.1, %dispatch_op ], [ %sum.7, %TARGET_2 ] + %add16 = add nsw i32 %oparg.3, 3 + %add17 = add nsw i32 %add16, %sum.4 + br i1 %tobool, label %if.then18, label %exit +; USE: br i1 %tobool, label %if.then18, label %exit, !prof !{{[0-9]+}}, +; USE-SAME: !irr_loop ![[SW_BB15_IRR_LOOP:[0-9]+]] + +if.then18: ; preds = %sw.bb15 + %idx.ext = sext i32 %oparg.3 to i64 + %add.ptr = getelementptr inbounds i8, i8* %p.addr.3, i64 %idx.ext + %mul19 = mul nsw i32 %add17, 17 + br label %indirectgoto + +unknown_op: ; preds = %indirectgoto + %sub24 = add nsw i32 %sum.7, -4 + br label %sw.default + +sw.default: ; preds = %unknown_op, %dispatch_op + %p.addr.4 = phi i8* [ %p.addr.5, %unknown_op ], [ %p.addr.1, %dispatch_op ] + %sum.5 = phi i32 [ %sub24, %unknown_op ], [ %sum.1, %dispatch_op ] + %add25 = add nsw i32 %sum.5, 11 + br label %for.cond1 + +exit: ; preds = %sw.bb15, %sw.bb + %sum.6 = phi i32 [ %sub, %sw.bb ], [ %add17, %sw.bb15 ] + ret i32 %sum.6 + +indirectgoto: ; preds = %if.then18, %if.then + %add.ptr.pn = phi i8* [ %add.ptr, %if.then18 ], [ %p.addr.2, %if.then ] + %sum.7 = phi i32 [ %mul19, %if.then18 ], [ %mul9, %if.then ] + %p.addr.5 = getelementptr inbounds i8, i8* %add.ptr.pn, i64 1 + %5 = load i8, i8* %add.ptr.pn, align 1 + %idxprom21 = zext i8 %5 to i64 + %arrayidx22 = getelementptr inbounds [256 x i8*], [256 x i8*]* @targets, i64 0, i64 %idxprom21 + %6 = load i8*, i8** %arrayidx22, align 8 + indirectbr i8* %6, [label %unknown_op, label %sw.bb, label %TARGET_1, label %TARGET_2] +; USE: indirectbr i8* %6, [label %unknown_op, label %sw.bb, label %TARGET_1, label %TARGET_2], !prof !{{[0-9]+}}, +; USE-SAME: !irr_loop ![[INDIRECTGOTO_IRR_LOOP:[0-9]+]] +} + +; USE: ![[FOR_COND2_IRR_LOOP]] = !{!"loop_header_weight", i64 1050} +; USE: ![[ENTRY8_IRR_LOOP]] = !{!"loop_header_weight", i64 373} +; USE: ![[IF_END9_IRR_LOOP]] = !{!"loop_header_weight", i64 1000} +; USE: ![[SW_BB6_IRR_LOOP]] = !{!"loop_header_weight", i64 501} +; USE: ![[SW_BB15_IRR_LOOP]] = !{!"loop_header_weight", i64 100} +; USE: ![[INDIRECTGOTO_IRR_LOOP]] = !{!"loop_header_weight", i64 400}