!1 = !{!1} ; an identifier for the inner loop
!2 = !{!2} ; an identifier for the outer loop
+'``irr_loop``' Metadata
+^^^^^^^^^^^^^^^^^^^^^^^
+
+``irr_loop`` metadata may be attached to the terminator instruction of a basic
+block that's an irreducible loop header (note that an irreducible loop has more
+than once header basic blocks.) If ``irr_loop`` metadata is attached to the
+terminator instruction of a basic block that is not really an irreducible loop
+header, the behavior is undefined. The intent of this metadata is to improve the
+accuracy of the block frequency propagation. For example, in the code below, the
+block ``header0`` may have a loop header weight (relative to the other headers of
+the irreducible loop) of 100:
+
+.. code-block:: llvm
+
+ header0:
+ ...
+ br i1 %cmp, label %t1, label %t2, !irr_loop !0
+
+ ...
+ !0 = !{"loop_header_weight", i64 100}
+
+Irreducible loop header weights are typically based on profile data.
+
'``invariant.group``' Metadata
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
/// the enclosing function's count (if available) and returns the value.
Optional<uint64_t> getProfileCountFromFreq(uint64_t Freq) const;
+ /// \brief Returns true if \p BB is an irreducible loop header
+ /// block. Otherwise false.
+ bool isIrrLoopHeader(const BasicBlock *BB);
+
// Set the frequency of the given basic block.
void setBlockFreq(const BasicBlock *BB, uint64_t Freq);
#include "llvm/ADT/Optional.h"
#include "llvm/ADT/PostOrderIterator.h"
#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/SparseBitVector.h"
#include "llvm/ADT/Twine.h"
#include "llvm/ADT/iterator_range.h"
#include "llvm/IR/BasicBlock.h"
/// \brief Data about each block. This is used downstream.
std::vector<FrequencyData> Freqs;
+ /// \brief Whether each block is an irreducible loop header.
+ /// This is used downstream.
+ SparseBitVector<> IsIrrLoopHeader;
+
/// \brief Loop data: see initializeLoops().
std::vector<WorkingData> Working;
/// the backedges going into each of the loop headers.
void adjustLoopHeaderMass(LoopData &Loop);
+ void distributeIrrLoopHeaderMass(Distribution &Dist);
+
/// \brief Package up a loop.
void packageLoop(LoopData &Loop);
const BlockNode &Node) const;
Optional<uint64_t> getProfileCountFromFreq(const Function &F,
uint64_t Freq) const;
+ bool isIrrLoopHeader(const BlockNode &Node);
void setBlockFreq(const BlockNode &Node, uint64_t Freq);
return BlockFrequencyInfoImplBase::getProfileCountFromFreq(F, Freq);
}
+ bool isIrrLoopHeader(const BlockT *BB) {
+ return BlockFrequencyInfoImplBase::isIrrLoopHeader(getNode(BB));
+ }
+
void setBlockFreq(const BlockT *BB, uint64_t Freq);
Scaled64 getFloatingBlockFreq(const BlockT *BB) const {
DEBUG(dbgs() << "compute-mass-in-loop: " << getLoopName(Loop) << "\n");
if (Loop.isIrreducible()) {
- BlockMass Remaining = BlockMass::getFull();
+ DEBUG(dbgs() << "isIrreducible = true\n");
+ Distribution Dist;
+ unsigned NumHeadersWithWeight = 0;
for (uint32_t H = 0; H < Loop.NumHeaders; ++H) {
- auto &Mass = Working[Loop.Nodes[H].Index].getMass();
- Mass = Remaining * BranchProbability(1, Loop.NumHeaders - H);
- Remaining -= Mass;
+ auto &HeaderNode = Loop.Nodes[H];
+ const BlockT *Block = getBlock(HeaderNode);
+ IsIrrLoopHeader.set(Loop.Nodes[H].Index);
+ Optional<uint64_t> HeaderWeight = Block->getIrrLoopHeaderWeight();
+ if (!HeaderWeight)
+ continue;
+ DEBUG(dbgs() << getBlockName(HeaderNode)
+ << " has irr loop header weight " << HeaderWeight.getValue()
+ << "\n");
+ NumHeadersWithWeight++;
+ uint64_t HeaderWeightValue = HeaderWeight.getValue();
+ if (HeaderWeightValue)
+ Dist.addLocal(HeaderNode, HeaderWeightValue);
}
+ if (NumHeadersWithWeight != Loop.NumHeaders) {
+ // Not all headers have a weight metadata. Distribute weight evenly.
+ Dist = Distribution();
+ for (uint32_t H = 0; H < Loop.NumHeaders; ++H) {
+ auto &HeaderNode = Loop.Nodes[H];
+ Dist.addLocal(HeaderNode, 1);
+ }
+ }
+ distributeIrrLoopHeaderMass(Dist);
for (const BlockNode &M : Loop.Nodes)
if (!propagateMassToSuccessors(&Loop, M))
llvm_unreachable("unhandled irreducible control flow");
-
- adjustLoopHeaderMass(Loop);
+ if (NumHeadersWithWeight != Loop.NumHeaders)
+ // Not all headers have a weight metadata. Adjust header mass.
+ adjustLoopHeaderMass(Loop);
} else {
Working[Loop.getHeader().Index].getMass() = BlockMass::getFull();
if (!propagateMassToSuccessors(&Loop, Loop.getHeader()))
BlockFrequencyInfoImplBase::getBlockProfileCount(
*F->getFunction(), getNode(&BB)))
OS << ", count = " << ProfileCount.getValue();
+ if (Optional<uint64_t> IrrLoopHeaderWeight =
+ BB.getIrrLoopHeaderWeight())
+ OS << ", irr_loop_header_weight = " << IrrLoopHeaderWeight.getValue();
OS << "\n";
}
using const_probability_iterator =
std::vector<BranchProbability>::const_iterator;
+ Optional<uint64_t> IrrLoopHeaderWeight;
+
/// Keep track of the physical registers that are livein of the basicblock.
using LiveInVector = std::vector<RegisterMaskPair>;
LiveInVector LiveIns;
/// Return the MCSymbol for this basic block.
MCSymbol *getSymbol() const;
+ Optional<uint64_t> getIrrLoopHeaderWeight() const {
+ return IrrLoopHeaderWeight;
+ }
+
+ void setIrrLoopHeaderWeight(uint64_t Weight) {
+ IrrLoopHeaderWeight = Weight;
+ }
+
private:
/// Return probability iterator corresponding to the I successor iterator.
probability_iterator getProbabilityIterator(succ_iterator I);
Optional<uint64_t> getBlockProfileCount(const MachineBasicBlock *MBB) const;
Optional<uint64_t> getProfileCountFromFreq(uint64_t Freq) const;
+ bool isIrrLoopHeader(const MachineBasicBlock *MBB);
+
const MachineFunction *getFunction() const;
const MachineBranchProbabilityInfo *getMBPI() const;
void view(const Twine &Name, bool isSimple = true) const;
/// \brief Return true if it is legal to hoist instructions into this block.
bool isLegalToHoistInto() const;
+ Optional<uint64_t> getIrrLoopHeaderWeight() const;
+
private:
/// \brief Increment the internal refcount of the number of BlockAddresses
/// referencing this BasicBlock by \p Amt.
MD_absolute_symbol = 21, // "absolute_symbol"
MD_associated = 22, // "associated"
MD_callees = 23, // "callees"
+ MD_irr_loop = 24, // "irr_loop"
};
/// Known operand bundle tag IDs, which always have the same value. All
/// base type, access type and offset relative to the base type.
MDNode *createTBAAStructTagNode(MDNode *BaseType, MDNode *AccessType,
uint64_t Offset, bool IsConstant = false);
+
+ /// \brief Return metadata containing an irreducible loop header weight.
+ MDNode *createIrrLoopHeaderWeight(uint64_t Weight);
};
} // end namespace llvm
void setProfMetadata(Module *M, Instruction *TI, ArrayRef<uint64_t> EdgeCounts,
uint64_t MaxCount);
+void setIrrLoopHeaderMetadata(Module *M, Instruction *TI, uint64_t Count);
+
} // end namespace llvm
#endif // LLVM_TRANSFORMS_PGOINSTRUMENTATION_H
return BFI->getProfileCountFromFreq(*getFunction(), Freq);
}
+bool BlockFrequencyInfo::isIrrLoopHeader(const BasicBlock *BB) {
+ assert(BFI && "Expected analysis to be available");
+ return BFI->isIrrLoopHeader(BB);
+}
+
void BlockFrequencyInfo::setBlockFreq(const BasicBlock *BB, uint64_t Freq) {
assert(BFI && "Expected analysis to be available");
BFI->setBlockFreq(BB, Freq);
// Swap with a default-constructed std::vector, since std::vector<>::clear()
// does not actually clear heap storage.
std::vector<FrequencyData>().swap(Freqs);
+ IsIrrLoopHeader.clear();
std::vector<WorkingData>().swap(Working);
Loops.clear();
}
/// Releases all memory not used downstream. In particular, saves Freqs.
static void cleanup(BlockFrequencyInfoImplBase &BFI) {
std::vector<FrequencyData> SavedFreqs(std::move(BFI.Freqs));
+ SparseBitVector<> SavedIsIrrLoopHeader(std::move(BFI.IsIrrLoopHeader));
BFI.clear();
BFI.Freqs = std::move(SavedFreqs);
+ BFI.IsIrrLoopHeader = std::move(SavedIsIrrLoopHeader);
}
bool BlockFrequencyInfoImplBase::addToDist(Distribution &Dist,
return BlockCount.getLimitedValue();
}
+bool
+BlockFrequencyInfoImplBase::isIrrLoopHeader(const BlockNode &Node) {
+ if (!Node.isValid())
+ return false;
+ return IsIrrLoopHeader.test(Node.Index);
+}
+
Scaled64
BlockFrequencyInfoImplBase::getFloatingBlockFreq(const BlockNode &Node) const {
if (!Node.isValid())
DEBUG(debugAssign(*this, D, W.TargetNode, Taken, nullptr));
}
}
+
+void BlockFrequencyInfoImplBase::distributeIrrLoopHeaderMass(Distribution &Dist) {
+ BlockMass LoopMass = BlockMass::getFull();
+ DitheringDistributer D(Dist, LoopMass);
+ for (const Weight &W : Dist.Weights) {
+ BlockMass Taken = D.takeMass(W.Amount);
+ assert(W.Type == Weight::Local && "all weights should be local");
+ Working[W.TargetNode.Index].getMass() = Taken;
+ DEBUG(debugAssign(*this, D, W.TargetNode, Taken, nullptr));
+ }
+}
MachineBasicBlock::MachineBasicBlock(MachineFunction &MF, const BasicBlock *B)
: BB(B), Number(-1), xParent(&MF) {
Insts.Parent = this;
+ if (B)
+ IrrLoopHeaderWeight = B->getIrrLoopHeaderWeight();
}
MachineBasicBlock::~MachineBasicBlock() {
}
OS << '\n';
}
+ if (IrrLoopHeaderWeight) {
+ if (Indexes) OS << '\t';
+ OS << " Irreducible loop header weight: "
+ << IrrLoopHeaderWeight.getValue();
+ OS << '\n';
+ }
}
void MachineBasicBlock::printAsOperand(raw_ostream &OS,
return MBFI ? MBFI->getProfileCountFromFreq(*F, Freq) : None;
}
+bool
+MachineBlockFrequencyInfo::isIrrLoopHeader(const MachineBasicBlock *MBB) {
+ assert(MBFI && "Expected analysis to be available");
+ return MBFI->isIrrLoopHeader(MBB);
+}
+
const MachineFunction *MachineBlockFrequencyInfo::getFunction() const {
return MBFI ? MBFI->getFunction() : nullptr;
}
const LandingPadInst *BasicBlock::getLandingPadInst() const {
return dyn_cast<LandingPadInst>(getFirstNonPHI());
}
+
+Optional<uint64_t> BasicBlock::getIrrLoopHeaderWeight() const {
+ const TerminatorInst *TI = getTerminator();
+ if (MDNode *MDIrrLoopHeader =
+ TI->getMetadata(LLVMContext::MD_irr_loop)) {
+ MDString *MDName = cast<MDString>(MDIrrLoopHeader->getOperand(0));
+ if (MDName->getString().equals("loop_header_weight")) {
+ auto *CI = mdconst::extract<ConstantInt>(MDIrrLoopHeader->getOperand(1));
+ return Optional<uint64_t>(CI->getValue().getZExtValue());
+ }
+ }
+ return Optional<uint64_t>();
+}
{MD_absolute_symbol, "absolute_symbol"},
{MD_associated, "associated"},
{MD_callees, "callees"},
+ {MD_irr_loop, "irr_loop"},
};
for (auto &MDKind : MDKinds) {
}
return MDNode::get(Context, {BaseType, AccessType, createConstant(Off)});
}
+
+MDNode *MDBuilder::createIrrLoopHeaderWeight(uint64_t Weight) {
+ SmallVector<Metadata *, 2> Vals(2);
+ Vals[0] = createString("loop_header_weight");
+ Vals[1] = createConstant(ConstantInt::get(Type::getInt64Ty(Context), Weight));
+ return MDNode::get(Context, Vals);
+}
PGOUseFunc(Function &Func, Module *Modu,
std::unordered_multimap<Comdat *, GlobalValue *> &ComdatMembers,
BranchProbabilityInfo *BPI = nullptr,
- BlockFrequencyInfo *BFI = nullptr)
- : F(Func), M(Modu), FuncInfo(Func, ComdatMembers, false, BPI, BFI),
+ BlockFrequencyInfo *BFIin = nullptr)
+ : F(Func), M(Modu), BFI(BFIin),
+ FuncInfo(Func, ComdatMembers, false, BPI, BFIin),
FreqAttr(FFA_Normal) {}
// Read counts for the instrumented BB from profile.
// Annotate the value profile call sites for one value kind.
void annotateValueSites(uint32_t Kind);
+ // Annotate the irreducible loop header weights.
+ void annotateIrrLoopHeaderWeights();
+
// The hotness of the function from the profile count.
enum FuncFreqAttr { FFA_Normal, FFA_Cold, FFA_Hot };
private:
Function &F;
Module *M;
+ BlockFrequencyInfo *BFI;
// This member stores the shared information with class PGOGenFunc.
FuncPGOInstrumentation<PGOUseEdge, UseBBInfo> FuncInfo;
}
}
+void PGOUseFunc::annotateIrrLoopHeaderWeights() {
+ DEBUG(dbgs() << "\nAnnotating irreducible loop header weights.\n");
+ // Find irr loop headers
+ for (auto &BB : F) {
+ if (BFI->isIrrLoopHeader(&BB)) {
+ TerminatorInst *TI = BB.getTerminator();
+ const UseBBInfo &BBCountInfo = getBBInfo(&BB);
+ setIrrLoopHeaderMetadata(M, TI, BBCountInfo.CountValue);
+ }
+ }
+}
+
void SelectInstVisitor::instrumentOneSelectInst(SelectInst &SI) {
Module *M = F.getParent();
IRBuilder<> Builder(&SI);
Func.populateCounters();
Func.setBranchWeights();
Func.annotateValueSites();
+ Func.annotateIrrLoopHeaderWeights();
PGOUseFunc::FuncFreqAttr FreqAttr = Func.getFuncFreqAttr();
if (FreqAttr == PGOUseFunc::FFA_Cold)
ColdFunctions.push_back(&F);
namespace llvm {
+void setIrrLoopHeaderMetadata(Module *M, Instruction *TI, uint64_t Count) {
+ MDBuilder MDB(M->getContext());
+ TI->setMetadata(llvm::LLVMContext::MD_irr_loop,
+ MDB.createIrrLoopHeaderWeight(Count));
+}
+
template <> struct GraphTraits<PGOUseFunc *> {
using NodeRef = const BasicBlock *;
using ChildIteratorType = succ_const_iterator;
--- /dev/null
+; RUN: opt < %s -analyze -block-freq | FileCheck %s
+; RUN: opt < %s -passes='print<block-freq>' -disable-output 2>&1 | FileCheck %s
+
+; Function Attrs: noinline norecurse nounwind readnone uwtable
+define i32 @_Z11irreducibleii(i32 %iter_outer, i32 %iter_inner) local_unnamed_addr !prof !27 {
+entry:
+ %cmp24 = icmp sgt i32 %iter_outer, 0
+ br i1 %cmp24, label %for.body, label %entry.for.cond.cleanup_crit_edge, !prof !28
+
+entry.for.cond.cleanup_crit_edge: ; preds = %entry
+ br label %for.cond.cleanup
+
+for.cond.cleanup: ; preds = %for.end, %entry.for.cond.cleanup_crit_edge
+ %sum.0.lcssa = phi i32 [ 0, %entry.for.cond.cleanup_crit_edge ], [ %sum.1, %for.end ]
+ ret i32 %sum.0.lcssa
+
+for.body: ; preds = %for.end, %entry
+ %k.026 = phi i32 [ %inc12, %for.end ], [ 0, %entry ]
+ %sum.025 = phi i32 [ %sum.1, %for.end ], [ 0, %entry ]
+ %rem23 = and i32 %k.026, 1
+ %cmp1 = icmp eq i32 %rem23, 0
+ br i1 %cmp1, label %entry8, label %for.cond2, !prof !29
+
+for.cond2: ; preds = %if.end9, %for.body
+ %sum.1 = phi i32 [ %add10, %if.end9 ], [ %sum.025, %for.body ]
+ %i.0 = phi i32 [ %inc, %if.end9 ], [ 0, %for.body ]
+ %cmp3 = icmp slt i32 %i.0, %iter_inner
+ br i1 %cmp3, label %for.body4, label %for.end, !prof !30, !irr_loop !31
+
+for.body4: ; preds = %for.cond2
+ %rem5 = srem i32 %k.026, 3
+ %cmp6 = icmp eq i32 %rem5, 0
+ br i1 %cmp6, label %entry8, label %if.end9, !prof !32
+
+entry8: ; preds = %for.body4, %for.body
+ %sum.2 = phi i32 [ %sum.025, %for.body ], [ %sum.1, %for.body4 ]
+ %i.1 = phi i32 [ 0, %for.body ], [ %i.0, %for.body4 ]
+ %add = add nsw i32 %sum.2, 4
+ br label %if.end9, !irr_loop !33
+
+if.end9: ; preds = %entry8, %for.body4
+ %sum.3 = phi i32 [ %add, %entry8 ], [ %sum.1, %for.body4 ]
+ %i.2 = phi i32 [ %i.1, %entry8 ], [ %i.0, %for.body4 ]
+ %add10 = add nsw i32 %sum.3, 1
+ %inc = add nsw i32 %i.2, 1
+ br label %for.cond2, !irr_loop !34
+
+for.end: ; preds = %for.cond2
+ %inc12 = add nuw nsw i32 %k.026, 1
+ %exitcond = icmp eq i32 %inc12, %iter_outer
+ br i1 %exitcond, label %for.cond.cleanup, label %for.body, !prof !35
+}
+
+!27 = !{!"function_entry_count", i64 1}
+!28 = !{!"branch_weights", i32 1, i32 0}
+!29 = !{!"branch_weights", i32 50, i32 50}
+!30 = !{!"branch_weights", i32 950, i32 100}
+!31 = !{!"loop_header_weight", i64 1050}
+!32 = !{!"branch_weights", i32 323, i32 627}
+!33 = !{!"loop_header_weight", i64 373}
+!34 = !{!"loop_header_weight", i64 1000}
+!35 = !{!"branch_weights", i32 1, i32 99}
+
+; CHECK-LABEL: Printing analysis {{.*}} for function '_Z11irreducibleii':
+; CHECK-NEXT: block-frequency-info: _Z11irreducibleii
+; CHECK-NEXT: - entry: {{.*}} count = 1
+; CHECK-NEXT: - entry.for.cond.cleanup_crit_edge: {{.*}} count = 0
+; CHECK-NEXT: - for.cond.cleanup: {{.*}} count = 1
+; CHECK-NEXT: - for.body: {{.*}} count = 100
+; CHECK-NEXT: - for.cond2: {{.*}} count = 1050, irr_loop_header_weight = 1050
+; CHECK-NEXT: - for.body4: {{.*}} count = 950
+; CHECK-NEXT: - entry8: {{.*}} count = 373, irr_loop_header_weight = 373
+; CHECK-NEXT: - if.end9: {{.*}} count = 1000, irr_loop_header_weight = 1000
+; CHECK-NEXT: - for.end: {{.*}} count = 100
+
+@targets = local_unnamed_addr global [256 x i8*] zeroinitializer, align 16
+@tracing = local_unnamed_addr global i32 0, align 4
+
+; Function Attrs: noinline norecurse nounwind uwtable
+define i32 @_Z11irreduciblePh(i8* nocapture readonly %p) !prof !27 {
+entry:
+ store <2 x i8*> <i8* blockaddress(@_Z11irreduciblePh, %sw.bb), i8* blockaddress(@_Z11irreduciblePh, %TARGET_1)>, <2 x i8*>* bitcast ([256 x i8*]* @targets to <2 x i8*>*), align 16
+ store i8* blockaddress(@_Z11irreduciblePh, %TARGET_2), i8** getelementptr inbounds ([256 x i8*], [256 x i8*]* @targets, i64 0, i64 2), align 16
+ %0 = load i32, i32* @tracing, align 4
+ %tobool = icmp eq i32 %0, 0
+ br label %for.cond1
+
+for.cond1: ; preds = %sw.default, %entry
+ %p.addr.0 = phi i8* [ %p, %entry ], [ %p.addr.4, %sw.default ]
+ %sum.0 = phi i32 [ 0, %entry ], [ %add25, %sw.default ]
+ %incdec.ptr = getelementptr inbounds i8, i8* %p.addr.0, i64 1
+ %1 = load i8, i8* %p.addr.0, align 1
+ %incdec.ptr2 = getelementptr inbounds i8, i8* %p.addr.0, i64 2
+ %2 = load i8, i8* %incdec.ptr, align 1
+ %conv3 = zext i8 %2 to i32
+ br label %dispatch_op
+
+dispatch_op: ; preds = %sw.bb6, %for.cond1
+ %p.addr.1 = phi i8* [ %incdec.ptr2, %for.cond1 ], [ %p.addr.2, %sw.bb6 ]
+ %op.0 = phi i8 [ %1, %for.cond1 ], [ 1, %sw.bb6 ]
+ %oparg.0 = phi i32 [ %conv3, %for.cond1 ], [ %oparg.2, %sw.bb6 ]
+ %sum.1 = phi i32 [ %sum.0, %for.cond1 ], [ %add7, %sw.bb6 ]
+ switch i8 %op.0, label %sw.default [
+ i8 0, label %sw.bb
+ i8 1, label %dispatch_op.sw.bb6_crit_edge
+ i8 2, label %sw.bb15
+ ], !prof !36
+
+dispatch_op.sw.bb6_crit_edge: ; preds = %dispatch_op
+ br label %sw.bb6
+
+sw.bb: ; preds = %indirectgoto, %dispatch_op
+ %oparg.1 = phi i32 [ %oparg.0, %dispatch_op ], [ 0, %indirectgoto ]
+ %sum.2 = phi i32 [ %sum.1, %dispatch_op ], [ %sum.7, %indirectgoto ]
+ %add.neg = sub i32 -5, %oparg.1
+ %sub = add i32 %add.neg, %sum.2
+ br label %exit
+
+TARGET_1: ; preds = %indirectgoto
+ %incdec.ptr4 = getelementptr inbounds i8, i8* %add.ptr.pn, i64 2
+ %3 = load i8, i8* %p.addr.5, align 1
+ %conv5 = zext i8 %3 to i32
+ br label %sw.bb6
+
+sw.bb6: ; preds = %TARGET_1, %dispatch_op.sw.bb6_crit_edge
+ %p.addr.2 = phi i8* [ %incdec.ptr4, %TARGET_1 ], [ %p.addr.1, %dispatch_op.sw.bb6_crit_edge ]
+ %oparg.2 = phi i32 [ %conv5, %TARGET_1 ], [ %oparg.0, %dispatch_op.sw.bb6_crit_edge ]
+ %sum.3 = phi i32 [ %sum.7, %TARGET_1 ], [ %sum.1, %dispatch_op.sw.bb6_crit_edge ]
+ %mul = mul nsw i32 %oparg.2, 7
+ %add7 = add nsw i32 %sum.3, %mul
+ %rem46 = and i32 %add7, 1
+ %cmp8 = icmp eq i32 %rem46, 0
+ br i1 %cmp8, label %dispatch_op, label %if.then, !prof !37, !irr_loop !38
+
+if.then: ; preds = %sw.bb6
+ %mul9 = mul nsw i32 %add7, 9
+ br label %indirectgoto
+
+TARGET_2: ; preds = %indirectgoto
+ %incdec.ptr13 = getelementptr inbounds i8, i8* %add.ptr.pn, i64 2
+ %4 = load i8, i8* %p.addr.5, align 1
+ %conv14 = zext i8 %4 to i32
+ br label %sw.bb15
+
+sw.bb15: ; preds = %TARGET_2, %dispatch_op
+ %p.addr.3 = phi i8* [ %p.addr.1, %dispatch_op ], [ %incdec.ptr13, %TARGET_2 ]
+ %oparg.3 = phi i32 [ %oparg.0, %dispatch_op ], [ %conv14, %TARGET_2 ]
+ %sum.4 = phi i32 [ %sum.1, %dispatch_op ], [ %sum.7, %TARGET_2 ]
+ %add16 = add nsw i32 %oparg.3, 3
+ %add17 = add nsw i32 %add16, %sum.4
+ br i1 %tobool, label %if.then18, label %exit, !prof !39, !irr_loop !40
+
+if.then18: ; preds = %sw.bb15
+ %idx.ext = sext i32 %oparg.3 to i64
+ %add.ptr = getelementptr inbounds i8, i8* %p.addr.3, i64 %idx.ext
+ %mul19 = mul nsw i32 %add17, 17
+ br label %indirectgoto
+
+unknown_op: ; preds = %indirectgoto
+ %sub24 = add nsw i32 %sum.7, -4
+ br label %sw.default
+
+sw.default: ; preds = %unknown_op, %dispatch_op
+ %p.addr.4 = phi i8* [ %p.addr.5, %unknown_op ], [ %p.addr.1, %dispatch_op ]
+ %sum.5 = phi i32 [ %sub24, %unknown_op ], [ %sum.1, %dispatch_op ]
+ %add25 = add nsw i32 %sum.5, 11
+ br label %for.cond1
+
+exit: ; preds = %sw.bb15, %sw.bb
+ %sum.6 = phi i32 [ %sub, %sw.bb ], [ %add17, %sw.bb15 ]
+ ret i32 %sum.6
+
+indirectgoto: ; preds = %if.then18, %if.then
+ %add.ptr.pn = phi i8* [ %add.ptr, %if.then18 ], [ %p.addr.2, %if.then ]
+ %sum.7 = phi i32 [ %mul19, %if.then18 ], [ %mul9, %if.then ]
+ %p.addr.5 = getelementptr inbounds i8, i8* %add.ptr.pn, i64 1
+ %5 = load i8, i8* %add.ptr.pn, align 1
+ %idxprom21 = zext i8 %5 to i64
+ %arrayidx22 = getelementptr inbounds [256 x i8*], [256 x i8*]* @targets, i64 0, i64 %idxprom21
+ %6 = load i8*, i8** %arrayidx22, align 8
+ indirectbr i8* %6, [label %unknown_op, label %sw.bb, label %TARGET_1, label %TARGET_2], !prof !41, !irr_loop !42
+}
+
+!36 = !{!"branch_weights", i32 0, i32 0, i32 201, i32 1}
+!37 = !{!"branch_weights", i32 201, i32 300}
+!38 = !{!"loop_header_weight", i64 501}
+!39 = !{!"branch_weights", i32 100, i32 0}
+!40 = !{!"loop_header_weight", i64 100}
+!41 = !{!"branch_weights", i32 0, i32 1, i32 300, i32 99}
+!42 = !{!"loop_header_weight", i64 400}
+
+; CHECK-LABEL: Printing analysis {{.*}} for function '_Z11irreduciblePh':
+; CHECK-NEXT: block-frequency-info: _Z11irreduciblePh
+; CHECK-NEXT: - entry: {{.*}} count = 1
+; CHECK-NEXT: - for.cond1: {{.*}} count = 1
+; CHECK-NEXT: - dispatch_op: {{.*}} count = 201
+; CHECK-NEXT: - dispatch_op.sw.bb6_crit_edge: {{.*}} count = 200
+; CHECK-NEXT: - sw.bb: {{.*}} count = 0
+; CHECK-NEXT: - TARGET_1: {{.*}} count = 299
+; CHECK-NEXT: - sw.bb6: {{.*}} count = 500, irr_loop_header_weight = 501
+; CHECK-NEXT: - if.then: {{.*}} count = 299
+; CHECK-NEXT: - TARGET_2: {{.*}} count = 98
+; CHECK-NEXT: - sw.bb15: {{.*}} count = 99, irr_loop_header_weight = 100
+; CHECK-NEXT: - if.then18: {{.*}} count = 99
+; CHECK-NEXT: - unknown_op: {{.*}} count = 0
+; CHECK-NEXT: - sw.default: {{.*}} count = 0
+; CHECK-NEXT: - exit: {{.*}} count = 1
+; CHECK-NEXT: - indirectgoto: {{.*}} count = 399, irr_loop_header_weight = 400
; RUN: llvm-lto -thinlto-action=import %t2.bc -thinlto-index=%t3.bc \
; RUN: -o /dev/null -stats \
; RUN: 2>&1 | FileCheck %s -check-prefix=LAZY
-; LAZY: 53 bitcode-reader - Number of Metadata records loaded
+; LAZY: 55 bitcode-reader - Number of Metadata records loaded
; LAZY: 2 bitcode-reader - Number of MDStrings loaded
; RUN: llvm-lto -thinlto-action=import %t2.bc -thinlto-index=%t3.bc \
; RUN: -o /dev/null -disable-ondemand-mds-loading -stats \
; RUN: 2>&1 | FileCheck %s -check-prefix=NOTLAZY
-; NOTLAZY: 62 bitcode-reader - Number of Metadata records loaded
+; NOTLAZY: 64 bitcode-reader - Number of Metadata records loaded
; NOTLAZY: 7 bitcode-reader - Number of MDStrings loaded
--- /dev/null
+:ir
+_Z11irreducibleii
+# Func Hash:
+64451410787
+# Num Counters:
+6
+# Counter Values:
+1000
+950
+100
+373
+1
+0
+
+_Z11irreduciblePh
+# Func Hash:
+104649601521
+# Num Counters:
+9
+# Counter Values:
+100
+300
+99
+300
+201
+1
+1
+0
+0
--- /dev/null
+; RUN: llvm-profdata merge %S/Inputs/irreducible.proftext -o %t.profdata
+; RUN: opt < %s -pgo-instr-use -pgo-test-profile-file=%t.profdata -S | FileCheck %s --check-prefix=USE
+; RUN: opt < %s -passes=pgo-instr-use -pgo-test-profile-file=%t.profdata -S | FileCheck %s --check-prefix=USE
+
+; GEN: $__llvm_profile_raw_version = comdat any
+
+; Function Attrs: noinline norecurse nounwind readnone uwtable
+define i32 @_Z11irreducibleii(i32 %iter_outer, i32 %iter_inner) local_unnamed_addr #0 {
+entry:
+ %cmp24 = icmp sgt i32 %iter_outer, 0
+ br i1 %cmp24, label %for.body, label %entry.for.cond.cleanup_crit_edge
+
+entry.for.cond.cleanup_crit_edge: ; preds = %entry
+ br label %for.cond.cleanup
+
+for.cond.cleanup: ; preds = %entry.for.cond.cleanup_crit_edge, %for.end
+ %sum.0.lcssa = phi i32 [ 0, %entry.for.cond.cleanup_crit_edge ], [ %sum.1, %for.end ]
+ ret i32 %sum.0.lcssa
+
+for.body: ; preds = %entry, %for.end
+ %k.026 = phi i32 [ %inc12, %for.end ], [ 0, %entry ]
+ %sum.025 = phi i32 [ %sum.1, %for.end ], [ 0, %entry ]
+ %rem23 = and i32 %k.026, 1
+ %cmp1 = icmp eq i32 %rem23, 0
+ br i1 %cmp1, label %entry8, label %for.cond2
+
+for.cond2: ; preds = %for.body, %if.end9
+ %sum.1 = phi i32 [ %add10, %if.end9 ], [ %sum.025, %for.body ]
+ %i.0 = phi i32 [ %inc, %if.end9 ], [ 0, %for.body ]
+ %cmp3 = icmp slt i32 %i.0, %iter_inner
+ br i1 %cmp3, label %for.body4, label %for.end
+; USE: br i1 %cmp3, label %for.body4, label %for.end, !prof !{{[0-9]+}},
+; USE-SAME: !irr_loop ![[FOR_COND2_IRR_LOOP:[0-9]+]]
+
+for.body4: ; preds = %for.cond2
+ %rem5 = srem i32 %k.026, 3
+ %cmp6 = icmp eq i32 %rem5, 0
+ br i1 %cmp6, label %entry8, label %if.end9
+
+entry8: ; preds = %for.body4, %for.body
+ %sum.2 = phi i32 [ %sum.025, %for.body ], [ %sum.1, %for.body4 ]
+ %i.1 = phi i32 [ 0, %for.body ], [ %i.0, %for.body4 ]
+ %add = add nsw i32 %sum.2, 4
+ br label %if.end9
+; USE: br label %if.end9,
+; USE-SAME: !irr_loop ![[ENTRY8_IRR_LOOP:[0-9]+]]
+
+if.end9: ; preds = %entry8, %for.body4
+ %sum.3 = phi i32 [ %add, %entry8 ], [ %sum.1, %for.body4 ]
+ %i.2 = phi i32 [ %i.1, %entry8 ], [ %i.0, %for.body4 ]
+ %add10 = add nsw i32 %sum.3, 1
+ %inc = add nsw i32 %i.2, 1
+ br label %for.cond2
+; USE: br label %for.cond2,
+; USE-SAME: !irr_loop ![[IF_END9_IRR_LOOP:[0-9]+]]
+
+for.end: ; preds = %for.cond2
+ %inc12 = add nuw nsw i32 %k.026, 1
+ %exitcond = icmp eq i32 %inc12, %iter_outer
+ br i1 %exitcond, label %for.cond.cleanup, label %for.body
+}
+
+
+
+@targets = local_unnamed_addr global [256 x i8*] zeroinitializer, align 16
+@tracing = local_unnamed_addr global i32 0, align 4
+
+; Function Attrs: noinline norecurse nounwind uwtable
+define i32 @_Z11irreduciblePh(i8* nocapture readonly %p) {
+entry:
+ store <2 x i8*> <i8* blockaddress(@_Z11irreduciblePh, %sw.bb), i8* blockaddress(@_Z11irreduciblePh, %TARGET_1)>, <2 x i8*>* bitcast ([256 x i8*]* @targets to <2 x i8*>*), align 16
+ store i8* blockaddress(@_Z11irreduciblePh, %TARGET_2), i8** getelementptr inbounds ([256 x i8*], [256 x i8*]* @targets, i64 0, i64 2), align 16
+ %0 = load i32, i32* @tracing, align 4
+ %tobool = icmp eq i32 %0, 0
+ br label %for.cond1
+
+for.cond1: ; preds = %sw.default, %entry
+ %p.addr.0 = phi i8* [ %p, %entry ], [ %p.addr.4, %sw.default ]
+ %sum.0 = phi i32 [ 0, %entry ], [ %add25, %sw.default ]
+ %incdec.ptr = getelementptr inbounds i8, i8* %p.addr.0, i64 1
+ %1 = load i8, i8* %p.addr.0, align 1
+ %incdec.ptr2 = getelementptr inbounds i8, i8* %p.addr.0, i64 2
+ %2 = load i8, i8* %incdec.ptr, align 1
+ %conv3 = zext i8 %2 to i32
+ br label %dispatch_op
+
+dispatch_op: ; preds = %sw.bb6, %for.cond1
+ %p.addr.1 = phi i8* [ %incdec.ptr2, %for.cond1 ], [ %p.addr.2, %sw.bb6 ]
+ %op.0 = phi i8 [ %1, %for.cond1 ], [ 1, %sw.bb6 ]
+ %oparg.0 = phi i32 [ %conv3, %for.cond1 ], [ %oparg.2, %sw.bb6 ]
+ %sum.1 = phi i32 [ %sum.0, %for.cond1 ], [ %add7, %sw.bb6 ]
+ switch i8 %op.0, label %sw.default [
+ i8 0, label %sw.bb
+ i8 1, label %dispatch_op.sw.bb6_crit_edge
+ i8 2, label %sw.bb15
+ ]
+
+dispatch_op.sw.bb6_crit_edge: ; preds = %dispatch_op
+ br label %sw.bb6
+
+sw.bb: ; preds = %indirectgoto, %dispatch_op
+ %oparg.1 = phi i32 [ %oparg.0, %dispatch_op ], [ 0, %indirectgoto ]
+ %sum.2 = phi i32 [ %sum.1, %dispatch_op ], [ %sum.7, %indirectgoto ]
+ %add.neg = sub i32 -5, %oparg.1
+ %sub = add i32 %add.neg, %sum.2
+ br label %exit
+
+TARGET_1: ; preds = %indirectgoto
+ %incdec.ptr4 = getelementptr inbounds i8, i8* %add.ptr.pn, i64 2
+ %3 = load i8, i8* %p.addr.5, align 1
+ %conv5 = zext i8 %3 to i32
+ br label %sw.bb6
+
+sw.bb6: ; preds = %dispatch_op.sw.bb6_crit_edge, %TARGET_1
+ %p.addr.2 = phi i8* [ %incdec.ptr4, %TARGET_1 ], [ %p.addr.1, %dispatch_op.sw.bb6_crit_edge ]
+ %oparg.2 = phi i32 [ %conv5, %TARGET_1 ], [ %oparg.0, %dispatch_op.sw.bb6_crit_edge ]
+ %sum.3 = phi i32 [ %sum.7, %TARGET_1 ], [ %sum.1, %dispatch_op.sw.bb6_crit_edge ]
+ %mul = mul nsw i32 %oparg.2, 7
+ %add7 = add nsw i32 %sum.3, %mul
+ %rem46 = and i32 %add7, 1
+ %cmp8 = icmp eq i32 %rem46, 0
+ br i1 %cmp8, label %dispatch_op, label %if.then
+; USE: br i1 %cmp8, label %dispatch_op, label %if.then, !prof !{{[0-9]+}},
+; USE-SAME: !irr_loop ![[SW_BB6_IRR_LOOP:[0-9]+]]
+
+if.then: ; preds = %sw.bb6
+ %mul9 = mul nsw i32 %add7, 9
+ br label %indirectgoto
+
+TARGET_2: ; preds = %indirectgoto
+ %incdec.ptr13 = getelementptr inbounds i8, i8* %add.ptr.pn, i64 2
+ %4 = load i8, i8* %p.addr.5, align 1
+ %conv14 = zext i8 %4 to i32
+ br label %sw.bb15
+
+sw.bb15: ; preds = %TARGET_2, %dispatch_op
+ %p.addr.3 = phi i8* [ %p.addr.1, %dispatch_op ], [ %incdec.ptr13, %TARGET_2 ]
+ %oparg.3 = phi i32 [ %oparg.0, %dispatch_op ], [ %conv14, %TARGET_2 ]
+ %sum.4 = phi i32 [ %sum.1, %dispatch_op ], [ %sum.7, %TARGET_2 ]
+ %add16 = add nsw i32 %oparg.3, 3
+ %add17 = add nsw i32 %add16, %sum.4
+ br i1 %tobool, label %if.then18, label %exit
+; USE: br i1 %tobool, label %if.then18, label %exit, !prof !{{[0-9]+}},
+; USE-SAME: !irr_loop ![[SW_BB15_IRR_LOOP:[0-9]+]]
+
+if.then18: ; preds = %sw.bb15
+ %idx.ext = sext i32 %oparg.3 to i64
+ %add.ptr = getelementptr inbounds i8, i8* %p.addr.3, i64 %idx.ext
+ %mul19 = mul nsw i32 %add17, 17
+ br label %indirectgoto
+
+unknown_op: ; preds = %indirectgoto
+ %sub24 = add nsw i32 %sum.7, -4
+ br label %sw.default
+
+sw.default: ; preds = %unknown_op, %dispatch_op
+ %p.addr.4 = phi i8* [ %p.addr.5, %unknown_op ], [ %p.addr.1, %dispatch_op ]
+ %sum.5 = phi i32 [ %sub24, %unknown_op ], [ %sum.1, %dispatch_op ]
+ %add25 = add nsw i32 %sum.5, 11
+ br label %for.cond1
+
+exit: ; preds = %sw.bb15, %sw.bb
+ %sum.6 = phi i32 [ %sub, %sw.bb ], [ %add17, %sw.bb15 ]
+ ret i32 %sum.6
+
+indirectgoto: ; preds = %if.then18, %if.then
+ %add.ptr.pn = phi i8* [ %add.ptr, %if.then18 ], [ %p.addr.2, %if.then ]
+ %sum.7 = phi i32 [ %mul19, %if.then18 ], [ %mul9, %if.then ]
+ %p.addr.5 = getelementptr inbounds i8, i8* %add.ptr.pn, i64 1
+ %5 = load i8, i8* %add.ptr.pn, align 1
+ %idxprom21 = zext i8 %5 to i64
+ %arrayidx22 = getelementptr inbounds [256 x i8*], [256 x i8*]* @targets, i64 0, i64 %idxprom21
+ %6 = load i8*, i8** %arrayidx22, align 8
+ indirectbr i8* %6, [label %unknown_op, label %sw.bb, label %TARGET_1, label %TARGET_2]
+; USE: indirectbr i8* %6, [label %unknown_op, label %sw.bb, label %TARGET_1, label %TARGET_2], !prof !{{[0-9]+}},
+; USE-SAME: !irr_loop ![[INDIRECTGOTO_IRR_LOOP:[0-9]+]]
+}
+
+; USE: ![[FOR_COND2_IRR_LOOP]] = !{!"loop_header_weight", i64 1050}
+; USE: ![[ENTRY8_IRR_LOOP]] = !{!"loop_header_weight", i64 373}
+; USE: ![[IF_END9_IRR_LOOP]] = !{!"loop_header_weight", i64 1000}
+; USE: ![[SW_BB6_IRR_LOOP]] = !{!"loop_header_weight", i64 501}
+; USE: ![[SW_BB15_IRR_LOOP]] = !{!"loop_header_weight", i64 100}
+; USE: ![[INDIRECTGOTO_IRR_LOOP]] = !{!"loop_header_weight", i64 400}