From: Florian Hahn Date: Thu, 7 Sep 2017 12:49:39 +0000 (+0000) Subject: [MachineCombiner] Update instruction depths incrementally for large BBs. X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=651af0243784137d375a40c97325ffb383845493;p=llvm [MachineCombiner] Update instruction depths incrementally for large BBs. Summary: For large basic blocks with lots of combinable instructions, the MachineTraceMetrics computations in MachineCombiner can dominate the compile time, as computing the trace information is quadratic in the number of instructions in a BB and it's relevant successors/predecessors. In most cases, knowing the instruction depth should be enough to make combination decisions. As we already iterate over all instructions in a basic block, the instruction depth can be computed incrementally. This reduces the cost of machine-combine drastically in cases where lots of instructions are combined. The major drawback is that AFAIK, computing the critical path length cannot be done incrementally. Therefore we only compute instruction depths incrementally, for basic blocks with more instructions than inc_threshold. The -machine-combiner-inc-threshold option can be used to set the threshold and allows for easier experimenting and checking if using incremental updates for all basic blocks has any impact on the performance. Reviewers: sanjoy, Gerolf, MatzeB, efriedma, fhahn Reviewed By: fhahn Subscribers: kiranchandramohan, javed.absar, efriedma, llvm-commits Differential Revision: https://reviews.llvm.org/D36619 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@312719 91177308-0d34-0410-b5e6-96231b3b80d8 --- diff --git a/include/llvm/CodeGen/MachineTraceMetrics.h b/include/llvm/CodeGen/MachineTraceMetrics.h index a8ac54715ba..28f22f4d359 100644 --- a/include/llvm/CodeGen/MachineTraceMetrics.h +++ b/include/llvm/CodeGen/MachineTraceMetrics.h @@ -366,6 +366,12 @@ public: SparseSet &RegUnits); void updateDepth(const MachineBasicBlock *, const MachineInstr&, SparseSet &RegUnits); + + /// Updates the depth of the instructions from Start to End. + void updateDepths(MachineBasicBlock::iterator Start, + MachineBasicBlock::iterator End, + SparseSet &RegUnits); + }; /// Strategies for selecting traces. diff --git a/lib/CodeGen/MachineCombiner.cpp b/lib/CodeGen/MachineCombiner.cpp index e6f80dbb863..8586cf9679f 100644 --- a/lib/CodeGen/MachineCombiner.cpp +++ b/lib/CodeGen/MachineCombiner.cpp @@ -22,6 +22,7 @@ #include "llvm/CodeGen/MachineTraceMetrics.h" #include "llvm/CodeGen/Passes.h" #include "llvm/CodeGen/TargetSchedule.h" +#include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Target/TargetInstrInfo.h" @@ -34,6 +35,11 @@ using namespace llvm; STATISTIC(NumInstCombined, "Number of machineinst combined"); +static cl::opt +inc_threshold("machine-combiner-inc-threshold", cl::Hidden, + cl::desc("Incremental depth computation will be used for basic " + "blocks with more instructions."), cl::init(500)); + namespace { class MachineCombiner : public MachineFunctionPass { const TargetInstrInfo *TII; @@ -73,7 +79,7 @@ private: SmallVectorImpl &InsInstrs, SmallVectorImpl &DelInstrs, DenseMap &InstrIdxForVirtReg, - MachineCombinerPattern Pattern); + MachineCombinerPattern Pattern, bool SlackIsAccurate); bool preservesResourceLen(MachineBasicBlock *MBB, MachineTraceMetrics::Trace BlockTrace, SmallVectorImpl &InsInstrs, @@ -247,7 +253,8 @@ bool MachineCombiner::improvesCriticalPathLen( SmallVectorImpl &InsInstrs, SmallVectorImpl &DelInstrs, DenseMap &InstrIdxForVirtReg, - MachineCombinerPattern Pattern) { + MachineCombinerPattern Pattern, + bool SlackIsAccurate) { assert(TSchedModel.hasInstrSchedModelOrItineraries() && "Missing machine model\n"); // NewRoot is the last instruction in the \p InsInstrs vector. @@ -258,7 +265,7 @@ bool MachineCombiner::improvesCriticalPathLen( unsigned NewRootDepth = getDepth(InsInstrs, InstrIdxForVirtReg, BlockTrace); unsigned RootDepth = BlockTrace.getInstrCycles(*Root).Depth; - DEBUG(dbgs() << "DEPENDENCE DATA FOR " << Root << "\n"; + DEBUG(dbgs() << "DEPENDENCE DATA FOR " << *Root << "\n"; dbgs() << " NewRootDepth: " << NewRootDepth << "\n"; dbgs() << " RootDepth: " << RootDepth << "\n"); @@ -281,17 +288,18 @@ bool MachineCombiner::improvesCriticalPathLen( RootLatency += TSchedModel.computeInstrLatency(I); unsigned RootSlack = BlockTrace.getInstrSlack(*Root); - + unsigned NewCycleCount = NewRootDepth + NewRootLatency; + unsigned OldCycleCount = RootDepth + RootLatency + + (SlackIsAccurate ? RootSlack : 0); DEBUG(dbgs() << " NewRootLatency: " << NewRootLatency << "\n"; dbgs() << " RootLatency: " << RootLatency << "\n"; - dbgs() << " RootSlack: " << RootSlack << "\n"; + dbgs() << " RootSlack: " << RootSlack << " SlackIsAccurate=" + << SlackIsAccurate << "\n"; dbgs() << " NewRootDepth + NewRootLatency = " - << NewRootDepth + NewRootLatency << "\n"; + << NewCycleCount << "\n"; dbgs() << " RootDepth + RootLatency + RootSlack = " - << RootDepth + RootLatency + RootSlack << "\n";); - - unsigned NewCycleCount = NewRootDepth + NewRootLatency; - unsigned OldCycleCount = RootDepth + RootLatency + RootSlack; + << OldCycleCount << "\n"; + ); return NewCycleCount <= OldCycleCount; } @@ -354,17 +362,44 @@ bool MachineCombiner::doSubstitute(unsigned NewSize, unsigned OldSize) { return false; } +/// Inserts InsInstrs and deletes DelInstrs. Incrementally updates instruction +/// depths if requested. +/// +/// \param MBB basic block to insert instructions in +/// \param MI current machine instruction +/// \param InsInstrs new instructions to insert in \p MBB +/// \param DelInstrs instruction to delete from \p MBB +/// \param MinInstr is a pointer to the machine trace information +/// \param RegUnits set of live registers, needed to compute instruction depths +/// \param IncrementalUpdate if true, compute instruction depths incrementally, +/// otherwise invalidate the trace static void insertDeleteInstructions(MachineBasicBlock *MBB, MachineInstr &MI, SmallVector InsInstrs, SmallVector DelInstrs, - MachineTraceMetrics *Traces) { + MachineTraceMetrics::Ensemble *MinInstr, + SparseSet &RegUnits, + bool IncrementalUpdate) { for (auto *InstrPtr : InsInstrs) MBB->insert((MachineBasicBlock::iterator)&MI, InstrPtr); - for (auto *InstrPtr : DelInstrs) + + for (auto *InstrPtr : DelInstrs) { InstrPtr->eraseFromParentAndMarkDBGValuesForRemoval(); - ++NumInstCombined; - Traces->invalidate(MBB); - Traces->verifyAnalysis(); + // Erase all LiveRegs defined by the removed instruction + for (auto I = RegUnits.begin(); I != RegUnits.end(); ) { + if (I->MI == InstrPtr) + I = RegUnits.erase(I); + else + I++; + } + } + + if (IncrementalUpdate) + for (auto *InstrPtr : InsInstrs) + MinInstr->updateDepth(MBB, *InstrPtr, RegUnits); + else + MinInstr->invalidate(MBB); + + NumInstCombined++; } /// Substitute a slow code sequence with a faster one by @@ -378,9 +413,16 @@ bool MachineCombiner::combineInstructions(MachineBasicBlock *MBB) { bool Changed = false; DEBUG(dbgs() << "Combining MBB " << MBB->getName() << "\n"); + bool IncrementalUpdate = false; auto BlockIter = MBB->begin(); + auto LastUpdate = BlockIter; // Check if the block is in a loop. const MachineLoop *ML = MLI->getLoopFor(MBB); + if (!MinInstr) + MinInstr = Traces->getEnsemble(MachineTraceMetrics::TS_MinInstrCount); + + SparseSet RegUnits; + RegUnits.setUniverse(TRI->getNumRegUnits()); while (BlockIter != MBB->end()) { auto &MI = *BlockIter++; @@ -419,9 +461,6 @@ bool MachineCombiner::combineInstructions(MachineBasicBlock *MBB) { SmallVector InsInstrs; SmallVector DelInstrs; DenseMap InstrIdxForVirtReg; - if (!MinInstr) - MinInstr = Traces->getEnsemble(MachineTraceMetrics::TS_MinInstrCount); - Traces->verifyAnalysis(); TII->genAlternativeCodeSequence(MI, P, InsInstrs, DelInstrs, InstrIdxForVirtReg); unsigned NewInstCount = InsInstrs.size(); @@ -436,23 +475,41 @@ bool MachineCombiner::combineInstructions(MachineBasicBlock *MBB) { if (ML && TII->isThroughputPattern(P)) SubstituteAlways = true; + if (IncrementalUpdate) { + // Update depths since the last incremental update. + MinInstr->updateDepths(LastUpdate, std::next(BlockIter), RegUnits); + LastUpdate = BlockIter; + } + // Substitute when we optimize for codesize and the new sequence has // fewer instructions OR // the new sequence neither lengthens the critical path nor increases // resource pressure. if (SubstituteAlways || doSubstitute(NewInstCount, OldInstCount)) { - insertDeleteInstructions(MBB, MI, InsInstrs, DelInstrs, Traces); + insertDeleteInstructions(MBB, MI, InsInstrs, DelInstrs, MinInstr, + RegUnits, IncrementalUpdate); // Eagerly stop after the first pattern fires. Changed = true; break; } else { - // Calculating the trace metrics may be expensive, - // so only do this when necessary. + // For big basic blocks, we only compute the full trace the first time + // we hit this. We do not invalidate the trace, but instead update the + // instruction depths incrementally. + // NOTE: Only the instruction depths up to MI are accurate. All other + // trace information is not updated. MachineTraceMetrics::Trace BlockTrace = MinInstr->getTrace(MBB); + Traces->verifyAnalysis(); if (improvesCriticalPathLen(MBB, &MI, BlockTrace, InsInstrs, DelInstrs, - InstrIdxForVirtReg, P) && + InstrIdxForVirtReg, P, + !IncrementalUpdate) && preservesResourceLen(MBB, BlockTrace, InsInstrs, DelInstrs)) { - insertDeleteInstructions(MBB, MI, InsInstrs, DelInstrs, Traces); + if (MBB->size() > inc_threshold) + // Use incremental depth updates for basic blocks above treshold + IncrementalUpdate = true; + + insertDeleteInstructions(MBB, MI, InsInstrs, DelInstrs, MinInstr, + RegUnits, IncrementalUpdate); + // Eagerly stop after the first pattern fires. Changed = true; break; @@ -467,6 +524,8 @@ bool MachineCombiner::combineInstructions(MachineBasicBlock *MBB) { } } + if (Changed && IncrementalUpdate) + Traces->invalidate(MBB); return Changed; } diff --git a/lib/CodeGen/MachineTraceMetrics.cpp b/lib/CodeGen/MachineTraceMetrics.cpp index d8b33147819..534733edc7b 100644 --- a/lib/CodeGen/MachineTraceMetrics.cpp +++ b/lib/CodeGen/MachineTraceMetrics.cpp @@ -823,6 +823,14 @@ updateDepth(const MachineBasicBlock *MBB, const MachineInstr &UseMI, updateDepth(BlockInfo[MBB->getNumber()], UseMI, RegUnits); } +void MachineTraceMetrics::Ensemble:: +updateDepths(MachineBasicBlock::iterator Start, + MachineBasicBlock::iterator End, + SparseSet &RegUnits) { + for (; Start != End; Start++) + updateDepth(Start->getParent(), *Start, RegUnits); +} + /// Compute instruction depths for all instructions above or in MBB in its /// trace. This assumes that the trace through MBB has already been computed. void MachineTraceMetrics::Ensemble:: diff --git a/test/CodeGen/AArch64/machine-combiner.ll b/test/CodeGen/AArch64/machine-combiner.ll index 0bd416ad172..358315d088d 100644 --- a/test/CodeGen/AArch64/machine-combiner.ll +++ b/test/CodeGen/AArch64/machine-combiner.ll @@ -1,5 +1,10 @@ ; RUN: llc -mtriple=aarch64-gnu-linux -mcpu=cortex-a57 -enable-unsafe-fp-math -disable-post-ra < %s | FileCheck %s +; Incremental updates of the instruction depths should be enough for this test +; case. +; RUN: llc -mtriple=aarch64-gnu-linux -mcpu=cortex-a57 -enable-unsafe-fp-math \ +; RUN: -disable-post-ra -machine-combiner-inc-threshold=0 < %s | FileCheck %s + ; Verify that the first two adds are independent regardless of how the inputs are ; commuted. The destination registers are used as source registers for the third add. diff --git a/test/CodeGen/X86/machine-combiner.ll b/test/CodeGen/X86/machine-combiner.ll index 3fbb233696c..048d30b6b24 100644 --- a/test/CodeGen/X86/machine-combiner.ll +++ b/test/CodeGen/X86/machine-combiner.ll @@ -1,6 +1,11 @@ ; RUN: llc -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=sse -enable-unsafe-fp-math < %s | FileCheck %s --check-prefix=SSE ; RUN: llc -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=avx -enable-unsafe-fp-math < %s | FileCheck %s --check-prefix=AVX +; Incremental updates of the instruction depths should be enough for this test +; case. +; RUN: llc -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=sse -enable-unsafe-fp-math -machine-combiner-inc-threshold=0 < %s | FileCheck %s --check-prefix=SSE +; RUN: llc -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=avx -enable-unsafe-fp-math -machine-combiner-inc-threshold=0 < %s | FileCheck %s --check-prefix=AVX + ; Verify that the first two adds are independent regardless of how the inputs are ; commuted. The destination registers are used as source registers for the third add. diff --git a/test/CodeGen/X86/mul-constant-result.ll b/test/CodeGen/X86/mul-constant-result.ll index 83a9dbe4b24..417b4385588 100644 --- a/test/CodeGen/X86/mul-constant-result.ll +++ b/test/CodeGen/X86/mul-constant-result.ll @@ -1,6 +1,9 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc < %s -mtriple=i686-unknown | FileCheck %s --check-prefix=X86 -; RUN: llc < %s -mtriple=x86_64-unknown -mcpu=haswell| FileCheck %s --check-prefix=X64-HSW + +; Incremental updates of the instruction depths should be enough for this test +; case. +; RUN: llc < %s -mtriple=x86_64-unknown -mcpu=haswell -machine-combiner-inc-threshold=0| FileCheck %s --check-prefix=X64-HSW ; Function Attrs: norecurse nounwind readnone uwtable define i32 @mult(i32, i32) local_unnamed_addr #0 {