#include "llvm/ADT/DenseMap.h"
#include "llvm/ADT/SmallSet.h"
#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/BlockFrequencyInfo.h"
+#include "llvm/Analysis/BranchProbabilityInfo.h"
#include "llvm/Analysis/InstructionSimplify.h"
#include "llvm/Analysis/LoopInfo.h"
#include "llvm/Analysis/ProfileSummaryInfo.h"
"profile-guided-section-prefix", cl::Hidden, cl::init(true),
cl::desc("Use profile info to add section prefix for hot/cold functions"));
+static cl::opt<unsigned> FreqRatioToSkipMerge(
+ "cgp-freq-ratio-to-skip-merge", cl::Hidden, cl::init(2),
+ cl::desc("Skip merging empty blocks if (frequency of empty block) / "
+ "(frequency of destination block) is greater than this ratio"));
+
namespace {
typedef SmallPtrSet<Instruction *, 16> SetOfInstrs;
typedef PointerIntPair<Type *, 1, bool> TypeIsSExt;
const TargetTransformInfo *TTI;
const TargetLibraryInfo *TLInfo;
const LoopInfo *LI;
+ std::unique_ptr<BlockFrequencyInfo> BFI;
+ std::unique_ptr<BranchProbabilityInfo> BPI;
/// As we scan instructions optimizing them, this is the next instruction
/// to optimize. Transforms that can invalidate this should update it.
private:
bool eliminateFallThrough(Function &F);
bool eliminateMostlyEmptyBlocks(Function &F);
+ BasicBlock *findDestBlockOfMergeableEmptyBlock(BasicBlock *BB);
bool canMergeBlocks(const BasicBlock *BB, const BasicBlock *DestBB) const;
void eliminateMostlyEmptyBlock(BasicBlock *BB);
+ bool isMergingEmptyBlockProfitable(BasicBlock *BB, BasicBlock *DestBB,
+ bool isPreheader);
bool optimizeBlock(BasicBlock &BB, bool& ModifiedDT);
bool optimizeInst(Instruction *I, bool& ModifiedDT);
bool optimizeMemoryInst(Instruction *I, Value *Addr,
// Clear per function information.
InsertedInsts.clear();
PromotedInsts.clear();
+ BFI.reset();
+ BPI.reset();
ModifiedDT = false;
if (TM)
return Changed;
}
+/// Find a destination block from BB if BB is mergeable empty block.
+BasicBlock *CodeGenPrepare::findDestBlockOfMergeableEmptyBlock(BasicBlock *BB) {
+ // If this block doesn't end with an uncond branch, ignore it.
+ BranchInst *BI = dyn_cast<BranchInst>(BB->getTerminator());
+ if (!BI || !BI->isUnconditional())
+ return nullptr;
+
+ // If the instruction before the branch (skipping debug info) isn't a phi
+ // node, then other stuff is happening here.
+ BasicBlock::iterator BBI = BI->getIterator();
+ if (BBI != BB->begin()) {
+ --BBI;
+ while (isa<DbgInfoIntrinsic>(BBI)) {
+ if (BBI == BB->begin())
+ break;
+ --BBI;
+ }
+ if (!isa<DbgInfoIntrinsic>(BBI) && !isa<PHINode>(BBI))
+ return nullptr;
+ }
+
+ // Do not break infinite loops.
+ BasicBlock *DestBB = BI->getSuccessor(0);
+ if (DestBB == BB)
+ return nullptr;
+
+ if (!canMergeBlocks(BB, DestBB))
+ DestBB = nullptr;
+
+ return DestBB;
+}
+
/// Eliminate blocks that contain only PHI nodes, debug info directives, and an
/// unconditional branch. Passes before isel (e.g. LSR/loopsimplify) often split
/// edges in ways that are non-optimal for isel. Start by eliminating these
// Note that this intentionally skips the entry block.
for (Function::iterator I = std::next(F.begin()), E = F.end(); I != E;) {
BasicBlock *BB = &*I++;
+ BasicBlock *DestBB = findDestBlockOfMergeableEmptyBlock(BB);
+ if (!DestBB ||
+ !isMergingEmptyBlockProfitable(BB, DestBB, Preheaders.count(BB)))
+ continue;
+
+ eliminateMostlyEmptyBlock(BB);
+ MadeChange = true;
+ }
+ return MadeChange;
+}
+
+bool CodeGenPrepare::isMergingEmptyBlockProfitable(BasicBlock *BB,
+ BasicBlock *DestBB,
+ bool isPreheader) {
+ // Do not delete loop preheaders if doing so would create a critical edge.
+ // Loop preheaders can be good locations to spill registers. If the
+ // preheader is deleted and we create a critical edge, registers may be
+ // spilled in the loop body instead.
+ if (!DisablePreheaderProtect && isPreheader &&
+ !(BB->getSinglePredecessor() &&
+ BB->getSinglePredecessor()->getSingleSuccessor()))
+ return false;
+
+ // Try to skip merging if the unique predecessor of BB is terminated by a
+ // switch or indirect branch instruction, and BB is used as an incoming block
+ // of PHIs in DestBB. In such case, merging BB and DestBB would cause ISel to
+ // add COPY instructions in the predecessor of BB instead of BB (if it is not
+ // merged). Note that the critical edge created by merging such blocks wont be
+ // split in MachineSink because the jump table is not analyzable. By keeping
+ // such empty block (BB), ISel will place COPY instructions in BB, not in the
+ // predecessor of BB.
+ BasicBlock *Pred = BB->getUniquePredecessor();
+ if (!Pred ||
+ !(isa<SwitchInst>(Pred->getTerminator()) ||
+ isa<IndirectBrInst>(Pred->getTerminator())))
+ return true;
- // If this block doesn't end with an uncond branch, ignore it.
- BranchInst *BI = dyn_cast<BranchInst>(BB->getTerminator());
- if (!BI || !BI->isUnconditional())
+ if (BB->getTerminator() != BB->getFirstNonPHI())
+ return true;
+
+ // We use a simple cost heuristic which determine skipping merging is
+ // profitable if the cost of skipping merging is less than the cost of
+ // merging : Cost(skipping merging) < Cost(merging BB), where the
+ // Cost(skipping merging) is Freq(BB) * (Cost(Copy) + Cost(Branch)), and
+ // the Cost(merging BB) is Freq(Pred) * Cost(Copy).
+ // Assuming Cost(Copy) == Cost(Branch), we could simplify it to :
+ // Freq(Pred) / Freq(BB) > 2.
+ // Note that if there are multiple empty blocks sharing the same incoming
+ // value for the PHIs in the DestBB, we consider them together. In such
+ // case, Cost(merging BB) will be the sum of their frequencies.
+
+ if (!isa<PHINode>(DestBB->begin()))
+ return true;
+
+ if (!BFI) {
+ BPI.reset(new BranchProbabilityInfo(*BB->getParent(), *LI));
+ BFI.reset(new BlockFrequencyInfo(*BB->getParent(), *BPI, *LI));
+ }
+
+ BlockFrequency PredFreq = BFI->getBlockFreq(Pred);
+ BlockFrequency BBFreq = BFI->getBlockFreq(BB);
+ SmallPtrSet<BasicBlock *, 16> SameIncomingValueBBs;
+
+ // Find all other incoming blocks from which incoming values of all PHIs in
+ // DestBB are the same as the ones from BB.
+ for (pred_iterator PI = pred_begin(DestBB), E = pred_end(DestBB); PI != E;
+ ++PI) {
+ BasicBlock *DestBBPred = *PI;
+ if (DestBBPred == BB)
continue;
- // If the instruction before the branch (skipping debug info) isn't a phi
- // node, then other stuff is happening here.
- BasicBlock::iterator BBI = BI->getIterator();
- if (BBI != BB->begin()) {
- --BBI;
- while (isa<DbgInfoIntrinsic>(BBI)) {
- if (BBI == BB->begin())
- break;
- --BBI;
+ bool HasAllSameValue = true;
+ BasicBlock::const_iterator DestBBI = DestBB->begin();
+ while (const PHINode *DestPN = dyn_cast<PHINode>(DestBBI++)) {
+ if (DestPN->getIncomingValueForBlock(BB) !=
+ DestPN->getIncomingValueForBlock(DestBBPred)) {
+ HasAllSameValue = false;
+ break;
}
- if (!isa<DbgInfoIntrinsic>(BBI) && !isa<PHINode>(BBI))
- continue;
}
+ if (HasAllSameValue)
+ SameIncomingValueBBs.insert(DestBBPred);
+ }
- // Do not break infinite loops.
- BasicBlock *DestBB = BI->getSuccessor(0);
- if (DestBB == BB)
- continue;
-
- if (!canMergeBlocks(BB, DestBB))
- continue;
+ // See if all BB's incoming values are same as the value from Pred. In this
+ // case, no reason to skip merging because COPYs are expected to be place in
+ // Pred already.
+ if (SameIncomingValueBBs.count(Pred))
+ return true;
- // Do not delete loop preheaders if doing so would create a critical edge.
- // Loop preheaders can be good locations to spill registers. If the
- // preheader is deleted and we create a critical edge, registers may be
- // spilled in the loop body instead.
- if (!DisablePreheaderProtect && Preheaders.count(BB) &&
- !(BB->getSinglePredecessor() && BB->getSinglePredecessor()->getSingleSuccessor()))
- continue;
+ for (auto SameValueBB : SameIncomingValueBBs)
+ if (SameValueBB->getUniquePredecessor() == Pred &&
+ DestBB == findDestBlockOfMergeableEmptyBlock(SameValueBB))
+ BBFreq += BFI->getBlockFreq(SameValueBB);
- eliminateMostlyEmptyBlock(BB);
- MadeChange = true;
- }
- return MadeChange;
+ return PredFreq.getFrequency() <=
+ BBFreq.getFrequency() * FreqRatioToSkipMerge;
}
/// Return true if we can merge BB into DestBB if there is a single
; REQUIRES: asserts
-; RUN: llc < %s -disable-preheader-prot=true -march=x86 -stats 2>&1 | grep "Number of blocks eliminated" | grep 6
+; RUN: llc < %s -disable-preheader-prot=true -march=x86 -stats 2>&1 | grep "Number of blocks eliminated" | grep 3
+; RUN: llc < %s -disable-preheader-prot=true -march=x86 -stats -cgp-freq-ratio-to-skip-merge=10 2>&1 | grep "Number of blocks eliminated" | grep 6
; RUN: llc < %s -disable-preheader-prot=false -march=x86 -stats 2>&1 | grep "Number of blocks eliminated" | grep 3
; PR1296
br label %for.cond357
sw.bb474:
+ ; CHECK: sw.bb474
+ ; spill is hoisted here. Although loop depth1 is even hotter than loop depth2, sw.bb474 is still cold.
+ ; CHECK: movq %r{{.*}}, {{[0-9]+}}(%rsp)
+ ; CHECK: land.rhs485
%cmp476 = icmp eq i8 undef, 0
br i1 %cmp476, label %if.end517, label %do.body479.preheader
do.body479.preheader:
- ; CHECK: do.body479.preheader
- ; spill is hoisted here. Although loop depth1 is even hotter than loop depth2, do.body479.preheader is cold.
- ; CHECK: movq %r{{.*}}, {{[0-9]+}}(%rsp)
- ; CHECK: land.rhs485
%cmp4833314 = icmp eq i8 undef, 0
br i1 %cmp4833314, label %if.end517, label %land.rhs485
; ARM64-LABEL: @widen_switch_i16(
; ARM64: %0 = zext i16 %trunc to i32
; ARM64-NEXT: switch i32 %0, label %sw.default [
-; ARM64-NEXT: i32 1, label %return
+; ARM64-NEXT: i32 1, label %sw.bb0
; ARM64-NEXT: i32 65535, label %sw.bb1
}
; ARM64-LABEL: @widen_switch_i17(
; ARM64: %0 = zext i17 %trunc to i32
; ARM64-NEXT: switch i32 %0, label %sw.default [
-; ARM64-NEXT: i32 10, label %return
+; ARM64-NEXT: i32 10, label %sw.bb0
; ARM64-NEXT: i32 131071, label %sw.bb1
}
; ARM64-LABEL: @widen_switch_i16_sext(
; ARM64: %0 = sext i2 %a to i32
; ARM64-NEXT: switch i32 %0, label %sw.default [
-; ARM64-NEXT: i32 1, label %return
+; ARM64-NEXT: i32 1, label %sw.bb0
; ARM64-NEXT: i32 -1, label %sw.bb1
}
; X86-LABEL: @widen_switch_i16(
; X86: %trunc = trunc i32 %a to i16
; X86-NEXT: switch i16 %trunc, label %sw.default [
-; X86-NEXT: i16 1, label %return
+; X86-NEXT: i16 1, label %sw.bb0
; X86-NEXT: i16 -1, label %sw.bb1
}
; X86-LABEL: @widen_switch_i17(
; X86: %0 = zext i17 %trunc to i32
; X86-NEXT: switch i32 %0, label %sw.default [
-; X86-NEXT: i32 10, label %return
+; X86-NEXT: i32 10, label %sw.bb0
; X86-NEXT: i32 131071, label %sw.bb1
}
; X86-LABEL: @widen_switch_i16_sext(
; X86: %0 = sext i2 %a to i8
; X86-NEXT: switch i8 %0, label %sw.default [
-; X86-NEXT: i8 1, label %return
+; X86-NEXT: i8 1, label %sw.bb0
; X86-NEXT: i8 -1, label %sw.bb1
}
--- /dev/null
+; RUN: opt -codegenprepare < %s -mtriple=aarch64-none-linux-gnu -S | FileCheck %s
+
+target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
+target triple = "aarch64--linux-gnu"
+
+; Expect to skip merging two empty blocks (sw.bb and sw.bb2) into sw.epilog
+; as both of them are unlikely executed.
+define i32 @f_switch(i32 %c) {
+; CHECK-LABEL: @f_switch
+; CHECK-LABEL: entry:
+; CHECK: i32 10, label %sw.bb
+; CHECK: i32 20, label %sw.bb2
+entry:
+ switch i32 %c, label %sw.default [
+ i32 10, label %sw.bb
+ i32 20, label %sw.bb2
+ i32 30, label %sw.bb3
+ i32 40, label %sw.bb4
+ ], !prof !0
+
+sw.bb: ; preds = %entry
+ br label %sw.epilog
+
+sw.bb2: ; preds = %entry
+ br label %sw.epilog
+
+sw.bb3: ; preds = %entry
+ call void bitcast (void (...)* @callcase3 to void ()*)()
+ br label %sw.epilog
+
+sw.bb4: ; preds = %entry
+ call void bitcast (void (...)* @callcase4 to void ()*)()
+ br label %sw.epilog
+
+sw.default: ; preds = %entry
+ call void bitcast (void (...)* @calldefault to void ()*)()
+ br label %sw.epilog
+
+; CHECK-LABEL: sw.epilog:
+; CHECK: %fp.0 = phi void (...)* [ @FD, %sw.default ], [ @F4, %sw.bb4 ], [ @F3, %sw.bb3 ], [ @F2, %sw.bb2 ], [ @F1, %sw.bb ]
+sw.epilog: ; preds = %sw.default, %sw.bb3, %sw.bb2, %sw.bb
+ %fp.0 = phi void (...)* [ @FD, %sw.default ], [ @F4, %sw.bb4 ], [ @F3, %sw.bb3 ], [ @F2, %sw.bb2 ], [ @F1, %sw.bb ]
+ %callee.knr.cast = bitcast void (...)* %fp.0 to void ()*
+ call void %callee.knr.cast()
+ ret i32 0
+}
+
+; Expect not to merge sw.bb2 because of the conflict in the incoming value from
+; sw.bb which is already merged.
+define i32 @f_switch2(i32 %c) {
+; CHECK-LABEL: @f_switch2
+; CHECK-LABEL: entry:
+; CHECK: i32 10, label %sw.epilog
+; CHECK: i32 20, label %sw.bb2
+entry:
+ switch i32 %c, label %sw.default [
+ i32 10, label %sw.bb
+ i32 20, label %sw.bb2
+ i32 30, label %sw.bb3
+ i32 40, label %sw.bb4
+ ], !prof !1
+
+sw.bb: ; preds = %entry
+ br label %sw.epilog
+
+sw.bb2: ; preds = %entry
+ br label %sw.epilog
+
+sw.bb3: ; preds = %entry
+ call void bitcast (void (...)* @callcase3 to void ()*)()
+ br label %sw.epilog
+
+sw.bb4: ; preds = %entry
+ call void bitcast (void (...)* @callcase4 to void ()*)()
+ br label %sw.epilog
+
+sw.default: ; preds = %entry
+ call void bitcast (void (...)* @calldefault to void ()*)()
+ br label %sw.epilog
+
+; CHECK-LABEL: sw.epilog:
+; CHECK: %fp.0 = phi void (...)* [ @FD, %sw.default ], [ @F4, %sw.bb4 ], [ @F3, %sw.bb3 ], [ @F2, %sw.bb2 ], [ @F1, %entry ]
+sw.epilog: ; preds = %sw.default, %sw.bb3, %sw.bb2, %sw.bb
+ %fp.0 = phi void (...)* [ @FD, %sw.default ], [ @F4, %sw.bb4 ], [ @F3, %sw.bb3 ], [ @F2, %sw.bb2 ], [ @F1, %sw.bb ]
+ %callee.knr.cast = bitcast void (...)* %fp.0 to void ()*
+ call void %callee.knr.cast()
+ ret i32 0
+}
+
+; Multiple empty blocks should be considered together if all incoming values
+; from them are same. We expect to merge both empty blocks (sw.bb and sw.bb2)
+; because the sum of frequencies are higer than the threshold.
+define i32 @f_switch3(i32 %c) {
+; CHECK-LABEL: @f_switch3
+; CHECK-LABEL: entry:
+; CHECK: i32 10, label %sw.epilog
+; CHECK: i32 20, label %sw.epilog
+entry:
+ switch i32 %c, label %sw.default [
+ i32 10, label %sw.bb
+ i32 20, label %sw.bb2
+ i32 30, label %sw.bb3
+ i32 40, label %sw.bb4
+ ], !prof !2
+
+sw.bb: ; preds = %entry
+ br label %sw.epilog
+
+sw.bb2: ; preds = %entry
+ br label %sw.epilog
+
+sw.bb3: ; preds = %entry
+ call void bitcast (void (...)* @callcase3 to void ()*)()
+ br label %sw.epilog
+
+sw.bb4: ; preds = %entry
+ call void bitcast (void (...)* @callcase4 to void ()*)()
+ br label %sw.epilog
+
+sw.default: ; preds = %entry
+ call void bitcast (void (...)* @calldefault to void ()*)()
+ br label %sw.epilog
+
+; CHECK-LABEL: sw.epilog:
+; CHECK: %fp.0 = phi void (...)* [ @FD, %sw.default ], [ @F4, %sw.bb4 ], [ @F3, %sw.bb3 ], [ @F1, %entry ], [ @F1, %entry ]
+sw.epilog: ; preds = %sw.default, %sw.bb3, %sw.bb2, %sw.bb
+ %fp.0 = phi void (...)* [ @FD, %sw.default ], [ @F4, %sw.bb4 ], [ @F3, %sw.bb3 ], [ @F1, %sw.bb2 ], [ @F1, %sw.bb ]
+ %callee.knr.cast = bitcast void (...)* %fp.0 to void ()*
+ call void %callee.knr.cast()
+ ret i32 0
+}
+
+declare void @F1(...) local_unnamed_addr
+declare void @F2(...) local_unnamed_addr
+declare void @F3(...) local_unnamed_addr
+declare void @F4(...) local_unnamed_addr
+declare void @FD(...) local_unnamed_addr
+declare void @callcase3(...) local_unnamed_addr
+declare void @callcase4(...) local_unnamed_addr
+declare void @calldefault(...) local_unnamed_addr
+
+!0 = !{!"branch_weights", i32 5, i32 1, i32 1,i32 5, i32 5}
+!1 = !{!"branch_weights", i32 1 , i32 5, i32 1,i32 1, i32 1}
+!2 = !{!"branch_weights", i32 1 , i32 4, i32 1,i32 1, i32 1}