/// Pair struct containing basic block and taildup profitiability
struct BlockAndTailDupResult {
- MachineBasicBlock * BB;
+ MachineBasicBlock *BB;
bool ShouldTailDup;
};
+ /// Triple struct containing edge weight and the edge.
+ struct WeightedEdge {
+ BlockFrequency Weight;
+ MachineBasicBlock *Src;
+ MachineBasicBlock *Dest;
+ };
+
/// \brief work lists of blocks that are ready to be laid out
SmallVector<MachineBasicBlock *, 16> BlockWorkList;
SmallVector<MachineBasicBlock *, 16> EHPadWorkList;
+ /// Edges that have already been computed as optimal by the trellis code.
+ DenseMap<const MachineBasicBlock *, MachineBasicBlock *> ComputedTrellisEdges;
+
/// \brief Machine Function
MachineFunction *F;
void buildCFGChains();
void optimizeBranches();
void alignBlocks();
+ /// Returns true if a block should be tail-duplicated to increase fallthrough
+ /// opportunities.
bool shouldTailDuplicate(MachineBasicBlock *BB);
/// Check the edge frequencies to see if tail duplication will increase
/// fallthroughs.
const MachineBasicBlock *BB, const MachineBasicBlock *Succ,
BranchProbability AdjustedSumProb,
const BlockChain &Chain, const BlockFilterSet *BlockFilter);
+ /// Check for a trellis layout.
+ bool isTrellis(const MachineBasicBlock *BB,
+ const SmallVectorImpl<MachineBasicBlock *> &ViableSuccs,
+ const BlockChain &Chain, const BlockFilterSet *BlockFilter);
+ /// Get the best successor given a trellis layout.
+ BlockAndTailDupResult getBestTrellisSuccessor(
+ const MachineBasicBlock *BB,
+ const SmallVectorImpl<MachineBasicBlock *> &ViableSuccs,
+ BranchProbability AdjustedSumProb, const BlockChain &Chain,
+ const BlockFilterSet *BlockFilter);
+ /// Get the best pair of non-conflicting edges.
+ static std::pair<WeightedEdge, WeightedEdge> getBestNonConflictingEdges(
+ const MachineBasicBlock *BB,
+ SmallVector<SmallVector<WeightedEdge, 8>, 2> &Edges);
/// Returns true if a block can tail duplicate into all unplaced
/// predecessors. Filters based on loop.
bool canTailDuplicateUnplacedPreds(
return SuccProb;
}
-/// Check if a block should be tail duplicated.
+/// Check if \p BB has exactly the successors in \p Successors.
+static bool
+hasSameSuccessors(MachineBasicBlock &BB,
+ SmallPtrSetImpl<const MachineBasicBlock *> &Successors) {
+ if (BB.succ_size() != Successors.size())
+ return false;
+ // We don't want to count self-loops
+ if (Successors.count(&BB))
+ return false;
+ for (MachineBasicBlock *Succ : BB.successors())
+ if (!Successors.count(Succ))
+ return false;
+ return true;
+}
+
+/// Check if a block should be tail duplicated to increase fallthrough
+/// opportunities.
/// \p BB Block to check.
bool MachineBlockPlacement::shouldTailDuplicate(MachineBasicBlock *BB) {
// Blocks with single successors don't create additional fallthrough
// | / | C' (+Succ)
// Succ Succ /|
// / \ | \/ |
- // U/ =V = /= =
+ // U/ =V | == |
// / \ | / \|
// D E D E
// '=' : Branch taken for that CFG edge
// Cost in the first case is: P + V
// For this calculation, we always assume P > Qout. If Qout > P
// The result of this function will be ignored at the caller.
- // Cost in the second case is: Qout + Qin * V + P * U + P * V
- // TODO(iteratee): If we lay out D after Succ, the P * U term
- // goes away. This logic is coming in D28522.
+ // Cost in the second case is: Qout + Qin * U + P * V
if (PDom == nullptr || !Succ->isSuccessor(PDom)) {
BranchProbability UProb = BestSuccSucc;
BranchProbability VProb = AdjustedSuccSumProb - UProb;
BlockFrequency V = SuccFreq * VProb;
- BlockFrequency QinV = Qin * VProb;
+ BlockFrequency QinU = Qin * UProb;
BlockFrequency BaseCost = P + V;
- BlockFrequency DupCost = Qout + QinV + P * AdjustedSuccSumProb;
+ BlockFrequency DupCost = Qout + QinU + P * VProb;
return greaterWithBias(BaseCost, DupCost, EntryFreq);
}
BranchProbability UProb = MBPI->getEdgeProbability(Succ, PDom);
// Succ Succ /| Succ Succ /|
// | \ V | \/ | | \ V | \/ |
// |U \ |U /\ | |U = |U /\ |
- // = D = = =| | D | = =|
+ // = D = = \= | D | = =|
// | / |/ D | / |/ D
// | / | / | = | /
// |/ | / |/ | =
// The cost in the second case (assuming independence), given the layout:
// BB, Succ, (C+Succ), D, Dom
// is Qout + P * V + Qin * U
- // compare P + U vs Qout + P + Qin * U.
+ // compare P + U vs Qout + P * U + Qin.
//
// The 3rd and 4th cases cover when Dom would be chosen to follow Succ.
//
// For the 3rd case, the cost is P + 2 * V
// For the 4th case, the cost is Qout + Qin * U + P * V + V
// We choose 4 over 3 when (P + V) > Qout + Qin * U + P * V
- if (UProb > AdjustedSuccSumProb / 2
- && !hasBetterLayoutPredecessor(Succ, PDom, *BlockToChain[PDom],
- UProb, UProb, Chain, BlockFilter)) {
+ if (UProb > AdjustedSuccSumProb / 2 &&
+ !hasBetterLayoutPredecessor(Succ, PDom, *BlockToChain[PDom], UProb, UProb,
+ Chain, BlockFilter))
// Cases 3 & 4
return greaterWithBias((P + V), (Qout + Qin * UProb + P * VProb),
EntryFreq);
- }
// Cases 1 & 2
return greaterWithBias(
- (P + U), (Qout + Qin * UProb + P * AdjustedSuccSumProb), EntryFreq);
+ (P + U), (Qout + Qin * AdjustedSuccSumProb + P * UProb), EntryFreq);
+}
+
+/// Check for a trellis layout. \p BB is the upper part of a trellis if its
+/// successors form the lower part of a trellis. A successor set S forms the
+/// lower part of a trellis if all of the predecessors of S are either in S or
+/// have all of S as successors. We ignore trellises where BB doesn't have 2
+/// successors because for fewer than 2, it's trivial, and for 3 or greater they
+/// are very uncommon and complex to compute optimally. Allowing edges within S
+/// is not strictly a trellis, but the same algorithm works, so we allow it.
+bool MachineBlockPlacement::isTrellis(
+ const MachineBasicBlock *BB,
+ const SmallVectorImpl<MachineBasicBlock *> &ViableSuccs,
+ const BlockChain &Chain, const BlockFilterSet *BlockFilter) {
+ // Technically BB could form a trellis with branching factor higher than 2.
+ // But that's extremely uncommon.
+ if (BB->succ_size() != 2 || ViableSuccs.size() != 2)
+ return false;
+
+ SmallPtrSet<const MachineBasicBlock *, 2> Successors(BB->succ_begin(),
+ BB->succ_end());
+ // To avoid reviewing the same predecessors twice.
+ SmallPtrSet<const MachineBasicBlock *, 8> SeenPreds;
+
+ for (MachineBasicBlock *Succ : ViableSuccs) {
+ int PredCount = 0;
+ for (auto SuccPred : Succ->predecessors()) {
+ // Allow triangle successors, but don't count them.
+ if (Successors.count(SuccPred))
+ continue;
+ const BlockChain *PredChain = BlockToChain[SuccPred];
+ if (SuccPred == BB || (BlockFilter && !BlockFilter->count(SuccPred)) ||
+ PredChain == &Chain || PredChain == BlockToChain[Succ])
+ continue;
+ ++PredCount;
+ // Perform the successor check only once.
+ if (!SeenPreds.insert(SuccPred).second)
+ continue;
+ if (!hasSameSuccessors(*SuccPred, Successors))
+ return false;
+ }
+ // If one of the successors has only BB as a predecessor, it is not a
+ // trellis.
+ if (PredCount < 1)
+ return false;
+ }
+ return true;
}
+/// Pick the highest total weight pair of edges that can both be laid out.
+/// The edges in \p Edges[0] are assumed to have a different destination than
+/// the edges in \p Edges[1]. Simple counting shows that the best pair is either
+/// the individual highest weight edges to the 2 different destinations, or in
+/// case of a conflict, one of them should be replaced with a 2nd best edge.
+std::pair<MachineBlockPlacement::WeightedEdge,
+ MachineBlockPlacement::WeightedEdge>
+MachineBlockPlacement::getBestNonConflictingEdges(
+ const MachineBasicBlock *BB,
+ SmallVector<SmallVector<MachineBlockPlacement::WeightedEdge, 8>, 2>
+ &Edges) {
+ // Sort the edges, and then for each successor, find the best incoming
+ // predecessor. If the best incoming predecessors aren't the same,
+ // then that is clearly the best layout. If there is a conflict, one of the
+ // successors will have to fallthrough from the second best predecessor. We
+ // compare which combination is better overall.
+
+ // Sort for highest frequency.
+ auto Cmp = [](WeightedEdge A, WeightedEdge B) { return A.Weight > B.Weight; };
+
+ std::stable_sort(Edges[0].begin(), Edges[0].end(), Cmp);
+ std::stable_sort(Edges[1].begin(), Edges[1].end(), Cmp);
+ auto BestA = Edges[0].begin();
+ auto BestB = Edges[1].begin();
+ // Arrange for the correct answer to be in BestA and BestB
+ // If the 2 best edges don't conflict, the answer is already there.
+ if (BestA->Src == BestB->Src) {
+ // Compare the total fallthrough of (Best + Second Best) for both pairs
+ auto SecondBestA = std::next(BestA);
+ auto SecondBestB = std::next(BestB);
+ BlockFrequency BestAScore = BestA->Weight + SecondBestB->Weight;
+ BlockFrequency BestBScore = BestB->Weight + SecondBestA->Weight;
+ if (BestAScore < BestBScore)
+ BestA = SecondBestA;
+ else
+ BestB = SecondBestB;
+ }
+ // Arrange for the BB edge to be in BestA if it exists.
+ if (BestB->Src == BB)
+ std::swap(BestA, BestB);
+ return std::make_pair(*BestA, *BestB);
+}
+
+/// Get the best successor from \p BB based on \p BB being part of a trellis.
+/// We only handle trellises with 2 successors, so the algorithm is
+/// straightforward: Find the best pair of edges that don't conflict. We find
+/// the best incoming edge for each successor in the trellis. If those conflict,
+/// we consider which of them should be replaced with the second best.
+/// Upon return the two best edges will be in \p BestEdges. If one of the edges
+/// comes from \p BB, it will be in \p BestEdges[0]
+MachineBlockPlacement::BlockAndTailDupResult
+MachineBlockPlacement::getBestTrellisSuccessor(
+ const MachineBasicBlock *BB,
+ const SmallVectorImpl<MachineBasicBlock *> &ViableSuccs,
+ BranchProbability AdjustedSumProb, const BlockChain &Chain,
+ const BlockFilterSet *BlockFilter) {
+
+ BlockAndTailDupResult Result = {nullptr, false};
+ SmallPtrSet<const MachineBasicBlock *, 4> Successors(BB->succ_begin(),
+ BB->succ_end());
+
+ // We assume size 2 because it's common. For general n, we would have to do
+ // the Hungarian algorithm, but it's not worth the complexity because more
+ // than 2 successors is fairly uncommon, and a trellis even more so.
+ if (Successors.size() != 2 || ViableSuccs.size() != 2)
+ return Result;
+
+ // Collect the edge frequencies of all edges that form the trellis.
+ SmallVector<SmallVector<WeightedEdge, 8>, 2> Edges(2);
+ int SuccIndex = 0;
+ for (auto Succ : ViableSuccs) {
+ for (MachineBasicBlock *SuccPred : Succ->predecessors()) {
+ // Skip any placed predecessors that are not BB
+ if (SuccPred != BB)
+ if ((BlockFilter && !BlockFilter->count(SuccPred)) ||
+ BlockToChain[SuccPred] == &Chain ||
+ BlockToChain[SuccPred] == BlockToChain[Succ])
+ continue;
+ BlockFrequency EdgeFreq = MBFI->getBlockFreq(SuccPred) *
+ MBPI->getEdgeProbability(SuccPred, Succ);
+ Edges[SuccIndex].push_back({EdgeFreq, SuccPred, Succ});
+ }
+ ++SuccIndex;
+ }
+
+ // Pick the best combination of 2 edges from all the edges in the trellis.
+ WeightedEdge BestA, BestB;
+ std::tie(BestA, BestB) = getBestNonConflictingEdges(BB, Edges);
+
+ if (BestA.Src != BB) {
+ // If we have a trellis, and BB doesn't have the best fallthrough edges,
+ // we shouldn't choose any successor. We've already looked and there's a
+ // better fallthrough edge for all the successors.
+ DEBUG(dbgs() << "Trellis, but not one of the chosen edges.\n");
+ return Result;
+ }
+
+ // Did we pick the triangle edge? If tail-duplication is profitable, do
+ // that instead. Otherwise merge the triangle edge now while we know it is
+ // optimal.
+ if (BestA.Dest == BestB.Src) {
+ // The edges are BB->Succ1->Succ2, and we're looking to see if BB->Succ2
+ // would be better.
+ MachineBasicBlock *Succ1 = BestA.Dest;
+ MachineBasicBlock *Succ2 = BestB.Dest;
+ // Check to see if tail-duplication would be profitable.
+ if (TailDupPlacement && shouldTailDuplicate(Succ2) &&
+ canTailDuplicateUnplacedPreds(BB, Succ2, Chain, BlockFilter) &&
+ isProfitableToTailDup(BB, Succ2, MBPI->getEdgeProbability(BB, Succ1),
+ Chain, BlockFilter)) {
+ DEBUG(BranchProbability Succ2Prob = getAdjustedProbability(
+ MBPI->getEdgeProbability(BB, Succ2), AdjustedSumProb);
+ dbgs() << " Selected: " << getBlockName(Succ2)
+ << ", probability: " << Succ2Prob << " (Tail Duplicate)\n");
+ Result.BB = Succ2;
+ Result.ShouldTailDup = true;
+ return Result;
+ }
+ }
+ // We have already computed the optimal edge for the other side of the
+ // trellis.
+ ComputedTrellisEdges[BestB.Src] = BestB.Dest;
+
+ auto TrellisSucc = BestA.Dest;
+ DEBUG(BranchProbability SuccProb = getAdjustedProbability(
+ MBPI->getEdgeProbability(BB, TrellisSucc), AdjustedSumProb);
+ dbgs() << " Selected: " << getBlockName(TrellisSucc)
+ << ", probability: " << SuccProb << " (Trellis)\n");
+ Result.BB = TrellisSucc;
+ return Result;
+}
/// When the option TailDupPlacement is on, this method checks if the
/// fallthrough candidate block \p Succ (of block \p BB) can be tail-duplicated
if (!shouldTailDuplicate(Succ))
return false;
+ // For CFG checking.
+ SmallPtrSet<const MachineBasicBlock *, 4> Successors(BB->succ_begin(),
+ BB->succ_end());
for (MachineBasicBlock *Pred : Succ->predecessors()) {
// Make sure all unplaced and unfiltered predecessors can be
// tail-duplicated into.
if (Pred == BB || (BlockFilter && !BlockFilter->count(Pred))
|| BlockToChain[Pred] == &Chain)
continue;
- if (!TailDup.canTailDuplicate(Succ, Pred))
+ if (!TailDup.canTailDuplicate(Succ, Pred)) {
+ if (Successors.size() > 1 && hasSameSuccessors(*Pred, Successors))
+ // This will result in a trellis after tail duplication, so we don't
+ // need to copy Succ into this predecessor. In the presence
+ // of a trellis tail duplication can continue to be profitable.
+ // For example:
+ // A A
+ // |\ |\
+ // | \ | \
+ // | C | C+BB
+ // | / | |
+ // |/ | |
+ // BB => BB |
+ // |\ |\/|
+ // | \ |/\|
+ // | D | D
+ // | / | /
+ // |/ |/
+ // Succ Succ
+ //
+ // After BB was duplicated into C, the layout looks like the one on the
+ // right. BB and C now have the same successors. When considering
+ // whether Succ can be duplicated into all its unplaced predecessors, we
+ // ignore C.
+ // We can do this because C already has a profitable fallthrough, namely
+ // D. TODO(iteratee): ignore sufficiently cold predecessors for
+ // duplication and for this test.
+ //
+ // This allows trellises to be laid out in 2 separate chains
+ // (A,B,Succ,...) and later (C,D,...) This is a reasonable heuristic
+ // because it allows the creation of 2 fallthrough paths with links
+ // between them, and we correctly identify the best layout for these
+ // CFGs. We want to extend trellises that the user created in addition
+ // to trellises created by tail-duplication, so we just look for the
+ // CFG.
+ continue;
return false;
+ }
}
return true;
}
// | Pred----| | S1----
// | | | |
// --(S1 or S2) ---Pred--
+ // |
+ // S2
//
// topo-cost = freq(S->Pred) + freq(BB->S1) + freq(BB->S2)
// + min(freq(Pred->S1), freq(Pred->S2))
// Non-topo-order cost:
- // In the worst case, S2 will not get laid out after Pred.
// non-topo-cost = 2 * freq(S->Pred) + freq(BB->S2).
// To be conservative, we can assume that min(freq(Pred->S1), freq(Pred->S2))
// is 0. Then the non topo layout is better when
DEBUG(dbgs() << "Selecting best successor for: " << getBlockName(BB) << "\n");
+ // if we already precomputed the best successor for BB as part of a trellis we
+ // saw earlier, return that if still applicable.
+ auto FoundEdge = ComputedTrellisEdges.find(BB);
+ if (FoundEdge != ComputedTrellisEdges.end()) {
+ MachineBasicBlock *Succ = FoundEdge->second;
+ ComputedTrellisEdges.erase(FoundEdge);
+ BlockChain *SuccChain = BlockToChain[Succ];
+ if (BB->isSuccessor(Succ) && (!BlockFilter || BlockFilter->count(Succ)) &&
+ SuccChain != &Chain && Succ == *SuccChain->begin()) {
+ BestSucc.BB = Succ;
+ return BestSucc;
+ }
+ }
+
+ // if BB is part of a trellis, Use the trellis to determine the optimal
+ // fallthrough edges
+ if (isTrellis(BB, Successors, Chain, BlockFilter))
+ return getBestTrellisSuccessor(BB, Successors, AdjustedSumProb, Chain,
+ BlockFilter);
+
// For blocks with CFG violations, we may be able to lay them out anyway with
// tail-duplication. We keep this vector so we can perform the probability
// calculations the minimum number of times.
; CHECK-NEXT: ; BB#1: ; %b3
; CHECK: ldr [[LOAD:w[0-9]+]]
-; CHECK: cbz [[LOAD]], [[SKIP_LONG_B:LBB[0-9]+_[0-9]+]]
-; CHECK-NEXT: b [[B8:LBB[0-9]+_[0-9]+]]
-
-; CHECK-NEXT: [[SKIP_LONG_B]]:
+; CHECK: cbnz [[LOAD]], [[B8:LBB[0-9]+_[0-9]+]]
; CHECK-NEXT: b [[B7:LBB[0-9]+_[0-9]+]]
+; CHECK-NEXT: [[B8]]: ; %b8
+; CHECK-NEXT: ret
+
; CHECK-NEXT: [[B2]]: ; %b2
; CHECK: mov w{{[0-9]+}}, #93
; CHECK: bl _extfunc
; CHECK: cbz w{{[0-9]+}}, [[B7]]
-
-; CHECK-NEXT: [[B8]]: ; %b8
-; CHECK-NEXT: ret
+; CHECK-NEXT: b [[B8]]
; CHECK-NEXT: [[B7]]: ; %b7
; CHECK: mov w{{[0-9]+}}, #13
; CHECK: b _extfunc
+
define void @split_block_no_fallthrough(i64 %val) #0 {
bb:
%c0 = icmp sgt i64 %val, -5
define i32 @do_nothing_if_resultant_opcodes_would_differ() #0 {
; CHECK-LABEL: do_nothing_if_resultant_opcodes_would_differ
; CHECK: cmn
-; CHECK: b.gt
+; CHECK: b.le
; CHECK: cmp
; CHECK: b.gt
entry:
;
; CHECK-LABEL: func
; CHECK-NOT: and
-; CHECK: tbnz
+; CHECK: tbz
define void @func() {
%c0 = icmp sgt i64 0, 0
br i1 %c0, label %b1, label %b6
; GCNNOOPT: v_writelane_b32
; GCN: s_cbranch_scc1 [[END:BB[0-9]+_[0-9]+]]
-
-; GCN: ; BB#1
; GCNNOOPT: v_readlane_b32
; GCNNOOPT: v_readlane_b32
; GCN: buffer_store_dword
-; GCNOPT-NEXT: s_waitcnt vmcnt(0) expcnt(0)
-; TODO: This waitcnt can be eliminated
+; GCNNOOPT: s_endpgm
; GCN: {{^}}[[END]]:
; GCN: s_endpgm
; GCN-LABEL: {{^}}long_branch_hang:
; GCN: s_cmp_lt_i32 s{{[0-9]+}}, 6
-; GCN-NEXT: s_cbranch_scc0 [[LONG_BR_0:BB[0-9]+_[0-9]+]]
+; GCN-NEXT: s_cbranch_scc1 {{BB[0-9]+_[0-9]+}}
+; GCN-NEXT: s_branch [[LONG_BR_0:BB[0-9]+_[0-9]+]]
; GCN-NEXT: BB{{[0-9]+_[0-9]+}}:
; GCN: s_add_u32 vcc_lo, vcc_lo, [[LONG_BR_DEST0:BB[0-9]+_[0-9]+]]-(
; RUN: llc -march=amdgcn -verify-machineinstrs -O0 < %s
; GCN-LABEL: {{^}}test_loop:
-; GCN: [[LABEL:BB[0-9+]_[0-9]+]]:
+; GCN: [[LABEL:BB[0-9+]_[0-9]+]]: ; %for.body{{$}}
; GCN: ds_read_b32
; GCN: ds_write_b32
; GCN: s_branch [[LABEL]]
; GCN: v_cmp_ne_u32_e64
; GCN: BB{{[0-9]+_[0-9]+}}:
+
define void @nonconvergent_inlineasm(i64 addrspace(1)* nocapture %arg) {
bb:
%tmp = call i32 @llvm.amdgcn.workitem.id.x()
; GCN: v_mov_b32_e32 [[ONE:v[0-9]+]], 1
; GCN-NOHSA: buffer_store_dword [[ONE]]
; GCN-HSA: flat_store_dword v[{{[0-9]+:[0-9]+}}], [[ONE]]
-; GCN; {{^}}[[EXIT]]:
+; GCN: {{^}}[[EXIT]]:
; GCN: s_endpgm
define void @sopc_vopc_legalize_bug(i32 %cond, i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
bb3: ; preds = %bb2
; CHECK: bl _quux
; CHECK-NOT: bl _quux
-; NOMERGE: bl _baz
-; NOMERGE: bl _baz
+; NOMERGE-DAG: bl _baz
+; NOMERGE-DAG: bl _baz
-; NOMERGE: bl _quux
-; NOMERGE: bl _quux
+; NOMERGE-DAG: bl _quux
+; NOMERGE-DAG: bl _quux
; ModuleID = 'tail.c'
target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64"
; CHECK-ARMV7-NEXT: [[HEAD:.LBB[0-9_]+]]:
; CHECK-ARMV7-NEXT: strexb [[SUCCESS:r[0-9]+]], r2, [r0]
; CHECK-ARMV7-NEXT: cmp [[SUCCESS]], #0
-; CHECK-ARMV7-NEXT: moveq [[RES:r[0-9]+]], #1
+; CHECK-ARMV7-NEXT: moveq r0, #1
; CHECK-ARMV7-NEXT: bxeq lr
; CHECK-ARMV7-NEXT: [[TRY]]:
-; CHECK-ARMV7-NEXT: ldrexb [[LD:r[0-9]+]], [r0]
-; CHECK-ARMV7-NEXT: cmp [[LD]], [[DESIRED]]
+; CHECK-ARMV7-NEXT: ldrexb [[SUCCESS]], [r0]
+; CHECK-ARMV7-NEXT: cmp [[SUCCESS]], r1
; CHECK-ARMV7-NEXT: beq [[HEAD]]
; CHECK-ARMV7-NEXT: clrex
-; CHECK-ARMV7-NEXT: mov [[RES]], #0
+; CHECK-ARMV7-NEXT: mov r0, #0
; CHECK-ARMV7-NEXT: bx lr
; CHECK-THUMBV7-LABEL: test_cmpxchg_res_i8:
; Important to check for beginning of basic block, because if it gets
; if-converted the test is probably no longer checking what it should.
-; CHECK: {{LBB[0-9]+_2}}:
+; CHECK: %end
; CHECK-NEXT: vpop {d7, d8}
; CHECK-NEXT: pop {r4, pc}
;CHECK-NEXT: bc 12, 1, [[BODY1LABEL:[._0-9A-Za-z]+]]
;CHECK-NEXT: # %test2
;CHECK-NEXT: rlwinm. {{[0-9]+}}, [[TAGREG]], 0, 30, 30
-;CHECK-NEXT: beq 0, [[EXITLABEL:[._0-9A-Za-z]+]]
-;CHECK-NEXT: b [[BODY2LABEL:[._0-9A-Za-z]+]]
+;CHECK-NEXT: bne 0, [[BODY2LABEL:[._0-9A-Za-z]+]]
+;CHECK: [[EXITLABEL:[._0-9A-Za-z]+]]: # %exit
+;CHECK: blr
;CHECK-NEXT: [[BODY1LABEL]]
;CHECK: rlwinm. {{[0-9]+}}, [[TAGREG]], 0, 30, 30
;CHECK-NEXT: beq 0, [[EXITLABEL]]
-;CHECK-NEXT: [[BODY2LABEL]]
-;CHECK: [[EXITLABEL:[._0-9A-Za-z]+]]: # %exit
-;CHECK: blr
+;CHECK-NEXT: [[BODY2LABEL:[._0-9A-Za-z]+]]:
+;CHECK: b [[EXITLABEL]]
define void @tail_dup_break_cfg(i32 %tag) {
entry:
br label %test1
test2:
%tagbit2 = and i32 %tag, 2
%tagbit2eq0 = icmp ne i32 %tagbit2, 0
- br i1 %tagbit2eq0, label %body2, label %exit, !prof !1 ; %body2 more likely
+ br i1 %tagbit2eq0, label %body2, label %exit, !prof !3 ; %body2 more likely
body2:
call void @b()
call void @b()
!1 = !{!"branch_weights", i32 5, i32 3}
!2 = !{!"branch_weights", i32 95, i32 5}
-!3 = !{!"branch_weights", i32 7, i32 3}
+!3 = !{!"branch_weights", i32 8, i32 3}
-; RUN: llc -outline-optional-branches -O2 < %s | FileCheck %s
+; RUN: llc -O2 < %s | FileCheck %s
target datalayout = "e-m:e-i64:64-n32:64"
target triple = "powerpc64le-grtev4-linux-gnu"
; Intended layout:
-; The outlining flag produces the layout
+; The chain-based outlining produces the layout
; test1
; test2
; test3
; test4
-; exit
; optional1
; optional2
; optional3
; optional4
+; exit
; Tail duplication puts test n+1 at the end of optional n
; so optional1 includes a copy of test2 at the end, and branches
; to test3 (at the top) or falls through to optional 2.
-; The CHECK statements check for the whole string of tests and exit block,
+; The CHECK statements check for the whole string of tests
; and then check that the correct test has been duplicated into the end of
; the optional blocks and that the optional blocks are in the correct order.
-;CHECK-LABEL: f:
+;CHECK-LABEL: straight_test:
; test1 may have been merged with entry
;CHECK: mr [[TAGREG:[0-9]+]], 3
;CHECK: andi. {{[0-9]+}}, [[TAGREG]], 1
-;CHECK-NEXT: bc 12, 1, [[OPT1LABEL:[._0-9A-Za-z]+]]
-;CHECK-NEXT: [[TEST2LABEL:[._0-9A-Za-z]+]]: # %test2
+;CHECK-NEXT: bc 12, 1, .[[OPT1LABEL:[_0-9A-Za-z]+]]
+;CHECK-NEXT: # %test2
;CHECK-NEXT: rlwinm. {{[0-9]+}}, [[TAGREG]], 0, 30, 30
-;CHECK-NEXT: bne 0, [[OPT2LABEL:[._0-9A-Za-z]+]]
-;CHECK-NEXT: [[TEST3LABEL:[._0-9A-Za-z]+]]: # %test3
+;CHECK-NEXT: bne 0, .[[OPT2LABEL:[_0-9A-Za-z]+]]
+;CHECK-NEXT: .[[TEST3LABEL:[_0-9A-Za-z]+]]: # %test3
;CHECK-NEXT: rlwinm. {{[0-9]+}}, [[TAGREG]], 0, 29, 29
-;CHECK-NEXT: bne 0, .[[OPT3LABEL:[._0-9A-Za-z]+]]
-;CHECK-NEXT: [[TEST4LABEL:[._0-9A-Za-z]+]]: # %test4
+;CHECK-NEXT: bne 0, .[[OPT3LABEL:[_0-9A-Za-z]+]]
+;CHECK-NEXT: .[[TEST4LABEL:[_0-9A-Za-z]+]]: # %test4
;CHECK-NEXT: rlwinm. {{[0-9]+}}, [[TAGREG]], 0, 28, 28
-;CHECK-NEXT: bne 0, .[[OPT4LABEL:[._0-9A-Za-z]+]]
-;CHECK-NEXT: [[EXITLABEL:[._0-9A-Za-z]+]]: # %exit
+;CHECK-NEXT: bne 0, .[[OPT4LABEL:[_0-9A-Za-z]+]]
+;CHECK-NEXT: .[[EXITLABEL:[_0-9A-Za-z]+]]: # %exit
;CHECK: blr
-;CHECK-NEXT: [[OPT1LABEL]]
+;CHECK-NEXT: .[[OPT1LABEL]]:
;CHECK: rlwinm. {{[0-9]+}}, [[TAGREG]], 0, 30, 30
-;CHECK-NEXT: beq 0, [[TEST3LABEL]]
-;CHECK-NEXT: [[OPT2LABEL]]
+;CHECK-NEXT: beq 0, .[[TEST3LABEL]]
+;CHECK-NEXT: .[[OPT2LABEL]]:
;CHECK: rlwinm. {{[0-9]+}}, [[TAGREG]], 0, 29, 29
-;CHECK-NEXT: beq 0, [[TEST4LABEL]]
-;CHECK-NEXT: [[OPT3LABEL]]
+;CHECK-NEXT: beq 0, .[[TEST4LABEL]]
+;CHECK-NEXT: .[[OPT3LABEL]]:
;CHECK: rlwinm. {{[0-9]+}}, [[TAGREG]], 0, 28, 28
-;CHECK-NEXT: beq 0, [[EXITLABEL]]
-;CHECK-NEXT: [[OPT4LABEL]]
-;CHECK: b [[EXITLABEL]]
+;CHECK-NEXT: beq 0, .[[EXITLABEL]]
+;CHECK-NEXT: .[[OPT4LABEL]]:
+;CHECK: b .[[EXITLABEL]]
-define void @f(i32 %tag) {
+define void @straight_test(i32 %tag) {
entry:
br label %test1
test1:
%tagbit1 = and i32 %tag, 1
%tagbit1eq0 = icmp eq i32 %tagbit1, 0
- br i1 %tagbit1eq0, label %test2, label %optional1
+ br i1 %tagbit1eq0, label %test2, label %optional1, !prof !1
optional1:
call void @a()
call void @a()
test2:
%tagbit2 = and i32 %tag, 2
%tagbit2eq0 = icmp eq i32 %tagbit2, 0
- br i1 %tagbit2eq0, label %test3, label %optional2
+ br i1 %tagbit2eq0, label %test3, label %optional2, !prof !1
optional2:
call void @b()
call void @b()
test3:
%tagbit3 = and i32 %tag, 4
%tagbit3eq0 = icmp eq i32 %tagbit3, 0
- br i1 %tagbit3eq0, label %test4, label %optional3
+ br i1 %tagbit3eq0, label %test4, label %optional3, !prof !1
optional3:
call void @c()
call void @c()
test4:
%tagbit4 = and i32 %tag, 8
%tagbit4eq0 = icmp eq i32 %tagbit4, 0
- br i1 %tagbit4eq0, label %exit, label %optional4
+ br i1 %tagbit4eq0, label %exit, label %optional4, !prof !1
optional4:
call void @d()
call void @d()
ret void
}
+; Intended layout:
+; The chain-based outlining produces the layout
+; entry
+; --- Begin loop ---
+; for.latch
+; for.check
+; test1
+; test2
+; test3
+; test4
+; optional1
+; optional2
+; optional3
+; optional4
+; --- End loop ---
+; exit
+; The CHECK statements check for the whole string of tests and exit block,
+; and then check that the correct test has been duplicated into the end of
+; the optional blocks and that the optional blocks are in the correct order.
+;CHECK-LABEL: loop_test:
+;CHECK: add [[TAGPTRREG:[0-9]+]], 3, 4
+;CHECK: .[[LATCHLABEL:[._0-9A-Za-z]+]]: # %for.latch
+;CHECK: addi
+;CHECK: .[[CHECKLABEL:[._0-9A-Za-z]+]]: # %for.check
+;CHECK: lwz [[TAGREG:[0-9]+]], 0([[TAGPTRREG]])
+;CHECK: # %test1
+;CHECK: andi. {{[0-9]+}}, [[TAGREG]], 1
+;CHECK-NEXT: bc 12, 1, .[[OPT1LABEL:[._0-9A-Za-z]+]]
+;CHECK-NEXT: # %test2
+;CHECK: rlwinm. {{[0-9]+}}, [[TAGREG]], 0, 30, 30
+;CHECK-NEXT: bne 0, .[[OPT2LABEL:[._0-9A-Za-z]+]]
+;CHECK-NEXT: .[[TEST3LABEL:[._0-9A-Za-z]+]]: # %test3
+;CHECK: rlwinm. {{[0-9]+}}, [[TAGREG]], 0, 29, 29
+;CHECK-NEXT: bne 0, .[[OPT3LABEL:[._0-9A-Za-z]+]]
+;CHECK-NEXT: .[[TEST4LABEL:[._0-9A-Za-z]+]]: # %{{(test4|optional3)}}
+;CHECK: rlwinm. {{[0-9]+}}, [[TAGREG]], 0, 28, 28
+;CHECK-NEXT: beq 0, .[[LATCHLABEL]]
+;CHECK-NEXT: b .[[OPT4LABEL:[._0-9A-Za-z]+]]
+;CHECK: [[OPT1LABEL]]
+;CHECK: rlwinm. {{[0-9]+}}, [[TAGREG]], 0, 30, 30
+;CHECK-NEXT: beq 0, .[[TEST3LABEL]]
+;CHECK-NEXT: .[[OPT2LABEL]]
+;CHECK: rlwinm. {{[0-9]+}}, [[TAGREG]], 0, 29, 29
+;CHECK-NEXT: beq 0, .[[TEST4LABEL]]
+;CHECK-NEXT: .[[OPT3LABEL]]
+;CHECK: rlwinm. {{[0-9]+}}, [[TAGREG]], 0, 28, 28
+;CHECK-NEXT: beq 0, .[[LATCHLABEL]]
+;CHECK: [[OPT4LABEL]]:
+;CHECK: b .[[LATCHLABEL]]
+define void @loop_test(i32* %tags, i32 %count) {
+entry:
+ br label %for.check
+for.check:
+ %count.loop = phi i32 [%count, %entry], [%count.sub, %for.latch]
+ %done.count = icmp ugt i32 %count.loop, 0
+ %tag_ptr = getelementptr inbounds i32, i32* %tags, i32 %count
+ %tag = load i32, i32* %tag_ptr
+ %done.tag = icmp eq i32 %tag, 0
+ %done = and i1 %done.count, %done.tag
+ br i1 %done, label %test1, label %exit, !prof !1
+test1:
+ %tagbit1 = and i32 %tag, 1
+ %tagbit1eq0 = icmp eq i32 %tagbit1, 0
+ br i1 %tagbit1eq0, label %test2, label %optional1, !prof !1
+optional1:
+ call void @a()
+ call void @a()
+ call void @a()
+ call void @a()
+ br label %test2
+test2:
+ %tagbit2 = and i32 %tag, 2
+ %tagbit2eq0 = icmp eq i32 %tagbit2, 0
+ br i1 %tagbit2eq0, label %test3, label %optional2, !prof !1
+optional2:
+ call void @b()
+ call void @b()
+ call void @b()
+ call void @b()
+ br label %test3
+test3:
+ %tagbit3 = and i32 %tag, 4
+ %tagbit3eq0 = icmp eq i32 %tagbit3, 0
+ br i1 %tagbit3eq0, label %test4, label %optional3, !prof !1
+optional3:
+ call void @c()
+ call void @c()
+ call void @c()
+ call void @c()
+ br label %test4
+test4:
+ %tagbit4 = and i32 %tag, 8
+ %tagbit4eq0 = icmp eq i32 %tagbit4, 0
+ br i1 %tagbit4eq0, label %for.latch, label %optional4, !prof !1
+optional4:
+ call void @d()
+ call void @d()
+ call void @d()
+ call void @d()
+ br label %for.latch
+for.latch:
+ %count.sub = sub i32 %count.loop, 1
+ br label %for.check
+exit:
+ ret void
+}
+
+; The block then2 is not unavoidable, meaning it does not dominate the exit.
+; But since it can be tail-duplicated, it should be placed as a fallthrough from
+; test2 and copied. The purpose here is to make sure that the tail-duplication
+; code is independent of the outlining code, which works by choosing the
+; "unavoidable" blocks.
+; CHECK-LABEL: avoidable_test:
+; CHECK: # %entry
+; CHECK: andi.
+; CHECK: # %test2
+; Make sure then2 falls through from test2
+; CHECK-NOT: # %{{[-_a-zA-Z0-9]+}}
+; CHECK: # %then2
+; CHECK: rlwinm. {{[0-9]+}}, {{[0-9]+}}, 0, 29, 29
+; CHECK: # %else1
+; CHECK: bl a
+; CHECK: bl a
+; Make sure then2 was copied into else1
+; CHECK: rlwinm. {{[0-9]+}}, {{[0-9]+}}, 0, 29, 29
+; CHECK: # %end1
+; CHECK: bl d
+; CHECK: # %else2
+; CHECK: bl c
+; CHECK: # %end2
+define void @avoidable_test(i32 %tag) {
+entry:
+ br label %test1
+test1:
+ %tagbit1 = and i32 %tag, 1
+ %tagbit1eq0 = icmp eq i32 %tagbit1, 0
+ br i1 %tagbit1eq0, label %test2, label %else1, !prof !1 ; %test2 more likely
+else1:
+ call void @a()
+ call void @a()
+ br label %then2
+test2:
+ %tagbit2 = and i32 %tag, 2
+ %tagbit2eq0 = icmp eq i32 %tagbit2, 0
+ br i1 %tagbit2eq0, label %then2, label %else2, !prof !1 ; %then2 more likely
+then2:
+ %tagbit3 = and i32 %tag, 4
+ %tagbit3eq0 = icmp eq i32 %tagbit3, 0
+ br i1 %tagbit3eq0, label %end2, label %end1, !prof !1 ; %end2 more likely
+else2:
+ call void @c()
+ br label %end2
+end2:
+ ret void
+end1:
+ call void @d()
+ ret void
+}
+
+; CHECK-LABEL: trellis_test
+; The number in the block labels is the expected block frequency given the
+; probabilities annotated. There is a conflict in the b;c->d;e trellis that
+; should be resolved as c->e;b->d.
+; The d;e->f;g trellis should be resolved as e->g;d->f.
+; The f;g->h;i trellis should be resolved as f->i;g->h.
+; The h;i->j;ret trellis contains a triangle edge, and should be resolved as
+; h->j->ret
+; CHECK: # %entry
+; CHECK: # %c10
+; CHECK: # %e9
+; CHECK: # %g10
+; CHECK: # %h10
+; CHECK: # %j8
+; CHECK: # %ret
+; CHECK: # %b6
+; CHECK: # %d7
+; CHECK: # %f6
+; CHECK: # %i6
+define void @trellis_test(i32 %tag) {
+entry:
+ br label %a16
+a16:
+ call void @a()
+ call void @a()
+ %tagbits.a = and i32 %tag, 3
+ %tagbits.a.eq0 = icmp eq i32 %tagbits.a, 0
+ br i1 %tagbits.a.eq0, label %c10, label %b6, !prof !1 ; 10 to 6
+c10:
+ call void @c()
+ call void @c()
+ %tagbits.c = and i32 %tag, 12
+ %tagbits.c.eq0 = icmp eq i32 %tagbits.c, 0
+ ; Both of these edges should be hotter than the other incoming edge
+ ; for e9 or d7
+ br i1 %tagbits.c.eq0, label %e9, label %d7, !prof !3 ; 6 to 4
+e9:
+ call void @e()
+ call void @e()
+ %tagbits.e = and i32 %tag, 48
+ %tagbits.e.eq0 = icmp eq i32 %tagbits.e, 0
+ br i1 %tagbits.e.eq0, label %g10, label %f6, !prof !4 ; 7 to 2
+g10:
+ call void @g()
+ call void @g()
+ %tagbits.g = and i32 %tag, 192
+ %tagbits.g.eq0 = icmp eq i32 %tagbits.g, 0
+ br i1 %tagbits.g.eq0, label %i6, label %h10, !prof !5 ; 2 to 8
+i6:
+ call void @i()
+ call void @i()
+ %tagbits.i = and i32 %tag, 768
+ %tagbits.i.eq0 = icmp eq i32 %tagbits.i, 0
+ br i1 %tagbits.i.eq0, label %ret, label %j8, !prof !2 ; balanced (3 to 3)
+b6:
+ call void @b()
+ call void @b()
+ %tagbits.b = and i32 %tag, 12
+ %tagbits.b.eq1 = icmp eq i32 %tagbits.b, 8
+ br i1 %tagbits.b.eq1, label %e9, label %d7, !prof !2 ; balanced (3 to 3)
+d7:
+ call void @d()
+ call void @d()
+ %tagbits.d = and i32 %tag, 48
+ %tagbits.d.eq1 = icmp eq i32 %tagbits.d, 32
+ br i1 %tagbits.d.eq1, label %g10, label %f6, !prof !6 ; 3 to 4
+f6:
+ call void @f()
+ call void @f()
+ %tagbits.f = and i32 %tag, 192
+ %tagbits.f.eq1 = icmp eq i32 %tagbits.f, 128
+ br i1 %tagbits.f.eq1, label %i6, label %h10, !prof !7 ; 4 to 2
+h10:
+ call void @h()
+ call void @h()
+ %tagbits.h = and i32 %tag, 768
+ %tagbits.h.eq1 = icmp eq i32 %tagbits.h, 512
+ br i1 %tagbits.h.eq1, label %ret, label %j8, !prof !2 ; balanced (5 to 5)
+j8:
+ call void @j()
+ call void @j()
+ br label %ret
+ret:
+ ret void
+}
+
+; Verify that we still consider tail-duplication opportunities if we find a
+; triangle trellis. Here D->F->G is the triangle, and D;E are both predecessors
+; of both F and G. The basic trellis algorithm picks the F->G edge, but after
+; checking, it's profitable to duplicate G into F. The weights here are not
+; really important. They are there to help make the test stable.
+; CHECK-LABEL: trellis_then_dup_test
+; CHECK: # %entry
+; CHECK: # %b
+; CHECK: # %d
+; CHECK: # %g
+; CHECK: # %ret1
+; CHECK: # %c
+; CHECK: # %e
+; CHECK: # %f
+; CHECK: # %ret2
+; CHECK: # %ret
+define void @trellis_then_dup_test(i32 %tag) {
+entry:
+ br label %a
+a:
+ call void @a()
+ call void @a()
+ %tagbits.a = and i32 %tag, 3
+ %tagbits.a.eq0 = icmp eq i32 %tagbits.a, 0
+ br i1 %tagbits.a.eq0, label %b, label %c, !prof !1 ; 5 to 3
+b:
+ call void @b()
+ call void @b()
+ %tagbits.b = and i32 %tag, 12
+ %tagbits.b.eq1 = icmp eq i32 %tagbits.b, 8
+ br i1 %tagbits.b.eq1, label %d, label %e, !prof !1 ; 5 to 3
+d:
+ call void @d()
+ call void @d()
+ %tagbits.d = and i32 %tag, 48
+ %tagbits.d.eq1 = icmp eq i32 %tagbits.d, 32
+ br i1 %tagbits.d.eq1, label %g, label %f, !prof !1 ; 5 to 3
+f:
+ call void @f()
+ call void @f()
+ br label %g
+g:
+ %tagbits.g = and i32 %tag, 192
+ %tagbits.g.eq0 = icmp eq i32 %tagbits.g, 0
+ br i1 %tagbits.g.eq0, label %ret1, label %ret2, !prof !2 ; balanced
+c:
+ call void @c()
+ call void @c()
+ %tagbits.c = and i32 %tag, 12
+ %tagbits.c.eq0 = icmp eq i32 %tagbits.c, 0
+ br i1 %tagbits.c.eq0, label %d, label %e, !prof !1 ; 5 to 3
+e:
+ call void @e()
+ call void @e()
+ %tagbits.e = and i32 %tag, 48
+ %tagbits.e.eq0 = icmp eq i32 %tagbits.e, 0
+ br i1 %tagbits.e.eq0, label %g, label %f, !prof !1 ; 5 to 3
+ret1:
+ call void @a()
+ br label %ret
+ret2:
+ call void @b()
+ br label %ret
+ret:
+ ret void
+}
+
declare void @a()
declare void @b()
declare void @c()
declare void @d()
+declare void @e()
+declare void @f()
+declare void @g()
+declare void @h()
+declare void @i()
+declare void @j()
+
+!1 = !{!"branch_weights", i32 5, i32 3}
+!2 = !{!"branch_weights", i32 50, i32 50}
+!3 = !{!"branch_weights", i32 6, i32 4}
+!4 = !{!"branch_weights", i32 7, i32 2}
+!5 = !{!"branch_weights", i32 2, i32 8}
+!6 = !{!"branch_weights", i32 3, i32 4}
+!7 = !{!"branch_weights", i32 4, i32 2}
; CHECK: nop
; CHECK:.LBB1_1: ! %entry
; CHECK: mov %g0, %i0
+; CHECK: ! %entry
; CHECK: cmp %i0, 0
-; CHECK: bne .LBB1_4
-; CHECK: ba .LBB1_5
-; CHECK:.LBB1_2: ! Block address taken
-; CHECK: mov 1, %i0
; CHECK: be .LBB1_5
+; CHECK: nop
; CHECK:.LBB1_4:
-; CHECK: ba .LBB1_6
+; CHECK: mov 1, %i0
+; CHECK: ba .LBB1_6
+; CHECK:.LBB1_2: ! Block address taken
+; CHECK: mov 1, %i0
+; CHECK: cmp %i0, 0
+; CHECK: bne .LBB1_4
+; CHECK: nop
}
declare i8* @llvm.frameaddress(i32) #2
-; RUN: llc < %s -asm-verbose=false -disable-wasm-fallthrough-return-opt -tail-dup-placement=0| FileCheck %s
+; RUN: llc < %s -asm-verbose=false -disable-wasm-fallthrough-return-opt -tail-dup-placement=0 | FileCheck %s
; Test memcpy, memmove, and memset intrinsics.
define void @unnatural_cfg1() {
; Test that we can handle a loop with an inner unnatural loop at the end of
; a function. This is a gross CFG reduced out of the single source GCC.
-; CHECK: unnatural_cfg1
+; CHECK-LABEL: unnatural_cfg1
; CHECK: %entry
; CHECK: %loop.body1
; CHECK: %loop.body2
; Test that we can handle a loop with a nested natural loop *and* an unnatural
; loop. This was reduced from a crash on block placement when run over
; single-source GCC.
-; CHECK: unnatural_cfg2
+; CHECK-LABEL: unnatural_cfg2
; CHECK: %entry
; CHECK: %loop.body1
; CHECK: %loop.body2
-; CHECK: %loop.body3
-; CHECK: %loop.inner1.begin
-; The end block is folded with %loop.body3...
-; CHECK-NOT: %loop.inner1.end
; CHECK: %loop.body4
; CHECK: %loop.inner2.begin
-; The loop.inner2.end block is folded
+; CHECK: %loop.inner2.begin
+; CHECK: %loop.body3
+; CHECK: %loop.inner1.begin
; CHECK: %loop.header
; CHECK: %bail
; didn't correctly locate the fallthrough successor, assuming blindly that the
; first one was the fallthrough successor. As a result, we would add an
; erroneous jump to the landing pad thinking *that* was the default successor.
-; CHECK: test_eh_lpad_successor
+; CHECK-LABEL: test_eh_lpad_successor
; CHECK: %entry
; CHECK-NOT: jmp
; CHECK: %loop
; fallthrough simply won't occur. Make sure we don't crash trying to update
; terminators for such constructs.
;
-; CHECK: test_eh_throw
+; CHECK-LABEL: test_eh_throw
; CHECK: %entry
; CHECK: %cleanup
; attempt to merge onto the wrong end of the inner loop just because we find it
; first. This was reduced from a crasher in GCC's single source.
;
-; CHECK: test_unnatural_cfg_backwards_inner_loop
+; CHECK-LABEL: test_unnatural_cfg_backwards_inner_loop
; CHECK: %entry
; CHECK: %loop2b
; CHECK: %loop1
; fallthrough because that happens to always produce unanalyzable branches on
; x86.
;
-; CHECK: unanalyzable_branch_to_loop_header
+; CHECK-LABEL: unanalyzable_branch_to_loop_header
; CHECK: %entry
; CHECK: %loop
; CHECK: %exit
; This branch is now analyzable and hence the destination block becomes the
; hotter one. The right order is entry->bar->exit->foo.
;
-; CHECK: unanalyzable_branch_to_best_succ
+; CHECK-LABEL: unanalyzable_branch_to_best_succ
; CHECK: %entry
; CHECK: %bar
; CHECK: %exit
; Ensure that we can handle unanalyzable branches where the destination block
; gets selected as the best free block in the CFG.
;
-; CHECK: unanalyzable_branch_to_free_block
+; CHECK-LABEL: unanalyzable_branch_to_free_block
; CHECK: %entry
; CHECK: %a
; CHECK: %b
; Ensure that we don't crash as we're building up many unanalyzable branches,
; blocks, and loops.
;
-; CHECK: many_unanalyzable_branches
+; CHECK-LABEL: many_unanalyzable_branches
; CHECK: %entry
; CHECK: %exit
; strange layouts that are siginificantly less efficient, often times maing
; it discontiguous.
;
-; CHECK: @benchmark_heapsort
+; CHECK-LABEL: @benchmark_heapsort
; CHECK: %entry
; First rotated loop top.
; CHECK: .p2align
; CHECK-NEXT: idivl %ebx
; CHECK-NEXT: movl %eax, %esi
; CHECK-NEXT: testl $-256, %edi
-; CHECK-NEXT: jne .LBB3_5
-; CHECK-NEXT: jmp .LBB3_4
-; CHECK-NEXT: .LBB3_1:
-; CHECK-NEXT: movzbl %cl, %eax
-; CHECK-NEXT: # kill: %EAX<def> %EAX<kill> %AX<def>
-; CHECK-NEXT: divb %bl
-; CHECK-NEXT: movzbl %al, %esi
-; CHECK-NEXT: testl $-256, %edi
; CHECK-NEXT: je .LBB3_4
; CHECK-NEXT: .LBB3_5:
; CHECK-NEXT: xorl %edx, %edx
; CHECK-NEXT: movl %ecx, %eax
; CHECK-NEXT: divl %ebx
; CHECK-NEXT: jmp .LBB3_6
+; CHECK-NEXT: .LBB3_1:
+; CHECK-NEXT: movzbl %cl, %eax
+; CHECK-NEXT: # kill: %EAX<def> %EAX<kill> %AX<def>
+; CHECK-NEXT: divb %bl
+; CHECK-NEXT: movzbl %al, %esi
+; CHECK-NEXT: testl $-256, %edi
+; CHECK-NEXT: jne .LBB3_5
; CHECK-NEXT: .LBB3_4:
; CHECK-NEXT: movzbl %cl, %eax
; CHECK-NEXT: # kill: %EAX<def> %EAX<kill> %AX<def>
; X32-NEXT: xorps %xmm1, %xmm1
; X32-NEXT: cmpl $0, {{[0-9]+}}(%esp)
; X32-NEXT: jne .LBB1_5
-; X32-NEXT: jmp .LBB1_4
+; X32-NEXT: .LBB1_4:
+; X32-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; X32-NEXT: cmpl $0, {{[0-9]+}}(%esp)
+; X32-NEXT: jne .LBB1_8
+; X32-NEXT: .LBB1_7:
+; X32-NEXT: movss {{.*#+}} xmm3 = mem[0],zero,zero,zero
+; X32-NEXT: jmp .LBB1_9
; X32-NEXT: .LBB1_1:
; X32-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
; X32-NEXT: cmpl $0, {{[0-9]+}}(%esp)
; X32-NEXT: .LBB1_5: # %entry
; X32-NEXT: xorps %xmm2, %xmm2
; X32-NEXT: cmpl $0, {{[0-9]+}}(%esp)
-; X32-NEXT: jne .LBB1_8
-; X32-NEXT: jmp .LBB1_7
-; X32-NEXT: .LBB1_4:
-; X32-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
-; X32-NEXT: cmpl $0, {{[0-9]+}}(%esp)
; X32-NEXT: je .LBB1_7
; X32-NEXT: .LBB1_8: # %entry
; X32-NEXT: xorps %xmm3, %xmm3
-; X32-NEXT: jmp .LBB1_9
-; X32-NEXT: .LBB1_7:
-; X32-NEXT: movss {{.*#+}} xmm3 = mem[0],zero,zero,zero
; X32-NEXT: .LBB1_9: # %entry
; X32-NEXT: cmpl $0, {{[0-9]+}}(%esp)
; X32-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
; X64-NEXT: xorps %xmm1, %xmm1
; X64-NEXT: testl %edx, %edx
; X64-NEXT: jne .LBB1_5
-; X64-NEXT: jmp .LBB1_4
+; X64-NEXT: .LBB1_4:
+; X64-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; X64-NEXT: testl %r8d, %r8d
+; X64-NEXT: jne .LBB1_8
+; X64-NEXT: .LBB1_7:
+; X64-NEXT: movss {{.*#+}} xmm3 = mem[0],zero,zero,zero
+; X64-NEXT: jmp .LBB1_9
; X64-NEXT: .LBB1_1:
; X64-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
; X64-NEXT: testl %edx, %edx
; X64-NEXT: .LBB1_5: # %entry
; X64-NEXT: xorps %xmm2, %xmm2
; X64-NEXT: testl %r8d, %r8d
-; X64-NEXT: jne .LBB1_8
-; X64-NEXT: jmp .LBB1_7
-; X64-NEXT: .LBB1_4:
-; X64-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
-; X64-NEXT: testl %r8d, %r8d
; X64-NEXT: je .LBB1_7
; X64-NEXT: .LBB1_8: # %entry
; X64-NEXT: xorps %xmm3, %xmm3
-; X64-NEXT: jmp .LBB1_9
-; X64-NEXT: .LBB1_7:
-; X64-NEXT: movss {{.*#+}} xmm3 = mem[0],zero,zero,zero
; X64-NEXT: .LBB1_9: # %entry
; X64-NEXT: testl %esi, %esi
; X64-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
ret <4 x i32> %zext
}
-; Fragile test warning - we need to induce the generation of a vselect
+; Fragile test warning - we need to induce the generation of a vselect
; post-legalization to cause the crash seen in:
; https://llvm.org/bugs/show_bug.cgi?id=31672
; Is there a way to do that without an unsafe/fast sqrt intrinsic call?
; CHECK-LABEL: tail_dup_merge_loops
; CHECK: # %entry
; CHECK-NOT: # %{{[a-zA-Z_]+}}
+; CHECK: # %exit
+; CHECK-NOT: # %{{[a-zA-Z_]+}}
; CHECK: # %inner_loop_exit
; CHECK-NOT: # %{{[a-zA-Z_]+}}
; CHECK: # %inner_loop_latch
; CHECK-NOT: # %{{[a-zA-Z_]+}}
; CHECK: # %inner_loop_test
-; CHECK-NOT: # %{{[a-zA-Z_]+}}
-; CHECK: # %exit
define void @tail_dup_merge_loops(i32 %a, i8* %b, i8* %c) local_unnamed_addr #0 {
entry:
%notlhs674.i = icmp eq i32 %a, 0
-; RUN: llc -O2 -tail-dup-placement-threshold=4 -o - %s | FileCheck %s
+; RUN: llc -O3 -tail-dup-placement-threshold=4 -o - %s | FileCheck %s
target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
target triple = "x86_64-unknown-linux-gnu"
; CHECK-NEXT: jbe .LBB2_3
; CHECK-NEXT: ucomiss %xmm{{[0-2]}}, %xmm{{[0-2]}}
; CHECK-NEXT: ja .LBB2_4
-; CHECK-NEXT: jmp .LBB2_2
+; CHECK-NEXT: .LBB2_2:
+; CHECK-NEXT: movb $1, %al
+; CHECK-NEXT: ret
; CHECK-NEXT: .LBB2_3:
; CHECK-NEXT: ucomiss %xmm{{[0-2]}}, %xmm{{[0-2]}}
; CHECK-NEXT: jbe .LBB2_2
; CHECK-NEXT: .LBB2_4:
; CHECK-NEXT: xorl %eax, %eax
; CHECK-NEXT: ret
-; CHECK-NEXT: .LBB2_2:
-; CHECK-NEXT: movb $1, %al
-; CHECK-NEXT: ret
define i1 @dont_merge_oddly(float* %result) nounwind {
entry:
; Check that only one mov will be generated in the kernel loop.
; CHECK-LABEL: foo:
-; CHECK: [[LOOP1:^[a-zA-Z0-9_.]+]]: {{#.*}} %for.body
+; CHECK: [[LOOP1:^[a-zA-Z0-9_.]+]]: {{#.*}} %for.body{{$}}
; CHECK-NOT: mov
; CHECK: movl {{.*}}, [[REG1:%[a-z0-9]+]]
; CHECK-NOT: mov
; Check that only two mov will be generated in the kernel loop.
; CHECK-LABEL: goo:
-; CHECK: [[LOOP2:^[a-zA-Z0-9_.]+]]: {{#.*}} %for.body
+; CHECK: [[LOOP2:^[a-zA-Z0-9_.]+]]: {{#.*}} %for.body{{$}}
; CHECK-NOT: mov
; CHECK: movl {{.*}}, [[REG2:%[a-z0-9]+]]
; CHECK-NOT: mov
; Test that the blocks are analyzed in the correct order.
; CHECK-LABEL: cfg:
entry:
- br i1 %x, label %bb1, label %bb2
+ br i1 %x, label %bb1, label %bb3
bb1:
%p1 = alloca %struct.S
; CHECK: pushl %eax
; CHECK: subl $1020, %esp
- br label %bb3
+ br label %bb4
+
bb2:
- %p2 = alloca %struct.T
+ %p5 = alloca %struct.T
; CHECK: pushl %eax
; CHECK: subl $2996, %esp
- br label %bb3
+ call void @g(%struct.T* %p5)
+ ret void
bb3:
- br i1 %y, label %bb4, label %bb5
+ %p2 = alloca %struct.T
+; CHECK: pushl %eax
+; CHECK: subl $2996, %esp
+ br label %bb4
bb4:
+ br i1 %y, label %bb5, label %bb2
+
+bb5:
%p4 = alloca %struct.S
; CHECK: subl $1024, %esp
call void @f(%struct.S* %p4)
ret void
-bb5:
- %p5 = alloca %struct.T
-; CHECK: pushl %eax
-; CHECK: subl $2996, %esp
- call void @g(%struct.T* %p5)
- ret void
}