Codegen: Make chains from trellis-shaped CFGs

author Kyle Butt <kyle+llvm@iteratee.net>

Wed, 15 Feb 2017 19:49:14 +0000 (19:49 +0000)

committer Kyle Butt <kyle+llvm@iteratee.net>

Wed, 15 Feb 2017 19:49:14 +0000 (19:49 +0000)
author Kyle Butt <kyle+llvm@iteratee.net>
Wed, 15 Feb 2017 19:49:14 +0000 (19:49 +0000)
committer Kyle Butt <kyle+llvm@iteratee.net>
Wed, 15 Feb 2017 19:49:14 +0000 (19:49 +0000)
diff --git a/lib/CodeGen/MachineBlockPlacement.cpp b/lib/CodeGen/MachineBlockPlacement.cpp

index bfc90e0d8cdc792da6300b7dfce43ad5caf8b7f4..b7ab25b87ead9b1bd9812224e770a66e329f49aa 100644 (file)
--- a/lib/CodeGen/MachineBlockPlacement.cpp
+++ b/lib/CodeGen/MachineBlockPlacement.cpp
@@ -294,14 +294,24 @@ class MachineBlockPlacement : public MachineFunctionPass {
  
    /// Pair struct containing basic block and taildup profitiability
    struct BlockAndTailDupResult {
-    MachineBasicBlock * BB;
+    MachineBasicBlock *BB;
      bool ShouldTailDup;
    };
  
+  /// Triple struct containing edge weight and the edge.
+  struct WeightedEdge {
+    BlockFrequency Weight;
+    MachineBasicBlock *Src;
+    MachineBasicBlock *Dest;
+  };
+
    /// \brief work lists of blocks that are ready to be laid out
    SmallVector<MachineBasicBlock *, 16> BlockWorkList;
    SmallVector<MachineBasicBlock *, 16> EHPadWorkList;
  
+  /// Edges that have already been computed as optimal by the trellis code.
+  DenseMap<const MachineBasicBlock *, MachineBasicBlock *> ComputedTrellisEdges;
+
    /// \brief Machine Function
    MachineFunction *F;
  
@@ -440,6 +450,8 @@ class MachineBlockPlacement : public MachineFunctionPass {
    void buildCFGChains();
    void optimizeBranches();
    void alignBlocks();
+  /// Returns true if a block should be tail-duplicated to increase fallthrough
+  /// opportunities.
    bool shouldTailDuplicate(MachineBasicBlock *BB);
    /// Check the edge frequencies to see if tail duplication will increase
    /// fallthroughs.
@@ -447,6 +459,20 @@ class MachineBlockPlacement : public MachineFunctionPass {
      const MachineBasicBlock *BB, const MachineBasicBlock *Succ,
      BranchProbability AdjustedSumProb,
      const BlockChain &Chain, const BlockFilterSet *BlockFilter);
+  /// Check for a trellis layout.
+  bool isTrellis(const MachineBasicBlock *BB,
+                 const SmallVectorImpl<MachineBasicBlock *> &ViableSuccs,
+                 const BlockChain &Chain, const BlockFilterSet *BlockFilter);
+  /// Get the best successor given a trellis layout.
+  BlockAndTailDupResult getBestTrellisSuccessor(
+      const MachineBasicBlock *BB,
+      const SmallVectorImpl<MachineBasicBlock *> &ViableSuccs,
+      BranchProbability AdjustedSumProb, const BlockChain &Chain,
+      const BlockFilterSet *BlockFilter);
+  /// Get the best pair of non-conflicting edges.
+  static std::pair<WeightedEdge, WeightedEdge> getBestNonConflictingEdges(
+      const MachineBasicBlock *BB,
+      SmallVector<SmallVector<WeightedEdge, 8>, 2> &Edges);
    /// Returns true if a block can tail duplicate into all unplaced
    /// predecessors. Filters based on loop.
    bool canTailDuplicateUnplacedPreds(
@@ -614,7 +640,23 @@ getAdjustedProbability(BranchProbability OrigProb,
    return SuccProb;
  }
  
-/// Check if a block should be tail duplicated.
+/// Check if \p BB has exactly the successors in \p Successors.
+static bool
+hasSameSuccessors(MachineBasicBlock &BB,
+                  SmallPtrSetImpl<const MachineBasicBlock *> &Successors) {
+  if (BB.succ_size() != Successors.size())
+    return false;
+  // We don't want to count self-loops
+  if (Successors.count(&BB))
+    return false;
+  for (MachineBasicBlock *Succ : BB.successors())
+    if (!Successors.count(Succ))
+      return false;
+  return true;
+}
+
+/// Check if a block should be tail duplicated to increase fallthrough
+/// opportunities.
  /// \p BB Block to check.
  bool MachineBlockPlacement::shouldTailDuplicate(MachineBasicBlock *BB) {
    // Blocks with single successors don't create additional fallthrough
@@ -724,24 +766,22 @@ bool MachineBlockPlacement::isProfitableToTailDup(
    //    | /       |     C' (+Succ)
    //    Succ      Succ /|
    //    / \       |  \/ |
-  //  U/   =V     =  /= =
+  //  U/   =V     |  == |
    //  /     \     | /  \|
    //  D      E    D     E
    //  '=' : Branch taken for that CFG edge
    //  Cost in the first case is: P + V
    //  For this calculation, we always assume P > Qout. If Qout > P
    //  The result of this function will be ignored at the caller.
-  //  Cost in the second case is: Qout + Qin * V + P * U + P * V
-  //  TODO(iteratee): If we lay out D after Succ, the P * U term
-  //  goes away. This logic is coming in D28522.
+  //  Cost in the second case is: Qout + Qin * U + P * V
  
    if (PDom == nullptr || !Succ->isSuccessor(PDom)) {
      BranchProbability UProb = BestSuccSucc;
      BranchProbability VProb = AdjustedSuccSumProb - UProb;
      BlockFrequency V = SuccFreq * VProb;
-    BlockFrequency QinV = Qin * VProb;
+    BlockFrequency QinU = Qin * UProb;
      BlockFrequency BaseCost = P + V;
-    BlockFrequency DupCost = Qout + QinV + P * AdjustedSuccSumProb;
+    BlockFrequency DupCost = Qout + QinU + P * VProb;
      return greaterWithBias(BaseCost, DupCost, EntryFreq);
    }
    BranchProbability UProb = MBPI->getEdgeProbability(Succ, PDom);
@@ -758,7 +798,7 @@ bool MachineBlockPlacement::isProfitableToTailDup(
    // Succ       Succ /|            Succ        Succ /|
    // | \  V     |  \/ |            | \  V      |  \/ |
    // |U \       |U /\ |            |U =        |U /\ |
-  // =   D      = =  =|            |   D       | =  =|
+  // =   D      = =  \=            |   D       | =  =|
    // |  /       |/    D            |  /        |/    D
    // | /        |    /             | =         |    /
    // |/         |   /              |/          |   =
@@ -768,25 +808,201 @@ bool MachineBlockPlacement::isProfitableToTailDup(
    // The cost in the second case (assuming independence), given the layout:
    // BB, Succ, (C+Succ), D, Dom
    // is Qout + P * V + Qin * U
-  // compare P + U vs Qout + P + Qin * U.
+  // compare P + U vs Qout + P * U + Qin.
    //
    // The 3rd and 4th cases cover when Dom would be chosen to follow Succ.
    //
    // For the 3rd case, the cost is P + 2 * V
    // For the 4th case, the cost is Qout + Qin * U + P * V + V
    // We choose 4 over 3 when (P + V) > Qout + Qin * U + P * V
-  if (UProb > AdjustedSuccSumProb / 2
-      && !hasBetterLayoutPredecessor(Succ, PDom, *BlockToChain[PDom],
-                                     UProb, UProb, Chain, BlockFilter)) {
+  if (UProb > AdjustedSuccSumProb / 2 &&
+      !hasBetterLayoutPredecessor(Succ, PDom, *BlockToChain[PDom], UProb, UProb,
+                                  Chain, BlockFilter))
      // Cases 3 & 4
      return greaterWithBias((P + V), (Qout + Qin * UProb + P * VProb),
                             EntryFreq);
-  }
    // Cases 1 & 2
    return greaterWithBias(
-      (P + U), (Qout + Qin * UProb + P * AdjustedSuccSumProb), EntryFreq);
+      (P + U), (Qout + Qin * AdjustedSuccSumProb + P * UProb), EntryFreq);
+}
+
+/// Check for a trellis layout. \p BB is the upper part of a trellis if its
+/// successors form the lower part of a trellis. A successor set S forms the
+/// lower part of a trellis if all of the predecessors of S are either in S or
+/// have all of S as successors. We ignore trellises where BB doesn't have 2
+/// successors because for fewer than 2, it's trivial, and for 3 or greater they
+/// are very uncommon and complex to compute optimally. Allowing edges within S
+/// is not strictly a trellis, but the same algorithm works, so we allow it.
+bool MachineBlockPlacement::isTrellis(
+    const MachineBasicBlock *BB,
+    const SmallVectorImpl<MachineBasicBlock *> &ViableSuccs,
+    const BlockChain &Chain, const BlockFilterSet *BlockFilter) {
+  // Technically BB could form a trellis with branching factor higher than 2.
+  // But that's extremely uncommon.
+  if (BB->succ_size() != 2 || ViableSuccs.size() != 2)
+    return false;
+
+  SmallPtrSet<const MachineBasicBlock *, 2> Successors(BB->succ_begin(),
+                                                       BB->succ_end());
+  // To avoid reviewing the same predecessors twice.
+  SmallPtrSet<const MachineBasicBlock *, 8> SeenPreds;
+
+  for (MachineBasicBlock *Succ : ViableSuccs) {
+    int PredCount = 0;
+    for (auto SuccPred : Succ->predecessors()) {
+      // Allow triangle successors, but don't count them.
+      if (Successors.count(SuccPred))
+        continue;
+      const BlockChain *PredChain = BlockToChain[SuccPred];
+      if (SuccPred == BB || (BlockFilter && !BlockFilter->count(SuccPred)) ||
+          PredChain == &Chain || PredChain == BlockToChain[Succ])
+        continue;
+      ++PredCount;
+      // Perform the successor check only once.
+      if (!SeenPreds.insert(SuccPred).second)
+        continue;
+      if (!hasSameSuccessors(*SuccPred, Successors))
+        return false;
+    }
+    // If one of the successors has only BB as a predecessor, it is not a
+    // trellis.
+    if (PredCount < 1)
+      return false;
+  }
+  return true;
  }
  
+/// Pick the highest total weight pair of edges that can both be laid out.
+/// The edges in \p Edges[0] are assumed to have a different destination than
+/// the edges in \p Edges[1]. Simple counting shows that the best pair is either
+/// the individual highest weight edges to the 2 different destinations, or in
+/// case of a conflict, one of them should be replaced with a 2nd best edge.
+std::pair<MachineBlockPlacement::WeightedEdge,
+          MachineBlockPlacement::WeightedEdge>
+MachineBlockPlacement::getBestNonConflictingEdges(
+    const MachineBasicBlock *BB,
+    SmallVector<SmallVector<MachineBlockPlacement::WeightedEdge, 8>, 2>
+        &Edges) {
+  // Sort the edges, and then for each successor, find the best incoming
+  // predecessor. If the best incoming predecessors aren't the same,
+  // then that is clearly the best layout. If there is a conflict, one of the
+  // successors will have to fallthrough from the second best predecessor. We
+  // compare which combination is better overall.
+
+  // Sort for highest frequency.
+  auto Cmp = [](WeightedEdge A, WeightedEdge B) { return A.Weight > B.Weight; };
+
+  std::stable_sort(Edges[0].begin(), Edges[0].end(), Cmp);
+  std::stable_sort(Edges[1].begin(), Edges[1].end(), Cmp);
+  auto BestA = Edges[0].begin();
+  auto BestB = Edges[1].begin();
+  // Arrange for the correct answer to be in BestA and BestB
+  // If the 2 best edges don't conflict, the answer is already there.
+  if (BestA->Src == BestB->Src) {
+    // Compare the total fallthrough of (Best + Second Best) for both pairs
+    auto SecondBestA = std::next(BestA);
+    auto SecondBestB = std::next(BestB);
+    BlockFrequency BestAScore = BestA->Weight + SecondBestB->Weight;
+    BlockFrequency BestBScore = BestB->Weight + SecondBestA->Weight;
+    if (BestAScore < BestBScore)
+      BestA = SecondBestA;
+    else
+      BestB = SecondBestB;
+  }
+  // Arrange for the BB edge to be in BestA if it exists.
+  if (BestB->Src == BB)
+    std::swap(BestA, BestB);
+  return std::make_pair(*BestA, *BestB);
+}
+
+/// Get the best successor from \p BB based on \p BB being part of a trellis.
+/// We only handle trellises with 2 successors, so the algorithm is
+/// straightforward: Find the best pair of edges that don't conflict. We find
+/// the best incoming edge for each successor in the trellis. If those conflict,
+/// we consider which of them should be replaced with the second best.
+/// Upon return the two best edges will be in \p BestEdges. If one of the edges
+/// comes from \p BB, it will be in \p BestEdges[0]
+MachineBlockPlacement::BlockAndTailDupResult
+MachineBlockPlacement::getBestTrellisSuccessor(
+    const MachineBasicBlock *BB,
+    const SmallVectorImpl<MachineBasicBlock *> &ViableSuccs,
+    BranchProbability AdjustedSumProb, const BlockChain &Chain,
+    const BlockFilterSet *BlockFilter) {
+
+  BlockAndTailDupResult Result = {nullptr, false};
+  SmallPtrSet<const MachineBasicBlock *, 4> Successors(BB->succ_begin(),
+                                                       BB->succ_end());
+
+  // We assume size 2 because it's common. For general n, we would have to do
+  // the Hungarian algorithm, but it's not worth the complexity because more
+  // than 2 successors is fairly uncommon, and a trellis even more so.
+  if (Successors.size() != 2 || ViableSuccs.size() != 2)
+    return Result;
+
+  // Collect the edge frequencies of all edges that form the trellis.
+  SmallVector<SmallVector<WeightedEdge, 8>, 2> Edges(2);
+  int SuccIndex = 0;
+  for (auto Succ : ViableSuccs) {
+    for (MachineBasicBlock *SuccPred : Succ->predecessors()) {
+      // Skip any placed predecessors that are not BB
+      if (SuccPred != BB)
+        if ((BlockFilter && !BlockFilter->count(SuccPred)) ||
+            BlockToChain[SuccPred] == &Chain ||
+            BlockToChain[SuccPred] == BlockToChain[Succ])
+          continue;
+      BlockFrequency EdgeFreq = MBFI->getBlockFreq(SuccPred) *
+                                MBPI->getEdgeProbability(SuccPred, Succ);
+      Edges[SuccIndex].push_back({EdgeFreq, SuccPred, Succ});
+    }
+    ++SuccIndex;
+  }
+
+  // Pick the best combination of 2 edges from all the edges in the trellis.
+  WeightedEdge BestA, BestB;
+  std::tie(BestA, BestB) = getBestNonConflictingEdges(BB, Edges);
+
+  if (BestA.Src != BB) {
+    // If we have a trellis, and BB doesn't have the best fallthrough edges,
+    // we shouldn't choose any successor. We've already looked and there's a
+    // better fallthrough edge for all the successors.
+    DEBUG(dbgs() << "Trellis, but not one of the chosen edges.\n");
+    return Result;
+  }
+
+  // Did we pick the triangle edge? If tail-duplication is profitable, do
+  // that instead. Otherwise merge the triangle edge now while we know it is
+  // optimal.
+  if (BestA.Dest == BestB.Src) {
+    // The edges are BB->Succ1->Succ2, and we're looking to see if BB->Succ2
+    // would be better.
+    MachineBasicBlock *Succ1 = BestA.Dest;
+    MachineBasicBlock *Succ2 = BestB.Dest;
+    // Check to see if tail-duplication would be profitable.
+    if (TailDupPlacement && shouldTailDuplicate(Succ2) &&
+        canTailDuplicateUnplacedPreds(BB, Succ2, Chain, BlockFilter) &&
+        isProfitableToTailDup(BB, Succ2, MBPI->getEdgeProbability(BB, Succ1),
+                              Chain, BlockFilter)) {
+      DEBUG(BranchProbability Succ2Prob = getAdjustedProbability(
+                MBPI->getEdgeProbability(BB, Succ2), AdjustedSumProb);
+            dbgs() << "    Selected: " << getBlockName(Succ2)
+                   << ", probability: " << Succ2Prob << " (Tail Duplicate)\n");
+      Result.BB = Succ2;
+      Result.ShouldTailDup = true;
+      return Result;
+    }
+  }
+  // We have already computed the optimal edge for the other side of the
+  // trellis.
+  ComputedTrellisEdges[BestB.Src] = BestB.Dest;
+
+  auto TrellisSucc = BestA.Dest;
+  DEBUG(BranchProbability SuccProb = getAdjustedProbability(
+            MBPI->getEdgeProbability(BB, TrellisSucc), AdjustedSumProb);
+        dbgs() << "    Selected: " << getBlockName(TrellisSucc)
+               << ", probability: " << SuccProb << " (Trellis)\n");
+  Result.BB = TrellisSucc;
+  return Result;
+}
  
  /// When the option TailDupPlacement is on, this method checks if the
  /// fallthrough candidate block \p Succ (of block \p BB) can be tail-duplicated
@@ -797,6 +1013,9 @@ bool MachineBlockPlacement::canTailDuplicateUnplacedPreds(
    if (!shouldTailDuplicate(Succ))
      return false;
  
+  // For CFG checking.
+  SmallPtrSet<const MachineBasicBlock *, 4> Successors(BB->succ_begin(),
+                                                       BB->succ_end());
    for (MachineBasicBlock *Pred : Succ->predecessors()) {
      // Make sure all unplaced and unfiltered predecessors can be
      // tail-duplicated into.
@@ -804,8 +1023,44 @@ bool MachineBlockPlacement::canTailDuplicateUnplacedPreds(
      if (Pred == BB || (BlockFilter && !BlockFilter->count(Pred))
          || BlockToChain[Pred] == &Chain)
        continue;
-    if (!TailDup.canTailDuplicate(Succ, Pred))
+    if (!TailDup.canTailDuplicate(Succ, Pred)) {
+      if (Successors.size() > 1 && hasSameSuccessors(*Pred, Successors))
+        // This will result in a trellis after tail duplication, so we don't
+        // need to copy Succ into this predecessor. In the presence
+        // of a trellis tail duplication can continue to be profitable.
+        // For example:
+        // A            A
+        // |\           |\
+        // | \          | \
+        // |  C         |  C+BB
+        // | /          |  |
+        // |/           |  |
+        // BB    =>     BB |
+        // |\           |\/|
+        // | \          |/\|
+        // |  D         |  D
+        // | /          | /
+        // |/           |/
+        // Succ         Succ
+        //
+        // After BB was duplicated into C, the layout looks like the one on the
+        // right. BB and C now have the same successors. When considering
+        // whether Succ can be duplicated into all its unplaced predecessors, we
+        // ignore C.
+        // We can do this because C already has a profitable fallthrough, namely
+        // D. TODO(iteratee): ignore sufficiently cold predecessors for
+        // duplication and for this test.
+        //
+        // This allows trellises to be laid out in 2 separate chains
+        // (A,B,Succ,...) and later (C,D,...) This is a reasonable heuristic
+        // because it allows the creation of 2 fallthrough paths with links
+        // between them, and we correctly identify the best layout for these
+        // CFGs. We want to extend trellises that the user created in addition
+        // to trellises created by tail-duplication, so we just look for the
+        // CFG.
+        continue;
        return false;
+    }
    }
    return true;
  }
@@ -990,11 +1245,12 @@ bool MachineBlockPlacement::hasBetterLayoutPredecessor(
    //  |  Pred----|                     |  S1----
    //  |  |                             |       |
    //  --(S1 or S2)                     ---Pred--
+  //                                        |
+  //                                       S2
    //
    // topo-cost = freq(S->Pred) + freq(BB->S1) + freq(BB->S2)
    //    + min(freq(Pred->S1), freq(Pred->S2))
    // Non-topo-order cost:
-  // In the worst case, S2 will not get laid out after Pred.
    // non-topo-cost = 2 * freq(S->Pred) + freq(BB->S2).
    // To be conservative, we can assume that min(freq(Pred->S1), freq(Pred->S2))
    // is 0. Then the non topo layout is better when
@@ -1073,6 +1329,26 @@ MachineBlockPlacement::selectBestSuccessor(
  
    DEBUG(dbgs() << "Selecting best successor for: " << getBlockName(BB) << "\n");
  
+  // if we already precomputed the best successor for BB as part of a trellis we
+  // saw earlier, return that if still applicable.
+  auto FoundEdge = ComputedTrellisEdges.find(BB);
+  if (FoundEdge != ComputedTrellisEdges.end()) {
+    MachineBasicBlock *Succ = FoundEdge->second;
+    ComputedTrellisEdges.erase(FoundEdge);
+    BlockChain *SuccChain = BlockToChain[Succ];
+    if (BB->isSuccessor(Succ) && (!BlockFilter || BlockFilter->count(Succ)) &&
+        SuccChain != &Chain && Succ == *SuccChain->begin()) {
+      BestSucc.BB = Succ;
+      return BestSucc;
+    }
+  }
+
+  // if BB is part of a trellis, Use the trellis to determine the optimal
+  // fallthrough edges
+  if (isTrellis(BB, Successors, Chain, BlockFilter))
+    return getBestTrellisSuccessor(BB, Successors, AdjustedSumProb, Chain,
+                                   BlockFilter);
+
    // For blocks with CFG violations, we may be able to lay them out anyway with
    // tail-duplication. We keep this vector so we can perform the probability
    // calculations the minimum number of times.
diff --git a/test/CodeGen/AArch64/branch-relax-cbz.ll b/test/CodeGen/AArch64/branch-relax-cbz.ll

index c654b94e49cf0e40be8ff3dce7c1753ffe2ca95a..d13c0f677bcb55e1d181ada49369485ca185db06 100644 (file)
--- a/test/CodeGen/AArch64/branch-relax-cbz.ll
+++ b/test/CodeGen/AArch64/branch-relax-cbz.ll
@@ -6,23 +6,22 @@
  
  ; CHECK-NEXT: ; BB#1: ; %b3
  ; CHECK: ldr [[LOAD:w[0-9]+]]
-; CHECK: cbz [[LOAD]], [[SKIP_LONG_B:LBB[0-9]+_[0-9]+]]
-; CHECK-NEXT: b [[B8:LBB[0-9]+_[0-9]+]]
-
-; CHECK-NEXT: [[SKIP_LONG_B]]:
+; CHECK: cbnz [[LOAD]], [[B8:LBB[0-9]+_[0-9]+]]
  ; CHECK-NEXT: b [[B7:LBB[0-9]+_[0-9]+]]
  
+; CHECK-NEXT: [[B8]]: ; %b8
+; CHECK-NEXT: ret
+
  ; CHECK-NEXT: [[B2]]: ; %b2
  ; CHECK: mov w{{[0-9]+}}, #93
  ; CHECK: bl _extfunc
  ; CHECK: cbz w{{[0-9]+}}, [[B7]]
-
-; CHECK-NEXT: [[B8]]: ; %b8
-; CHECK-NEXT: ret
+; CHECK-NEXT: b [[B8]]
  
  ; CHECK-NEXT: [[B7]]: ; %b7
  ; CHECK: mov w{{[0-9]+}}, #13
  ; CHECK: b _extfunc
+
  define void @split_block_no_fallthrough(i64 %val) #0 {
  bb:
    %c0 = icmp sgt i64 %val, -5
diff --git a/test/CodeGen/AArch64/combine-comparisons-by-cse.ll b/test/CodeGen/AArch64/combine-comparisons-by-cse.ll

index 86be3ccea1d80f7a7cdeb855dc48a22d49ec9ac8..8c85fc3c68aba45afcddfed7cbec1146302a1744 100644 (file)
--- a/test/CodeGen/AArch64/combine-comparisons-by-cse.ll
+++ b/test/CodeGen/AArch64/combine-comparisons-by-cse.ll
@@ -264,7 +264,7 @@ declare void @do_something() #1
  define i32 @do_nothing_if_resultant_opcodes_would_differ() #0 {
  ; CHECK-LABEL: do_nothing_if_resultant_opcodes_would_differ
  ; CHECK: cmn
-; CHECK: b.gt
+; CHECK: b.le
  ; CHECK: cmp
  ; CHECK: b.gt
  entry:
diff --git a/test/CodeGen/AArch64/optimize-cond-branch.ll b/test/CodeGen/AArch64/optimize-cond-branch.ll

index 4e3ca6f16e78c703cc003f8ab3b3d30d17d0c3d2..ab4ad5e2ce93daeab0f9e1a8dab952832964483e 100644 (file)
--- a/test/CodeGen/AArch64/optimize-cond-branch.ll
+++ b/test/CodeGen/AArch64/optimize-cond-branch.ll
@@ -11,7 +11,7 @@ target triple = "arm64--"
  ;
  ; CHECK-LABEL: func
  ; CHECK-NOT: and
-; CHECK: tbnz
+; CHECK: tbz
  define void @func() {
    %c0 = icmp sgt i64 0, 0
    br i1 %c0, label %b1, label %b6
diff --git a/test/CodeGen/AMDGPU/basic-branch.ll b/test/CodeGen/AMDGPU/basic-branch.ll

index 104dd45e8a1adc9c24620ae095d8f6f9645c28c5..d4538ee2d2579ae8e0f9e5e6ab846db5ea06352f 100644 (file)
--- a/test/CodeGen/AMDGPU/basic-branch.ll
+++ b/test/CodeGen/AMDGPU/basic-branch.ll
@@ -8,13 +8,10 @@
  ; GCNNOOPT: v_writelane_b32
  ; GCN: s_cbranch_scc1 [[END:BB[0-9]+_[0-9]+]]
  
-
-; GCN: ; BB#1
  ; GCNNOOPT: v_readlane_b32
  ; GCNNOOPT: v_readlane_b32
  ; GCN: buffer_store_dword
-; GCNOPT-NEXT: s_waitcnt vmcnt(0) expcnt(0)
-; TODO: This waitcnt can be eliminated
+; GCNNOOPT: s_endpgm
  
  ; GCN: {{^}}[[END]]:
  ; GCN: s_endpgm
diff --git a/test/CodeGen/AMDGPU/branch-relaxation.ll b/test/CodeGen/AMDGPU/branch-relaxation.ll

index 3fd40521801565b7e6d968ceb01177755d61378b..18bbc3e5f07c1603ce8b81e16e8afa12e9b00231 100644 (file)
--- a/test/CodeGen/AMDGPU/branch-relaxation.ll
+++ b/test/CodeGen/AMDGPU/branch-relaxation.ll
@@ -491,7 +491,8 @@ ret:
  
  ; GCN-LABEL: {{^}}long_branch_hang:
  ; GCN: s_cmp_lt_i32 s{{[0-9]+}}, 6
-; GCN-NEXT: s_cbranch_scc0 [[LONG_BR_0:BB[0-9]+_[0-9]+]]
+; GCN-NEXT: s_cbranch_scc1 {{BB[0-9]+_[0-9]+}}
+; GCN-NEXT: s_branch [[LONG_BR_0:BB[0-9]+_[0-9]+]]
  ; GCN-NEXT: BB{{[0-9]+_[0-9]+}}:
  
  ; GCN: s_add_u32 vcc_lo, vcc_lo, [[LONG_BR_DEST0:BB[0-9]+_[0-9]+]]-(
diff --git a/test/CodeGen/AMDGPU/cf-loop-on-constant.ll b/test/CodeGen/AMDGPU/cf-loop-on-constant.ll

index 0d919bbf85e3c05249cde8718227390d23d6c472..03d8cd9eeeafdf593a8f6ae0c55ec2413ffb6729 100644 (file)
--- a/test/CodeGen/AMDGPU/cf-loop-on-constant.ll
+++ b/test/CodeGen/AMDGPU/cf-loop-on-constant.ll
@@ -2,7 +2,7 @@
  ; RUN: llc -march=amdgcn -verify-machineinstrs -O0 < %s
  
  ; GCN-LABEL: {{^}}test_loop:
-; GCN: [[LABEL:BB[0-9+]_[0-9]+]]:
+; GCN: [[LABEL:BB[0-9+]_[0-9]+]]: ; %for.body{{$}}
  ; GCN: ds_read_b32
  ; GCN: ds_write_b32
  ; GCN: s_branch [[LABEL]]
diff --git a/test/CodeGen/AMDGPU/convergent-inlineasm.ll b/test/CodeGen/AMDGPU/convergent-inlineasm.ll

index 755f439c68635d2cf25c10b0dff9ccab9d105ed2..c3d155ce7473c68162f4bb9f2bbffad01f689c3e 100644 (file)
--- a/test/CodeGen/AMDGPU/convergent-inlineasm.ll
+++ b/test/CodeGen/AMDGPU/convergent-inlineasm.ll
@@ -29,6 +29,7 @@ bb5:                                              ; preds = %bb3, %bb
  ; GCN: v_cmp_ne_u32_e64
  
  ; GCN: BB{{[0-9]+_[0-9]+}}:
+
  define void @nonconvergent_inlineasm(i64 addrspace(1)* nocapture %arg) {
  bb:
    %tmp = call i32 @llvm.amdgcn.workitem.id.x()
diff --git a/test/CodeGen/AMDGPU/salu-to-valu.ll b/test/CodeGen/AMDGPU/salu-to-valu.ll

index 37083fbbd3c5e23d2ff8a78556e45a24e7e95d66..9c153841a63c8d808b5d33917c39db50eabbaf3e 100644 (file)
--- a/test/CodeGen/AMDGPU/salu-to-valu.ll
+++ b/test/CodeGen/AMDGPU/salu-to-valu.ll
@@ -439,7 +439,7 @@ entry:
  ; GCN: v_mov_b32_e32 [[ONE:v[0-9]+]], 1
  ; GCN-NOHSA: buffer_store_dword [[ONE]]
  ; GCN-HSA: flat_store_dword v[{{[0-9]+:[0-9]+}}], [[ONE]]
-; GCN; {{^}}[[EXIT]]:
+; GCN: {{^}}[[EXIT]]:
  ; GCN: s_endpgm
  define void @sopc_vopc_legalize_bug(i32 %cond, i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
  bb3:                                              ; preds = %bb2
diff --git a/test/CodeGen/ARM/2007-05-22-tailmerge-3.ll b/test/CodeGen/ARM/2007-05-22-tailmerge-3.ll

index 52cc37e2408453ee85d37deac39741613ccb20ef..b8f2980be7502bd2577885b78bdf66954e233c72 100644 (file)
--- a/test/CodeGen/ARM/2007-05-22-tailmerge-3.ll
+++ b/test/CodeGen/ARM/2007-05-22-tailmerge-3.ll
@@ -12,11 +12,11 @@
  ; CHECK: bl _quux
  ; CHECK-NOT: bl _quux
  
-; NOMERGE: bl _baz
-; NOMERGE: bl _baz
+; NOMERGE-DAG: bl _baz
+; NOMERGE-DAG: bl _baz
  
-; NOMERGE: bl _quux
-; NOMERGE: bl _quux
+; NOMERGE-DAG: bl _quux
+; NOMERGE-DAG: bl _quux
  
  ; ModuleID = 'tail.c'
  target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64"
diff --git a/test/CodeGen/ARM/atomic-cmpxchg.ll b/test/CodeGen/ARM/atomic-cmpxchg.ll

index 364bd5d13691e9b8d0038ef840043727bd8d35fe..8a82af6a69b702acd4c891f8e1016b56183a5acf 100644 (file)
--- a/test/CodeGen/ARM/atomic-cmpxchg.ll
+++ b/test/CodeGen/ARM/atomic-cmpxchg.ll
@@ -66,14 +66,14 @@ entry:
  ; CHECK-ARMV7-NEXT: [[HEAD:.LBB[0-9_]+]]:
  ; CHECK-ARMV7-NEXT: strexb [[SUCCESS:r[0-9]+]], r2, [r0]
  ; CHECK-ARMV7-NEXT: cmp [[SUCCESS]], #0
-; CHECK-ARMV7-NEXT: moveq [[RES:r[0-9]+]], #1
+; CHECK-ARMV7-NEXT: moveq r0, #1
  ; CHECK-ARMV7-NEXT: bxeq lr
  ; CHECK-ARMV7-NEXT: [[TRY]]:
-; CHECK-ARMV7-NEXT: ldrexb [[LD:r[0-9]+]], [r0]
-; CHECK-ARMV7-NEXT: cmp [[LD]], [[DESIRED]]
+; CHECK-ARMV7-NEXT: ldrexb [[SUCCESS]], [r0]
+; CHECK-ARMV7-NEXT: cmp [[SUCCESS]], r1
  ; CHECK-ARMV7-NEXT: beq [[HEAD]]
  ; CHECK-ARMV7-NEXT: clrex
-; CHECK-ARMV7-NEXT: mov [[RES]], #0
+; CHECK-ARMV7-NEXT: mov r0, #0
  ; CHECK-ARMV7-NEXT: bx lr
  
  ; CHECK-THUMBV7-LABEL: test_cmpxchg_res_i8:
diff --git a/test/CodeGen/ARM/fold-stack-adjust.ll b/test/CodeGen/ARM/fold-stack-adjust.ll

index 442459bc0582c984245bb58f8dbf15f62420f319..eb32ee54c09594c5059f1853adc64b880bbf8122 100644 (file)
--- a/test/CodeGen/ARM/fold-stack-adjust.ll
+++ b/test/CodeGen/ARM/fold-stack-adjust.ll
@@ -135,7 +135,7 @@ define void @test_fold_point(i1 %tst) minsize {
  
    ; Important to check for beginning of basic block, because if it gets
    ; if-converted the test is probably no longer checking what it should.
-; CHECK: {{LBB[0-9]+_2}}:
+; CHECK: %end
  ; CHECK-NEXT: vpop {d7, d8}
  ; CHECK-NEXT: pop {r4, pc}
  
diff --git a/test/CodeGen/PowerPC/tail-dup-break-cfg.ll b/test/CodeGen/PowerPC/tail-dup-break-cfg.ll

index 4be737e89a91270ed819c41746eab164dfa4cc2f..f19b11f2ae4ca653d09c27ae205350971ca34162 100644 (file)
--- a/test/CodeGen/PowerPC/tail-dup-break-cfg.ll
+++ b/test/CodeGen/PowerPC/tail-dup-break-cfg.ll
@@ -16,14 +16,14 @@ target triple = "powerpc64le-grtev4-linux-gnu"
  ;CHECK-NEXT: bc 12, 1, [[BODY1LABEL:[._0-9A-Za-z]+]]
  ;CHECK-NEXT: # %test2
  ;CHECK-NEXT: rlwinm. {{[0-9]+}}, [[TAGREG]], 0, 30, 30
-;CHECK-NEXT: beq 0, [[EXITLABEL:[._0-9A-Za-z]+]]
-;CHECK-NEXT: b [[BODY2LABEL:[._0-9A-Za-z]+]]
+;CHECK-NEXT: bne 0, [[BODY2LABEL:[._0-9A-Za-z]+]]
+;CHECK: [[EXITLABEL:[._0-9A-Za-z]+]]: # %exit
+;CHECK: blr
  ;CHECK-NEXT: [[BODY1LABEL]]
  ;CHECK: rlwinm. {{[0-9]+}}, [[TAGREG]], 0, 30, 30
  ;CHECK-NEXT: beq 0, [[EXITLABEL]]
-;CHECK-NEXT: [[BODY2LABEL]]
-;CHECK: [[EXITLABEL:[._0-9A-Za-z]+]]: # %exit
-;CHECK: blr
+;CHECK-NEXT: [[BODY2LABEL:[._0-9A-Za-z]+]]:
+;CHECK: b [[EXITLABEL]]
  define void @tail_dup_break_cfg(i32 %tag) {
  entry:
    br label %test1
@@ -79,7 +79,7 @@ body1:
  test2:
    %tagbit2 = and i32 %tag, 2
    %tagbit2eq0 = icmp ne i32 %tagbit2, 0
-  br i1 %tagbit2eq0, label %body2, label %exit, !prof !1 ; %body2 more likely
+  br i1 %tagbit2eq0, label %body2, label %exit, !prof !3 ; %body2 more likely
  body2:
    call void @b()
    call void @b()
@@ -137,4 +137,4 @@ ret:
  
  !1 = !{!"branch_weights", i32 5, i32 3}
  !2 = !{!"branch_weights", i32 95, i32 5}
-!3 = !{!"branch_weights", i32 7, i32 3}
+!3 = !{!"branch_weights", i32 8, i32 3}
diff --git a/test/CodeGen/PowerPC/tail-dup-layout.ll b/test/CodeGen/PowerPC/tail-dup-layout.ll

index 6790aa8e944147453d986065ddae3ce9d2525548..95c23ade5130950a547804cb837955412293c8d5 100644 (file)
--- a/test/CodeGen/PowerPC/tail-dup-layout.ll
+++ b/test/CodeGen/PowerPC/tail-dup-layout.ll
@@ -1,59 +1,59 @@
-; RUN: llc -outline-optional-branches -O2 < %s | FileCheck %s
+; RUN: llc -O2 < %s | FileCheck %s
  target datalayout = "e-m:e-i64:64-n32:64"
  target triple = "powerpc64le-grtev4-linux-gnu"
  
  ; Intended layout:
-; The outlining flag produces the layout
+; The chain-based outlining produces the layout
  ; test1
  ; test2
  ; test3
  ; test4
-; exit
  ; optional1
  ; optional2
  ; optional3
  ; optional4
+; exit
  ; Tail duplication puts test n+1 at the end of optional n
  ; so optional1 includes a copy of test2 at the end, and branches
  ; to test3 (at the top) or falls through to optional 2.
-; The CHECK statements check for the whole string of tests and exit block,
+; The CHECK statements check for the whole string of tests
  ; and then check that the correct test has been duplicated into the end of
  ; the optional blocks and that the optional blocks are in the correct order.
-;CHECK-LABEL: f:
+;CHECK-LABEL: straight_test:
  ; test1 may have been merged with entry
  ;CHECK: mr [[TAGREG:[0-9]+]], 3
  ;CHECK: andi. {{[0-9]+}}, [[TAGREG]], 1
-;CHECK-NEXT: bc 12, 1, [[OPT1LABEL:[._0-9A-Za-z]+]]
-;CHECK-NEXT: [[TEST2LABEL:[._0-9A-Za-z]+]]: # %test2
+;CHECK-NEXT: bc 12, 1, .[[OPT1LABEL:[_0-9A-Za-z]+]]
+;CHECK-NEXT: # %test2
  ;CHECK-NEXT: rlwinm. {{[0-9]+}}, [[TAGREG]], 0, 30, 30
-;CHECK-NEXT: bne 0, [[OPT2LABEL:[._0-9A-Za-z]+]]
-;CHECK-NEXT: [[TEST3LABEL:[._0-9A-Za-z]+]]: # %test3
+;CHECK-NEXT: bne 0, .[[OPT2LABEL:[_0-9A-Za-z]+]]
+;CHECK-NEXT: .[[TEST3LABEL:[_0-9A-Za-z]+]]: # %test3
  ;CHECK-NEXT: rlwinm. {{[0-9]+}}, [[TAGREG]], 0, 29, 29
-;CHECK-NEXT: bne 0, .[[OPT3LABEL:[._0-9A-Za-z]+]]
-;CHECK-NEXT: [[TEST4LABEL:[._0-9A-Za-z]+]]: # %test4
+;CHECK-NEXT: bne 0, .[[OPT3LABEL:[_0-9A-Za-z]+]]
+;CHECK-NEXT: .[[TEST4LABEL:[_0-9A-Za-z]+]]: # %test4
  ;CHECK-NEXT: rlwinm. {{[0-9]+}}, [[TAGREG]], 0, 28, 28
-;CHECK-NEXT: bne 0, .[[OPT4LABEL:[._0-9A-Za-z]+]]
-;CHECK-NEXT: [[EXITLABEL:[._0-9A-Za-z]+]]: # %exit
+;CHECK-NEXT: bne 0, .[[OPT4LABEL:[_0-9A-Za-z]+]]
+;CHECK-NEXT: .[[EXITLABEL:[_0-9A-Za-z]+]]: # %exit
  ;CHECK: blr
-;CHECK-NEXT: [[OPT1LABEL]]
+;CHECK-NEXT: .[[OPT1LABEL]]:
  ;CHECK: rlwinm. {{[0-9]+}}, [[TAGREG]], 0, 30, 30
-;CHECK-NEXT: beq 0, [[TEST3LABEL]]
-;CHECK-NEXT: [[OPT2LABEL]]
+;CHECK-NEXT: beq 0, .[[TEST3LABEL]]
+;CHECK-NEXT: .[[OPT2LABEL]]:
  ;CHECK: rlwinm. {{[0-9]+}}, [[TAGREG]], 0, 29, 29
-;CHECK-NEXT: beq 0, [[TEST4LABEL]]
-;CHECK-NEXT: [[OPT3LABEL]]
+;CHECK-NEXT: beq 0, .[[TEST4LABEL]]
+;CHECK-NEXT: .[[OPT3LABEL]]:
  ;CHECK: rlwinm. {{[0-9]+}}, [[TAGREG]], 0, 28, 28
-;CHECK-NEXT: beq 0, [[EXITLABEL]]
-;CHECK-NEXT: [[OPT4LABEL]]
-;CHECK: b [[EXITLABEL]]
+;CHECK-NEXT: beq 0, .[[EXITLABEL]]
+;CHECK-NEXT: .[[OPT4LABEL]]:
+;CHECK: b .[[EXITLABEL]]
  
-define void @f(i32 %tag) {
+define void @straight_test(i32 %tag) {
  entry:
    br label %test1
  test1:
    %tagbit1 = and i32 %tag, 1
    %tagbit1eq0 = icmp eq i32 %tagbit1, 0
-  br i1 %tagbit1eq0, label %test2, label %optional1
+  br i1 %tagbit1eq0, label %test2, label %optional1, !prof !1
  optional1:
    call void @a()
    call void @a()
@@ -63,7 +63,7 @@ optional1:
  test2:
    %tagbit2 = and i32 %tag, 2
    %tagbit2eq0 = icmp eq i32 %tagbit2, 0
-  br i1 %tagbit2eq0, label %test3, label %optional2
+  br i1 %tagbit2eq0, label %test3, label %optional2, !prof !1
  optional2:
    call void @b()
    call void @b()
@@ -73,7 +73,7 @@ optional2:
  test3:
    %tagbit3 = and i32 %tag, 4
    %tagbit3eq0 = icmp eq i32 %tagbit3, 0
-  br i1 %tagbit3eq0, label %test4, label %optional3
+  br i1 %tagbit3eq0, label %test4, label %optional3, !prof !1
  optional3:
    call void @c()
    call void @c()
@@ -83,7 +83,7 @@ optional3:
  test4:
    %tagbit4 = and i32 %tag, 8
    %tagbit4eq0 = icmp eq i32 %tagbit4, 0
-  br i1 %tagbit4eq0, label %exit, label %optional4
+  br i1 %tagbit4eq0, label %exit, label %optional4, !prof !1
  optional4:
    call void @d()
    call void @d()
@@ -94,7 +94,333 @@ exit:
    ret void
  }
  
+; Intended layout:
+; The chain-based outlining produces the layout
+; entry
+; --- Begin loop ---
+; for.latch
+; for.check
+; test1
+; test2
+; test3
+; test4
+; optional1
+; optional2
+; optional3
+; optional4
+; --- End loop ---
+; exit
+; The CHECK statements check for the whole string of tests and exit block,
+; and then check that the correct test has been duplicated into the end of
+; the optional blocks and that the optional blocks are in the correct order.
+;CHECK-LABEL: loop_test:
+;CHECK: add [[TAGPTRREG:[0-9]+]], 3, 4
+;CHECK: .[[LATCHLABEL:[._0-9A-Za-z]+]]: # %for.latch
+;CHECK: addi
+;CHECK: .[[CHECKLABEL:[._0-9A-Za-z]+]]: # %for.check
+;CHECK: lwz [[TAGREG:[0-9]+]], 0([[TAGPTRREG]])
+;CHECK: # %test1
+;CHECK: andi. {{[0-9]+}}, [[TAGREG]], 1
+;CHECK-NEXT: bc 12, 1, .[[OPT1LABEL:[._0-9A-Za-z]+]]
+;CHECK-NEXT: # %test2
+;CHECK: rlwinm. {{[0-9]+}}, [[TAGREG]], 0, 30, 30
+;CHECK-NEXT: bne 0, .[[OPT2LABEL:[._0-9A-Za-z]+]]
+;CHECK-NEXT: .[[TEST3LABEL:[._0-9A-Za-z]+]]: # %test3
+;CHECK: rlwinm. {{[0-9]+}}, [[TAGREG]], 0, 29, 29
+;CHECK-NEXT: bne 0, .[[OPT3LABEL:[._0-9A-Za-z]+]]
+;CHECK-NEXT: .[[TEST4LABEL:[._0-9A-Za-z]+]]: # %{{(test4|optional3)}}
+;CHECK: rlwinm. {{[0-9]+}}, [[TAGREG]], 0, 28, 28
+;CHECK-NEXT: beq 0, .[[LATCHLABEL]]
+;CHECK-NEXT: b .[[OPT4LABEL:[._0-9A-Za-z]+]]
+;CHECK: [[OPT1LABEL]]
+;CHECK: rlwinm. {{[0-9]+}}, [[TAGREG]], 0, 30, 30
+;CHECK-NEXT: beq 0, .[[TEST3LABEL]]
+;CHECK-NEXT: .[[OPT2LABEL]]
+;CHECK: rlwinm. {{[0-9]+}}, [[TAGREG]], 0, 29, 29
+;CHECK-NEXT: beq 0, .[[TEST4LABEL]]
+;CHECK-NEXT: .[[OPT3LABEL]]
+;CHECK: rlwinm. {{[0-9]+}}, [[TAGREG]], 0, 28, 28
+;CHECK-NEXT: beq 0, .[[LATCHLABEL]]
+;CHECK: [[OPT4LABEL]]:
+;CHECK: b .[[LATCHLABEL]]
+define void @loop_test(i32* %tags, i32 %count) {
+entry:
+  br label %for.check
+for.check:
+  %count.loop = phi i32 [%count, %entry], [%count.sub, %for.latch]
+  %done.count = icmp ugt i32 %count.loop, 0
+  %tag_ptr = getelementptr inbounds i32, i32* %tags, i32 %count
+  %tag = load i32, i32* %tag_ptr
+  %done.tag = icmp eq i32 %tag, 0
+  %done = and i1 %done.count, %done.tag
+  br i1 %done, label %test1, label %exit, !prof !1
+test1:
+  %tagbit1 = and i32 %tag, 1
+  %tagbit1eq0 = icmp eq i32 %tagbit1, 0
+  br i1 %tagbit1eq0, label %test2, label %optional1, !prof !1
+optional1:
+  call void @a()
+  call void @a()
+  call void @a()
+  call void @a()
+  br label %test2
+test2:
+  %tagbit2 = and i32 %tag, 2
+  %tagbit2eq0 = icmp eq i32 %tagbit2, 0
+  br i1 %tagbit2eq0, label %test3, label %optional2, !prof !1
+optional2:
+  call void @b()
+  call void @b()
+  call void @b()
+  call void @b()
+  br label %test3
+test3:
+  %tagbit3 = and i32 %tag, 4
+  %tagbit3eq0 = icmp eq i32 %tagbit3, 0
+  br i1 %tagbit3eq0, label %test4, label %optional3, !prof !1
+optional3:
+  call void @c()
+  call void @c()
+  call void @c()
+  call void @c()
+  br label %test4
+test4:
+  %tagbit4 = and i32 %tag, 8
+  %tagbit4eq0 = icmp eq i32 %tagbit4, 0
+  br i1 %tagbit4eq0, label %for.latch, label %optional4, !prof !1
+optional4:
+  call void @d()
+  call void @d()
+  call void @d()
+  call void @d()
+  br label %for.latch
+for.latch:
+  %count.sub = sub i32 %count.loop, 1
+  br label %for.check
+exit:
+  ret void
+}
+
+; The block then2 is not unavoidable, meaning it does not dominate the exit.
+; But since it can be tail-duplicated, it should be placed as a fallthrough from
+; test2 and copied. The purpose here is to make sure that the tail-duplication
+; code is independent of the outlining code, which works by choosing the
+; "unavoidable" blocks.
+; CHECK-LABEL: avoidable_test:
+; CHECK: # %entry
+; CHECK: andi.
+; CHECK: # %test2
+; Make sure then2 falls through from test2
+; CHECK-NOT: # %{{[-_a-zA-Z0-9]+}}
+; CHECK: # %then2
+; CHECK: rlwinm. {{[0-9]+}}, {{[0-9]+}}, 0, 29, 29
+; CHECK: # %else1
+; CHECK: bl a
+; CHECK: bl a
+; Make sure then2 was copied into else1
+; CHECK: rlwinm. {{[0-9]+}}, {{[0-9]+}}, 0, 29, 29
+; CHECK: # %end1
+; CHECK: bl d
+; CHECK: # %else2
+; CHECK: bl c
+; CHECK: # %end2
+define void @avoidable_test(i32 %tag) {
+entry:
+  br label %test1
+test1:
+  %tagbit1 = and i32 %tag, 1
+  %tagbit1eq0 = icmp eq i32 %tagbit1, 0
+  br i1 %tagbit1eq0, label %test2, label %else1, !prof !1 ; %test2 more likely
+else1:
+  call void @a()
+  call void @a()
+  br label %then2
+test2:
+  %tagbit2 = and i32 %tag, 2
+  %tagbit2eq0 = icmp eq i32 %tagbit2, 0
+  br i1 %tagbit2eq0, label %then2, label %else2, !prof !1 ; %then2 more likely
+then2:
+  %tagbit3 = and i32 %tag, 4
+  %tagbit3eq0 = icmp eq i32 %tagbit3, 0
+  br i1 %tagbit3eq0, label %end2, label %end1, !prof !1 ; %end2 more likely
+else2:
+  call void @c()
+  br label %end2
+end2:
+  ret void
+end1:
+  call void @d()
+  ret void
+}
+
+; CHECK-LABEL: trellis_test
+; The number in the block labels is the expected block frequency given the
+; probabilities annotated. There is a conflict in the b;c->d;e trellis that
+; should be resolved as c->e;b->d.
+; The d;e->f;g trellis should be resolved as e->g;d->f.
+; The f;g->h;i trellis should be resolved as f->i;g->h.
+; The h;i->j;ret trellis contains a triangle edge, and should be resolved as
+; h->j->ret
+; CHECK: # %entry
+; CHECK: # %c10
+; CHECK: # %e9
+; CHECK: # %g10
+; CHECK: # %h10
+; CHECK: # %j8
+; CHECK: # %ret
+; CHECK: # %b6
+; CHECK: # %d7
+; CHECK: # %f6
+; CHECK: # %i6
+define void @trellis_test(i32 %tag) {
+entry:
+  br label %a16
+a16:
+  call void @a()
+  call void @a()
+  %tagbits.a = and i32 %tag, 3
+  %tagbits.a.eq0 = icmp eq i32 %tagbits.a, 0
+  br i1 %tagbits.a.eq0, label %c10, label %b6, !prof !1 ; 10 to 6
+c10:
+  call void @c()
+  call void @c()
+  %tagbits.c = and i32 %tag, 12
+  %tagbits.c.eq0 = icmp eq i32 %tagbits.c, 0
+  ; Both of these edges should be hotter than the other incoming edge
+  ; for e9 or d7
+  br i1 %tagbits.c.eq0, label %e9, label %d7, !prof !3 ; 6 to 4
+e9:
+  call void @e()
+  call void @e()
+  %tagbits.e = and i32 %tag, 48
+  %tagbits.e.eq0 = icmp eq i32 %tagbits.e, 0
+  br i1 %tagbits.e.eq0, label %g10, label %f6, !prof !4 ; 7 to 2
+g10:
+  call void @g()
+  call void @g()
+  %tagbits.g = and i32 %tag, 192
+  %tagbits.g.eq0 = icmp eq i32 %tagbits.g, 0
+  br i1 %tagbits.g.eq0, label %i6, label %h10, !prof !5 ; 2 to 8
+i6:
+  call void @i()
+  call void @i()
+  %tagbits.i = and i32 %tag, 768
+  %tagbits.i.eq0 = icmp eq i32 %tagbits.i, 0
+  br i1 %tagbits.i.eq0, label %ret, label %j8, !prof !2 ; balanced (3 to 3)
+b6:
+  call void @b()
+  call void @b()
+  %tagbits.b = and i32 %tag, 12
+  %tagbits.b.eq1 = icmp eq i32 %tagbits.b, 8
+  br i1 %tagbits.b.eq1, label %e9, label %d7, !prof !2 ; balanced (3 to 3)
+d7:
+  call void @d()
+  call void @d()
+  %tagbits.d = and i32 %tag, 48
+  %tagbits.d.eq1 = icmp eq i32 %tagbits.d, 32
+  br i1 %tagbits.d.eq1, label %g10, label %f6, !prof !6 ; 3 to 4
+f6:
+  call void @f()
+  call void @f()
+  %tagbits.f = and i32 %tag, 192
+  %tagbits.f.eq1 = icmp eq i32 %tagbits.f, 128
+  br i1 %tagbits.f.eq1, label %i6, label %h10, !prof !7 ; 4 to 2
+h10:
+  call void @h()
+  call void @h()
+  %tagbits.h = and i32 %tag, 768
+  %tagbits.h.eq1 = icmp eq i32 %tagbits.h, 512
+  br i1 %tagbits.h.eq1, label %ret, label %j8, !prof !2 ; balanced (5 to 5)
+j8:
+  call void @j()
+  call void @j()
+  br label %ret
+ret:
+  ret void
+}
+
+; Verify that we still consider tail-duplication opportunities if we find a
+; triangle trellis. Here D->F->G is the triangle, and D;E are both predecessors
+; of both F and G. The basic trellis algorithm picks the F->G edge, but after
+; checking, it's profitable to duplicate G into F. The weights here are not
+; really important. They are there to help make the test stable.
+; CHECK-LABEL: trellis_then_dup_test
+; CHECK: # %entry
+; CHECK: # %b
+; CHECK: # %d
+; CHECK: # %g
+; CHECK: # %ret1
+; CHECK: # %c
+; CHECK: # %e
+; CHECK: # %f
+; CHECK: # %ret2
+; CHECK: # %ret
+define void @trellis_then_dup_test(i32 %tag) {
+entry:
+  br label %a
+a:
+  call void @a()
+  call void @a()
+  %tagbits.a = and i32 %tag, 3
+  %tagbits.a.eq0 = icmp eq i32 %tagbits.a, 0
+  br i1 %tagbits.a.eq0, label %b, label %c, !prof !1 ; 5 to 3
+b:
+  call void @b()
+  call void @b()
+  %tagbits.b = and i32 %tag, 12
+  %tagbits.b.eq1 = icmp eq i32 %tagbits.b, 8
+  br i1 %tagbits.b.eq1, label %d, label %e, !prof !1 ; 5 to 3
+d:
+  call void @d()
+  call void @d()
+  %tagbits.d = and i32 %tag, 48
+  %tagbits.d.eq1 = icmp eq i32 %tagbits.d, 32
+  br i1 %tagbits.d.eq1, label %g, label %f, !prof !1 ; 5 to 3
+f:
+  call void @f()
+  call void @f()
+  br label %g
+g:
+  %tagbits.g = and i32 %tag, 192
+  %tagbits.g.eq0 = icmp eq i32 %tagbits.g, 0
+  br i1 %tagbits.g.eq0, label %ret1, label %ret2, !prof !2 ; balanced
+c:
+  call void @c()
+  call void @c()
+  %tagbits.c = and i32 %tag, 12
+  %tagbits.c.eq0 = icmp eq i32 %tagbits.c, 0
+  br i1 %tagbits.c.eq0, label %d, label %e, !prof !1 ; 5 to 3
+e:
+  call void @e()
+  call void @e()
+  %tagbits.e = and i32 %tag, 48
+  %tagbits.e.eq0 = icmp eq i32 %tagbits.e, 0
+  br i1 %tagbits.e.eq0, label %g, label %f, !prof !1 ; 5 to 3
+ret1:
+  call void @a()
+  br label %ret
+ret2:
+  call void @b()
+  br label %ret
+ret:
+  ret void
+}
+
  declare void @a()
  declare void @b()
  declare void @c()
  declare void @d()
+declare void @e()
+declare void @f()
+declare void @g()
+declare void @h()
+declare void @i()
+declare void @j()
+
+!1 = !{!"branch_weights", i32 5, i32 3}
+!2 = !{!"branch_weights", i32 50, i32 50}
+!3 = !{!"branch_weights", i32 6, i32 4}
+!4 = !{!"branch_weights", i32 7, i32 2}
+!5 = !{!"branch_weights", i32 2, i32 8}
+!6 = !{!"branch_weights", i32 3, i32 4}
+!7 = !{!"branch_weights", i32 4, i32 2}
diff --git a/test/CodeGen/SPARC/sjlj.ll b/test/CodeGen/SPARC/sjlj.ll

index 647d8f2fd2cc2efbae369d8b2429aceef8ea0c30..459630f9255fc92818d93ffff75fcb9b26f7027e 100755 (executable)
--- a/test/CodeGen/SPARC/sjlj.ll
+++ b/test/CodeGen/SPARC/sjlj.ll
@@ -67,14 +67,18 @@ return:                                           ; preds = %if.end, %if.then
  ; CHECK:  nop
  ; CHECK:.LBB1_1:                                ! %entry
  ; CHECK:  mov  %g0, %i0
+; CHECK:                                        ! %entry
  ; CHECK:  cmp %i0, 0
-; CHECK:  bne  .LBB1_4
-; CHECK:  ba   .LBB1_5
-; CHECK:.LBB1_2:                                ! Block address taken
-; CHECK:  mov  1, %i0
  ; CHECK:  be   .LBB1_5
+; CHECK:  nop
  ; CHECK:.LBB1_4:
-; CHECK:  ba   .LBB1_6
+; CHECK:  mov  1, %i0
+; CHECK:  ba .LBB1_6
+; CHECK:.LBB1_2:                                ! Block address taken
+; CHECK:  mov  1, %i0
+; CHECK:  cmp %i0, 0
+; CHECK:  bne  .LBB1_4
+; CHECK:  nop
  }
  declare i8* @llvm.frameaddress(i32) #2
  
diff --git a/test/CodeGen/WebAssembly/mem-intrinsics.ll b/test/CodeGen/WebAssembly/mem-intrinsics.ll

index 0ac1e1e182cd9b737e91a0e21b17eec9bf1d77fa..0e2f06b70b8b47da51dd49a3b3917b73b22c93bb 100644 (file)
--- a/test/CodeGen/WebAssembly/mem-intrinsics.ll
+++ b/test/CodeGen/WebAssembly/mem-intrinsics.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -asm-verbose=false -disable-wasm-fallthrough-return-opt -tail-dup-placement=0| FileCheck %s
+; RUN: llc < %s -asm-verbose=false -disable-wasm-fallthrough-return-opt -tail-dup-placement=0 | FileCheck %s
  
  ; Test memcpy, memmove, and memset intrinsics.
  
diff --git a/test/CodeGen/X86/block-placement.ll b/test/CodeGen/X86/block-placement.ll

index 807dfe464cbeabb5f1f0bd21d16082263fa53855..4969b1886977485f91b7fcd027f4da74f11c9e3a 100644 (file)
--- a/test/CodeGen/X86/block-placement.ll
+++ b/test/CodeGen/X86/block-placement.ll
@@ -314,7 +314,7 @@ exit:
  define void @unnatural_cfg1() {
  ; Test that we can handle a loop with an inner unnatural loop at the end of
  ; a function. This is a gross CFG reduced out of the single source GCC.
-; CHECK: unnatural_cfg1
+; CHECK-LABEL: unnatural_cfg1
  ; CHECK: %entry
  ; CHECK: %loop.body1
  ; CHECK: %loop.body2
@@ -352,17 +352,15 @@ define void @unnatural_cfg2() {
  ; Test that we can handle a loop with a nested natural loop *and* an unnatural
  ; loop. This was reduced from a crash on block placement when run over
  ; single-source GCC.
-; CHECK: unnatural_cfg2
+; CHECK-LABEL: unnatural_cfg2
  ; CHECK: %entry
  ; CHECK: %loop.body1
  ; CHECK: %loop.body2
-; CHECK: %loop.body3
-; CHECK: %loop.inner1.begin
-; The end block is folded with %loop.body3...
-; CHECK-NOT: %loop.inner1.end
  ; CHECK: %loop.body4
  ; CHECK: %loop.inner2.begin
-; The loop.inner2.end block is folded
+; CHECK: %loop.inner2.begin
+; CHECK: %loop.body3
+; CHECK: %loop.inner1.begin
  ; CHECK: %loop.header
  ; CHECK: %bail
  
@@ -559,7 +557,7 @@ define void @test_eh_lpad_successor() personality i8* bitcast (i32 (...)* @__gxx
  ; didn't correctly locate the fallthrough successor, assuming blindly that the
  ; first one was the fallthrough successor. As a result, we would add an
  ; erroneous jump to the landing pad thinking *that* was the default successor.
-; CHECK: test_eh_lpad_successor
+; CHECK-LABEL: test_eh_lpad_successor
  ; CHECK: %entry
  ; CHECK-NOT: jmp
  ; CHECK: %loop
@@ -587,7 +585,7 @@ define void @test_eh_throw() personality i8* bitcast (i32 (...)* @__gxx_personal
  ; fallthrough simply won't occur. Make sure we don't crash trying to update
  ; terminators for such constructs.
  ;
-; CHECK: test_eh_throw
+; CHECK-LABEL: test_eh_throw
  ; CHECK: %entry
  ; CHECK: %cleanup
  
@@ -609,7 +607,7 @@ define void @test_unnatural_cfg_backwards_inner_loop() {
  ; attempt to merge onto the wrong end of the inner loop just because we find it
  ; first. This was reduced from a crasher in GCC's single source.
  ;
-; CHECK: test_unnatural_cfg_backwards_inner_loop
+; CHECK-LABEL: test_unnatural_cfg_backwards_inner_loop
  ; CHECK: %entry
  ; CHECK: %loop2b
  ; CHECK: %loop1
@@ -649,7 +647,7 @@ define void @unanalyzable_branch_to_loop_header() {
  ; fallthrough because that happens to always produce unanalyzable branches on
  ; x86.
  ;
-; CHECK: unanalyzable_branch_to_loop_header
+; CHECK-LABEL: unanalyzable_branch_to_loop_header
  ; CHECK: %entry
  ; CHECK: %loop
  ; CHECK: %exit
@@ -673,7 +671,7 @@ define void @unanalyzable_branch_to_best_succ(i1 %cond) {
  ; This branch is now analyzable and hence the destination block becomes the
  ; hotter one. The right order is entry->bar->exit->foo.
  ;
-; CHECK: unanalyzable_branch_to_best_succ
+; CHECK-LABEL: unanalyzable_branch_to_best_succ
  ; CHECK: %entry
  ; CHECK: %bar
  ; CHECK: %exit
@@ -699,7 +697,7 @@ define void @unanalyzable_branch_to_free_block(float %x) {
  ; Ensure that we can handle unanalyzable branches where the destination block
  ; gets selected as the best free block in the CFG.
  ;
-; CHECK: unanalyzable_branch_to_free_block
+; CHECK-LABEL: unanalyzable_branch_to_free_block
  ; CHECK: %entry
  ; CHECK: %a
  ; CHECK: %b
@@ -729,7 +727,7 @@ define void @many_unanalyzable_branches() {
  ; Ensure that we don't crash as we're building up many unanalyzable branches,
  ; blocks, and loops.
  ;
-; CHECK: many_unanalyzable_branches
+; CHECK-LABEL: many_unanalyzable_branches
  ; CHECK: %entry
  ; CHECK: %exit
  
@@ -948,7 +946,7 @@ define void @benchmark_heapsort(i32 %n, double* nocapture %ra) {
  ;    strange layouts that are siginificantly less efficient, often times maing
  ;    it discontiguous.
  ;
-; CHECK: @benchmark_heapsort
+; CHECK-LABEL: @benchmark_heapsort
  ; CHECK: %entry
  ; First rotated loop top.
  ; CHECK: .p2align
diff --git a/test/CodeGen/X86/bypass-slow-division-32.ll b/test/CodeGen/X86/bypass-slow-division-32.ll

index ea545d22385c14e07ad78bf9f5a28e9425f41dda..9f266647d8aa22b522f70d91dea3618026afb11a 100644 (file)
--- a/test/CodeGen/X86/bypass-slow-division-32.ll
+++ b/test/CodeGen/X86/bypass-slow-division-32.ll
@@ -95,20 +95,19 @@ define i32 @Test_use_div_and_idiv(i32 %a, i32 %b) nounwind {
  ; CHECK-NEXT:    idivl %ebx
  ; CHECK-NEXT:    movl %eax, %esi
  ; CHECK-NEXT:    testl $-256, %edi
-; CHECK-NEXT:    jne .LBB3_5
-; CHECK-NEXT:    jmp .LBB3_4
-; CHECK-NEXT:  .LBB3_1:
-; CHECK-NEXT:    movzbl %cl, %eax
-; CHECK-NEXT:    # kill: %EAX<def> %EAX<kill> %AX<def>
-; CHECK-NEXT:    divb %bl
-; CHECK-NEXT:    movzbl %al, %esi
-; CHECK-NEXT:    testl $-256, %edi
  ; CHECK-NEXT:    je .LBB3_4
  ; CHECK-NEXT:  .LBB3_5:
  ; CHECK-NEXT:    xorl %edx, %edx
  ; CHECK-NEXT:    movl %ecx, %eax
  ; CHECK-NEXT:    divl %ebx
  ; CHECK-NEXT:    jmp .LBB3_6
+; CHECK-NEXT:  .LBB3_1:
+; CHECK-NEXT:    movzbl %cl, %eax
+; CHECK-NEXT:    # kill: %EAX<def> %EAX<kill> %AX<def>
+; CHECK-NEXT:    divb %bl
+; CHECK-NEXT:    movzbl %al, %esi
+; CHECK-NEXT:    testl $-256, %edi
+; CHECK-NEXT:    jne .LBB3_5
  ; CHECK-NEXT:  .LBB3_4:
  ; CHECK-NEXT:    movzbl %cl, %eax
  ; CHECK-NEXT:    # kill: %EAX<def> %EAX<kill> %AX<def>
diff --git a/test/CodeGen/X86/sse1.ll b/test/CodeGen/X86/sse1.ll

index 9488d6d26056c7d76cc10c37fc073ffd5461fe7e..dfc1aefd31a611951d64fdbd87d191a3c1576713 100644 (file)
--- a/test/CodeGen/X86/sse1.ll
+++ b/test/CodeGen/X86/sse1.ll
@@ -60,7 +60,13 @@ define <4 x float> @vselect(<4 x float>*%p, <4 x i32> %q) {
  ; X32-NEXT:    xorps %xmm1, %xmm1
  ; X32-NEXT:    cmpl $0, {{[0-9]+}}(%esp)
  ; X32-NEXT:    jne .LBB1_5
-; X32-NEXT:    jmp .LBB1_4
+; X32-NEXT:  .LBB1_4:
+; X32-NEXT:    movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; X32-NEXT:    cmpl $0, {{[0-9]+}}(%esp)
+; X32-NEXT:    jne .LBB1_8
+; X32-NEXT:  .LBB1_7:
+; X32-NEXT:    movss {{.*#+}} xmm3 = mem[0],zero,zero,zero
+; X32-NEXT:    jmp .LBB1_9
  ; X32-NEXT:  .LBB1_1:
  ; X32-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
  ; X32-NEXT:    cmpl $0, {{[0-9]+}}(%esp)
@@ -68,17 +74,9 @@ define <4 x float> @vselect(<4 x float>*%p, <4 x i32> %q) {
  ; X32-NEXT:  .LBB1_5: # %entry
  ; X32-NEXT:    xorps %xmm2, %xmm2
  ; X32-NEXT:    cmpl $0, {{[0-9]+}}(%esp)
-; X32-NEXT:    jne .LBB1_8
-; X32-NEXT:    jmp .LBB1_7
-; X32-NEXT:  .LBB1_4:
-; X32-NEXT:    movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
-; X32-NEXT:    cmpl $0, {{[0-9]+}}(%esp)
  ; X32-NEXT:    je .LBB1_7
  ; X32-NEXT:  .LBB1_8: # %entry
  ; X32-NEXT:    xorps %xmm3, %xmm3
-; X32-NEXT:    jmp .LBB1_9
-; X32-NEXT:  .LBB1_7:
-; X32-NEXT:    movss {{.*#+}} xmm3 = mem[0],zero,zero,zero
  ; X32-NEXT:  .LBB1_9: # %entry
  ; X32-NEXT:    cmpl $0, {{[0-9]+}}(%esp)
  ; X32-NEXT:    unpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
@@ -99,7 +97,13 @@ define <4 x float> @vselect(<4 x float>*%p, <4 x i32> %q) {
  ; X64-NEXT:    xorps %xmm1, %xmm1
  ; X64-NEXT:    testl %edx, %edx
  ; X64-NEXT:    jne .LBB1_5
-; X64-NEXT:    jmp .LBB1_4
+; X64-NEXT:  .LBB1_4:
+; X64-NEXT:    movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; X64-NEXT:    testl %r8d, %r8d
+; X64-NEXT:    jne .LBB1_8
+; X64-NEXT:  .LBB1_7:
+; X64-NEXT:    movss {{.*#+}} xmm3 = mem[0],zero,zero,zero
+; X64-NEXT:    jmp .LBB1_9
  ; X64-NEXT:  .LBB1_1:
  ; X64-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
  ; X64-NEXT:    testl %edx, %edx
@@ -107,17 +111,9 @@ define <4 x float> @vselect(<4 x float>*%p, <4 x i32> %q) {
  ; X64-NEXT:  .LBB1_5: # %entry
  ; X64-NEXT:    xorps %xmm2, %xmm2
  ; X64-NEXT:    testl %r8d, %r8d
-; X64-NEXT:    jne .LBB1_8
-; X64-NEXT:    jmp .LBB1_7
-; X64-NEXT:  .LBB1_4:
-; X64-NEXT:    movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
-; X64-NEXT:    testl %r8d, %r8d
  ; X64-NEXT:    je .LBB1_7
  ; X64-NEXT:  .LBB1_8: # %entry
  ; X64-NEXT:    xorps %xmm3, %xmm3
-; X64-NEXT:    jmp .LBB1_9
-; X64-NEXT:  .LBB1_7:
-; X64-NEXT:    movss {{.*#+}} xmm3 = mem[0],zero,zero,zero
  ; X64-NEXT:  .LBB1_9: # %entry
  ; X64-NEXT:    testl %esi, %esi
  ; X64-NEXT:    unpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
@@ -215,7 +211,7 @@ define <4 x i32> @PR30512(<4 x i32> %x, <4 x i32> %y) nounwind {
    ret <4 x i32> %zext
  }
  
-; Fragile test warning - we need to induce the generation of a vselect 
+; Fragile test warning - we need to induce the generation of a vselect
  ; post-legalization to cause the crash seen in:
  ; https://llvm.org/bugs/show_bug.cgi?id=31672
  ; Is there a way to do that without an unsafe/fast sqrt intrinsic call?
diff --git a/test/CodeGen/X86/tail-dup-merge-loop-headers.ll b/test/CodeGen/X86/tail-dup-merge-loop-headers.ll

index 197fd72586a5f26ae39d9f2db4d4a5d505e41f56..80346018ff8074629c312a214abc57597190393c 100644 (file)
--- a/test/CodeGen/X86/tail-dup-merge-loop-headers.ll
+++ b/test/CodeGen/X86/tail-dup-merge-loop-headers.ll
@@ -6,13 +6,13 @@ target triple = "x86_64-unknown-linux-gnu"
  ; CHECK-LABEL: tail_dup_merge_loops
  ; CHECK: # %entry
  ; CHECK-NOT: # %{{[a-zA-Z_]+}}
+; CHECK: # %exit
+; CHECK-NOT: # %{{[a-zA-Z_]+}}
  ; CHECK: # %inner_loop_exit
  ; CHECK-NOT: # %{{[a-zA-Z_]+}}
  ; CHECK: # %inner_loop_latch
  ; CHECK-NOT: # %{{[a-zA-Z_]+}}
  ; CHECK: # %inner_loop_test
-; CHECK-NOT: # %{{[a-zA-Z_]+}}
-; CHECK: # %exit
  define void @tail_dup_merge_loops(i32 %a, i8* %b, i8* %c) local_unnamed_addr #0 {
  entry:
    %notlhs674.i = icmp eq i32 %a, 0
diff --git a/test/CodeGen/X86/tail-dup-repeat.ll b/test/CodeGen/X86/tail-dup-repeat.ll

index 21b48e16efb9eaf44c9159ab698f2e1c0d221d98..7d9c0908e571ccd18d69ff2f457e4536bef70bb9 100644 (file)
--- a/test/CodeGen/X86/tail-dup-repeat.ll
+++ b/test/CodeGen/X86/tail-dup-repeat.ll
@@ -1,4 +1,4 @@
-; RUN: llc -O2 -tail-dup-placement-threshold=4 -o - %s | FileCheck %s
+; RUN: llc -O3 -tail-dup-placement-threshold=4 -o - %s | FileCheck %s
  target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
  target triple = "x86_64-unknown-linux-gnu"
  
diff --git a/test/CodeGen/X86/tail-opts.ll b/test/CodeGen/X86/tail-opts.ll

index e7797426d891b23e1cb35c468415540abeae6cb2..96ff33ff5f7d79d19f0c670680314a9c37e4dbe4 100644 (file)
--- a/test/CodeGen/X86/tail-opts.ll
+++ b/test/CodeGen/X86/tail-opts.ll
@@ -113,16 +113,15 @@ altret:
  ; CHECK-NEXT:   jbe .LBB2_3
  ; CHECK-NEXT:   ucomiss %xmm{{[0-2]}}, %xmm{{[0-2]}}
  ; CHECK-NEXT:   ja .LBB2_4
-; CHECK-NEXT:   jmp .LBB2_2
+; CHECK-NEXT: .LBB2_2:
+; CHECK-NEXT:   movb $1, %al
+; CHECK-NEXT:   ret
  ; CHECK-NEXT: .LBB2_3:
  ; CHECK-NEXT:   ucomiss %xmm{{[0-2]}}, %xmm{{[0-2]}}
  ; CHECK-NEXT:   jbe .LBB2_2
  ; CHECK-NEXT: .LBB2_4:
  ; CHECK-NEXT:   xorl %eax, %eax
  ; CHECK-NEXT:   ret
-; CHECK-NEXT: .LBB2_2:
-; CHECK-NEXT:   movb $1, %al
-; CHECK-NEXT:   ret
  
  define i1 @dont_merge_oddly(float* %result) nounwind {
  entry:
diff --git a/test/CodeGen/X86/twoaddr-coalesce-3.ll b/test/CodeGen/X86/twoaddr-coalesce-3.ll

index 33c9d46f13c3ade6be91d48554d6eb1e521902e3..f5a7326c970c185938f407f03168a2c255fe9a96 100644 (file)
--- a/test/CodeGen/X86/twoaddr-coalesce-3.ll
+++ b/test/CodeGen/X86/twoaddr-coalesce-3.ll
@@ -19,7 +19,7 @@ for.body.lr.ph:                                   ; preds = %entry
  
  ; Check that only one mov will be generated in the kernel loop.
  ; CHECK-LABEL: foo:
-; CHECK: [[LOOP1:^[a-zA-Z0-9_.]+]]: {{#.*}} %for.body
+; CHECK: [[LOOP1:^[a-zA-Z0-9_.]+]]: {{#.*}} %for.body{{$}}
  ; CHECK-NOT: mov
  ; CHECK: movl {{.*}}, [[REG1:%[a-z0-9]+]]
  ; CHECK-NOT: mov
@@ -56,7 +56,7 @@ for.body.lr.ph:                                   ; preds = %entry
  
  ; Check that only two mov will be generated in the kernel loop.
  ; CHECK-LABEL: goo:
-; CHECK: [[LOOP2:^[a-zA-Z0-9_.]+]]: {{#.*}} %for.body
+; CHECK: [[LOOP2:^[a-zA-Z0-9_.]+]]: {{#.*}} %for.body{{$}}
  ; CHECK-NOT: mov
  ; CHECK: movl {{.*}}, [[REG2:%[a-z0-9]+]]
  ; CHECK-NOT: mov
diff --git a/test/CodeGen/X86/win-alloca-expander.ll b/test/CodeGen/X86/win-alloca-expander.ll

index 45ca3b214ab87a7d81439b8f302bd55c8aad3b9e..4b6e3bb18e60142a2fe51880d3cbe7d95cc41647 100644 (file)
--- a/test/CodeGen/X86/win-alloca-expander.ll
+++ b/test/CodeGen/X86/win-alloca-expander.ll
@@ -115,34 +115,36 @@ define void @cfg(i1 %x, i1 %y) {
  ; Test that the blocks are analyzed in the correct order.
  ; CHECK-LABEL: cfg:
  entry:
-  br i1 %x, label %bb1, label %bb2
+  br i1 %x, label %bb1, label %bb3
  
  bb1:
    %p1 = alloca %struct.S
  ; CHECK: pushl %eax
  ; CHECK: subl $1020, %esp
-  br label %bb3
+  br label %bb4
+
  bb2:
-  %p2 = alloca %struct.T
+  %p5 = alloca %struct.T
  ; CHECK: pushl %eax
  ; CHECK: subl $2996, %esp
-  br label %bb3
+  call void @g(%struct.T* %p5)
+  ret void
  
  bb3:
-  br i1 %y, label %bb4, label %bb5
+  %p2 = alloca %struct.T
+; CHECK: pushl %eax
+; CHECK: subl $2996, %esp
+  br label %bb4
  
  bb4:
+  br i1 %y, label %bb5, label %bb2
+
+bb5:
    %p4 = alloca %struct.S
  ; CHECK: subl $1024, %esp
    call void @f(%struct.S* %p4)
    ret void
  
-bb5:
-  %p5 = alloca %struct.T
-; CHECK: pushl %eax
-; CHECK: subl $2996, %esp
-  call void @g(%struct.T* %p5)
-  ret void
  }
author	Kyle Butt <kyle+llvm@iteratee.net>
	Wed, 15 Feb 2017 19:49:14 +0000 (19:49 +0000)
committer	Kyle Butt <kyle+llvm@iteratee.net>
	Wed, 15 Feb 2017 19:49:14 +0000 (19:49 +0000)
lib/CodeGen/MachineBlockPlacement.cpp		patch \| blob \| history
test/CodeGen/AArch64/branch-relax-cbz.ll		patch \| blob \| history
test/CodeGen/AArch64/combine-comparisons-by-cse.ll		patch \| blob \| history
test/CodeGen/AArch64/optimize-cond-branch.ll		patch \| blob \| history
test/CodeGen/AMDGPU/basic-branch.ll		patch \| blob \| history
test/CodeGen/AMDGPU/branch-relaxation.ll		patch \| blob \| history
test/CodeGen/AMDGPU/cf-loop-on-constant.ll		patch \| blob \| history
test/CodeGen/AMDGPU/convergent-inlineasm.ll		patch \| blob \| history
test/CodeGen/AMDGPU/salu-to-valu.ll		patch \| blob \| history
test/CodeGen/ARM/2007-05-22-tailmerge-3.ll		patch \| blob \| history
test/CodeGen/ARM/atomic-cmpxchg.ll		patch \| blob \| history
test/CodeGen/ARM/fold-stack-adjust.ll		patch \| blob \| history
test/CodeGen/PowerPC/tail-dup-break-cfg.ll		patch \| blob \| history
test/CodeGen/PowerPC/tail-dup-layout.ll		patch \| blob \| history
test/CodeGen/SPARC/sjlj.ll		patch \| blob \| history
test/CodeGen/WebAssembly/mem-intrinsics.ll		patch \| blob \| history
test/CodeGen/X86/block-placement.ll		patch \| blob \| history
test/CodeGen/X86/bypass-slow-division-32.ll		patch \| blob \| history
test/CodeGen/X86/sse1.ll		patch \| blob \| history
test/CodeGen/X86/tail-dup-merge-loop-headers.ll		patch \| blob \| history
test/CodeGen/X86/tail-dup-repeat.ll		patch \| blob \| history
test/CodeGen/X86/tail-opts.ll		patch \| blob \| history
test/CodeGen/X86/twoaddr-coalesce-3.ll		patch \| blob \| history
test/CodeGen/X86/win-alloca-expander.ll		patch \| blob \| history