[LoopFusion] Add ability to fuse guarded loops

author Kit Barton <kbarton@ca.ibm.com>

Thu, 26 Sep 2019 21:42:45 +0000 (21:42 +0000)

committer Kit Barton <kbarton@ca.ibm.com>

Thu, 26 Sep 2019 21:42:45 +0000 (21:42 +0000)
author Kit Barton <kbarton@ca.ibm.com>
Thu, 26 Sep 2019 21:42:45 +0000 (21:42 +0000)
committer Kit Barton <kbarton@ca.ibm.com>
Thu, 26 Sep 2019 21:42:45 +0000 (21:42 +0000)
diff --git a/lib/Transforms/Scalar/LoopFuse.cpp b/lib/Transforms/Scalar/LoopFuse.cpp

index a323d80dbe68356c9118b2b0a5389260cf6423dd..9f93c68e6128debb08d121b4090889b6f77baa40 100644 (file)
--- a/lib/Transforms/Scalar/LoopFuse.cpp
+++ b/lib/Transforms/Scalar/LoopFuse.cpp
@@ -85,6 +85,9 @@ STATISTIC(NonEqualTripCount, "Loop trip counts are not the same");
  STATISTIC(NonAdjacent, "Loops are not adjacent");
  STATISTIC(NonEmptyPreheader, "Loop has a non-empty preheader");
  STATISTIC(FusionNotBeneficial, "Fusion is not beneficial");
+STATISTIC(NonIdenticalGuards, "Candidates have different guards");
+STATISTIC(NonEmptyExitBlock, "Candidate has a non-empty exit block");
+STATISTIC(NonEmptyGuardBlock, "Candidate has a non-empty guard block");
  
  enum FusionDependenceAnalysisChoice {
    FUSION_DEPENDENCE_ANALYSIS_SCEV,
@@ -144,6 +147,8 @@ struct FusionCandidate {
    SmallVector<Instruction *, 16> MemWrites;
    /// Are all of the members of this fusion candidate still valid
    bool Valid;
+  /// Guard branch of the loop, if it exists
+  BranchInst *GuardBranch;
  
    /// Dominator and PostDominator trees are needed for the
    /// FusionCandidateCompare function, required by FusionCandidateSet to
@@ -158,8 +163,14 @@ struct FusionCandidate {
                    const PostDominatorTree *PDT, OptimizationRemarkEmitter &ORE)
        : Preheader(L->getLoopPreheader()), Header(L->getHeader()),
          ExitingBlock(L->getExitingBlock()), ExitBlock(L->getExitBlock()),
-        Latch(L->getLoopLatch()), L(L), Valid(true), DT(DT), PDT(PDT),
-        ORE(ORE) {
+        Latch(L->getLoopLatch()), L(L), Valid(true), GuardBranch(nullptr),
+        DT(DT), PDT(PDT), ORE(ORE) {
+
+    // TODO: This is temporary while we fuse both rotated and non-rotated
+    // loops. Once we switch to only fusing rotated loops, the initialization of
+    // GuardBranch can be moved into the initialization list above.
+    if (isRotated())
+      GuardBranch = L->getLoopGuardBranch();
  
      // Walk over all blocks in the loop and check for conditions that may
      // prevent fusion. For each block, walk over all instructions and collect
@@ -218,16 +229,55 @@ struct FusionCandidate {
      assert(Latch == L->getLoopLatch() && "Latch is out of sync");
    }
  
+  /// Get the entry block for this fusion candidate.
+  ///
+  /// If this fusion candidate represents a guarded loop, the entry block is the
+  /// loop guard block. If it represents an unguarded loop, the entry block is
+  /// the preheader of the loop.
+  BasicBlock *getEntryBlock() const {
+    if (GuardBranch)
+      return GuardBranch->getParent();
+    else
+      return Preheader;
+  }
+
+  /// Given a guarded loop, get the successor of the guard that is not in the
+  /// loop.
+  ///
+  /// This method returns the successor of the loop guard that is not located
+  /// within the loop (i.e., the successor of the guard that is not the
+  /// preheader).
+  /// This method is only valid for guarded loops.
+  BasicBlock *getNonLoopBlock() const {
+    assert(GuardBranch && "Only valid on guarded loops.");
+    assert(GuardBranch->isConditional() &&
+           "Expecting guard to be a conditional branch.");
+    return (GuardBranch->getSuccessor(0) == Preheader)
+               ? GuardBranch->getSuccessor(1)
+               : GuardBranch->getSuccessor(0);
+  }
+
+  bool isRotated() const {
+    assert(L && "Expecting loop to be valid.");
+    assert(Latch && "Expecting latch to be valid.");
+    return L->isLoopExiting(Latch);
+  }
+
  #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
    LLVM_DUMP_METHOD void dump() const {
-    dbgs() << "\tPreheader: " << (Preheader ? Preheader->getName() : "nullptr")
+    dbgs() << "\tGuardBranch: "
+           << (GuardBranch ? GuardBranch->getName() : "nullptr") << "\n"
+           << "\tPreheader: " << (Preheader ? Preheader->getName() : "nullptr")
             << "\n"
             << "\tHeader: " << (Header ? Header->getName() : "nullptr") << "\n"
             << "\tExitingBB: "
             << (ExitingBlock ? ExitingBlock->getName() : "nullptr") << "\n"
             << "\tExitBB: " << (ExitBlock ? ExitBlock->getName() : "nullptr")
             << "\n"
-           << "\tLatch: " << (Latch ? Latch->getName() : "nullptr") << "\n";
+           << "\tLatch: " << (Latch ? Latch->getName() : "nullptr") << "\n"
+           << "\tEntryBlock: "
+           << (getEntryBlock() ? getEntryBlock()->getName() : "nullptr")
+           << "\n";
    }
  #endif
  
@@ -303,21 +353,24 @@ struct FusionCandidateCompare {
                    const FusionCandidate &RHS) const {
      const DominatorTree *DT = LHS.DT;
  
+    BasicBlock *LHSEntryBlock = LHS.getEntryBlock();
+    BasicBlock *RHSEntryBlock = RHS.getEntryBlock();
+
      // Do not save PDT to local variable as it is only used in asserts and thus
      // will trigger an unused variable warning if building without asserts.
      assert(DT && LHS.PDT && "Expecting valid dominator tree");
  
      // Do this compare first so if LHS == RHS, function returns false.
-    if (DT->dominates(RHS.Preheader, LHS.Preheader)) {
+    if (DT->dominates(RHSEntryBlock, LHSEntryBlock)) {
        // RHS dominates LHS
        // Verify LHS post-dominates RHS
-      assert(LHS.PDT->dominates(LHS.Preheader, RHS.Preheader));
+      assert(LHS.PDT->dominates(LHSEntryBlock, RHSEntryBlock));
        return false;
      }
  
-    if (DT->dominates(LHS.Preheader, RHS.Preheader)) {
+    if (DT->dominates(LHSEntryBlock, RHSEntryBlock)) {
        // Verify RHS Postdominates LHS
-      assert(LHS.PDT->dominates(RHS.Preheader, LHS.Preheader));
+      assert(LHS.PDT->dominates(RHSEntryBlock, LHSEntryBlock));
        return true;
      }
  
@@ -538,11 +591,14 @@ private:
                                 const FusionCandidate &FC1) const {
      assert(FC0.Preheader && FC1.Preheader && "Expecting valid preheaders");
  
-    if (DT.dominates(FC0.Preheader, FC1.Preheader))
-      return PDT.dominates(FC1.Preheader, FC0.Preheader);
+    BasicBlock *FC0EntryBlock = FC0.getEntryBlock();
+    BasicBlock *FC1EntryBlock = FC1.getEntryBlock();
+
+    if (DT.dominates(FC0EntryBlock, FC1EntryBlock))
+      return PDT.dominates(FC1EntryBlock, FC0EntryBlock);
  
-    if (DT.dominates(FC1.Preheader, FC0.Preheader))
-      return PDT.dominates(FC0.Preheader, FC1.Preheader);
+    if (DT.dominates(FC1EntryBlock, FC0EntryBlock))
+      return PDT.dominates(FC0EntryBlock, FC1EntryBlock);
  
      return false;
    }
@@ -677,11 +733,22 @@ private:
              continue;
            }
  
-          // For now we skip fusing if the second candidate has any instructions
-          // in the preheader. This is done because we currently do not have the
-          // safety checks to determine if it is save to move the preheader of
-          // the second candidate past the body of the first candidate. Once
-          // these checks are added, this condition can be removed.
+          // Ensure that FC0 and FC1 have identical guards.
+          // If one (or both) are not guarded, this check is not necessary.
+          if (FC0->GuardBranch && FC1->GuardBranch &&
+              !haveIdenticalGuards(*FC0, *FC1)) {
+            LLVM_DEBUG(dbgs() << "Fusion candidates do not have identical "
+                                 "guards. Not Fusing.\n");
+            reportLoopFusion<OptimizationRemarkMissed>(*FC0, *FC1,
+                                                       NonIdenticalGuards);
+            continue;
+          }
+
+          // The following three checks look for empty blocks in FC0 and FC1. If
+          // any of these blocks are non-empty, we do not fuse. This is done
+          // because we currently do not have the safety checks to determine if
+          // it is safe to move the blocks past other blocks in the loop. Once
+          // these checks are added, these conditions can be relaxed.
            if (!isEmptyPreheader(*FC1)) {
              LLVM_DEBUG(dbgs() << "Fusion candidate does not have empty "
                                   "preheader. Not fusing.\n");
@@ -690,6 +757,24 @@ private:
              continue;
            }
  
+          if (FC0->GuardBranch && !isEmptyExitBlock(*FC0)) {
+            LLVM_DEBUG(dbgs() << "Fusion candidate does not have empty exit "
+                                 "block. Not fusing.\n");
+            reportLoopFusion<OptimizationRemarkMissed>(*FC0, *FC1,
+                                                       NonEmptyExitBlock);
+            continue;
+          }
+
+          if (FC1->GuardBranch && !isEmptyGuardBlock(*FC1)) {
+            LLVM_DEBUG(dbgs() << "Fusion candidate does not have empty guard "
+                                 "block. Not fusing.\n");
+            reportLoopFusion<OptimizationRemarkMissed>(*FC0, *FC1,
+                                                       NonEmptyGuardBlock);
+            continue;
+          }
+
+          // Check the dependencies across the loops and do not fuse if it would
+          // violate them.
            if (!dependencesAllowFusion(*FC0, *FC1)) {
              LLVM_DEBUG(dbgs() << "Memory dependencies do not allow fusion!\n");
              reportLoopFusion<OptimizationRemarkMissed>(*FC0, *FC1,
@@ -895,7 +980,7 @@ private:
      LLVM_DEBUG(dbgs() << "Check if " << FC0 << " can be fused with " << FC1
                        << "\n");
      assert(FC0.L->getLoopDepth() == FC1.L->getLoopDepth());
-    assert(DT.dominates(FC0.Preheader, FC1.Preheader));
+    assert(DT.dominates(FC0.getEntryBlock(), FC1.getEntryBlock()));
  
      for (Instruction *WriteL0 : FC0.MemWrites) {
        for (Instruction *WriteL1 : FC1.MemWrites)
@@ -945,18 +1030,89 @@ private:
      return true;
    }
  
-  /// Determine if the exit block of \p FC0 is the preheader of \p FC1. In this
-  /// case, there is no code in between the two fusion candidates, thus making
-  /// them adjacent.
+  /// Determine if two fusion candidates are adjacent in the CFG.
+  ///
+  /// This method will determine if there are additional basic blocks in the CFG
+  /// between the exit of \p FC0 and the entry of \p FC1.
+  /// If the two candidates are guarded loops, then it checks whether the
+  /// non-loop successor of the \p FC0 guard branch is the entry block of \p
+  /// FC1. If not, then the loops are not adjacent. If the two candidates are
+  /// not guarded loops, then it checks whether the exit block of \p FC0 is the
+  /// preheader of \p FC1.
    bool isAdjacent(const FusionCandidate &FC0,
                    const FusionCandidate &FC1) const {
-    return FC0.ExitBlock == FC1.Preheader;
+    // If the successor of the guard branch is FC1, then the loops are adjacent
+    if (FC0.GuardBranch)
+      return FC0.getNonLoopBlock() == FC1.getEntryBlock();
+    else
+      return FC0.ExitBlock == FC1.getEntryBlock();
+  }
+
+  /// Determine if two fusion candidates have identical guards
+  ///
+  /// This method will determine if two fusion candidates have the same guards.
+  /// The guards are considered the same if:
+  ///   1. The instructions to compute the condition used in the compare are
+  ///      identical.
+  ///   2. The successors of the guard have the same flow into/around the loop.
+  /// If the compare instructions are identical, then the first successor of the
+  /// guard must go to the same place (either the preheader of the loop or the
+  /// NonLoopBlock). In other words, the the first successor of both loops must
+  /// both go into the loop (i.e., the preheader) or go around the loop (i.e.,
+  /// the NonLoopBlock). The same must be true for the second successor.
+  bool haveIdenticalGuards(const FusionCandidate &FC0,
+                           const FusionCandidate &FC1) const {
+    assert(FC0.GuardBranch && FC1.GuardBranch &&
+           "Expecting FC0 and FC1 to be guarded loops.");
+
+    if (auto FC0CmpInst =
+            dyn_cast<Instruction>(FC0.GuardBranch->getCondition()))
+      if (auto FC1CmpInst =
+              dyn_cast<Instruction>(FC1.GuardBranch->getCondition()))
+        if (!FC0CmpInst->isIdenticalTo(FC1CmpInst))
+          return false;
+
+    // The compare instructions are identical.
+    // Now make sure the successor of the guards have the same flow into/around
+    // the loop
+    if (FC0.GuardBranch->getSuccessor(0) == FC0.Preheader)
+      return (FC1.GuardBranch->getSuccessor(0) == FC1.Preheader);
+    else
+      return (FC1.GuardBranch->getSuccessor(1) == FC1.Preheader);
+  }
+
+  /// Check that the guard for \p FC *only* contains the cmp/branch for the
+  /// guard.
+  /// Once we are able to handle intervening code, any code in the guard block
+  /// for FC1 will need to be treated as intervening code and checked whether
+  /// it can safely move around the loops.
+  bool isEmptyGuardBlock(const FusionCandidate &FC) const {
+    assert(FC.GuardBranch && "Expecting a fusion candidate with guard branch.");
+    if (auto *CmpInst = dyn_cast<Instruction>(FC.GuardBranch->getCondition())) {
+      auto *GuardBlock = FC.GuardBranch->getParent();
+      // If the generation of the cmp value is in GuardBlock, then the size of
+      // the guard block should be 2 (cmp + branch). If the generation of the
+      // cmp value is in a different block, then the size of the guard block
+      // should only be 1.
+      if (CmpInst->getParent() == GuardBlock)
+        return GuardBlock->size() == 2;
+      else
+        return GuardBlock->size() == 1;
+    }
+
+    return false;
    }
  
    bool isEmptyPreheader(const FusionCandidate &FC) const {
+    assert(FC.Preheader && "Expecting a valid preheader");
      return FC.Preheader->size() == 1;
    }
  
+  bool isEmptyExitBlock(const FusionCandidate &FC) const {
+    assert(FC.ExitBlock && "Expecting a valid exit block");
+    return FC.ExitBlock->size() == 1;
+  }
+
    /// Fuse two fusion candidates, creating a new fused loop.
    ///
    /// This method contains the mechanics of fusing two loops, represented by \p
@@ -993,6 +1149,12 @@ private:
      LLVM_DEBUG(dbgs() << "Fusion Candidate 0: \n"; FC0.dump();
                 dbgs() << "Fusion Candidate 1: \n"; FC1.dump(););
  
+    // Fusing guarded loops is handled slightly differently than non-guarded
+    // loops and has been broken out into a separate method instead of trying to
+    // intersperse the logic within a single method.
+    if (FC0.GuardBranch)
+      return fuseGuardedLoops(FC0, FC1);
+
      assert(FC1.Preheader == FC0.ExitBlock);
      assert(FC1.Preheader->size() == 1 &&
             FC1.Preheader->getSingleSuccessor() == FC1.Header);
@@ -1137,8 +1299,6 @@ private:
      SE.verify();
  #endif
  
-    FuseCounter++;
-
      LLVM_DEBUG(dbgs() << "Fusion done:\n");
  
      return FC0.L;
@@ -1170,6 +1330,232 @@ private:
               << " and " << NV("Cand2", StringRef(FC1.Preheader->getName()))
               << ": " << Stat.getDesc());
    }
+
+  /// Fuse two guarded fusion candidates, creating a new fused loop.
+  ///
+  /// Fusing guarded loops is handled much the same way as fusing non-guarded
+  /// loops. The rewiring of the CFG is slightly different though, because of
+  /// the presence of the guards around the loops and the exit blocks after the
+  /// loop body. As such, the new loop is rewired as follows:
+  ///    1. Keep the guard branch from FC0 and use the non-loop block target
+  /// from the FC1 guard branch.
+  ///    2. Remove the exit block from FC0 (this exit block should be empty
+  /// right now).
+  ///    3. Remove the guard branch for FC1
+  ///    4. Remove the preheader for FC1.
+  /// The exit block successor for the latch of FC0 is updated to be the header
+  /// of FC1 and the non-exit block successor of the latch of FC1 is updated to
+  /// be the header of FC0, thus creating the fused loop.
+  Loop *fuseGuardedLoops(const FusionCandidate &FC0,
+                         const FusionCandidate &FC1) {
+    assert(FC0.GuardBranch && FC1.GuardBranch && "Expecting guarded loops");
+
+    BasicBlock *FC0GuardBlock = FC0.GuardBranch->getParent();
+    BasicBlock *FC1GuardBlock = FC1.GuardBranch->getParent();
+    BasicBlock *FC0NonLoopBlock = FC0.getNonLoopBlock();
+    BasicBlock *FC1NonLoopBlock = FC1.getNonLoopBlock();
+
+    assert(FC0NonLoopBlock == FC1GuardBlock && "Loops are not adjacent");
+
+    SmallVector<DominatorTree::UpdateType, 8> TreeUpdates;
+
+    ////////////////////////////////////////////////////////////////////////////
+    // Update the Loop Guard
+    ////////////////////////////////////////////////////////////////////////////
+    // The guard for FC0 is updated to guard both FC0 and FC1. This is done by
+    // changing the NonLoopGuardBlock for FC0 to the NonLoopGuardBlock for FC1.
+    // Thus, one path from the guard goes to the preheader for FC0 (and thus
+    // executes the new fused loop) and the other path goes to the NonLoopBlock
+    // for FC1 (where FC1 guard would have gone if FC1 was not executed).
+    FC0.GuardBranch->replaceUsesOfWith(FC0NonLoopBlock, FC1NonLoopBlock);
+    FC0.ExitBlock->getTerminator()->replaceUsesOfWith(FC1GuardBlock,
+                                                      FC1.Header);
+
+    // The guard of FC1 is not necessary anymore.
+    FC1.GuardBranch->eraseFromParent();
+    new UnreachableInst(FC1GuardBlock->getContext(), FC1GuardBlock);
+
+    TreeUpdates.emplace_back(DominatorTree::UpdateType(
+        DominatorTree::Delete, FC1GuardBlock, FC1.Preheader));
+    TreeUpdates.emplace_back(DominatorTree::UpdateType(
+        DominatorTree::Delete, FC1GuardBlock, FC1NonLoopBlock));
+    TreeUpdates.emplace_back(DominatorTree::UpdateType(
+        DominatorTree::Delete, FC0GuardBlock, FC1GuardBlock));
+    TreeUpdates.emplace_back(DominatorTree::UpdateType(
+        DominatorTree::Insert, FC0GuardBlock, FC1NonLoopBlock));
+
+    assert(pred_begin(FC1GuardBlock) == pred_end(FC1GuardBlock) &&
+           "Expecting guard block to have no predecessors");
+    assert(succ_begin(FC1GuardBlock) == succ_end(FC1GuardBlock) &&
+           "Expecting guard block to have no successors");
+
+    // Remember the phi nodes originally in the header of FC0 in order to rewire
+    // them later. However, this is only necessary if the new loop carried
+    // values might not dominate the exiting branch. While we do not generally
+    // test if this is the case but simply insert intermediate phi nodes, we
+    // need to make sure these intermediate phi nodes have different
+    // predecessors. To this end, we filter the special case where the exiting
+    // block is the latch block of the first loop. Nothing needs to be done
+    // anyway as all loop carried values dominate the latch and thereby also the
+    // exiting branch.
+    // KB: This is no longer necessary because FC0.ExitingBlock == FC0.Latch
+    // (because the loops are rotated. Thus, nothing will ever be added to
+    // OriginalFC0PHIs.
+    SmallVector<PHINode *, 8> OriginalFC0PHIs;
+    if (FC0.ExitingBlock != FC0.Latch)
+      for (PHINode &PHI : FC0.Header->phis())
+        OriginalFC0PHIs.push_back(&PHI);
+
+    assert(OriginalFC0PHIs.empty() && "Expecting OriginalFC0PHIs to be empty!");
+
+    // Replace incoming blocks for header PHIs first.
+    FC1.Preheader->replaceSuccessorsPhiUsesWith(FC0.Preheader);
+    FC0.Latch->replaceSuccessorsPhiUsesWith(FC1.Latch);
+
+    // The old exiting block of the first loop (FC0) has to jump to the header
+    // of the second as we need to execute the code in the second header block
+    // regardless of the trip count. That is, if the trip count is 0, so the
+    // back edge is never taken, we still have to execute both loop headers,
+    // especially (but not only!) if the second is a do-while style loop.
+    // However, doing so might invalidate the phi nodes of the first loop as
+    // the new values do only need to dominate their latch and not the exiting
+    // predicate. To remedy this potential problem we always introduce phi
+    // nodes in the header of the second loop later that select the loop carried
+    // value, if the second header was reached through an old latch of the
+    // first, or undef otherwise. This is sound as exiting the first implies the
+    // second will exit too, __without__ taking the back-edge (their
+    // trip-counts are equal after all).
+    FC0.ExitingBlock->getTerminator()->replaceUsesOfWith(FC0.ExitBlock,
+                                                         FC1.Header);
+
+    TreeUpdates.emplace_back(DominatorTree::UpdateType(
+        DominatorTree::Delete, FC0.ExitingBlock, FC0.ExitBlock));
+    TreeUpdates.emplace_back(DominatorTree::UpdateType(
+        DominatorTree::Insert, FC0.ExitingBlock, FC1.Header));
+
+    // Remove FC0 Exit Block
+    // The exit block for FC0 is no longer needed since control will flow
+    // directly to the header of FC1. Since it is an empty block, it can be
+    // removed at this point.
+    // TODO: In the future, we can handle non-empty exit blocks my merging any
+    // instructions from FC0 exit block into FC1 exit block prior to removing
+    // the block.
+    assert(pred_begin(FC0.ExitBlock) == pred_end(FC0.ExitBlock) &&
+           "Expecting exit block to be empty");
+    FC0.ExitBlock->getTerminator()->eraseFromParent();
+    new UnreachableInst(FC0.ExitBlock->getContext(), FC0.ExitBlock);
+
+    // Remove FC1 Preheader
+    // The pre-header of L1 is not necessary anymore.
+    assert(pred_begin(FC1.Preheader) == pred_end(FC1.Preheader));
+    FC1.Preheader->getTerminator()->eraseFromParent();
+    new UnreachableInst(FC1.Preheader->getContext(), FC1.Preheader);
+    TreeUpdates.emplace_back(DominatorTree::UpdateType(
+        DominatorTree::Delete, FC1.Preheader, FC1.Header));
+
+    // Moves the phi nodes from the second to the first loops header block.
+    while (PHINode *PHI = dyn_cast<PHINode>(&FC1.Header->front())) {
+      if (SE.isSCEVable(PHI->getType()))
+        SE.forgetValue(PHI);
+      if (PHI->hasNUsesOrMore(1))
+        PHI->moveBefore(&*FC0.Header->getFirstInsertionPt());
+      else
+        PHI->eraseFromParent();
+    }
+
+    // Introduce new phi nodes in the second loop header to ensure
+    // exiting the first and jumping to the header of the second does not break
+    // the SSA property of the phis originally in the first loop. See also the
+    // comment above.
+    Instruction *L1HeaderIP = &FC1.Header->front();
+    for (PHINode *LCPHI : OriginalFC0PHIs) {
+      int L1LatchBBIdx = LCPHI->getBasicBlockIndex(FC1.Latch);
+      assert(L1LatchBBIdx >= 0 &&
+             "Expected loop carried value to be rewired at this point!");
+
+      Value *LCV = LCPHI->getIncomingValue(L1LatchBBIdx);
+
+      PHINode *L1HeaderPHI = PHINode::Create(
+          LCV->getType(), 2, LCPHI->getName() + ".afterFC0", L1HeaderIP);
+      L1HeaderPHI->addIncoming(LCV, FC0.Latch);
+      L1HeaderPHI->addIncoming(UndefValue::get(LCV->getType()),
+                               FC0.ExitingBlock);
+
+      LCPHI->setIncomingValue(L1LatchBBIdx, L1HeaderPHI);
+    }
+
+    // Update the latches
+
+    // Replace latch terminator destinations.
+    FC0.Latch->getTerminator()->replaceUsesOfWith(FC0.Header, FC1.Header);
+    FC1.Latch->getTerminator()->replaceUsesOfWith(FC1.Header, FC0.Header);
+
+    // If FC0.Latch and FC0.ExitingBlock are the same then we have already
+    // performed the updates above.
+    if (FC0.Latch != FC0.ExitingBlock)
+      TreeUpdates.emplace_back(DominatorTree::UpdateType(
+          DominatorTree::Insert, FC0.Latch, FC1.Header));
+
+    TreeUpdates.emplace_back(DominatorTree::UpdateType(DominatorTree::Delete,
+                                                       FC0.Latch, FC0.Header));
+    TreeUpdates.emplace_back(DominatorTree::UpdateType(DominatorTree::Insert,
+                                                       FC1.Latch, FC0.Header));
+    TreeUpdates.emplace_back(DominatorTree::UpdateType(DominatorTree::Delete,
+                                                       FC1.Latch, FC1.Header));
+
+    // All done
+    // Apply the updates to the Dominator Tree and cleanup.
+
+    assert(succ_begin(FC1GuardBlock) == succ_end(FC1GuardBlock) &&
+           "FC1GuardBlock has successors!!");
+    assert(pred_begin(FC1GuardBlock) == pred_end(FC1GuardBlock) &&
+           "FC1GuardBlock has predecessors!!");
+
+    // Update DT/PDT
+    DTU.applyUpdates(TreeUpdates);
+
+    LI.removeBlock(FC1.Preheader);
+    DTU.deleteBB(FC1.Preheader);
+    DTU.deleteBB(FC0.ExitBlock);
+    DTU.flush();
+
+    // Is there a way to keep SE up-to-date so we don't need to forget the loops
+    // and rebuild the information in subsequent passes of fusion?
+    SE.forgetLoop(FC1.L);
+    SE.forgetLoop(FC0.L);
+
+    // Merge the loops.
+    SmallVector<BasicBlock *, 8> Blocks(FC1.L->block_begin(),
+                                        FC1.L->block_end());
+    for (BasicBlock *BB : Blocks) {
+      FC0.L->addBlockEntry(BB);
+      FC1.L->removeBlockFromLoop(BB);
+      if (LI.getLoopFor(BB) != FC1.L)
+        continue;
+      LI.changeLoopFor(BB, FC0.L);
+    }
+    while (!FC1.L->empty()) {
+      const auto &ChildLoopIt = FC1.L->begin();
+      Loop *ChildLoop = *ChildLoopIt;
+      FC1.L->removeChildLoop(ChildLoopIt);
+      FC0.L->addChildLoop(ChildLoop);
+    }
+
+    // Delete the now empty loop L1.
+    LI.erase(FC1.L);
+
+#ifndef NDEBUG
+    assert(!verifyFunction(*FC0.Header->getParent(), &errs()));
+    assert(DT.verify(DominatorTree::VerificationLevel::Fast));
+    assert(PDT.verify());
+    LI.verify(DT);
+    SE.verify();
+#endif
+
+    LLVM_DEBUG(dbgs() << "Fusion done:\n");
+
+    return FC0.L;
+  }
  };
  
  struct LoopFuseLegacy : public FunctionPass {
diff --git a/test/Transforms/LoopFusion/guarded.ll b/test/Transforms/LoopFusion/guarded.ll

new file mode 100644 (file)

index 0000000..1a13117
--- /dev/null
+++ b/test/Transforms/LoopFusion/guarded.ll
@@ -0,0 +1,67 @@
+; RUN: opt -S -loop-fusion < %s | FileCheck %s
+
+@B = common global [1024 x i32] zeroinitializer, align 16
+
+; CHECK: void @dep_free_parametric
+; CHECK-next: entry:
+; CHECK: br i1 %{{.*}}, label %[[LOOP1PREHEADER:bb[0-9]*]], label %[[LOOP1SUCC:bb[0-9]+]]
+; CHECK: [[LOOP1PREHEADER]]
+; CHECK-NEXT: br label %[[LOOP1BODY:bb[0-9]*]]
+; CHECK: [[LOOP1BODY]]
+; CHECK: br i1 %{{.*}}, label %[[LOOP2BODY:bb[0-9]*]], label %[[LOOP2BODY]]
+; CHECK: [[LOOP2BODY]]
+; CHECK: br i1 %{{.*}}, label %[[LOOP1BODY]], label %[[LOOP2EXIT:bb[0-9]+]]
+; CHECK: [[LOOP2EXIT]]
+; CHECK: br label %[[LOOP1SUCC]]
+; CHECK: [[LOOP1SUCC]]
+; CHECK: ret void
+define void @dep_free_parametric(i32* noalias %A, i64 %N) {
+entry:
+  %cmp4 = icmp slt i64 0, %N
+  br i1 %cmp4, label %bb3, label %bb14
+
+bb3:                               ; preds = %entry
+  br label %bb5
+
+bb5:                                         ; preds = %bb3, %bb5
+  %i.05 = phi i64 [ %inc, %bb5 ], [ 0, %bb3 ]
+  %sub = sub nsw i64 %i.05, 3
+  %add = add nsw i64 %i.05, 3
+  %mul = mul nsw i64 %sub, %add
+  %rem = srem i64 %mul, %i.05
+  %conv = trunc i64 %rem to i32
+  %arrayidx = getelementptr inbounds i32, i32* %A, i64 %i.05
+  store i32 %conv, i32* %arrayidx, align 4
+  %inc = add nsw i64 %i.05, 1
+  %cmp = icmp slt i64 %inc, %N
+  br i1 %cmp, label %bb5, label %bb10
+
+bb10:                                 ; preds = %bb5
+  br label %bb14
+
+bb14:                                          ; preds = %bb10, %entry
+  %cmp31 = icmp slt i64 0, %N
+  br i1 %cmp31, label %bb8, label %bb12
+
+bb8:                              ; preds = %bb14
+  br label %bb9
+
+bb9:                                        ; preds = %bb8, %bb9
+  %i1.02 = phi i64 [ %inc14, %bb9 ], [ 0, %bb8 ]
+  %sub7 = sub nsw i64 %i1.02, 3
+  %add8 = add nsw i64 %i1.02, 3
+  %mul9 = mul nsw i64 %sub7, %add8
+  %rem10 = srem i64 %mul9, %i1.02
+  %conv11 = trunc i64 %rem10 to i32
+  %arrayidx12 = getelementptr inbounds [1024 x i32], [1024 x i32]* @B, i64 0, i64 %i1.02
+  store i32 %conv11, i32* %arrayidx12, align 4
+  %inc14 = add nsw i64 %i1.02, 1
+  %cmp3 = icmp slt i64 %inc14, %N
+  br i1 %cmp3, label %bb9, label %bb15
+
+bb15:                               ; preds = %bb9
+  br label %bb12
+
+bb12:                                        ; preds = %bb15, %bb14
+  ret void
+}
author	Kit Barton <kbarton@ca.ibm.com>
	Thu, 26 Sep 2019 21:42:45 +0000 (21:42 +0000)
committer	Kit Barton <kbarton@ca.ibm.com>
	Thu, 26 Sep 2019 21:42:45 +0000 (21:42 +0000)
lib/Transforms/Scalar/LoopFuse.cpp		patch \| blob \| history
test/Transforms/LoopFusion/guarded.ll	[new file with mode: 0644]	patch \| blob