[LICM] Allow sinking when foldable in loop

author Jun Bum Lim <junbuml@codeaurora.org>

Fri, 15 Dec 2017 16:09:54 +0000 (16:09 +0000)

committer Jun Bum Lim <junbuml@codeaurora.org>

Fri, 15 Dec 2017 16:09:54 +0000 (16:09 +0000)
author Jun Bum Lim <junbuml@codeaurora.org>
Fri, 15 Dec 2017 16:09:54 +0000 (16:09 +0000)
committer Jun Bum Lim <junbuml@codeaurora.org>
Fri, 15 Dec 2017 16:09:54 +0000 (16:09 +0000)
diff --git a/include/llvm/Transforms/Utils/LoopUtils.h b/include/llvm/Transforms/Utils/LoopUtils.h

index 90299c5e6a2fe5d5cf8a569a16b2069aea3e9362..75066613650702669abfc8904c9ea04ea94e2be8 100644 (file)
--- a/include/llvm/Transforms/Utils/LoopUtils.h
+++ b/include/llvm/Transforms/Utils/LoopUtils.h
@@ -436,8 +436,9 @@ bool formLCSSARecursively(Loop &L, DominatorTree &DT, LoopInfo *LI,
  /// instructions of the loop and loop safety information as
  /// arguments. Diagnostics is emitted via \p ORE. It returns changed status.
  bool sinkRegion(DomTreeNode *, AliasAnalysis *, LoopInfo *, DominatorTree *,
-                TargetLibraryInfo *, Loop *, AliasSetTracker *,
-                LoopSafetyInfo *, OptimizationRemarkEmitter *ORE);
+                TargetLibraryInfo *, TargetTransformInfo *, Loop *,
+                AliasSetTracker *, LoopSafetyInfo *,
+                OptimizationRemarkEmitter *ORE);
  
  /// \brief Walk the specified region of the CFG (defined by all blocks
  /// dominated by the specified block, and that are in the current loop) in depth
diff --git a/lib/Transforms/Scalar/LICM.cpp b/lib/Transforms/Scalar/LICM.cpp

index f610aae2403badcc58f658f7a033e75eae2d660a..ccb0af2d85adcf58113a2ed00a7c96885636ed61 100644 (file)
--- a/lib/Transforms/Scalar/LICM.cpp
+++ b/lib/Transforms/Scalar/LICM.cpp
@@ -90,14 +90,15 @@ static cl::opt<uint32_t> MaxNumUsesTraversed(
               "invariance in loop using invariant start (default = 8)"));
  
  static bool inSubLoop(BasicBlock *BB, Loop *CurLoop, LoopInfo *LI);
-static bool isNotUsedInLoop(const Instruction &I, const Loop *CurLoop,
-                            const LoopSafetyInfo *SafetyInfo);
+static bool isNotUsedOrFreeInLoop(const Instruction &I, const Loop *CurLoop,
+                                  const LoopSafetyInfo *SafetyInfo,
+                                  TargetTransformInfo *TTI, bool &FreeInLoop);
  static bool hoist(Instruction &I, const DominatorTree *DT, const Loop *CurLoop,
                    const LoopSafetyInfo *SafetyInfo,
                    OptimizationRemarkEmitter *ORE);
  static bool sink(Instruction &I, LoopInfo *LI, DominatorTree *DT,
                   const Loop *CurLoop, const LoopSafetyInfo *SafetyInfo,
-                 OptimizationRemarkEmitter *ORE);
+                 OptimizationRemarkEmitter *ORE, bool FreeInLoop);
  static bool isSafeToExecuteUnconditionally(Instruction &Inst,
                                             const DominatorTree *DT,
                                             const Loop *CurLoop,
@@ -115,7 +116,8 @@ CloneInstructionInExitBlock(Instruction &I, BasicBlock &ExitBlock, PHINode &PN,
  namespace {
  struct LoopInvariantCodeMotion {
    bool runOnLoop(Loop *L, AliasAnalysis *AA, LoopInfo *LI, DominatorTree *DT,
-                 TargetLibraryInfo *TLI, ScalarEvolution *SE, MemorySSA *MSSA,
+                 TargetLibraryInfo *TLI, TargetTransformInfo *TTI,
+                 ScalarEvolution *SE, MemorySSA *MSSA,
                   OptimizationRemarkEmitter *ORE, bool DeleteAST);
  
    DenseMap<Loop *, AliasSetTracker *> &getLoopToAliasSetMap() {
@@ -159,6 +161,8 @@ struct LegacyLICMPass : public LoopPass {
                            &getAnalysis<LoopInfoWrapperPass>().getLoopInfo(),
                            &getAnalysis<DominatorTreeWrapperPass>().getDomTree(),
                            &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(),
+                          &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(
+                              *L->getHeader()->getParent()),
                            SE ? &SE->getSE() : nullptr, MSSA, &ORE, false);
    }
  
@@ -170,6 +174,7 @@ struct LegacyLICMPass : public LoopPass {
      AU.addRequired<TargetLibraryInfoWrapperPass>();
      if (EnableMSSALoopDependency)
        AU.addRequired<MemorySSAWrapperPass>();
+    AU.addRequired<TargetTransformInfoWrapperPass>();
      getLoopAnalysisUsage(AU);
    }
  
@@ -210,8 +215,8 @@ PreservedAnalyses LICMPass::run(Loop &L, LoopAnalysisManager &AM,
                         "cached at a higher level");
  
    LoopInvariantCodeMotion LICM;
-  if (!LICM.runOnLoop(&L, &AR.AA, &AR.LI, &AR.DT, &AR.TLI, &AR.SE, AR.MSSA, ORE,
-                      true))
+  if (!LICM.runOnLoop(&L, &AR.AA, &AR.LI, &AR.DT, &AR.TLI, &AR.TTI, &AR.SE,
+                      AR.MSSA, ORE, true))
      return PreservedAnalyses::all();
  
    auto PA = getLoopPassPreservedAnalyses();
@@ -224,6 +229,7 @@ INITIALIZE_PASS_BEGIN(LegacyLICMPass, "licm", "Loop Invariant Code Motion",
                        false, false)
  INITIALIZE_PASS_DEPENDENCY(LoopPass)
  INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
  INITIALIZE_PASS_DEPENDENCY(MemorySSAWrapperPass)
  INITIALIZE_PASS_END(LegacyLICMPass, "licm", "Loop Invariant Code Motion", false,
                      false)
@@ -236,12 +242,10 @@ Pass *llvm::createLICMPass() { return new LegacyLICMPass(); }
  /// We should delete AST for inner loops in the new pass manager to avoid
  /// memory leak.
  ///
-bool LoopInvariantCodeMotion::runOnLoop(Loop *L, AliasAnalysis *AA,
-                                        LoopInfo *LI, DominatorTree *DT,
-                                        TargetLibraryInfo *TLI,
-                                        ScalarEvolution *SE, MemorySSA *MSSA,
-                                        OptimizationRemarkEmitter *ORE,
-                                        bool DeleteAST) {
+bool LoopInvariantCodeMotion::runOnLoop(
+    Loop *L, AliasAnalysis *AA, LoopInfo *LI, DominatorTree *DT,
+    TargetLibraryInfo *TLI, TargetTransformInfo *TTI, ScalarEvolution *SE,
+    MemorySSA *MSSA, OptimizationRemarkEmitter *ORE, bool DeleteAST) {
    bool Changed = false;
  
    assert(L->isLCSSAForm(*DT) && "Loop is not in LCSSA form.");
@@ -266,7 +270,7 @@ bool LoopInvariantCodeMotion::runOnLoop(Loop *L, AliasAnalysis *AA,
    // instructions, we perform another pass to hoist them out of the loop.
    //
    if (L->hasDedicatedExits())
-    Changed |= sinkRegion(DT->getNode(L->getHeader()), AA, LI, DT, TLI, L,
+    Changed |= sinkRegion(DT->getNode(L->getHeader()), AA, LI, DT, TLI, TTI, L,
                            CurAST, &SafetyInfo, ORE);
    if (Preheader)
      Changed |= hoistRegion(DT->getNode(L->getHeader()), AA, LI, DT, TLI, L,
@@ -359,7 +363,8 @@ bool LoopInvariantCodeMotion::runOnLoop(Loop *L, AliasAnalysis *AA,
  /// definitions, allowing us to sink a loop body in one pass without iteration.
  ///
  bool llvm::sinkRegion(DomTreeNode *N, AliasAnalysis *AA, LoopInfo *LI,
-                      DominatorTree *DT, TargetLibraryInfo *TLI, Loop *CurLoop,
+                      DominatorTree *DT, TargetLibraryInfo *TLI,
+                      TargetTransformInfo *TTI, Loop *CurLoop,
                        AliasSetTracker *CurAST, LoopSafetyInfo *SafetyInfo,
                        OptimizationRemarkEmitter *ORE) {
  
@@ -400,12 +405,15 @@ bool llvm::sinkRegion(DomTreeNode *N, AliasAnalysis *AA, LoopInfo *LI,
        // outside of the loop.  In this case, it doesn't even matter if the
        // operands of the instruction are loop invariant.
        //
-      if (isNotUsedInLoop(I, CurLoop, SafetyInfo) &&
+      bool FreeInLoop = false;
+      if (isNotUsedOrFreeInLoop(I, CurLoop, SafetyInfo, TTI, FreeInLoop) &&
            canSinkOrHoistInst(I, AA, DT, CurLoop, CurAST, SafetyInfo, ORE)) {
-        if (sink(I, LI, DT, CurLoop, SafetyInfo, ORE)) {
-          ++II;
-          CurAST->deleteValue(&I);
-          I.eraseFromParent();
+        if (sink(I, LI, DT, CurLoop, SafetyInfo, ORE, FreeInLoop)) {
+          if (!FreeInLoop) {
+            ++II;
+            CurAST->deleteValue(&I);
+            I.eraseFromParent();
+          }
            Changed = true;
          }
        }
@@ -708,13 +716,39 @@ static bool isTriviallyReplacablePHI(const PHINode &PN, const Instruction &I) {
    return true;
  }
  
+/// Return true if the instruction is free in the loop.
+static bool isFreeInLoop(const Instruction &I, const Loop *CurLoop,
+                         const TargetTransformInfo *TTI) {
+  if (const GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(&I)) {
+    if (TTI->getUserCost(&I) != TargetTransformInfo::TCC_Free)
+      return false;
+    // For a GEP, we cannot simply use getUserCost because currently it
+    // optimistically assume that a GEP will fold into addressing mode
+    // regardless of its users.
+    const BasicBlock *BB = I.getParent();
+    for (const User *U : I.users()) {
+      const Instruction *UI = cast<Instruction>(U);
+      if (CurLoop->contains(UI) &&
+          (BB != UI->getParent() ||
+           (!isa<StoreInst>(UI) && !isa<LoadInst>(UI))))
+        return false;
+    }
+    return true;
+  } else
+    return TTI->getUserCost(&I) == TargetTransformInfo::TCC_Free;
+}
+
  /// Return true if the only users of this instruction are outside of
  /// the loop. If this is true, we can sink the instruction to the exit
  /// blocks of the loop.
  ///
-static bool isNotUsedInLoop(const Instruction &I, const Loop *CurLoop,
-                            const LoopSafetyInfo *SafetyInfo) {
+/// We also return true if the instruction could be folded away in lowering.
+/// (e.g.,  a GEP can be folded into a load as an addressing mode in the loop).
+static bool isNotUsedOrFreeInLoop(const Instruction &I, const Loop *CurLoop,
+                                  const LoopSafetyInfo *SafetyInfo,
+                                  TargetTransformInfo *TTI, bool &FreeInLoop) {
    const auto &BlockColors = SafetyInfo->BlockColors;
+  bool IsFree = isFreeInLoop(I, CurLoop, TTI);
    for (const User *U : I.users()) {
      const Instruction *UI = cast<Instruction>(U);
      if (const PHINode *PN = dyn_cast<PHINode>(UI)) {
@@ -731,8 +765,13 @@ static bool isNotUsedInLoop(const Instruction &I, const Loop *CurLoop,
            return false;
      }
  
-    if (CurLoop->contains(UI))
+    if (CurLoop->contains(UI)) {
+      if (IsFree) {
+        FreeInLoop = true;
+        continue;
+      }
        return false;
+    }
    }
    return true;
  }
@@ -888,7 +927,7 @@ static void splitPredecessorsOfLoopExit(PHINode *PN, DominatorTree *DT,
  ///
  static bool sink(Instruction &I, LoopInfo *LI, DominatorTree *DT,
                   const Loop *CurLoop, const LoopSafetyInfo *SafetyInfo,
-                 OptimizationRemarkEmitter *ORE) {
+                 OptimizationRemarkEmitter *ORE, bool FreeInLoop) {
    DEBUG(dbgs() << "LICM sinking instruction: " << I << "\n");
    ORE->emit([&]() {
      return OptimizationRemark(DEBUG_TYPE, "InstSunk", &I)
@@ -900,7 +939,6 @@ static bool sink(Instruction &I, LoopInfo *LI, DominatorTree *DT,
    else if (isa<CallInst>(I))
      ++NumMovedCalls;
    ++NumSunk;
-  Changed = true;
  
    // Iterate over users to be ready for actual sinking. Replace users via
    // unrechable blocks with undef and make all user PHIs trivially replcable.
@@ -910,11 +948,12 @@ static bool sink(Instruction &I, LoopInfo *LI, DominatorTree *DT,
      Use &U = UI.getUse();
      ++UI;
  
-    if (VisitedUsers.count(User))
+    if (VisitedUsers.count(User) || CurLoop->contains(User))
        continue;
  
      if (!DT->isReachableFromEntry(User->getParent())) {
        U = UndefValue::get(I.getType());
+      Changed = true;
        continue;
      }
  
@@ -927,6 +966,7 @@ static bool sink(Instruction &I, LoopInfo *LI, DominatorTree *DT,
      BasicBlock *BB = PN->getIncomingBlock(U);
      if (!DT->isReachableFromEntry(BB)) {
        U = UndefValue::get(I.getType());
+      Changed = true;
        continue;
      }
  
@@ -935,7 +975,7 @@ static bool sink(Instruction &I, LoopInfo *LI, DominatorTree *DT,
        continue;
  
      if (!canSplitPredecessors(PN))
-      return false;
+      return Changed;
  
      // Split predecessors of the PHI so that we can make users trivially
      // replacable.
@@ -947,6 +987,9 @@ static bool sink(Instruction &I, LoopInfo *LI, DominatorTree *DT,
      UE = I.user_end();
    }
  
+  if (VisitedUsers.empty())
+    return Changed;
+
  #ifndef NDEBUG
    SmallVector<BasicBlock *, 32> ExitBlocks;
    CurLoop->getUniqueExitBlocks(ExitBlocks);
@@ -960,9 +1003,14 @@ static bool sink(Instruction &I, LoopInfo *LI, DominatorTree *DT,
    // If this instruction is only used outside of the loop, then all users are
    // PHI nodes in exit blocks due to LCSSA form. Just RAUW them with clones of
    // the instruction.
-  while (!I.use_empty()) {
-    Value::user_iterator UI = I.user_begin();
-    PHINode *PN = cast<PHINode>(*UI);
+  SmallSetVector<User*, 8> Users(I.user_begin(), I.user_end());
+  for (auto *UI : Users) {
+    auto *User = cast<Instruction>(UI);
+
+    if (CurLoop->contains(User))
+      continue;
+
+    PHINode *PN = cast<PHINode>(User);
      assert(ExitBlockSet.count(PN->getParent()) &&
             "The LCSSA PHI is not in an exit block!");
      // The PHI must be trivially replacable.
@@ -970,6 +1018,7 @@ static bool sink(Instruction &I, LoopInfo *LI, DominatorTree *DT,
                                                           SafetyInfo, CurLoop);
      PN->replaceAllUsesWith(New);
      PN->eraseFromParent();
+    Changed = true;
    }
    return Changed;
  }
diff --git a/test/Transforms/LICM/sink-foldable.ll b/test/Transforms/LICM/sink-foldable.ll

new file mode 100644 (file)

index 0000000..cf67e35
--- /dev/null
+++ b/test/Transforms/LICM/sink-foldable.ll
@@ -0,0 +1,147 @@
+; RUN: opt < %s  -licm -S   | FileCheck %s
+target triple = "aarch64--linux-gnueabi"
+
+; CHECK-LABEL:@test1
+; CHECK-LABEL:loopexit1:
+; CHECK: %[[PHI:.+]] = phi i8** [ %arrayidx0, %if.end ]
+; CHECK: getelementptr inbounds i8*, i8** %[[PHI]], i64 1
+define i8** @test1(i32 %j, i8** readonly %P, i8* readnone %Q) {
+entry:
+  %cmp0 = icmp slt i32 0, %j
+  br i1 %cmp0, label %for.body.lr.ph, label %return
+
+for.body.lr.ph:
+  br label %for.body
+
+for.body:
+  %P.addr = phi i8** [ %P, %for.body.lr.ph ], [ %arrayidx0, %if.end  ]
+  %i0 = phi i32 [ 0, %for.body.lr.ph ], [ %i.add, %if.end]
+
+  %i0.ext = sext i32 %i0 to i64
+  %arrayidx0 = getelementptr inbounds i8*, i8** %P.addr, i64 %i0.ext
+  %l0 = load i8*, i8** %arrayidx0, align 8
+  %cmp1 = icmp ugt i8* %l0, %Q
+  br i1 %cmp1, label %loopexit0, label %if.end
+
+if.end:                                           ; preds = %for.body
+  %arrayidx1 = getelementptr inbounds i8*, i8** %arrayidx0, i64 1
+  %l1 = load i8*, i8** %arrayidx1, align 8
+  %cmp4 = icmp ugt i8* %l1, %Q
+  %i.add = add nsw i32 %i0, 2
+  br i1 %cmp4, label %loopexit1, label %for.body
+
+loopexit0:
+  %p1 = phi i8** [%arrayidx0, %for.body]
+  br label %return
+
+loopexit1:
+  %p2 = phi i8** [%arrayidx1, %if.end]
+  br label  %return
+
+return:
+  %retval.0 = phi i8** [ %p1, %loopexit0 ], [%p2, %loopexit1], [ null, %entry ]
+  ret i8** %retval.0
+}
+
+; CHECK-LABEL: @test2
+; CHECK-LABEL: loopexit2:
+; CHECK: %[[PHI:.*]] = phi i8** [ %add.ptr, %if.end ]
+; CHECK: getelementptr inbounds i8*, i8** %[[PHI]]
+define i8** @test2(i32 %j, i8** readonly %P, i8* readnone %Q) {
+
+entry:
+  br label %for.body
+
+for.cond:
+  %i.addr.0 = phi i32 [ %add, %if.end ]
+  %P.addr.0 = phi i8** [ %add.ptr, %if.end ]
+  %cmp = icmp slt i32 %i.addr.0, %j
+  br i1 %cmp, label %for.body, label %loopexit0
+
+for.body:
+  %P.addr = phi i8** [ %P, %entry ], [ %P.addr.0, %for.cond ]
+  %i.addr = phi i32 [ 0, %entry ], [ %i.addr.0, %for.cond ]
+
+  %idx.ext = sext i32 %i.addr to i64
+  %add.ptr = getelementptr inbounds i8*, i8** %P.addr, i64 %idx.ext
+  %l0 = load i8*, i8** %add.ptr, align 8
+
+  %cmp1 = icmp ugt i8* %l0, %Q
+  br i1 %cmp1, label %loopexit1, label %if.end
+
+if.end:
+  %add.i = add i32 %i.addr, 1
+  %idx2.ext = sext i32 %add.i to i64
+  %arrayidx2 = getelementptr inbounds i8*, i8** %add.ptr, i64 %idx2.ext
+  %l1 = load i8*, i8** %arrayidx2, align 8
+  %cmp2 = icmp ugt i8* %l1, %Q
+  %add = add nsw i32 %add.i, 1
+  br i1 %cmp2, label %loopexit2, label %for.cond
+
+loopexit0:
+  %p0 = phi i8** [ null, %for.cond ]
+  br label %return
+
+loopexit1:
+  %p1 = phi i8** [ %add.ptr, %for.body ]
+  br label %return
+
+loopexit2:
+  %p2 = phi i8** [ %arrayidx2, %if.end ]
+  br label %return
+
+return:
+  %retval.0 = phi i8** [ %p1, %loopexit1 ], [ %p2, %loopexit2 ], [ %p0, %loopexit0 ]
+  ret i8** %retval.0
+}
+
+
+; CHECK-LABEL: @test3
+; CHECK-LABEL: loopexit1:
+; CHECK: %[[ADD:.*]]  = phi i64 [ %add, %if.end ]
+; CHECK: %[[ADDR:.*]] = phi i8** [ %P.addr, %if.end ]
+; CHECK: %[[TRUNC:.*]] = trunc i64 %[[ADD]] to i32
+; CHECK: getelementptr inbounds i8*, i8** %[[ADDR]], i32 %[[TRUNC]]
+; CHECK: call void @dummy(i32 %[[TRUNC]])
+define i8** @test3(i64 %j, i8** readonly %P, i8* readnone %Q) {
+entry:
+  %cmp0 = icmp slt i64 0, %j
+  br i1 %cmp0, label %for.body.lr.ph, label %return
+
+for.body.lr.ph:
+  br label %for.body
+
+for.body:
+  %P.addr = phi i8** [ %P, %for.body.lr.ph ], [ %arrayidx0, %if.end  ]
+  %i0 = phi i32 [ 0, %for.body.lr.ph ], [ %i.add, %if.end]
+
+  %i0.ext = sext i32 %i0 to i64
+  %arrayidx0 = getelementptr inbounds i8*, i8** %P.addr, i64 %i0.ext
+  %l0 = load i8*, i8** %arrayidx0, align 8
+  %cmp1 = icmp ugt i8* %l0, %Q
+  br i1 %cmp1, label %loopexit0, label %if.end
+
+if.end:                                           ; preds = %for.body
+  %add = add i64 %i0.ext, 1
+  %trunc = trunc i64 %add to i32
+  %arrayidx1 = getelementptr inbounds i8*, i8** %P.addr, i32 %trunc
+  %l1 = load i8*, i8** %arrayidx1, align 8
+  %cmp4 = icmp ugt i8* %l1, %Q
+  %i.add = add nsw i32 %i0, 2
+  br i1 %cmp4, label %loopexit1, label %for.body
+
+loopexit0:
+  %p1 = phi i8** [%arrayidx0, %for.body]
+  br label %return
+
+loopexit1:
+  %p2 = phi i8** [%arrayidx1, %if.end]
+  call void @dummy(i32 %trunc)
+  br label  %return
+
+return:
+  %retval.0 = phi i8** [ %p1, %loopexit0 ], [%p2, %loopexit1], [ null, %entry ]
+  ret i8** %retval.0
+}
+
+declare void @dummy(i32)
author	Jun Bum Lim <junbuml@codeaurora.org>
	Fri, 15 Dec 2017 16:09:54 +0000 (16:09 +0000)
committer	Jun Bum Lim <junbuml@codeaurora.org>
	Fri, 15 Dec 2017 16:09:54 +0000 (16:09 +0000)
include/llvm/Transforms/Utils/LoopUtils.h		patch \| blob \| history
lib/Transforms/Scalar/LICM.cpp		patch \| blob \| history
test/Transforms/LICM/sink-foldable.ll	[new file with mode: 0644]	patch \| blob