class Function;
class GlobalVariable;
class Instruction;
+class ProfileSummaryInfo;
class TargetTransformInfo;
/// A private "module" namespace for types and utilities used by
// Glue for old PM.
bool runImpl(Function &F, TargetTransformInfo &TTI, DominatorTree &DT,
- BlockFrequencyInfo *BFI, BasicBlock &Entry);
+ BlockFrequencyInfo *BFI, BasicBlock &Entry,
+ ProfileSummaryInfo *PSI);
void cleanup() {
ClonedCastMap.clear();
LLVMContext *Ctx;
const DataLayout *DL;
BasicBlock *Entry;
+ ProfileSummaryInfo *PSI;
/// Keeps track of constant candidates found in the function.
using ConstCandVecType = std::vector<consthoist::ConstantCandidate>;
class BasicBlock;
class Function;
class OptimizationRemarkEmitter;
+class BlockFrequencyInfo;
+class ProfileSummaryInfo;
/// This class implements simplifications for calls to fortified library
/// functions (__st*cpy_chk, __memcpy_chk, __memmove_chk, __memset_chk), to,
const DataLayout &DL;
const TargetLibraryInfo *TLI;
OptimizationRemarkEmitter &ORE;
+ BlockFrequencyInfo *BFI;
+ ProfileSummaryInfo *PSI;
bool UnsafeFPShrink;
function_ref<void(Instruction *, Value *)> Replacer;
function_ref<void(Instruction *)> Eraser;
LibCallSimplifier(
const DataLayout &DL, const TargetLibraryInfo *TLI,
OptimizationRemarkEmitter &ORE,
+ BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI,
function_ref<void(Instruction *, Value *)> Replacer =
&replaceAllUsesWithDefault,
function_ref<void(Instruction *)> Eraser = &eraseFromParentDefault);
--- /dev/null
+//===- llvm/Transforms/Utils/SizeOpts.h - size optimization -----*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains some shared code size optimization related code.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TRANSFORMS_UTILS_SIZEOPTS_H
+#define LLVM_TRANSFORMS_UTILS_SiZEOPTS_H
+
+namespace llvm {
+
+class BasicBlock;
+class BlockFrequencyInfo;
+class Function;
+class ProfileSummaryInfo;
+
+/// Returns true if function \p F is suggested to be size-optimized base on the
+/// profile.
+bool shouldOptimizeForSize(Function *F, ProfileSummaryInfo *PSI,
+ BlockFrequencyInfo *BFI);
+/// Returns true if basic block \p BB is suggested to be size-optimized base
+/// on the profile.
+bool shouldOptimizeForSize(BasicBlock *BB, ProfileSummaryInfo *PSI,
+ BlockFrequencyInfo *BFI);
+
+} // end namespace llvm
+
+#endif // LLVM_TRANSFORMS_UTILS_SiZEOPTS_H
class AssumptionCache;
class BasicBlock;
+class BlockFrequencyInfo;
class DependenceInfo;
class DominatorTree;
class Loop;
class LoopInfo;
class MDNode;
+class ProfileSummaryInfo;
class OptimizationRemarkEmitter;
class ScalarEvolution;
MDNode *GetUnrollMetadata(MDNode *LoopID, StringRef Name);
TargetTransformInfo::UnrollingPreferences gatherUnrollingPreferences(
- Loop *L, ScalarEvolution &SE, const TargetTransformInfo &TTI, int OptLevel,
+ Loop *L, ScalarEvolution &SE, const TargetTransformInfo &TTI,
+ BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, int OptLevel,
Optional<unsigned> UserThreshold, Optional<unsigned> UserCount,
Optional<bool> UserAllowPartial, Optional<bool> UserRuntime,
Optional<bool> UserUpperBound, Optional<bool> UserAllowPeeling);
class LoopAccessInfo;
class LoopInfo;
class OptimizationRemarkEmitter;
+class ProfileSummaryInfo;
class ScalarEvolution;
class TargetLibraryInfo;
class TargetTransformInfo;
AssumptionCache *AC;
std::function<const LoopAccessInfo &(Loop &)> *GetLAA;
OptimizationRemarkEmitter *ORE;
+ ProfileSummaryInfo *PSI;
PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM);
BlockFrequencyInfo &BFI_, TargetLibraryInfo *TLI_,
DemandedBits &DB_, AliasAnalysis &AA_, AssumptionCache &AC_,
std::function<const LoopAccessInfo &(Loop &)> &GetLAA_,
- OptimizationRemarkEmitter &ORE);
+ OptimizationRemarkEmitter &ORE_, ProfileSummaryInfo *PSI_);
bool processLoop(Loop *L);
};
Options.DoCounterPromotion = true;
Options.UseBFIInPromotion = IsCS;
MPM.addPass(InstrProfiling(Options, IsCS));
- } else if (!ProfileFile.empty())
+ } else if (!ProfileFile.empty()) {
MPM.addPass(PGOInstrumentationUse(ProfileFile, ProfileRemappingFile, IsCS));
+ // Cache ProfileSummaryAnalysis once to avoid the potential need to insert
+ // RequireAnalysisPass for PSI before subsequent non-module passes.
+ MPM.addPass(RequireAnalysisPass<ProfileSummaryAnalysis, Module>());
+ }
}
static InlineParams
MPM.addPass(SampleProfileLoaderPass(PGOOpt->ProfileFile,
PGOOpt->ProfileRemappingFile,
Phase == ThinLTOPhase::PreLink));
+ // Cache ProfileSummaryAnalysis once to avoid the potential need to insert
+ // RequireAnalysisPass for PSI before subsequent non-module passes.
+ MPM.addPass(RequireAnalysisPass<ProfileSummaryAnalysis, Module>());
// Do not invoke ICP in the ThinLTOPrelink phase as it makes it hard
// for the profile annotation to be accurate in the ThinLTO backend.
if (Phase != ThinLTOPhase::PreLink)
MPM.addPass(SampleProfileLoaderPass(PGOOpt->ProfileFile,
PGOOpt->ProfileRemappingFile,
false /* ThinLTOPhase::PreLink */));
+ // Cache ProfileSummaryAnalysis once to avoid the potential need to insert
+ // RequireAnalysisPass for PSI before subsequent non-module passes.
+ MPM.addPass(RequireAnalysisPass<ProfileSummaryAnalysis, Module>());
}
// Remove unused virtual tables to improve the quality of code generated by
auto InstCombineErase = [this](Instruction *I) {
eraseInstFromFunction(*I);
};
- LibCallSimplifier Simplifier(DL, &TLI, ORE, InstCombineRAUW,
+ LibCallSimplifier Simplifier(DL, &TLI, ORE, BFI, PSI, InstCombineRAUW,
InstCombineErase);
if (Value *With = Simplifier.optimizeCall(CI)) {
++NumSimplified;
class APInt;
class AssumptionCache;
+class BlockFrequencyInfo;
class DataLayout;
class DominatorTree;
class GEPOperator;
class GlobalVariable;
class LoopInfo;
class OptimizationRemarkEmitter;
+class ProfileSummaryInfo;
class TargetLibraryInfo;
class User;
const DataLayout &DL;
const SimplifyQuery SQ;
OptimizationRemarkEmitter &ORE;
+ BlockFrequencyInfo *BFI;
+ ProfileSummaryInfo *PSI;
// Optional analyses. When non-null, these can both be used to do better
// combining and will be updated to reflect any changes.
InstCombiner(InstCombineWorklist &Worklist, BuilderTy &Builder,
bool MinimizeSize, bool ExpensiveCombines, AliasAnalysis *AA,
AssumptionCache &AC, TargetLibraryInfo &TLI, DominatorTree &DT,
- OptimizationRemarkEmitter &ORE, const DataLayout &DL,
- LoopInfo *LI)
+ OptimizationRemarkEmitter &ORE, BlockFrequencyInfo *BFI,
+ ProfileSummaryInfo *PSI, const DataLayout &DL, LoopInfo *LI)
: Worklist(Worklist), Builder(Builder), MinimizeSize(MinimizeSize),
ExpensiveCombines(ExpensiveCombines), AA(AA), AC(AC), TLI(TLI), DT(DT),
- DL(DL), SQ(DL, &TLI, &DT, &AC), ORE(ORE), LI(LI) {}
+ DL(DL), SQ(DL, &TLI, &DT, &AC), ORE(ORE), BFI(BFI), PSI(PSI), LI(LI) {}
/// Run the combiner over the entire worklist until it is empty.
///
#include "llvm/Analysis/AliasAnalysis.h"
#include "llvm/Analysis/AssumptionCache.h"
#include "llvm/Analysis/BasicAliasAnalysis.h"
+#include "llvm/Analysis/BlockFrequencyInfo.h"
#include "llvm/Analysis/CFG.h"
#include "llvm/Analysis/ConstantFolding.h"
#include "llvm/Analysis/EHPersonalities.h"
#include "llvm/Analysis/GlobalsModRef.h"
#include "llvm/Analysis/InstructionSimplify.h"
+#include "llvm/Analysis/LazyBlockFrequencyInfo.h"
#include "llvm/Analysis/LoopInfo.h"
#include "llvm/Analysis/MemoryBuiltins.h"
#include "llvm/Analysis/OptimizationRemarkEmitter.h"
+#include "llvm/Analysis/ProfileSummaryInfo.h"
#include "llvm/Analysis/TargetFolder.h"
#include "llvm/Analysis/TargetLibraryInfo.h"
#include "llvm/Analysis/ValueTracking.h"
static bool combineInstructionsOverFunction(
Function &F, InstCombineWorklist &Worklist, AliasAnalysis *AA,
AssumptionCache &AC, TargetLibraryInfo &TLI, DominatorTree &DT,
- OptimizationRemarkEmitter &ORE, bool ExpensiveCombines = true,
+ OptimizationRemarkEmitter &ORE, BlockFrequencyInfo *BFI,
+ ProfileSummaryInfo *PSI, bool ExpensiveCombines = true,
LoopInfo *LI = nullptr) {
auto &DL = F.getParent()->getDataLayout();
ExpensiveCombines |= EnableExpensiveCombines;
MadeIRChange |= prepareICWorklistFromFunction(F, DL, &TLI, Worklist);
InstCombiner IC(Worklist, Builder, F.hasMinSize(), ExpensiveCombines, AA,
- AC, TLI, DT, ORE, DL, LI);
+ AC, TLI, DT, ORE, BFI, PSI, DL, LI);
IC.MaxArraySizeForCombine = MaxArraySize;
if (!IC.run())
auto *LI = AM.getCachedResult<LoopAnalysis>(F);
auto *AA = &AM.getResult<AAManager>(F);
+ const ModuleAnalysisManager &MAM =
+ AM.getResult<ModuleAnalysisManagerFunctionProxy>(F).getManager();
+ ProfileSummaryInfo *PSI =
+ MAM.getCachedResult<ProfileSummaryAnalysis>(*F.getParent());
+ auto *BFI = (PSI && PSI->hasProfileSummary()) ?
+ &AM.getResult<BlockFrequencyAnalysis>(F) : nullptr;
+
if (!combineInstructionsOverFunction(F, Worklist, AA, AC, TLI, DT, ORE,
- ExpensiveCombines, LI))
+ BFI, PSI, ExpensiveCombines, LI))
// No changes, all analyses are preserved.
return PreservedAnalyses::all();
AU.addPreserved<AAResultsWrapperPass>();
AU.addPreserved<BasicAAWrapperPass>();
AU.addPreserved<GlobalsAAWrapperPass>();
+ AU.addRequired<ProfileSummaryInfoWrapperPass>();
+ LazyBlockFrequencyInfoPass::getLazyBFIAnalysisUsage(AU);
}
bool InstructionCombiningPass::runOnFunction(Function &F) {
// Optional analyses.
auto *LIWP = getAnalysisIfAvailable<LoopInfoWrapperPass>();
auto *LI = LIWP ? &LIWP->getLoopInfo() : nullptr;
+ ProfileSummaryInfo *PSI =
+ &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI();
+ BlockFrequencyInfo *BFI =
+ (PSI && PSI->hasProfileSummary()) ?
+ &getAnalysis<LazyBlockFrequencyInfoPass>().getBFI() :
+ nullptr;
return combineInstructionsOverFunction(F, Worklist, AA, AC, TLI, DT, ORE,
- ExpensiveCombines, LI);
+ BFI, PSI, ExpensiveCombines, LI);
}
char InstructionCombiningPass::ID = 0;
INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass)
INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(LazyBlockFrequencyInfoPass)
+INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass)
INITIALIZE_PASS_END(InstructionCombiningPass, "instcombine",
"Combine redundant instructions", false, false)
#include "llvm/ADT/SmallVector.h"
#include "llvm/ADT/Statistic.h"
#include "llvm/Analysis/BlockFrequencyInfo.h"
+#include "llvm/Analysis/ProfileSummaryInfo.h"
#include "llvm/Analysis/TargetTransformInfo.h"
#include "llvm/Transforms/Utils/Local.h"
#include "llvm/IR/BasicBlock.h"
#include "llvm/Support/Debug.h"
#include "llvm/Support/raw_ostream.h"
#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils/SizeOpts.h"
#include <algorithm>
#include <cassert>
#include <cstdint>
if (ConstHoistWithBlockFrequency)
AU.addRequired<BlockFrequencyInfoWrapperPass>();
AU.addRequired<DominatorTreeWrapperPass>();
+ AU.addRequired<ProfileSummaryInfoWrapperPass>();
AU.addRequired<TargetTransformInfoWrapperPass>();
}
"Constant Hoisting", false, false)
INITIALIZE_PASS_DEPENDENCY(BlockFrequencyInfoWrapperPass)
INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass)
INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
INITIALIZE_PASS_END(ConstantHoistingLegacyPass, "consthoist",
"Constant Hoisting", false, false)
ConstHoistWithBlockFrequency
? &getAnalysis<BlockFrequencyInfoWrapperPass>().getBFI()
: nullptr,
- Fn.getEntryBlock());
+ Fn.getEntryBlock(),
+ &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI());
if (MadeChange) {
LLVM_DEBUG(dbgs() << "********** Function after Constant Hoisting: "
ConstCandVecType::iterator &MaxCostItr) {
unsigned NumUses = 0;
- if(!Entry->getParent()->hasOptSize() || std::distance(S,E) > 100) {
+ bool OptForSize = Entry->getParent()->hasOptSize() ||
+ llvm::shouldOptimizeForSize(Entry->getParent(), PSI, BFI);
+ if (!OptForSize || std::distance(S,E) > 100) {
for (auto ConstCand = S; ConstCand != E; ++ConstCand) {
NumUses += ConstCand->Uses.size();
if (ConstCand->CumulativeCost > MaxCostItr->CumulativeCost)
/// Optimize expensive integer constants in the given function.
bool ConstantHoistingPass::runImpl(Function &Fn, TargetTransformInfo &TTI,
DominatorTree &DT, BlockFrequencyInfo *BFI,
- BasicBlock &Entry) {
+ BasicBlock &Entry, ProfileSummaryInfo *PSI) {
this->TTI = &TTI;
this->DT = &DT;
this->BFI = BFI;
this->DL = &Fn.getParent()->getDataLayout();
this->Ctx = &Fn.getContext();
this->Entry = &Entry;
+ this->PSI = PSI;
// Collect all constant candidates.
collectConstantCandidates(Fn);
auto BFI = ConstHoistWithBlockFrequency
? &AM.getResult<BlockFrequencyAnalysis>(F)
: nullptr;
- if (!runImpl(F, TTI, DT, BFI, F.getEntryBlock()))
+ auto &MAM = AM.getResult<ModuleAnalysisManagerFunctionProxy>(F).getManager();
+ auto *PSI = MAM.getCachedResult<ProfileSummaryAnalysis>(*F.getParent());
+ if (!runImpl(F, TTI, DT, BFI, F.getEntryBlock(), PSI))
return PreservedAnalyses::all();
PreservedAnalyses PA;
#include "llvm/ADT/Statistic.h"
#include "llvm/Analysis/AliasAnalysis.h"
#include "llvm/Analysis/AssumptionCache.h"
+#include "llvm/Analysis/BlockFrequencyInfo.h"
#include "llvm/Analysis/GlobalsModRef.h"
+#include "llvm/Analysis/LazyBlockFrequencyInfo.h"
#include "llvm/Analysis/LoopAccessAnalysis.h"
#include "llvm/Analysis/LoopAnalysisManager.h"
#include "llvm/Analysis/LoopInfo.h"
#include "llvm/Analysis/MemorySSA.h"
+#include "llvm/Analysis/ProfileSummaryInfo.h"
#include "llvm/Analysis/ScalarEvolution.h"
#include "llvm/Analysis/ScalarEvolutionExpander.h"
#include "llvm/Analysis/ScalarEvolutionExpressions.h"
#include "llvm/Transforms/Scalar.h"
#include "llvm/Transforms/Utils.h"
#include "llvm/Transforms/Utils/LoopVersioning.h"
+#include "llvm/Transforms/Utils/SizeOpts.h"
#include <algorithm>
#include <cassert>
#include <forward_list>
class LoadEliminationForLoop {
public:
LoadEliminationForLoop(Loop *L, LoopInfo *LI, const LoopAccessInfo &LAI,
- DominatorTree *DT)
- : L(L), LI(LI), LAI(LAI), DT(DT), PSE(LAI.getPSE()) {}
+ DominatorTree *DT, BlockFrequencyInfo *BFI,
+ ProfileSummaryInfo* PSI)
+ : L(L), LI(LI), LAI(LAI), DT(DT), BFI(BFI), PSI(PSI), PSE(LAI.getPSE()) {}
/// Look through the loop-carried and loop-independent dependences in
/// this loop and find store->load dependences.
}
if (!Checks.empty() || !LAI.getPSE().getUnionPredicate().isAlwaysTrue()) {
- if (L->getHeader()->getParent()->hasOptSize()) {
+ auto *HeaderBB = L->getHeader();
+ auto *F = HeaderBB->getParent();
+ bool OptForSize = F->hasOptSize() ||
+ llvm::shouldOptimizeForSize(HeaderBB, PSI, BFI);
+ if (OptForSize) {
LLVM_DEBUG(
dbgs() << "Versioning is needed but not allowed when optimizing "
"for size.\n");
LoopInfo *LI;
const LoopAccessInfo &LAI;
DominatorTree *DT;
+ BlockFrequencyInfo *BFI;
+ ProfileSummaryInfo *PSI;
PredicatedScalarEvolution PSE;
};
static bool
eliminateLoadsAcrossLoops(Function &F, LoopInfo &LI, DominatorTree &DT,
+ BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI,
function_ref<const LoopAccessInfo &(Loop &)> GetLAI) {
// Build up a worklist of inner-loops to transform to avoid iterator
// invalidation.
bool Changed = false;
for (Loop *L : Worklist) {
// The actual work is performed by LoadEliminationForLoop.
- LoadEliminationForLoop LEL(L, &LI, GetLAI(*L), &DT);
+ LoadEliminationForLoop LEL(L, &LI, GetLAI(*L), &DT, BFI, PSI);
Changed |= LEL.processLoop();
}
return Changed;
auto &LI = getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
auto &LAA = getAnalysis<LoopAccessLegacyAnalysis>();
auto &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+ auto *PSI = &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI();
+ auto *BFI = (PSI && PSI->hasProfileSummary()) ?
+ &getAnalysis<LazyBlockFrequencyInfoPass>().getBFI() :
+ nullptr;
// Process each loop nest in the function.
return eliminateLoadsAcrossLoops(
- F, LI, DT,
+ F, LI, DT, BFI, PSI,
[&LAA](Loop &L) -> const LoopAccessInfo & { return LAA.getInfo(&L); });
}
AU.addRequired<DominatorTreeWrapperPass>();
AU.addPreserved<DominatorTreeWrapperPass>();
AU.addPreserved<GlobalsAAWrapperPass>();
+ AU.addRequired<ProfileSummaryInfoWrapperPass>();
+ LazyBlockFrequencyInfoPass::getLazyBFIAnalysisUsage(AU);
}
};
INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
INITIALIZE_PASS_DEPENDENCY(LoopSimplify)
+INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(LazyBlockFrequencyInfoPass)
INITIALIZE_PASS_END(LoopLoadElimination, LLE_OPTION, LLE_name, false, false)
FunctionPass *llvm::createLoopLoadEliminationPass() {
auto &TLI = AM.getResult<TargetLibraryAnalysis>(F);
auto &AA = AM.getResult<AAManager>(F);
auto &AC = AM.getResult<AssumptionAnalysis>(F);
+ auto &MAM = AM.getResult<ModuleAnalysisManagerFunctionProxy>(F).getManager();
+ auto *PSI = MAM.getCachedResult<ProfileSummaryAnalysis>(*F.getParent());
+ auto *BFI = (PSI && PSI->hasProfileSummary()) ?
+ &AM.getResult<BlockFrequencyAnalysis>(F) : nullptr;
MemorySSA *MSSA = EnableMSSALoopDependency
? &AM.getResult<MemorySSAAnalysis>(F).getMSSA()
: nullptr;
auto &LAM = AM.getResult<LoopAnalysisManagerFunctionProxy>(F).getManager();
bool Changed = eliminateLoadsAcrossLoops(
- F, LI, DT, [&](Loop &L) -> const LoopAccessInfo & {
+ F, LI, DT, BFI, PSI, [&](Loop &L) -> const LoopAccessInfo & {
LoopStandardAnalysisResults AR = {AA, AC, DT, LI, SE, TLI, TTI, MSSA};
return LAM.getResult<LoopAccessAnalysis>(L, AR);
});
return LoopUnrollResult::Unmodified;
TargetTransformInfo::UnrollingPreferences UP = gatherUnrollingPreferences(
- L, SE, TTI, OptLevel, None, None, None, None, None, None);
+ L, SE, TTI, nullptr, nullptr, OptLevel,
+ None, None, None, None, None, None);
if (AllowUnrollAndJam.getNumOccurrences() > 0)
UP.UnrollAndJam = AllowUnrollAndJam;
if (UnrollAndJamThreshold.getNumOccurrences() > 0)
#include "llvm/ADT/SmallVector.h"
#include "llvm/ADT/StringRef.h"
#include "llvm/Analysis/AssumptionCache.h"
+#include "llvm/Analysis/BlockFrequencyInfo.h"
#include "llvm/Analysis/CodeMetrics.h"
+#include "llvm/Analysis/LazyBlockFrequencyInfo.h"
#include "llvm/Analysis/LoopAnalysisManager.h"
#include "llvm/Analysis/LoopInfo.h"
#include "llvm/Analysis/LoopPass.h"
#include "llvm/Transforms/Utils.h"
#include "llvm/Transforms/Utils/LoopSimplify.h"
#include "llvm/Transforms/Utils/LoopUtils.h"
+#include "llvm/Transforms/Utils/SizeOpts.h"
#include "llvm/Transforms/Utils/UnrollLoop.h"
#include <algorithm>
#include <cassert>
/// Gather the various unrolling parameters based on the defaults, compiler
/// flags, TTI overrides and user specified parameters.
TargetTransformInfo::UnrollingPreferences llvm::gatherUnrollingPreferences(
- Loop *L, ScalarEvolution &SE, const TargetTransformInfo &TTI, int OptLevel,
+ Loop *L, ScalarEvolution &SE, const TargetTransformInfo &TTI,
+ BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, int OptLevel,
Optional<unsigned> UserThreshold, Optional<unsigned> UserCount,
Optional<bool> UserAllowPartial, Optional<bool> UserRuntime,
Optional<bool> UserUpperBound, Optional<bool> UserAllowPeeling) {
TTI.getUnrollingPreferences(L, SE, UP);
// Apply size attributes
- if (L->getHeader()->getParent()->hasOptSize()) {
+ bool OptForSize = L->getHeader()->getParent()->hasOptSize() ||
+ llvm::shouldOptimizeForSize(L->getHeader(), PSI, BFI);
+ if (OptForSize) {
UP.Threshold = UP.OptSizeThreshold;
UP.PartialThreshold = UP.PartialOptSizeThreshold;
}
static LoopUnrollResult tryToUnrollLoop(
Loop *L, DominatorTree &DT, LoopInfo *LI, ScalarEvolution &SE,
const TargetTransformInfo &TTI, AssumptionCache &AC,
- OptimizationRemarkEmitter &ORE, bool PreserveLCSSA, int OptLevel,
+ OptimizationRemarkEmitter &ORE,
+ BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI,
+ bool PreserveLCSSA, int OptLevel,
bool OnlyWhenForced, bool ForgetAllSCEV, Optional<unsigned> ProvidedCount,
Optional<unsigned> ProvidedThreshold, Optional<bool> ProvidedAllowPartial,
Optional<bool> ProvidedRuntime, Optional<bool> ProvidedUpperBound,
bool NotDuplicatable;
bool Convergent;
TargetTransformInfo::UnrollingPreferences UP = gatherUnrollingPreferences(
- L, SE, TTI, OptLevel, ProvidedThreshold, ProvidedCount,
+ L, SE, TTI, BFI, PSI, OptLevel, ProvidedThreshold, ProvidedCount,
ProvidedAllowPartial, ProvidedRuntime, ProvidedUpperBound,
ProvidedAllowPeeling);
// Exit early if unrolling is disabled.
bool PreserveLCSSA = mustPreserveAnalysisID(LCSSAID);
LoopUnrollResult Result = tryToUnrollLoop(
- L, DT, LI, SE, TTI, AC, ORE, PreserveLCSSA, OptLevel, OnlyWhenForced,
+ L, DT, LI, SE, TTI, AC, ORE, nullptr, nullptr,
+ PreserveLCSSA, OptLevel, OnlyWhenForced,
ForgetAllSCEV, ProvidedCount, ProvidedThreshold, ProvidedAllowPartial,
ProvidedRuntime, ProvidedUpperBound, ProvidedAllowPeeling);
bool Changed =
tryToUnrollLoop(&L, AR.DT, &AR.LI, AR.SE, AR.TTI, AR.AC, *ORE,
+ /*BFI*/ nullptr, /*PSI*/ nullptr,
/*PreserveLCSSA*/ true, OptLevel, OnlyWhenForced,
/*ForgetAllSCEV*/ false, /*Count*/ None,
/*Threshold*/ None, /*AllowPartial*/ false,
AM.getResult<ModuleAnalysisManagerFunctionProxy>(F).getManager();
ProfileSummaryInfo *PSI =
MAM.getCachedResult<ProfileSummaryAnalysis>(*F.getParent());
+ auto *BFI = (PSI && PSI->hasProfileSummary()) ?
+ &AM.getResult<BlockFrequencyAnalysis>(F) : nullptr;
bool Changed = false;
// The API here is quite complex to call and we allow to select some
// flavors of unrolling during construction time (by setting UnrollOpts).
LoopUnrollResult Result = tryToUnrollLoop(
- &L, DT, &LI, SE, TTI, AC, ORE,
+ &L, DT, &LI, SE, TTI, AC, ORE, BFI, PSI,
/*PreserveLCSSA*/ true, UnrollOpts.OptLevel, UnrollOpts.OnlyWhenForced,
/*ForgetAllSCEV*/ false, /*Count*/ None,
/*Threshold*/ None, UnrollOpts.AllowPartial, UnrollOpts.AllowRuntime,
SimplifyCFG.cpp
SimplifyIndVar.cpp
SimplifyLibCalls.cpp
+ SizeOpts.cpp
SplitModule.cpp
StripNonLineTableDebugInfo.cpp
SymbolRewriter.cpp
#include "llvm/ADT/SmallString.h"
#include "llvm/ADT/StringMap.h"
#include "llvm/ADT/Triple.h"
+#include "llvm/Analysis/BlockFrequencyInfo.h"
#include "llvm/Analysis/ConstantFolding.h"
#include "llvm/Analysis/OptimizationRemarkEmitter.h"
+#include "llvm/Analysis/ProfileSummaryInfo.h"
#include "llvm/Analysis/TargetLibraryInfo.h"
#include "llvm/Transforms/Utils/Local.h"
#include "llvm/Analysis/ValueTracking.h"
#include "llvm/Support/CommandLine.h"
#include "llvm/Support/KnownBits.h"
#include "llvm/Transforms/Utils/BuildLibCalls.h"
+#include "llvm/Transforms/Utils/SizeOpts.h"
using namespace llvm;
using namespace PatternMatch;
// Don't rewrite fputs to fwrite when optimising for size because fwrite
// requires more arguments and thus extra MOVs are required.
- if (CI->getFunction()->hasOptSize())
+ bool OptForSize = CI->getFunction()->hasOptSize() ||
+ llvm::shouldOptimizeForSize(CI->getParent(), PSI, BFI);
+ if (OptForSize)
return nullptr;
// Check if has any use
LibCallSimplifier::LibCallSimplifier(
const DataLayout &DL, const TargetLibraryInfo *TLI,
OptimizationRemarkEmitter &ORE,
+ BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI,
function_ref<void(Instruction *, Value *)> Replacer,
function_ref<void(Instruction *)> Eraser)
- : FortifiedSimplifier(TLI), DL(DL), TLI(TLI), ORE(ORE),
+ : FortifiedSimplifier(TLI), DL(DL), TLI(TLI), ORE(ORE), BFI(BFI), PSI(PSI),
UnsafeFPShrink(false), Replacer(Replacer), Eraser(Eraser) {}
void LibCallSimplifier::replaceAllUsesWith(Instruction *I, Value *With) {
--- /dev/null
+//===-- SizeOpts.cpp - code size optimization related code ----------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains some shared code size optimization related code.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Analysis/BlockFrequencyInfo.h"
+#include "llvm/Analysis/ProfileSummaryInfo.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Transforms/Utils/SizeOpts.h"
+using namespace llvm;
+
+static cl::opt<bool> ProfileGuidedSizeOpt(
+ "pgso", cl::Hidden, cl::init(true),
+ cl::desc("Enable the profile guided size optimization. "));
+
+bool llvm::shouldOptimizeForSize(Function *F, ProfileSummaryInfo *PSI,
+ BlockFrequencyInfo *BFI) {
+ assert(F);
+ if (!PSI || !BFI || !PSI->hasProfileSummary())
+ return false;
+ return ProfileGuidedSizeOpt && PSI->isFunctionColdInCallGraph(F, *BFI);
+}
+
+bool llvm::shouldOptimizeForSize(BasicBlock *BB, ProfileSummaryInfo *PSI,
+ BlockFrequencyInfo *BFI) {
+ assert(BB);
+ if (!PSI || !BFI || !PSI->hasProfileSummary())
+ return false;
+ return ProfileGuidedSizeOpt && PSI->isColdBlock(BB, BFI);
+}
#include "llvm/Analysis/LoopIterator.h"
#include "llvm/Analysis/MemorySSA.h"
#include "llvm/Analysis/OptimizationRemarkEmitter.h"
+#include "llvm/Analysis/ProfileSummaryInfo.h"
#include "llvm/Analysis/ScalarEvolution.h"
#include "llvm/Analysis/ScalarEvolutionExpander.h"
#include "llvm/Analysis/ScalarEvolutionExpressions.h"
#include "llvm/Transforms/Utils/LoopSimplify.h"
#include "llvm/Transforms/Utils/LoopUtils.h"
#include "llvm/Transforms/Utils/LoopVersioning.h"
+#include "llvm/Transforms/Utils/SizeOpts.h"
#include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h"
#include <algorithm>
#include <cassert>
auto *LAA = &getAnalysis<LoopAccessLegacyAnalysis>();
auto *DB = &getAnalysis<DemandedBitsWrapperPass>().getDemandedBits();
auto *ORE = &getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE();
+ auto *PSI = &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI();
std::function<const LoopAccessInfo &(Loop &)> GetLAA =
[&](Loop &L) -> const LoopAccessInfo & { return LAA->getInfo(&L); };
return Impl.runImpl(F, *SE, *LI, *TTI, *DT, *BFI, TLI, *DB, *AA, *AC,
- GetLAA, *ORE);
+ GetLAA, *ORE, PSI);
}
void getAnalysisUsage(AnalysisUsage &AU) const override {
AU.addPreserved<BasicAAWrapperPass>();
AU.addPreserved<GlobalsAAWrapperPass>();
+ AU.addRequired<ProfileSummaryInfoWrapperPass>();
}
};
INITIALIZE_PASS_DEPENDENCY(LoopAccessLegacyAnalysis)
INITIALIZE_PASS_DEPENDENCY(DemandedBitsWrapperPass)
INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass)
INITIALIZE_PASS_END(LoopVectorize, LV_NAME, lv_name, false, false)
namespace llvm {
Loop *L, PredicatedScalarEvolution &PSE, LoopInfo *LI, DominatorTree *DT,
LoopVectorizationLegality *LVL, TargetTransformInfo *TTI,
TargetLibraryInfo *TLI, DemandedBits *DB, AssumptionCache *AC,
- OptimizationRemarkEmitter *ORE, LoopVectorizeHints &Hints) {
+ OptimizationRemarkEmitter *ORE, BlockFrequencyInfo *BFI,
+ ProfileSummaryInfo *PSI, LoopVectorizeHints &Hints) {
assert(EnableVPlanNativePath && "VPlan-native path is disabled.");
Function *F = L->getHeader()->getParent();
// Get user vectorization factor.
const unsigned UserVF = Hints.getWidth();
- // Check the function attributes to find out if this function should be
- // optimized for size.
+ // Check the function attributes and profiles to find out if this function
+ // should be optimized for size.
bool OptForSize =
- Hints.getForce() != LoopVectorizeHints::FK_Enabled && F->hasOptSize();
+ Hints.getForce() != LoopVectorizeHints::FK_Enabled &&
+ (F->hasOptSize() ||
+ llvm::shouldOptimizeForSize(L->getHeader(), PSI, BFI));
// Plan how to best vectorize, return the best VF and its cost.
const VectorizationFactor VF = LVP.planInVPlanNativePath(OptForSize, UserVF);
return false;
}
- // Check the function attributes to find out if this function should be
- // optimized for size.
+ // Check the function attributes and profiles to find out if this function
+ // should be optimized for size.
bool OptForSize =
- Hints.getForce() != LoopVectorizeHints::FK_Enabled && F->hasOptSize();
+ Hints.getForce() != LoopVectorizeHints::FK_Enabled &&
+ (F->hasOptSize() ||
+ llvm::shouldOptimizeForSize(L->getHeader(), PSI, BFI));
// Entrance to the VPlan-native vectorization path. Outer loops are processed
// here. They may require CFG and instruction level transformations before
// pipeline.
if (!L->empty())
return processLoopInVPlanNativePath(L, PSE, LI, DT, &LVL, TTI, TLI, DB, AC,
- ORE, Hints);
+ ORE, BFI, PSI, Hints);
assert(L->empty() && "Inner loop expected.");
// Check the loop for a trip count threshold: vectorize loops with a tiny trip
DominatorTree &DT_, BlockFrequencyInfo &BFI_, TargetLibraryInfo *TLI_,
DemandedBits &DB_, AliasAnalysis &AA_, AssumptionCache &AC_,
std::function<const LoopAccessInfo &(Loop &)> &GetLAA_,
- OptimizationRemarkEmitter &ORE_) {
+ OptimizationRemarkEmitter &ORE_, ProfileSummaryInfo *PSI_) {
SE = &SE_;
LI = &LI_;
TTI = &TTI_;
GetLAA = &GetLAA_;
DB = &DB_;
ORE = &ORE_;
+ PSI = PSI_;
// Don't attempt if
// 1. the target claims to have no vector registers, and
LoopStandardAnalysisResults AR = {AA, AC, DT, LI, SE, TLI, TTI, MSSA};
return LAM.getResult<LoopAccessAnalysis>(L, AR);
};
+ const ModuleAnalysisManager &MAM =
+ AM.getResult<ModuleAnalysisManagerFunctionProxy>(F).getManager();
+ ProfileSummaryInfo *PSI =
+ MAM.getCachedResult<ProfileSummaryAnalysis>(*F.getParent());
bool Changed =
- runImpl(F, SE, LI, TTI, DT, BFI, &TLI, DB, AA, AC, GetLAA, ORE);
+ runImpl(F, SE, LI, TTI, DT, BFI, &TLI, DB, AA, AC, GetLAA, ORE, PSI);
if (!Changed)
return PreservedAnalyses::all();
PreservedAnalyses PA;
; CHECK-O-NEXT: Running pass: InstCombinePass
; CHECK-O-NEXT: Running analysis: OptimizationRemarkEmitterAnalysis
; CHECK-O-NEXT: Running analysis: AAManager
+; CHECK-O-NEXT: Running analysis: OuterAnalysisManagerProxy
; CHECK-EP-PEEPHOLE-NEXT: Running pass: NoOpFunctionPass
; CHECK-O-NEXT: Running pass: SimplifyCFGPass
; CHECK-O-NEXT: Finished llvm::Function pass manager run.
; CHECK-O-NEXT: Running pass: SLPVectorizerPass
; CHECK-O-NEXT: Running pass: InstCombinePass
; CHECK-O-NEXT: Running pass: LoopUnrollPass
-; CHECK-O-NEXT: Running analysis: OuterAnalysisManagerProxy
; CHECK-O-NEXT: Running pass: WarnMissedTransformationsPass
; CHECK-O-NEXT: Running pass: InstCombinePass
; CHECK-O-NEXT: Running pass: RequireAnalysisPass<{{.*}}OptimizationRemarkEmitterAnalysis
; CHECK-O2-NEXT: Starting llvm::Function pass manager run.
; CHECK-O3-NEXT: Running pass: AggressiveInstCombinePass
; CHECK-O2-NEXT: Running pass: InstCombinePass
+; CHECK-O2-NEXT: Running analysis: OuterAnalysisManagerProxy
; CHECK-EP-Peephole-NEXT: Running pass: NoOpFunctionPass
; CHECK-O2-NEXT: Finished llvm::Function pass manager run.
; CHECK-O2-NEXT: Running pass: ModuleToPostOrderCGSCCPassAdaptor<{{.*}}InlinerPass>
; CHECK-O-NEXT: Running pass: InstCombinePass
; CHECK-PRELINK-O-NEXT: Running analysis: OptimizationRemarkEmitterAnalysis
; CHECK-O-NEXT: Running analysis: AAManager
+; CHECK-O-NEXT: Running analysis: OuterAnalysisManagerProxy
; CHECK-O-NEXT: Running pass: SimplifyCFGPass
; CHECK-O-NEXT: Finished llvm::Function pass manager run.
; CHECK-O-NEXT: Running pass: RequireAnalysisPass<{{.*}}GlobalsAA
; CHECK-POSTLINK-O-NEXT: Running pass: SLPVectorizerPass
; CHECK-POSTLINK-O-NEXT: Running pass: InstCombinePass
; CHECK-POSTLINK-O-NEXT: Running pass: LoopUnrollPass
-; CHECK-POSTLINK-O-NEXT: Running analysis: OuterAnalysisManagerProxy
; CHECK-POSTLINK-O-NEXT: Running pass: WarnMissedTransformationsPass
; CHECK-POSTLINK-O-NEXT: Running pass: InstCombinePass
; CHECK-POSTLINK-O-NEXT: Running pass: RequireAnalysisPass<{{.*}}OptimizationRemarkEmitterAnalysis
; CHECK-NEXT: Scalar Evolution Analysis
; CHECK-NEXT: Function Alias Analysis Results
; CHECK-NEXT: Loop Access Analysis
+; CHECK-NEXT: Lazy Branch Probability Analysis
+; CHECK-NEXT: Lazy Block Frequency Analysis
; CHECK-NEXT: Loop Load Elimination
; CHECK-NEXT: Basic Alias Analysis (stateless AA impl)
; CHECK-NEXT: Function Alias Analysis Results
; CHECK-NEXT: Scalar Evolution Analysis
; CHECK-NEXT: Function Alias Analysis Results
; CHECK-NEXT: Loop Access Analysis
+; CHECK-NEXT: Lazy Branch Probability Analysis
+; CHECK-NEXT: Lazy Block Frequency Analysis
; CHECK-NEXT: Loop Load Elimination
; CHECK-NEXT: Basic Alias Analysis (stateless AA impl)
; CHECK-NEXT: Function Alias Analysis Results
; CHECK-NEXT: Scalar Evolution Analysis
; CHECK-NEXT: Function Alias Analysis Results
; CHECK-NEXT: Loop Access Analysis
+; CHECK-NEXT: Lazy Branch Probability Analysis
+; CHECK-NEXT: Lazy Block Frequency Analysis
; CHECK-NEXT: Loop Load Elimination
; CHECK-NEXT: Basic Alias Analysis (stateless AA impl)
; CHECK-NEXT: Function Alias Analysis Results
; RUN: opt -mtriple=arm-arm-none-eabi -consthoist -S < %s | FileCheck %s
+; RUN: opt -mtriple=arm-arm-none-eabi -consthoist -pgso -S < %s | FileCheck %s -check-prefix=PGSO
+; RUN: opt -mtriple=arm-arm-none-eabi -consthoist -pgso=false -S < %s | FileCheck %s -check-prefix=NPGSO
; There are different candidates here for the base constant: 1073876992 and
; 1073876996. But we don't want to see the latter because it results in
entry:
; CHECK-LABEL: @foo
; CHECK-NOT: [[CONST1:%const_mat[0-9]*]] = add i32 %const, -4
+; CHECK-LABEL: @foo_pgso
%0 = load volatile i32, i32* inttoptr (i32 1073876992 to i32*), align 4096
%or = or i32 %0, 1
store volatile i32 %or, i32* inttoptr (i32 1073876992 to i32*), align 4096
}
attributes #0 = { minsize norecurse nounwind optsize readnone uwtable }
+
+define void @foo_pgso() #1 !prof !14 {
+entry:
+; PGSO-LABEL: @foo_pgso
+; PGSO-NOT: [[CONST2:%const_mat[0-9]*]] = add i32 %const, -4
+; NPGSO-LABEL: @foo_pgso
+; NPGSO: [[CONST2:%const_mat[0-9]*]] = add i32 %const, -4
+ %0 = load volatile i32, i32* inttoptr (i32 1073876992 to i32*), align 4096
+ %or = or i32 %0, 1
+ store volatile i32 %or, i32* inttoptr (i32 1073876992 to i32*), align 4096
+ %1 = load volatile i32, i32* inttoptr (i32 1073876996 to i32*), align 4
+ %and = and i32 %1, -117506048
+ store volatile i32 %and, i32* inttoptr (i32 1073876996 to i32*), align 4
+ %2 = load volatile i32, i32* inttoptr (i32 1073876992 to i32*), align 4096
+ %and1 = and i32 %2, -17367041
+ store volatile i32 %and1, i32* inttoptr (i32 1073876996 to i32*), align 4096
+ %3 = load volatile i32, i32* inttoptr (i32 1073876992 to i32*), align 4096
+ %and2 = and i32 %3, -262145
+ store volatile i32 %and2, i32* inttoptr (i32 1073876992 to i32*), align 4096
+ %4 = load volatile i32, i32* inttoptr (i32 1073876996 to i32*), align 4
+ %and3 = and i32 %4, -8323073
+ store volatile i32 %and3, i32* inttoptr (i32 1073876996 to i32*), align 4
+ store volatile i32 10420224, i32* inttoptr (i32 1073877000 to i32*), align 8
+ %5 = load volatile i32, i32* inttoptr (i32 1073876996 to i32*), align 4096
+ %or4 = or i32 %5, 65536
+ store volatile i32 %or4, i32* inttoptr (i32 1073876996 to i32*), align 4096
+ %6 = load volatile i32, i32* inttoptr (i32 1073881088 to i32*), align 8192
+ %or6.i.i = or i32 %6, 16
+ store volatile i32 %or6.i.i, i32* inttoptr (i32 1073881088 to i32*), align 8192
+ %7 = load volatile i32, i32* inttoptr (i32 1073881088 to i32*), align 8192
+ %and7.i.i = and i32 %7, -4
+ store volatile i32 %and7.i.i, i32* inttoptr (i32 1073881088 to i32*), align 8192
+ %8 = load volatile i32, i32* inttoptr (i32 1073881088 to i32*), align 8192
+ %or8.i.i = or i32 %8, 2
+ store volatile i32 %or8.i.i, i32* inttoptr (i32 1073881088 to i32*), align 8192
+ ret void
+}
+
+attributes #1 = { norecurse nounwind readnone uwtable } ; no optsize or minsize
+
+!llvm.module.flags = !{!0}
+!0 = !{i32 1, !"ProfileSummary", !1}
+!1 = !{!2, !3, !4, !5, !6, !7, !8, !9}
+!2 = !{!"ProfileFormat", !"InstrProf"}
+!3 = !{!"TotalCount", i64 10000}
+!4 = !{!"MaxCount", i64 10}
+!5 = !{!"MaxInternalCount", i64 1}
+!6 = !{!"MaxFunctionCount", i64 1000}
+!7 = !{!"NumCounts", i64 3}
+!8 = !{!"NumFunctions", i64 3}
+!9 = !{!"DetailedSummary", !10}
+!10 = !{!11, !12, !13}
+!11 = !{i32 10000, i64 100, i32 1}
+!12 = !{i32 999000, i64 100, i32 1}
+!13 = !{i32 999999, i64 1, i32 2}
+!14 = !{!"function_entry_count", i64 0}
; because it requires more arguments and thus extra MOVs are required.
;
; RUN: opt < %s -instcombine -S | FileCheck %s
+; RUN: opt < %s -instcombine -pgso -S | FileCheck %s -check-prefix=PGSO
+; RUN: opt < %s -instcombine -pgso=false -S | FileCheck %s -check-prefix=NPGSO
%struct._IO_FILE = type { i32, i8*, i8*, i8*, i8*, i8*, i8*, i8*, i8*, i8*, i8*, i8*, %struct._IO_marker*, %struct._IO_FILE*, i32, i32, i32, i16, i8, [1 x i8], i8*, i64, i8*, i8*, i8*, i8*, i32, i32, [40 x i8] }
%struct._IO_marker = type { %struct._IO_marker*, %struct._IO_FILE*, i32 }
attributes #0 = { nounwind optsize }
attributes #1 = { nounwind optsize }
+
+define i32 @main_pgso() local_unnamed_addr !prof !14 {
+entry:
+; PGSO-LABEL: @main_pgso(
+; PGSO-NOT: call i64 @fwrite
+; PGSO: call i32 @fputs
+; NPGSO-LABEL: @main_pgso(
+; NPGSO: call i64 @fwrite
+; NPGSO-NOT: call i32 @fputs
+
+ %call = tail call %struct._IO_FILE* @fopen(i8* getelementptr inbounds ([10 x i8], [10 x i8]* @.str, i32 0, i32 0), i8* getelementptr inbounds ([2 x i8], [2 x i8]* @.str.1, i32 0, i32 0)) #2
+ %call1 = tail call i32 @fputs(i8* getelementptr inbounds ([27 x i8], [27 x i8]* @.str.2, i32 0, i32 0), %struct._IO_FILE* %call) #2
+ ret i32 0
+}
+
+!llvm.module.flags = !{!0}
+!0 = !{i32 1, !"ProfileSummary", !1}
+!1 = !{!2, !3, !4, !5, !6, !7, !8, !9}
+!2 = !{!"ProfileFormat", !"InstrProf"}
+!3 = !{!"TotalCount", i64 10000}
+!4 = !{!"MaxCount", i64 10}
+!5 = !{!"MaxInternalCount", i64 1}
+!6 = !{!"MaxFunctionCount", i64 1000}
+!7 = !{!"NumCounts", i64 3}
+!8 = !{!"NumFunctions", i64 3}
+!9 = !{!"DetailedSummary", !10}
+!10 = !{!11, !12, !13}
+!11 = !{i32 10000, i64 100, i32 1}
+!12 = !{i32 999000, i64 100, i32 1}
+!13 = !{i32 999999, i64 1, i32 2}
+!14 = !{!"function_entry_count", i64 0}
; RUN: opt -basicaa -loop-load-elim -S < %s | FileCheck %s
+; RUN: opt -basicaa -loop-load-elim -pgso -S < %s | FileCheck %s -check-prefix=PGSO
+; RUN: opt -basicaa -loop-load-elim -pgso=false -S < %s | FileCheck %s -check-prefix=NPGSO
; When optimizing for size don't eliminate in this loop because the loop would
; have to be versioned first because A and C may alias.
for.end: ; preds = %for.body
ret void
}
+
+
+; PGSO-LABEL: @f_pgso(
+; NPGSO-LABEL: @f_pgso(
+define void @f_pgso(i32* %A, i32* %B, i32* %C, i64 %N) !prof !14 {
+
+entry:
+ br label %for.body
+
+for.body: ; preds = %for.body, %entry
+ %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+ %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+
+ %Aidx_next = getelementptr inbounds i32, i32* %A, i64 %indvars.iv.next
+ %Bidx = getelementptr inbounds i32, i32* %B, i64 %indvars.iv
+ %Cidx = getelementptr inbounds i32, i32* %C, i64 %indvars.iv
+ %Aidx = getelementptr inbounds i32, i32* %A, i64 %indvars.iv
+
+ %b = load i32, i32* %Bidx, align 4
+ %a_p1 = add i32 %b, 2
+ store i32 %a_p1, i32* %Aidx_next, align 4
+
+ %a = load i32, i32* %Aidx, align 4
+; PGSO: %c = mul i32 %a, 2
+; NPGSO-NOT: %c = mul i32 %a, 2
+ %c = mul i32 %a, 2
+ store i32 %c, i32* %Cidx, align 4
+
+ %exitcond = icmp eq i64 %indvars.iv.next, %N
+ br i1 %exitcond, label %for.end, label %for.body
+
+for.end: ; preds = %for.body
+ ret void
+}
+
+!llvm.module.flags = !{!0}
+!0 = !{i32 1, !"ProfileSummary", !1}
+!1 = !{!2, !3, !4, !5, !6, !7, !8, !9}
+!2 = !{!"ProfileFormat", !"InstrProf"}
+!3 = !{!"TotalCount", i64 10000}
+!4 = !{!"MaxCount", i64 10}
+!5 = !{!"MaxInternalCount", i64 1}
+!6 = !{!"MaxFunctionCount", i64 1000}
+!7 = !{!"NumCounts", i64 3}
+!8 = !{!"NumFunctions", i64 3}
+!9 = !{!"DetailedSummary", !10}
+!10 = !{!11, !12, !13}
+!11 = !{i32 10000, i64 100, i32 1}
+!12 = !{i32 999000, i64 100, i32 1}
+!13 = !{i32 999999, i64 1, i32 2}
+!14 = !{!"function_entry_count", i64 0}
; RUN: opt < %s -S -loop-unroll -unroll-count=4 | FileCheck -check-prefix=CHECK_COUNT4 %s\r
; RUN: opt < %s -S -loop-unroll | FileCheck -check-prefix=CHECK_NOCOUNT %s\r
+; RUN: opt < %s -S -passes='require<profile-summary>,function(unroll)' -pgso | FileCheck -check-prefix=PGSO %s\r
+; RUN: opt < %s -S -passes='require<profile-summary>,function(unroll)' -pgso=false | FileCheck -check-prefix=NPGSO %s\r
\r
\r
;///////////////////// TEST 1 //////////////////////////////\r
; CHECK_NOCOUNT-LABEL: @Test4\r
; CHECK_NOCOUNT: phi\r
; CHECK_NOCOUNT: icmp\r
+\r
+;///////////////////// TEST 5 //////////////////////////////\r
+\r
+; This test shows that with PGO, this loop is cold and not unrolled.\r
+\r
+define i32 @Test5() !prof !14 {\r
+entry:\r
+ br label %for.body\r
+\r
+for.body: ; preds = %for.body, %entry\r
+ %i.05 = phi i32 [ 0, %entry ], [ %inc, %for.body ]\r
+ %arrayidx = getelementptr inbounds [24 x i32], [24 x i32]* @tab, i32 0, i32 %i.05\r
+ store i32 %i.05, i32* %arrayidx, align 4\r
+ %inc = add nuw nsw i32 %i.05, 1\r
+ %exitcond = icmp eq i32 %inc, 24\r
+ br i1 %exitcond, label %for.end, label %for.body\r
+\r
+for.end: ; preds = %for.body\r
+ ret i32 42\r
+}\r
+\r
+; PGSO-LABEL: @Test5\r
+; PGSO: phi\r
+; PGSO: icmp\r
+; NPGSO-LABEL: @Test5\r
+; NPGSO-NOT: phi\r
+; NPGSO-NOT: icmp\r
+\r
+!llvm.module.flags = !{!0}\r
+!0 = !{i32 1, !"ProfileSummary", !1}\r
+!1 = !{!2, !3, !4, !5, !6, !7, !8, !9}\r
+!2 = !{!"ProfileFormat", !"InstrProf"}\r
+!3 = !{!"TotalCount", i64 10000}\r
+!4 = !{!"MaxCount", i64 10}\r
+!5 = !{!"MaxInternalCount", i64 1}\r
+!6 = !{!"MaxFunctionCount", i64 1000}\r
+!7 = !{!"NumCounts", i64 3}\r
+!8 = !{!"NumFunctions", i64 3}\r
+!9 = !{!"DetailedSummary", !10}\r
+!10 = !{!11, !12, !13}\r
+!11 = !{i32 10000, i64 100, i32 1}\r
+!12 = !{i32 999000, i64 100, i32 1}\r
+!13 = !{i32 999999, i64 1, i32 2}\r
+!14 = !{!"function_entry_count", i64 0}\r
; loop with the optimize for size or the minimize size attributes.
; REQUIRES: asserts
; RUN: opt < %s -loop-vectorize -S | FileCheck %s
+; RUN: opt < %s -loop-vectorize -pgso -S | FileCheck %s -check-prefix=PGSO
+; RUN: opt < %s -loop-vectorize -pgso=false -S | FileCheck %s -check-prefix=NPGSO
target datalayout = "E-m:e-p:32:32-i64:32-f64:32:64-a:0:32-n32-S128"
; CHECK-LABEL: @foo_minsize(
; CHECK-NOT: <2 x i8>
; CHECK-NOT: <4 x i8>
+; CHECK-LABEL: @foo_pgso(
entry:
br label %for.body
attributes #1 = { minsize }
+define i32 @foo_pgso() !prof !14 {
+; PGSO-LABEL: @foo_pgso(
+; PGSO-NOT: <{{[0-9]+}} x i8>
+; NPGSO-LABEL: @foo_pgso(
+; NPGSO: <{{[0-9]+}} x i8>
+
+entry:
+ br label %for.body
+
+for.body: ; preds = %for.body, %entry
+ %i.08 = phi i32 [ 0, %entry ], [ %inc, %for.body ]
+ %arrayidx = getelementptr inbounds [32 x i8], [32 x i8]* @tab, i32 0, i32 %i.08
+ %0 = load i8, i8* %arrayidx, align 1
+ %cmp1 = icmp eq i8 %0, 0
+ %. = select i1 %cmp1, i8 2, i8 1
+ store i8 %., i8* %arrayidx, align 1
+ %inc = add nsw i32 %i.08, 1
+ %exitcond = icmp eq i32 %i.08, 202
+ br i1 %exitcond, label %for.end, label %for.body
+
+for.end: ; preds = %for.body
+ ret i32 0
+}
+
+!llvm.module.flags = !{!0}
+!0 = !{i32 1, !"ProfileSummary", !1}
+!1 = !{!2, !3, !4, !5, !6, !7, !8, !9}
+!2 = !{!"ProfileFormat", !"InstrProf"}
+!3 = !{!"TotalCount", i64 10000}
+!4 = !{!"MaxCount", i64 10}
+!5 = !{!"MaxInternalCount", i64 1}
+!6 = !{!"MaxFunctionCount", i64 1000}
+!7 = !{!"NumCounts", i64 3}
+!8 = !{!"NumFunctions", i64 3}
+!9 = !{!"DetailedSummary", !10}
+!10 = !{!11, !12, !13}
+!11 = !{i32 10000, i64 100, i32 1}
+!12 = !{i32 999000, i64 100, i32 1}
+!13 = !{i32 999999, i64 1, i32 2}
+!14 = !{!"function_entry_count", i64 0}