63#define DEBUG_TYPE "partial-inlining"
66 "Number of callsites functions partially inlined into.");
67STATISTIC(NumColdOutlinePartialInlined,
"Number of times functions with "
68 "cold outlined regions were partially "
69 "inlined into its caller(s).");
71 "Number of cold single entry/exit regions found.");
73 "Number of cold single entry/exit regions outlined.");
83 cl::desc(
"Disable multi-region partial inlining"));
89 cl::desc(
"Force outline regions with live exits"));
95 cl::desc(
"Mark outline function calls with ColdCC"));
108 cl::desc(
"Minimum ratio comparing relative sizes of each "
109 "outline candidate and original function"));
114 cl::desc(
"Minimum block executions to consider "
115 "its BranchProbabilityInfo valid"));
120 cl::desc(
"Minimum BranchProbability to consider a region cold."));
124 cl::desc(
"Max number of blocks to be partially inlined"));
130 cl::desc(
"Max number of partial inlining. The default is unlimited"));
138 cl::desc(
"Relative frequency of outline region to "
143 cl::desc(
"A debug option to add additional penalty to the computed one."));
147struct FunctionOutliningInfo {
148 FunctionOutliningInfo() =
default;
152 unsigned getNumInlinedBlocks()
const {
return Entries.size() + 1; }
168struct FunctionOutliningMultiRegionInfo {
169 FunctionOutliningMultiRegionInfo() =
default;
172 struct OutlineRegionInfo {
174 BasicBlock *ExitBlock, BasicBlock *ReturnBlock)
175 : Region(Region), EntryBlock(EntryBlock), ExitBlock(ExitBlock),
176 ReturnBlock(ReturnBlock) {}
177 SmallVector<BasicBlock *, 8> Region;
186struct PartialInlinerImpl {
189 function_ref<AssumptionCache &(Function &)> GetAC,
190 function_ref<AssumptionCache *(Function &)> LookupAC,
191 function_ref<TargetTransformInfo &(Function &)> GTTI,
192 function_ref<
const TargetLibraryInfo &(Function &)> GTLI,
193 ProfileSummaryInfo &ProfSI,
194 function_ref<BlockFrequencyInfo &(Function &)> GBFI =
nullptr)
195 : GetAssumptionCache(GetAC), LookupAssumptionCache(LookupAC),
196 GetTTI(GTTI), GetBFI(GBFI), GetTLI(GTLI), PSI(ProfSI) {}
206 std::pair<bool, Function *> unswitchFunction(Function &
F);
212 struct FunctionCloner {
215 FunctionCloner(Function *
F, FunctionOutliningInfo *OI,
216 OptimizationRemarkEmitter &ORE,
217 function_ref<AssumptionCache *(Function &)> LookupAC,
218 function_ref<TargetTransformInfo &(Function &)> GetTTI);
219 FunctionCloner(Function *
F, FunctionOutliningMultiRegionInfo *OMRI,
220 OptimizationRemarkEmitter &ORE,
221 function_ref<AssumptionCache *(Function &)> LookupAC,
222 function_ref<TargetTransformInfo &(Function &)> GetTTI);
229 void normalizeReturnBlock()
const;
232 bool doMultiRegionFunctionOutlining();
239 Function *doSingleRegionFunctionOutlining();
244 typedef std::pair<Function *, BasicBlock *> FuncBodyCallerPair;
250 bool IsFunctionInlined =
false;
254 std::unique_ptr<FunctionOutliningInfo> ClonedOI =
nullptr;
256 std::unique_ptr<FunctionOutliningMultiRegionInfo> ClonedOMRI =
nullptr;
257 std::unique_ptr<BlockFrequencyInfo> ClonedFuncBFI =
nullptr;
258 OptimizationRemarkEmitter &ORE;
259 function_ref<AssumptionCache *(
Function &)> LookupAC;
260 function_ref<TargetTransformInfo &(
Function &)> GetTTI;
264 int NumPartialInlining = 0;
265 function_ref<AssumptionCache &(
Function &)> GetAssumptionCache;
266 function_ref<AssumptionCache *(
Function &)> LookupAssumptionCache;
267 function_ref<TargetTransformInfo &(
Function &)> GetTTI;
268 function_ref<BlockFrequencyInfo &(
Function &)> GetBFI;
269 function_ref<
const TargetLibraryInfo &(
Function &)> GetTLI;
270 ProfileSummaryInfo &PSI;
277 getOutliningCallBBRelativeFreq(FunctionCloner &Cloner)
const;
281 bool shouldPartialInline(CallBase &CB, FunctionCloner &Cloner,
282 BlockFrequency WeightedOutliningRcost,
283 OptimizationRemarkEmitter &ORE)
const;
288 bool tryPartialInline(FunctionCloner &Cloner);
293 computeCallsiteToProfCountMap(Function *DuplicateFunction,
294 DenseMap<User *, uint64_t> &SiteCountMap)
const;
296 bool isLimitReached()
const {
301 static CallBase *getSupportedCallBase(User *U) {
308 static CallBase *getOneCallSiteTo(Function &
F) {
310 return getSupportedCallBase(User);
313 std::tuple<DebugLoc, BasicBlock *> getOneDebugLoc(Function &
F)
const {
314 CallBase *CB = getOneCallSiteTo(
F);
317 return std::make_tuple(DLoc,
Block);
326 std::tuple<InstructionCost, InstructionCost>
327 computeOutliningCosts(FunctionCloner &Cloner)
const;
333 TargetTransformInfo *
TTI);
335 std::unique_ptr<FunctionOutliningInfo>
336 computeOutliningInfo(Function &
F)
const;
338 std::unique_ptr<FunctionOutliningMultiRegionInfo>
339 computeOutliningColdRegionsInfo(Function &
F,
340 OptimizationRemarkEmitter &ORE)
const;
345std::unique_ptr<FunctionOutliningMultiRegionInfo>
346PartialInlinerImpl::computeOutliningColdRegionsInfo(
352 BranchProbabilityInfo BPI(
F, LI);
353 std::unique_ptr<BlockFrequencyInfo> ScopedBFI;
354 BlockFrequencyInfo *
BFI;
356 ScopedBFI.reset(
new BlockFrequencyInfo(
F, BPI, LI));
357 BFI = ScopedBFI.get();
362 if (!PSI.hasInstrumentationProfile())
363 return std::unique_ptr<FunctionOutliningMultiRegionInfo>();
365 std::unique_ptr<FunctionOutliningMultiRegionInfo> OutliningInfo =
366 std::make_unique<FunctionOutliningMultiRegionInfo>();
369 [&ORE](SmallVectorImpl<BasicBlock *> &BlockList) -> BasicBlock * {
371 for (
auto *
Block : BlockList) {
376 return OptimizationRemarkMissed(
DEBUG_TYPE,
"MultiExitRegion",
378 <<
"Region dominated by "
379 <<
ore::NV(
"Block", BlockList.front()->getName())
380 <<
" has more than one region exit edge.";
393 return BFI->getBlockProfileCount(BB).value_or(0);
398 TargetTransformInfo *FTTI = &GetTTI(
F);
401 OverallFunctionCost += computeBBInlineCost(&BB, FTTI);
403 LLVM_DEBUG(
dbgs() <<
"OverallFunctionCost = " << OverallFunctionCost
409 BranchProbability MinBranchProbability(
412 bool ColdCandidateFound =
false;
414 std::vector<BasicBlock *> DFS;
415 SmallPtrSet<BasicBlock *, 8> VisitedSet;
416 DFS.push_back(CurrEntry);
417 VisitedSet.
insert(CurrEntry);
425 while (!DFS.empty()) {
426 auto *ThisBB = DFS.back();
431 if (PSI.isColdBlock(ThisBB, BFI) ||
435 if (!VisitedSet.
insert(*SI).second)
439 BranchProbability SuccProb = BPI.getEdgeProbability(ThisBB, *SI);
440 if (SuccProb > MinBranchProbability)
443 LLVM_DEBUG(
dbgs() <<
"Found cold edge: " << ThisBB->getName() <<
"->"
445 <<
"\nBranch Probability = " << SuccProb <<
"\n";);
447 SmallVector<BasicBlock *, 8> DominateVector;
448 DT.getDescendants(*SI, DominateVector);
450 "SI should be reachable and have at least itself as descendant");
453 if (!DominateVector.
front()->hasNPredecessors(1)) {
455 <<
" doesn't have a single predecessor in the "
456 "dominator tree\n";);
462 if (!(ExitBlock = IsSingleExit(DominateVector))) {
464 <<
" doesn't have a unique successor\n";);
469 for (
auto *BB : DominateVector)
470 OutlineRegionCost += computeBBInlineCost(BB, &GetTTI(*BB->getParent()));
477 return OptimizationRemarkAnalysis(
DEBUG_TYPE,
"TooCostly",
480 <<
" inline cost-savings smaller than "
481 <<
ore::NV(
"Cost", MinOutlineRegionCost);
484 LLVM_DEBUG(
dbgs() <<
"ABORT: Outline region cost is smaller than "
485 << MinOutlineRegionCost <<
"\n";);
497 FunctionOutliningMultiRegionInfo::OutlineRegionInfo RegInfo(
498 DominateVector, DominateVector.front(), ExitBlock, ReturnBlock);
499 OutliningInfo->ORI.push_back(RegInfo);
501 << DominateVector.front()->getName() <<
"\n";);
502 ColdCandidateFound =
true;
503 NumColdRegionsFound++;
507 if (ColdCandidateFound)
508 return OutliningInfo;
510 return std::unique_ptr<FunctionOutliningMultiRegionInfo>();
513std::unique_ptr<FunctionOutliningInfo>
514PartialInlinerImpl::computeOutliningInfo(Function &
F)
const {
518 return std::unique_ptr<FunctionOutliningInfo>();
531 if (IsReturnBlock(Succ1))
532 return std::make_tuple(Succ1, Succ2);
533 if (IsReturnBlock(Succ2))
534 return std::make_tuple(Succ2, Succ1);
536 return std::make_tuple<BasicBlock *, BasicBlock *>(
nullptr,
nullptr);
541 if (IsSuccessor(Succ1, Succ2))
542 return std::make_tuple(Succ1, Succ2);
543 if (IsSuccessor(Succ2, Succ1))
544 return std::make_tuple(Succ2, Succ1);
546 return std::make_tuple<BasicBlock *, BasicBlock *>(
nullptr,
nullptr);
549 std::unique_ptr<FunctionOutliningInfo> OutliningInfo =
550 std::make_unique<FunctionOutliningInfo>();
553 bool CandidateFound =
false;
568 std::tie(ReturnBlock, NonReturnBlock) = GetReturnBlock(Succ1, Succ2);
571 OutliningInfo->Entries.push_back(CurrEntry);
572 OutliningInfo->ReturnBlock = ReturnBlock;
573 OutliningInfo->NonReturnBlock = NonReturnBlock;
574 CandidateFound =
true;
579 std::tie(CommSucc,
OtherSucc) = GetCommonSucc(Succ1, Succ2);
584 OutliningInfo->Entries.push_back(CurrEntry);
589 return std::unique_ptr<FunctionOutliningInfo>();
593 assert(OutliningInfo->Entries[0] == &
F.front() &&
594 "Function Entry must be the first in Entries vector");
599 auto HasNonEntryPred = [Entries](
BasicBlock *BB) {
601 if (!Entries.count(Pred))
606 auto CheckAndNormalizeCandidate =
607 [Entries, HasNonEntryPred](FunctionOutliningInfo *OutliningInfo) {
608 for (BasicBlock *
E : OutliningInfo->Entries) {
610 if (Entries.count(Succ))
612 if (Succ == OutliningInfo->ReturnBlock)
613 OutliningInfo->ReturnBlockPreds.push_back(
E);
614 else if (Succ != OutliningInfo->NonReturnBlock)
618 if (HasNonEntryPred(
E))
624 if (!CheckAndNormalizeCandidate(OutliningInfo.get()))
625 return std::unique_ptr<FunctionOutliningInfo>();
630 BasicBlock *Cand = OutliningInfo->NonReturnBlock;
634 if (HasNonEntryPred(Cand))
641 std::tie(ReturnBlock, NonReturnBlock) = GetReturnBlock(Succ1, Succ2);
642 if (!ReturnBlock || ReturnBlock != OutliningInfo->ReturnBlock)
649 OutliningInfo->Entries.push_back(Cand);
650 OutliningInfo->NonReturnBlock = NonReturnBlock;
651 OutliningInfo->ReturnBlockPreds.push_back(Cand);
652 Entries.insert(Cand);
655 return OutliningInfo;
660 if (
F.hasProfileData())
663 for (
auto *
E : OI.Entries) {
673BranchProbability PartialInlinerImpl::getOutliningCallBBRelativeFreq(
674 FunctionCloner &Cloner)
const {
675 BasicBlock *OutliningCallBB = Cloner.OutlinedFunctions.back().second;
677 Cloner.ClonedFuncBFI->getBlockFreq(&Cloner.ClonedFunc->getEntryBlock());
678 auto OutliningCallFreq =
679 Cloner.ClonedFuncBFI->getBlockFreq(OutliningCallBB);
683 if (OutliningCallFreq.getFrequency() > EntryFreq.getFrequency())
684 OutliningCallFreq = EntryFreq;
687 OutliningCallFreq.getFrequency(), EntryFreq.getFrequency());
690 return OutlineRegionRelFreq;
704 if (OutlineRegionRelFreq < BranchProbability(45, 100))
705 return OutlineRegionRelFreq;
707 OutlineRegionRelFreq = std::max(
710 return OutlineRegionRelFreq;
713bool PartialInlinerImpl::shouldPartialInline(
714 CallBase &CB, FunctionCloner &Cloner, BlockFrequency WeightedOutliningRcost,
715 OptimizationRemarkEmitter &ORE)
const {
719 assert(Callee == Cloner.ClonedFunc);
725 auto &CalleeTTI = GetTTI(*Callee);
726 bool RemarksEnabled =
727 Callee->getContext().getDiagHandlerPtr()->isMissedOptRemarkEnabled(
731 GetTLI, GetBFI, &PSI, RemarksEnabled ? &ORE :
nullptr);
735 return OptimizationRemarkAnalysis(
DEBUG_TYPE,
"AlwaysInline", &CB)
736 <<
NV(
"Callee", Cloner.OrigFunc)
737 <<
" should always be fully inlined, not partially";
744 return OptimizationRemarkMissed(
DEBUG_TYPE,
"NeverInline", &CB)
745 <<
NV(
"Callee", Cloner.OrigFunc) <<
" not partially inlined into "
746 <<
NV(
"Caller", Caller)
747 <<
" because it should never be inlined (cost=never)";
754 return OptimizationRemarkAnalysis(
DEBUG_TYPE,
"TooCostly", &CB)
755 <<
NV(
"Callee", Cloner.OrigFunc) <<
" not partially inlined into "
756 <<
NV(
"Caller", Caller) <<
" because too costly to inline (cost="
757 <<
NV(
"Cost", IC.
getCost()) <<
", threshold="
762 const DataLayout &
DL =
Caller->getDataLayout();
766 BlockFrequency NormWeightedSavings(NonWeightedSavings);
769 if (NormWeightedSavings < WeightedOutliningRcost) {
771 return OptimizationRemarkAnalysis(
DEBUG_TYPE,
"OutliningCallcostTooHigh",
773 <<
NV(
"Callee", Cloner.OrigFunc) <<
" not partially inlined into "
774 <<
NV(
"Caller", Caller) <<
" runtime overhead (overhead="
775 <<
NV(
"Overhead", (
unsigned)WeightedOutliningRcost.
getFrequency())
777 <<
NV(
"Savings", (
unsigned)NormWeightedSavings.getFrequency())
779 <<
" of making the outlined call is too high";
786 return OptimizationRemarkAnalysis(
DEBUG_TYPE,
"CanBePartiallyInlined", &CB)
787 <<
NV(
"Callee", Cloner.OrigFunc) <<
" can be partially inlined into "
788 <<
NV(
"Caller", Caller) <<
" with cost=" <<
NV(
"Cost", IC.
getCost())
799PartialInlinerImpl::computeBBInlineCost(BasicBlock *BB,
800 TargetTransformInfo *
TTI) {
806 switch (
I.getOpcode()) {
807 case Instruction::BitCast:
808 case Instruction::PtrToInt:
809 case Instruction::IntToPtr:
810 case Instruction::Alloca:
811 case Instruction::PHI:
813 case Instruction::GetElementPtr:
821 if (
I.isLifetimeStartOrEnd())
832 FMF = FPMO->getFastMathFlags();
834 IntrinsicCostAttributes ICA(IID,
II->getType(), Tys, FMF);
859std::tuple<InstructionCost, InstructionCost>
860PartialInlinerImpl::computeOutliningCosts(FunctionCloner &Cloner)
const {
862 for (
auto FuncBBPair : Cloner.OutlinedFunctions) {
863 Function *OutlinedFunc = FuncBBPair.first;
864 BasicBlock* OutliningCallBB = FuncBBPair.second;
867 auto *OutlinedFuncTTI = &GetTTI(*OutlinedFunc);
868 OutliningFuncCallCost +=
869 computeBBInlineCost(OutliningCallBB, OutlinedFuncTTI);
872 for (BasicBlock &BB : *OutlinedFunc)
873 OutlinedFunctionCost += computeBBInlineCost(&BB, OutlinedFuncTTI);
875 assert(OutlinedFunctionCost >= Cloner.OutlinedRegionCost &&
876 "Outlined function cost should be no less than the outlined region");
881 OutlinedFunctionCost -=
885 OutliningFuncCallCost +
886 (OutlinedFunctionCost - Cloner.OutlinedRegionCost) +
889 return std::make_tuple(OutliningFuncCallCost, OutliningRuntimeOverhead);
895void PartialInlinerImpl::computeCallsiteToProfCountMap(
896 Function *DuplicateFunction,
897 DenseMap<User *, uint64_t> &CallSiteToProfCountMap)
const {
901 std::unique_ptr<BlockFrequencyInfo> TempBFI;
902 BlockFrequencyInfo *CurrentCallerBFI =
nullptr;
907 DominatorTree DT(*Caller);
909 BranchProbabilityInfo BPI(*Caller, LI);
910 TempBFI.reset(
new BlockFrequencyInfo(*Caller, BPI, LI));
911 CurrentCallerBFI = TempBFI.get();
914 CurrentCallerBFI = &(GetBFI(*Caller));
918 for (User *User :
Users) {
919 CallBase *CB = getSupportedCallBase(User);
921 if (CurrentCaller != Caller) {
923 ComputeCurrBFI(Caller);
925 assert(CurrentCallerBFI &&
"CallerBFI is not set");
932 CallSiteToProfCountMap[
User] = 0;
936PartialInlinerImpl::FunctionCloner::FunctionCloner(
937 Function *
F, FunctionOutliningInfo *OI, OptimizationRemarkEmitter &ORE,
938 function_ref<AssumptionCache *(Function &)> LookupAC,
939 function_ref<TargetTransformInfo &(Function &)> GetTTI)
940 : OrigFunc(
F), ORE(ORE), LookupAC(LookupAC), GetTTI(GetTTI) {
941 ClonedOI = std::make_unique<FunctionOutliningInfo>();
954 ClonedOI->ReturnBlockPreds.push_back(NewE);
958 F->replaceAllUsesWith(ClonedFunc);
961PartialInlinerImpl::FunctionCloner::FunctionCloner(
962 Function *
F, FunctionOutliningMultiRegionInfo *OI,
966 : OrigFunc(
F), ORE(ORE), LookupAC(LookupAC), GetTTI(GetTTI) {
967 ClonedOMRI = std::make_unique<FunctionOutliningMultiRegionInfo>();
975 for (
const FunctionOutliningMultiRegionInfo::OutlineRegionInfo &
RegionInfo :
986 FunctionOutliningMultiRegionInfo::OutlineRegionInfo MappedRegionInfo(
987 Region, NewEntryBlock, NewExitBlock, NewReturnBlock);
988 ClonedOMRI->ORI.push_back(MappedRegionInfo);
992 F->replaceAllUsesWith(ClonedFunc);
995void PartialInlinerImpl::FunctionCloner::normalizeReturnBlock()
const {
998 PHINode *FirstPhi =
nullptr;
999 while (
I != BB->end()) {
1020 BasicBlock *PreReturn = ClonedOI->ReturnBlock;
1022 PHINode *FirstPhi = GetFirstPHI(PreReturn);
1023 unsigned NumPredsFromEntries = ClonedOI->ReturnBlockPreds.size();
1028 auto IsTrivialPhi = [](PHINode *PN) ->
Value * {
1030 return PN->getIncomingValue(0);
1034 ClonedOI->ReturnBlock = ClonedOI->ReturnBlock->splitBasicBlock(
1035 ClonedOI->ReturnBlock->getFirstNonPHIIt());
1038 SmallVector<Instruction *, 4> DeadPhis;
1039 while (
I != PreReturn->
end()) {
1048 Ins = ClonedOI->ReturnBlock->getFirstNonPHIIt();
1051 for (BasicBlock *
E : ClonedOI->ReturnBlockPreds) {
1060 if (
auto *OldPhiVal = IsTrivialPhi(OldPhi)) {
1066 for (
auto *DP : DeadPhis)
1067 DP->eraseFromParent();
1069 for (
auto *
E : ClonedOI->ReturnBlockPreds)
1070 E->getTerminator()->replaceUsesOfWith(PreReturn, ClonedOI->ReturnBlock);
1073bool PartialInlinerImpl::FunctionCloner::doMultiRegionFunctionOutlining() {
1075 auto ComputeRegionCost =
1078 for (BasicBlock* BB : Region)
1079 Cost += computeBBInlineCost(BB, &GetTTI(*BB->getParent()));
1083 assert(ClonedOMRI &&
"Expecting OutlineInfo for multi region outline");
1085 if (ClonedOMRI->ORI.empty())
1094 BranchProbabilityInfo BPI(*ClonedFunc, LI);
1095 ClonedFuncBFI.reset(
new BlockFrequencyInfo(*ClonedFunc, BPI, LI));
1098 CodeExtractorAnalysisCache CEAC(*ClonedFunc);
1100 SetVector<Value *> Inputs, Outputs, Sinks;
1101 for (FunctionOutliningMultiRegionInfo::OutlineRegionInfo RegionInfo :
1104 ComputeRegionCost(RegionInfo.Region);
1106 CodeExtractor
CE(RegionInfo.Region, &DT,
false,
1107 ClonedFuncBFI.get(), &BPI,
1108 LookupAC(*RegionInfo.EntryBlock->
getParent()),
1111 CE.findInputsOutputs(Inputs, Outputs, Sinks);
1114 dbgs() <<
"inputs: " << Inputs.
size() <<
"\n";
1115 dbgs() <<
"outputs: " << Outputs.
size() <<
"\n";
1116 for (
Value *value : Inputs)
1117 dbgs() <<
"value used in func: " << *value <<
"\n";
1118 for (
Value *output : Outputs)
1119 dbgs() <<
"instr used in func: " << *output <<
"\n";
1126 if (Function *OutlinedFunc =
CE.extractCodeRegion(CEAC)) {
1127 CallBase *OCS = PartialInlinerImpl::getOneCallSiteTo(*OutlinedFunc);
1130 OutlinedFunctions.push_back(std::make_pair(OutlinedFunc,OutliningCallBB));
1131 NumColdRegionsOutlined++;
1132 OutlinedRegionCost += CurrentOutlinedRegionCost;
1140 return OptimizationRemarkMissed(
DEBUG_TYPE,
"ExtractFailed",
1141 &RegionInfo.Region.
front()->front())
1142 <<
"Failed to extract region at block "
1147 return !OutlinedFunctions.empty();
1151PartialInlinerImpl::FunctionCloner::doSingleRegionFunctionOutlining() {
1154 auto ToBeInlined = [&,
this](
BasicBlock *BB) {
1155 return BB == ClonedOI->ReturnBlock ||
1159 assert(ClonedOI &&
"Expecting OutlineInfo for single region outline");
1166 BranchProbabilityInfo BPI(*ClonedFunc, LI);
1167 ClonedFuncBFI.reset(
new BlockFrequencyInfo(*ClonedFunc, BPI, LI));
1170 std::vector<BasicBlock *> ToExtract;
1171 auto *ClonedFuncTTI = &GetTTI(*ClonedFunc);
1172 ToExtract.push_back(ClonedOI->NonReturnBlock);
1173 OutlinedRegionCost += PartialInlinerImpl::computeBBInlineCost(
1174 ClonedOI->NonReturnBlock, ClonedFuncTTI);
1175 for (BasicBlock *BB :
depth_first(&ClonedFunc->getEntryBlock()))
1176 if (!ToBeInlined(BB) && BB != ClonedOI->NonReturnBlock) {
1177 ToExtract.push_back(BB);
1182 OutlinedRegionCost += computeBBInlineCost(BB, ClonedFuncTTI);
1186 CodeExtractorAnalysisCache CEAC(*ClonedFunc);
1188 CodeExtractor(ToExtract, &DT,
false,
1189 ClonedFuncBFI.get(), &BPI, LookupAC(*ClonedFunc),
1191 .extractCodeRegion(CEAC);
1195 PartialInlinerImpl::getOneCallSiteTo(*OutlinedFunc)->
getParent();
1197 OutlinedFunctions.push_back(std::make_pair(OutlinedFunc, OutliningCallBB));
1200 return OptimizationRemarkMissed(
DEBUG_TYPE,
"ExtractFailed",
1201 &ToExtract.front()->front())
1202 <<
"Failed to extract region at block "
1203 <<
ore::NV(
"Block", ToExtract.front());
1206 return OutlinedFunc;
1209PartialInlinerImpl::FunctionCloner::~FunctionCloner() {
1213 ClonedFunc->eraseFromParent();
1214 if (!IsFunctionInlined) {
1217 for (
auto FuncBBPair : OutlinedFunctions) {
1219 Func->eraseFromParent();
1224std::pair<bool, Function *> PartialInlinerImpl::unswitchFunction(Function &
F) {
1225 if (
F.hasAddressTaken())
1226 return {
false,
nullptr};
1229 if (
F.hasFnAttribute(Attribute::AlwaysInline))
1230 return {
false,
nullptr};
1232 if (
F.hasFnAttribute(Attribute::NoInline))
1233 return {
false,
nullptr};
1235 if (PSI.isFunctionEntryCold(&
F))
1236 return {
false,
nullptr};
1238 if (
F.users().empty())
1239 return {
false,
nullptr};
1241 OptimizationRemarkEmitter ORE(&
F);
1245 if (PSI.hasProfileSummary() &&
F.hasProfileData() &&
1247 std::unique_ptr<FunctionOutliningMultiRegionInfo> OMRI =
1248 computeOutliningColdRegionsInfo(
F, ORE);
1250 FunctionCloner Cloner(&
F, OMRI.get(), ORE, LookupAssumptionCache, GetTTI);
1253 dbgs() <<
"HotCountThreshold = " << PSI.getHotCountThreshold() <<
"\n";
1254 dbgs() <<
"ColdCountThreshold = " << PSI.getColdCountThreshold()
1258 bool DidOutline = Cloner.doMultiRegionFunctionOutlining();
1262 dbgs() <<
">>>>>> Outlined (Cloned) Function >>>>>>\n";
1263 Cloner.ClonedFunc->print(
dbgs());
1264 dbgs() <<
"<<<<<< Outlined (Cloned) Function <<<<<<\n";
1267 if (tryPartialInline(Cloner))
1268 return {
true,
nullptr};
1276 std::unique_ptr<FunctionOutliningInfo> OI = computeOutliningInfo(
F);
1278 return {
false,
nullptr};
1280 FunctionCloner Cloner(&
F, OI.get(), ORE, LookupAssumptionCache, GetTTI);
1281 Cloner.normalizeReturnBlock();
1283 Function *OutlinedFunction = Cloner.doSingleRegionFunctionOutlining();
1285 if (!OutlinedFunction)
1286 return {
false,
nullptr};
1288 if (tryPartialInline(Cloner))
1289 return {
true, OutlinedFunction};
1291 return {
false,
nullptr};
1294bool PartialInlinerImpl::tryPartialInline(FunctionCloner &Cloner) {
1295 if (Cloner.OutlinedFunctions.empty())
1298 auto OutliningCosts = computeOutliningCosts(Cloner);
1304 "Expected valid costs");
1308 BranchProbability RelativeToEntryFreq;
1309 if (Cloner.ClonedOI)
1310 RelativeToEntryFreq = getOutliningCallBBRelativeFreq(Cloner);
1317 RelativeToEntryFreq = BranchProbability(0, 1);
1319 BlockFrequency WeightedRcost =
1320 BlockFrequency(NonWeightedRcost.
getValue()) * RelativeToEntryFreq;
1327 OptimizationRemarkEmitter OrigFuncORE(Cloner.OrigFunc);
1330 std::tie(DLoc,
Block) = getOneDebugLoc(*Cloner.ClonedFunc);
1331 OrigFuncORE.emit([&]() {
1332 return OptimizationRemarkAnalysis(
DEBUG_TYPE,
"OutlineRegionTooSmall",
1334 <<
ore::NV(
"Function", Cloner.OrigFunc)
1335 <<
" not partially inlined into callers (Original Size = "
1336 <<
ore::NV(
"OutlinedRegionOriginalSize", Cloner.OutlinedRegionCost)
1337 <<
", Size of call sequence to outlined function = "
1338 <<
ore::NV(
"NewSize", SizeCost) <<
")";
1343 assert(Cloner.OrigFunc->users().empty() &&
1344 "F's users should all be replaced!");
1346 std::vector<User *>
Users(Cloner.ClonedFunc->user_begin(),
1347 Cloner.ClonedFunc->user_end());
1349 DenseMap<User *, uint64_t> CallSiteToProfCountMap;
1350 auto CalleeEntryCount = Cloner.OrigFunc->getEntryCount();
1351 if (CalleeEntryCount)
1352 computeCallsiteToProfCountMap(Cloner.ClonedFunc, CallSiteToProfCountMap);
1354 uint64_t CalleeEntryCountV =
1355 (CalleeEntryCount ? CalleeEntryCount->getCount() : 0);
1357 bool AnyInline =
false;
1358 for (User *User :
Users) {
1359 CallBase *CB = getSupportedCallBase(User);
1361 if (isLimitReached())
1364 OptimizationRemarkEmitter CallerORE(CB->
getCaller());
1365 if (!shouldPartialInline(*CB, Cloner, WeightedRcost, CallerORE))
1370 OptimizationRemark
OR(
DEBUG_TYPE,
"PartiallyInlined", CB);
1371 OR <<
ore::NV(
"Callee", Cloner.OrigFunc) <<
" partially inlined into "
1374 InlineFunctionInfo IFI(GetAssumptionCache, &PSI);
1378 (Cloner.ClonedOI ? Cloner.OutlinedFunctions.back().first
1386 if (CalleeEntryCountV) {
1387 if (
auto It = CallSiteToProfCountMap.
find(User);
1388 It != CallSiteToProfCountMap.
end()) {
1389 uint64_t CallSiteCount = It->second;
1390 CalleeEntryCountV -= std::min(CalleeEntryCountV, CallSiteCount);
1395 NumPartialInlining++;
1397 if (Cloner.ClonedOI)
1398 NumPartialInlined++;
1400 NumColdOutlinePartialInlined++;
1404 Cloner.IsFunctionInlined =
true;
1405 if (CalleeEntryCount)
1406 Cloner.OrigFunc->setEntryCount(Function::ProfileCount(
1407 CalleeEntryCountV, CalleeEntryCount->getType()));
1408 OptimizationRemarkEmitter OrigFuncORE(Cloner.OrigFunc);
1409 OrigFuncORE.emit([&]() {
1410 return OptimizationRemark(
DEBUG_TYPE,
"PartiallyInlined", Cloner.OrigFunc)
1411 <<
"Partially inlined into at least one caller";
1418bool PartialInlinerImpl::run(
Module &M) {
1422 std::vector<Function *> Worklist;
1423 Worklist.reserve(
M.size());
1424 for (Function &
F : M)
1425 if (!
F.use_empty() && !
F.isDeclaration())
1426 Worklist.push_back(&
F);
1429 while (!Worklist.empty()) {
1430 Function *CurrFunc = Worklist.back();
1431 Worklist.pop_back();
1436 std::pair<bool, Function *>
Result = unswitchFunction(*CurrFunc);
1438 Worklist.push_back(
Result.second);
1471 if (PartialInlinerImpl(GetAssumptionCache, LookupAssumptionCache, GetTTI,
1472 GetTLI, PSI, GetBFI)
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
static MachineBasicBlock * OtherSucc(MachineBasicBlock *MBB, MachineBasicBlock *Succ)
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
This file contains the simple types necessary to represent the attributes associated with functions a...
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
This file defines the DenseMap class.
This file defines the DenseSet and SmallDenseSet classes.
This file builds on the ADT/GraphTraits.h file to build generic depth first graph iterator.
This file provides various utilities for inspecting and working with the control flow graph in LLVM I...
Module.h This file contains the declarations for the Module class.
iv Induction Variable Users
static cl::opt< int > InstrCost("inline-instr-cost", cl::Hidden, cl::init(5), cl::desc("Cost of a single instruction when inlining"))
Machine Check Debug Module
uint64_t IntrinsicInst * II
static cl::opt< unsigned > MaxNumInlineBlocks("max-num-inline-blocks", cl::init(5), cl::Hidden, cl::desc("Max number of blocks to be partially inlined"))
static cl::opt< int > OutlineRegionFreqPercent("outline-region-freq-percent", cl::init(75), cl::Hidden, cl::desc("Relative frequency of outline region to " "the entry block"))
static cl::opt< bool > MarkOutlinedColdCC("pi-mark-coldcc", cl::init(false), cl::Hidden, cl::desc("Mark outline function calls with ColdCC"))
static cl::opt< float > MinRegionSizeRatio("min-region-size-ratio", cl::init(0.1), cl::Hidden, cl::desc("Minimum ratio comparing relative sizes of each " "outline candidate and original function"))
static cl::opt< bool > DisableMultiRegionPartialInline("disable-mr-partial-inlining", cl::init(false), cl::Hidden, cl::desc("Disable multi-region partial inlining"))
static cl::opt< unsigned > MinBlockCounterExecution("min-block-execution", cl::init(100), cl::Hidden, cl::desc("Minimum block executions to consider " "its BranchProbabilityInfo valid"))
static cl::opt< int > MaxNumPartialInlining("max-partial-inlining", cl::init(-1), cl::Hidden, cl::desc("Max number of partial inlining. The default is unlimited"))
static cl::opt< bool > DisablePartialInlining("disable-partial-inlining", cl::init(false), cl::Hidden, cl::desc("Disable partial inlining"))
static bool hasProfileData(const Function &F, const FunctionOutliningInfo &OI)
static cl::opt< float > ColdBranchRatio("cold-branch-ratio", cl::init(0.1), cl::Hidden, cl::desc("Minimum BranchProbability to consider a region cold."))
static cl::opt< bool > ForceLiveExit("pi-force-live-exit-outline", cl::init(false), cl::Hidden, cl::desc("Force outline regions with live exits"))
static cl::opt< unsigned > ExtraOutliningPenalty("partial-inlining-extra-penalty", cl::init(0), cl::Hidden, cl::desc("A debug option to add additional penalty to the computed one."))
static cl::opt< bool > SkipCostAnalysis("skip-partial-inlining-cost-analysis", cl::ReallyHidden, cl::desc("Skip Cost Analysis"))
FunctionAnalysisManager FAM
This file contains the declarations for profiling metadata utility functions.
This file defines the SmallVector class.
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
PassT::Result & getResult(IRUnitT &IR, ExtraArgTs... ExtraArgs)
Get the result of an analysis pass for a given IR unit.
A function analysis which provides an AssumptionCache.
A cache of @llvm.assume calls within a function.
LLVM Basic Block Representation.
iterator begin()
Instruction iterator methods.
const Function * getParent() const
Return the enclosing method, or null if none.
LLVM_ABI iterator_range< filter_iterator< BasicBlock::const_iterator, std::function< bool(const Instruction &)> > > instructionsWithoutDebug(bool SkipPseudoOp=true) const
Return a const iterator range over the instructions in the block, skipping any debug instructions.
LLVM_ABI const BasicBlock * getSinglePredecessor() const
Return the predecessor of this block if it has a single predecessor block.
const Instruction & front() const
LLVM_ABI const BasicBlock * getSingleSuccessor() const
Return the successor of this block if it has a single successor.
LLVM_ABI const DataLayout & getDataLayout() const
Get the data layout of the module this basic block belongs to.
InstListType::iterator iterator
Instruction iterators...
const Instruction * getTerminator() const LLVM_READONLY
Returns the terminator instruction if the block is well formed or null if the block is not well forme...
Analysis pass which computes BlockFrequencyInfo.
BlockFrequencyInfo pass uses BlockFrequencyInfoImpl implementation to estimate IR basic block frequen...
LLVM_ABI std::optional< uint64_t > getBlockProfileCount(const BasicBlock *BB, bool AllowSynthetic=false) const
Returns the estimated profile count of BB.
uint64_t getFrequency() const
Returns the frequency as a fixpoint number scaled by the entry frequency.
Conditional or Unconditional Branch instruction.
bool isUnconditional() const
static LLVM_ABI BranchProbability getBranchProbability(uint64_t Numerator, uint64_t Denominator)
void setCallingConv(CallingConv::ID CC)
Function * getCalledFunction() const
Returns the function called, or null if this is an indirect function invocation or the function signa...
LLVM_ABI Function * getCaller()
Helper to get the caller (the parent function).
iterator find(const_arg_type_t< KeyT > Val)
void recalculate(ParentType &Func)
recalculate - compute a dominator tree for the given function
void setCallingConv(CallingConv::ID CC)
int getCost() const
Get the inline cost estimate.
int getCostDelta() const
Get the cost delta from the threshold for inlining.
auto map(const Function &F) const -> InstructionCost
CostType getValue() const
This function is intended to be used as sparingly as possible, since the class provides the full rang...
const DebugLoc & getDebugLoc() const
Return the debug location for this node as a DebugLoc.
LLVM_ABI void insertBefore(InstListType::iterator InsertPos)
Insert an unlinked instruction into a basic block immediately before the specified position.
A Module instance is used to store all the information related to an LLVM module.
void addIncoming(Value *V, BasicBlock *BB)
Add an incoming value to the end of the PHI list.
LLVM_ABI Value * removeIncomingValue(unsigned Idx, bool DeletePHIIfEmpty=true)
Remove an incoming value.
Value * getIncomingValueForBlock(const BasicBlock *BB) const
unsigned getNumIncomingValues() const
Return the number of incoming edges.
static PHINode * Create(Type *Ty, unsigned NumReservedValues, const Twine &NameStr="", InsertPosition InsertBefore=nullptr)
Constructors - NumReservedValues is a hint for the number of incoming edges that this phi node will h...
PreservedAnalyses run(Module &M, ModuleAnalysisManager &)
A set of analyses that are preserved following a run of a transformation pass.
static PreservedAnalyses none()
Convenience factory function for the empty preserved set.
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
An analysis pass based on the new PM to deliver ProfileSummaryInfo.
Analysis providing profile information.
size_type size() const
Determine the number of elements in the SetVector.
void insert_range(Range &&R)
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Analysis pass providing the TargetTransformInfo.
Analysis pass providing the TargetLibraryInfo.
Provides information about what library functions are available for the current target.
Type * getType() const
All values are typed, get the type of this value.
user_iterator user_begin()
LLVM_ABI void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
An efficient, type-erasing, non-owning reference to a callable.
const ParentTy * getParent() const
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ BasicBlock
Various leaf nodes.
LLVM_ABI int getInstrCost()
@ CE
Windows NT (Windows on ARM)
initializer< Ty > init(const Ty &Val)
PointerTypeMap run(const Module &M)
Compute the PointerTypeMap for the module M.
@ User
could "use" a pointer
DiagnosticInfoOptimizationBase::Argument NV
NodeAddr< PhiNode * > Phi
NodeAddr< FuncNode * > Func
friend class Instruction
Iterator for Instructions in a `BasicBlock.
This is an optimization pass for GlobalISel generic memory operations.
FunctionAddr VTableAddr Value
LLVM_ABI InlineResult InlineFunction(CallBase &CB, InlineFunctionInfo &IFI, bool MergeAttributes=false, AAResults *CalleeAAR=nullptr, bool InsertLifetime=true, Function *ForwardVarArgsTo=nullptr, OptimizationRemarkEmitter *ORE=nullptr)
This function inlines the called function into the basic block of the caller.
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
auto successors(const MachineBasicBlock *BB)
constexpr from_range_t from_range
InnerAnalysisManagerProxy< FunctionAnalysisManager, Module > FunctionAnalysisManagerModuleProxy
Provide the FunctionAnalysisManager to Module proxy.
LLVM_ABI InlineResult isInlineViable(Function &Callee)
Check if it is mechanically possible to inline the function Callee, based on the contents of the func...
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
FunctionAddr VTableAddr Count
auto succ_size(const MachineBasicBlock *BB)
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
RNSuccIterator< NodeRef, BlockT, RegionT > succ_begin(NodeRef Node)
LLVM_ABI InlineCost getInlineCost(CallBase &Call, const InlineParams &Params, TargetTransformInfo &CalleeTTI, function_ref< AssumptionCache &(Function &)> GetAssumptionCache, function_ref< const TargetLibraryInfo &(Function &)> GetTLI, function_ref< BlockFrequencyInfo &(Function &)> GetBFI=nullptr, ProfileSummaryInfo *PSI=nullptr, OptimizationRemarkEmitter *ORE=nullptr, function_ref< EphemeralValuesCache &(Function &)> GetEphValuesCache=nullptr)
Get an InlineCost object representing the cost of inlining this callsite.
RNSuccIterator< NodeRef, BlockT, RegionT > succ_end(NodeRef Node)
ArrayRef(const T &OneElt) -> ArrayRef< T >
LLVM_ABI InlineParams getInlineParams()
Generate the parameters to tune the inline cost analysis based only on the commandline options.
LLVM_ABI int getCallsiteCost(const TargetTransformInfo &TTI, const CallBase &Call, const DataLayout &DL)
Return the cost associated with a callsite, including parameter passing and the call/return instructi...
ValueMap< const Value *, WeakTrackingVH > ValueToValueMapTy
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
auto predecessors(const MachineBasicBlock *BB)
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
iterator_range< df_iterator< T > > depth_first(const T &G)
bool all_equal(std::initializer_list< T > Values)
Returns true if all Values in the initializer lists are equal or the list.
LLVM_ABI bool hasBranchWeightMD(const Instruction &I)
Checks if an instructions has Branch Weight Metadata.
LLVM_ABI Function * CloneFunction(Function *F, ValueToValueMapTy &VMap, ClonedCodeInfo *CodeInfo=nullptr)
Return a copy of the specified function and add it to that function's module.
AnalysisManager< Module > ModuleAnalysisManager
Convenience typedef for the Module analysis manager.