34#define DEBUG_TYPE "amdgpu-perf-hint"
38 cl::desc(
"Function mem bound threshold in %"));
42 cl::desc(
"Kernel limit wave threshold in %"));
46 cl::desc(
"Indirect access memory instruction weight"));
50 cl::desc(
"Large stride memory access weight"));
54 cl::desc(
"Large stride memory access threshold"));
56STATISTIC(NumMemBound,
"Number of functions marked as memory bound");
57STATISTIC(NumLimitWave,
"Number of functions marked as needing limit wave");
61struct AMDGPUPerfHint {
67 : FIM(FIM_), TLI(TLI_) {}
72 struct MemAccessInfo {
73 const Value *V =
nullptr;
74 const Value *Base =
nullptr;
76 MemAccessInfo() =
default;
77 bool isLargeStride(MemAccessInfo &Reference)
const;
78#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
79 Printable
print()
const {
80 return Printable([
this](raw_ostream &OS) {
81 OS <<
"Value: " << *V <<
'\n'
82 <<
"Base: " << *Base <<
" Offset: " << Offset <<
'\n';
88 MemAccessInfo makeMemAccessInfo(Instruction *)
const;
90 MemAccessInfo LastAccess;
94 const DataLayout *DL =
nullptr;
96 const SITargetLowering *TLI;
98 AMDGPUPerfHintAnalysis::FuncInfo *
visit(
const Function &
F);
99 static bool isMemBound(
const AMDGPUPerfHintAnalysis::FuncInfo &
F);
100 static bool needLimitWave(
const AMDGPUPerfHintAnalysis::FuncInfo &
F);
102 bool isIndirectAccess(
const Instruction *Inst)
const;
111 bool isLargeStride(
const Instruction *Inst);
113 bool isGlobalAddr(
const Value *V)
const;
114 bool isLocalAddr(
const Value *V)
const;
115 bool isGlobalLoadUsedInBB(
const Instruction &)
const;
118static std::pair<const Value *, const Type *> getMemoryInstrPtrAndType(
121 return {LI->getPointerOperand(), LI->getType()};
123 return {
SI->getPointerOperand(),
SI->getValueOperand()->getType()};
125 return {AI->getPointerOperand(), AI->getCompareOperand()->getType()};
127 return {AI->getPointerOperand(), AI->getValOperand()->getType()};
131 return {
nullptr,
nullptr};
134bool AMDGPUPerfHint::isIndirectAccess(
const Instruction *Inst)
const {
136 SmallPtrSet<const Value *, 32> WorkSet;
137 SmallPtrSet<const Value *, 32> Visited;
138 if (
const Value *MO = getMemoryInstrPtrAndType(Inst).first) {
139 if (isGlobalAddr(MO))
143 while (!WorkSet.
empty()) {
146 if (!Visited.
insert(V).second)
151 const auto *
M =
LD->getPointerOperand();
152 if (isGlobalAddr(M)) {
160 const auto *
P =
GEP->getPointerOperand();
162 for (
unsigned I = 1,
E =
GEP->getNumIndices() + 1;
I !=
E; ++
I)
168 WorkSet.
insert(
U->getOperand(0));
173 WorkSet.
insert(BO->getOperand(0));
174 WorkSet.
insert(BO->getOperand(1));
179 WorkSet.
insert(S->getFalseValue());
180 WorkSet.
insert(S->getTrueValue());
185 WorkSet.
insert(
E->getVectorOperand());
197bool AMDGPUPerfHint::isGlobalLoadUsedInBB(
const Instruction &
I)
const {
201 if (!isGlobalAddr(Ld->getPointerOperand()))
204 for (
const User *Usr : Ld->users()) {
206 if (UsrInst->getParent() ==
I.getParent())
214AMDGPUPerfHintAnalysis::FuncInfo *AMDGPUPerfHint::visit(
const Function &
F) {
215 AMDGPUPerfHintAnalysis::FuncInfo &FI = FIM[&
F];
217 LLVM_DEBUG(
dbgs() <<
"[AMDGPUPerfHint] process " <<
F.getName() <<
'\n');
220 LastAccess = MemAccessInfo();
221 unsigned UsedGlobalLoadsInBB = 0;
223 if (
const Type *Ty = getMemoryInstrPtrAndType(&
I).second) {
227 if (isGlobalLoadUsedInBB(
I))
228 UsedGlobalLoadsInBB +=
Size;
229 if (isIndirectAccess(&
I))
231 if (isLargeStride(&
I))
239 if (!Callee ||
Callee->isDeclaration()) {
246 auto Loc = FIM.
find(Callee);
247 if (Loc == FIM.
end())
251 FI.
InstCost += Loc->second.InstCost;
255 TargetLoweringBase::AddrMode AM;
260 GEP->getPointerAddressSpace()))
270 unsigned GlobalMemAccPercentage = UsedGlobalLoadsInBB * 100 /
B.size();
271 if (GlobalMemAccPercentage > 50) {
273 <<
B.getName() <<
" has " << GlobalMemAccPercentage
274 <<
"% global memory access\n");
283bool AMDGPUPerfHint::runOnFunction(Function &
F) {
285 DL = &
M.getDataLayout();
287 if (
F.hasFnAttribute(
"amdgpu-wave-limiter") &&
288 F.hasFnAttribute(
"amdgpu-memory-bound"))
291 const AMDGPUPerfHintAnalysis::FuncInfo *
Info =
visit(
F);
295 <<
" IAMInst cost: " <<
Info->IAMInstCost <<
'\n'
296 <<
" LSMInst cost: " <<
Info->LSMInstCost <<
'\n'
297 <<
" TotalInst cost: " <<
Info->InstCost <<
'\n');
301 if (isMemBound(*
Info)) {
304 F.addFnAttr(
"amdgpu-memory-bound",
"true");
311 F.addFnAttr(
"amdgpu-wave-limiter",
"true");
318bool AMDGPUPerfHint::isMemBound(
const AMDGPUPerfHintAnalysis::FuncInfo &FI) {
327bool AMDGPUPerfHint::needLimitWave(
const AMDGPUPerfHintAnalysis::FuncInfo &FI) {
332bool AMDGPUPerfHint::isGlobalAddr(
const Value *V)
const {
334 unsigned As = PT->getAddressSpace();
341bool AMDGPUPerfHint::isLocalAddr(
const Value *V)
const {
347bool AMDGPUPerfHint::isLargeStride(
const Instruction *Inst) {
350 MemAccessInfo MAI = makeMemAccessInfo(
const_cast<Instruction *
>(Inst));
351 bool IsLargeStride = MAI.isLargeStride(LastAccess);
353 LastAccess = std::move(MAI);
355 return IsLargeStride;
358AMDGPUPerfHint::MemAccessInfo
359AMDGPUPerfHint::makeMemAccessInfo(Instruction *Inst)
const {
361 const Value *MO = getMemoryInstrPtrAndType(Inst).first;
373bool AMDGPUPerfHint::MemAccessInfo::isLargeStride(
374 MemAccessInfo &Reference)
const {
383 <<
print() <<
"<=>\n"
384 <<
Reference.print() <<
"Result:" << Result <<
'\n');
388class AMDGPUPerfHintAnalysisLegacy :
public CallGraphSCCPass {
391 AMDGPUPerfHintAnalysis Impl;
396 AMDGPUPerfHintAnalysisLegacy() : CallGraphSCCPass(
ID) {}
398 bool runOnSCC(CallGraphSCC &SCC)
override;
400 void getAnalysisUsage(AnalysisUsage &AU)
const override {
408 auto FI = FIM.find(
F);
412 return AMDGPUPerfHint::isMemBound(FI->second);
416 auto FI = FIM.find(
F);
420 return AMDGPUPerfHint::needLimitWave(FI->second);
428 if (!
F ||
F->isDeclaration())
432 AMDGPUPerfHint Analyzer(FIM, ST.getTargetLowering());
434 if (Analyzer.runOnFunction(*
F))
451 Function &
F = SCC.begin()->getFunction();
453 if (
F.isDeclaration())
457 AMDGPUPerfHint Analyzer(FIM, ST.getTargetLowering());
458 if (Analyzer.runOnFunction(
F))
466char AMDGPUPerfHintAnalysisLegacy::ID = 0;
470 "Analysis if a function is memory bound",
true,
true)
473 auto *TPC = getAnalysisIfAvailable<TargetPassConfig>();
478 return Impl.runOnSCC(TM, SCC);
static cl::opt< unsigned > LargeStrideThresh("amdgpu-large-stride-threshold", cl::init(64), cl::Hidden, cl::desc("Large stride memory access threshold"))
static cl::opt< unsigned > IAWeight("amdgpu-indirect-access-weight", cl::init(1000), cl::Hidden, cl::desc("Indirect access memory instruction weight"))
static cl::opt< unsigned > LimitWaveThresh("amdgpu-limit-wave-threshold", cl::init(50), cl::Hidden, cl::desc("Kernel limit wave threshold in %"))
static cl::opt< unsigned > LSWeight("amdgpu-large-stride-weight", cl::init(1000), cl::Hidden, cl::desc("Large stride memory access weight"))
static cl::opt< unsigned > MemBoundThresh("amdgpu-membound-threshold", cl::init(50), cl::Hidden, cl::desc("Function mem bound threshold in %"))
Analyzes if a function potentially memory bound and if a kernel kernel may benefit from limiting numb...
The AMDGPU TargetMachine interface definition for hw codegen targets.
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static void print(raw_ostream &Out, object::Archive::Kind Kind, T Val)
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
Analysis containing CSE Info
This file provides interfaces used to build and manipulate a call graph, which is a very useful tool ...
static bool runOnFunction(Function &F, bool PostInlining)
Implements a lazy call graph analysis and related passes for the new pass manager.
Machine Check Debug Module
#define INITIALIZE_PASS(passName, arg, name, cfg, analysis)
void visit(MachineFunction &MF, MachineBasicBlock &Start, std::function< void(MachineBasicBlock *)> op)
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
This file describes how to lower LLVM code to machine code.
Target-Independent Code Generator Pass Configuration Options pass.
bool isMemoryBound(const Function *F) const
bool needsWaveLimiter(const Function *F) const
bool run(const GCNTargetMachine &TM, LazyCallGraph &CG)
bool runOnSCC(const GCNTargetMachine &TM, CallGraphSCC &SCC)
ValueMap< const Function *, FuncInfo > FuncInfoMap
PassT::Result & getResult(IRUnitT &IR, ExtraArgTs... ExtraArgs)
Get the result of an analysis pass for a given IR unit.
void setPreservesAll()
Set by analyses that do not transform their input at all.
A node in the call graph for a module.
CallGraphSCC - This is a single SCC that a CallGraphSCCPass is run on.
An analysis pass which computes the call graph for a module.
A RefSCC of the call graph.
An SCC of the call graph.
A lazily constructed view of the call graph of a module.
LLVM_ABI void buildRefSCCs()
iterator_range< postorder_ref_scc_iterator > postorder_ref_sccs()
A Module instance is used to store all the information related to an LLVM module.
A set of analyses that are preserved following a run of a transformation pass.
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
PreservedAnalyses & preserve()
Mark an analysis as preserved.
bool isLegalAddressingMode(const DataLayout &DL, const AddrMode &AM, Type *Ty, unsigned AS, Instruction *I=nullptr) const override
Return true if the addressing mode represented by AM is legal for this target, for a load/store of th...
bool erase(PtrType Ptr)
Remove pointer from the set.
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
static LLVM_ABI IntegerType * getInt8Ty(LLVMContext &C)
iterator find(const KeyT &Val)
@ LOCAL_ADDRESS
Address space for local memory.
@ FLAT_ADDRESS
Address space for flat memory.
@ GLOBAL_ADDRESS
Address space for global memory (RAT0, VTX0).
LLVM_READNONE constexpr bool isEntryFunctionCC(CallingConv::ID CC)
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
initializer< Ty > init(const Ty &Val)
friend class Instruction
Iterator for Instructions in a `BasicBlock.
This is an optimization pass for GlobalISel generic memory operations.
FunctionAddr VTableAddr Value
Printable print(const GCNRegPressure &RP, const GCNSubtarget *ST=nullptr, unsigned DynamicVGPRBlockSize=0)
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Value * GetPointerBaseWithConstantOffset(Value *Ptr, int64_t &Offset, const DataLayout &DL, bool AllowNonInbounds=true)
Analyze the specified pointer to see if it can be expressed as a base pointer plus a constant offset.
auto dyn_cast_or_null(const Y &Val)
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
constexpr T divideCeil(U Numerator, V Denominator)
Returns the integer ceil(Numerator / Denominator).
AnalysisManager< Module > ModuleAnalysisManager
Convenience typedef for the Module analysis manager.
char & AMDGPUPerfHintAnalysisLegacyID
std::unique_ptr< AMDGPUPerfHintAnalysis > Impl
const GCNTargetMachine & TM
PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM)
bool HasDenseGlobalMemAcc