LLVM 22.0.0git
AMDGPUPromoteAlloca.cpp
Go to the documentation of this file.
1//===-- AMDGPUPromoteAlloca.cpp - Promote Allocas -------------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// Eliminates allocas by either converting them into vectors or by migrating
10// them to local address space.
11//
12// Two passes are exposed by this file:
13// - "promote-alloca-to-vector", which runs early in the pipeline and only
14// promotes to vector. Promotion to vector is almost always profitable
15// except when the alloca is too big and the promotion would result in
16// very high register pressure.
17// - "promote-alloca", which does both promotion to vector and LDS and runs
18// much later in the pipeline. This runs after SROA because promoting to
19// LDS is of course less profitable than getting rid of the alloca or
20// vectorizing it, thus we only want to do it when the only alternative is
21// lowering the alloca to stack.
22//
23// Note that both of them exist for the old and new PMs. The new PM passes are
24// declared in AMDGPU.h and the legacy PM ones are declared here.s
25//
26//===----------------------------------------------------------------------===//
27
28#include "AMDGPU.h"
29#include "GCNSubtarget.h"
31#include "llvm/ADT/STLExtras.h"
38#include "llvm/IR/IRBuilder.h"
40#include "llvm/IR/IntrinsicsAMDGPU.h"
41#include "llvm/IR/IntrinsicsR600.h"
44#include "llvm/Pass.h"
47
48#define DEBUG_TYPE "amdgpu-promote-alloca"
49
50using namespace llvm;
51
52namespace {
53
54static cl::opt<bool>
55 DisablePromoteAllocaToVector("disable-promote-alloca-to-vector",
56 cl::desc("Disable promote alloca to vector"),
57 cl::init(false));
58
59static cl::opt<bool>
60 DisablePromoteAllocaToLDS("disable-promote-alloca-to-lds",
61 cl::desc("Disable promote alloca to LDS"),
62 cl::init(false));
63
64static cl::opt<unsigned> PromoteAllocaToVectorLimit(
65 "amdgpu-promote-alloca-to-vector-limit",
66 cl::desc("Maximum byte size to consider promote alloca to vector"),
67 cl::init(0));
68
69static cl::opt<unsigned> PromoteAllocaToVectorMaxRegs(
70 "amdgpu-promote-alloca-to-vector-max-regs",
72 "Maximum vector size (in 32b registers) to use when promoting alloca"),
73 cl::init(32));
74
75// Use up to 1/4 of available register budget for vectorization.
76// FIXME: Increase the limit for whole function budgets? Perhaps x2?
77static cl::opt<unsigned> PromoteAllocaToVectorVGPRRatio(
78 "amdgpu-promote-alloca-to-vector-vgpr-ratio",
79 cl::desc("Ratio of VGPRs to budget for promoting alloca to vectors"),
80 cl::init(4));
81
83 LoopUserWeight("promote-alloca-vector-loop-user-weight",
84 cl::desc("The bonus weight of users of allocas within loop "
85 "when sorting profitable allocas"),
86 cl::init(4));
87
88// Shared implementation which can do both promotion to vector and to LDS.
89class AMDGPUPromoteAllocaImpl {
90private:
91 const TargetMachine &TM;
92 LoopInfo &LI;
93 Module *Mod = nullptr;
94 const DataLayout *DL = nullptr;
95
96 // FIXME: This should be per-kernel.
97 uint32_t LocalMemLimit = 0;
98 uint32_t CurrentLocalMemUsage = 0;
99 unsigned MaxVGPRs;
100 unsigned VGPRBudgetRatio;
101 unsigned MaxVectorRegs;
102
103 bool IsAMDGCN = false;
104 bool IsAMDHSA = false;
105
106 std::pair<Value *, Value *> getLocalSizeYZ(IRBuilder<> &Builder);
107 Value *getWorkitemID(IRBuilder<> &Builder, unsigned N);
108
109 /// BaseAlloca is the alloca root the search started from.
110 /// Val may be that alloca or a recursive user of it.
111 bool collectUsesWithPtrTypes(Value *BaseAlloca, Value *Val,
112 std::vector<Value *> &WorkList) const;
113
114 /// Val is a derived pointer from Alloca. OpIdx0/OpIdx1 are the operand
115 /// indices to an instruction with 2 pointer inputs (e.g. select, icmp).
116 /// Returns true if both operands are derived from the same alloca. Val should
117 /// be the same value as one of the input operands of UseInst.
118 bool binaryOpIsDerivedFromSameAlloca(Value *Alloca, Value *Val,
119 Instruction *UseInst, int OpIdx0,
120 int OpIdx1) const;
121
122 /// Check whether we have enough local memory for promotion.
123 bool hasSufficientLocalMem(const Function &F);
124
125 bool tryPromoteAllocaToVector(AllocaInst &I);
126 bool tryPromoteAllocaToLDS(AllocaInst &I, bool SufficientLDS);
127
128 void sortAllocasToPromote(SmallVectorImpl<AllocaInst *> &Allocas);
129
130 void setFunctionLimits(const Function &F);
131
132public:
133 AMDGPUPromoteAllocaImpl(TargetMachine &TM, LoopInfo &LI) : TM(TM), LI(LI) {
134
135 const Triple &TT = TM.getTargetTriple();
136 IsAMDGCN = TT.isAMDGCN();
137 IsAMDHSA = TT.getOS() == Triple::AMDHSA;
138 }
139
140 bool run(Function &F, bool PromoteToLDS);
141};
142
143// FIXME: This can create globals so should be a module pass.
144class AMDGPUPromoteAlloca : public FunctionPass {
145public:
146 static char ID;
147
148 AMDGPUPromoteAlloca() : FunctionPass(ID) {}
149
150 bool runOnFunction(Function &F) override {
151 if (skipFunction(F))
152 return false;
153 if (auto *TPC = getAnalysisIfAvailable<TargetPassConfig>())
154 return AMDGPUPromoteAllocaImpl(
155 TPC->getTM<TargetMachine>(),
156 getAnalysis<LoopInfoWrapperPass>().getLoopInfo())
157 .run(F, /*PromoteToLDS*/ true);
158 return false;
159 }
160
161 StringRef getPassName() const override { return "AMDGPU Promote Alloca"; }
162
163 void getAnalysisUsage(AnalysisUsage &AU) const override {
164 AU.setPreservesCFG();
167 }
168};
169
170static unsigned getMaxVGPRs(unsigned LDSBytes, const TargetMachine &TM,
171 const Function &F) {
172 if (!TM.getTargetTriple().isAMDGCN())
173 return 128;
174
175 const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F);
176
177 unsigned DynamicVGPRBlockSize = AMDGPU::getDynamicVGPRBlockSize(F);
178 // Temporarily check both the attribute and the subtarget feature, until the
179 // latter is removed.
180 if (DynamicVGPRBlockSize == 0 && ST.isDynamicVGPREnabled())
181 DynamicVGPRBlockSize = ST.getDynamicVGPRBlockSize();
182
183 unsigned MaxVGPRs = ST.getMaxNumVGPRs(
184 ST.getWavesPerEU(ST.getFlatWorkGroupSizes(F), LDSBytes, F).first,
185 DynamicVGPRBlockSize);
186
187 // A non-entry function has only 32 caller preserved registers.
188 // Do not promote alloca which will force spilling unless we know the function
189 // will be inlined.
190 if (!F.hasFnAttribute(Attribute::AlwaysInline) &&
191 !AMDGPU::isEntryFunctionCC(F.getCallingConv()))
192 MaxVGPRs = std::min(MaxVGPRs, 32u);
193 return MaxVGPRs;
194}
195
196} // end anonymous namespace
197
198char AMDGPUPromoteAlloca::ID = 0;
199
201 "AMDGPU promote alloca to vector or LDS", false, false)
202// Move LDS uses from functions to kernels before promote alloca for accurate
203// estimation of LDS available
204INITIALIZE_PASS_DEPENDENCY(AMDGPULowerModuleLDSLegacy)
206INITIALIZE_PASS_END(AMDGPUPromoteAlloca, DEBUG_TYPE,
207 "AMDGPU promote alloca to vector or LDS", false, false)
208
209char &llvm::AMDGPUPromoteAllocaID = AMDGPUPromoteAlloca::ID;
210
213 auto &LI = AM.getResult<LoopAnalysis>(F);
214 bool Changed = AMDGPUPromoteAllocaImpl(TM, LI).run(F, /*PromoteToLDS=*/true);
215 if (Changed) {
218 return PA;
219 }
220 return PreservedAnalyses::all();
221}
222
225 auto &LI = AM.getResult<LoopAnalysis>(F);
226 bool Changed = AMDGPUPromoteAllocaImpl(TM, LI).run(F, /*PromoteToLDS=*/false);
227 if (Changed) {
230 return PA;
231 }
232 return PreservedAnalyses::all();
233}
234
236 return new AMDGPUPromoteAlloca();
237}
238
239static void collectAllocaUses(AllocaInst &Alloca,
241 SmallVector<Instruction *, 4> WorkList({&Alloca});
242 while (!WorkList.empty()) {
243 auto *Cur = WorkList.pop_back_val();
244 for (auto &U : Cur->uses()) {
245 Uses.push_back(&U);
246
247 if (isa<GetElementPtrInst>(U.getUser()))
248 WorkList.push_back(cast<Instruction>(U.getUser()));
249 }
250 }
251}
252
253void AMDGPUPromoteAllocaImpl::sortAllocasToPromote(
256
257 for (auto *Alloca : Allocas) {
258 LLVM_DEBUG(dbgs() << "Scoring: " << *Alloca << "\n");
259 unsigned &Score = Scores[Alloca];
260 // Increment score by one for each user + a bonus for users within loops.
262 collectAllocaUses(*Alloca, Uses);
263 for (auto *U : Uses) {
264 Instruction *Inst = cast<Instruction>(U->getUser());
265 if (isa<GetElementPtrInst>(Inst))
266 continue;
267 unsigned UserScore =
268 1 + (LoopUserWeight * LI.getLoopDepth(Inst->getParent()));
269 LLVM_DEBUG(dbgs() << " [+" << UserScore << "]:\t" << *Inst << "\n");
270 Score += UserScore;
271 }
272 LLVM_DEBUG(dbgs() << " => Final Score:" << Score << "\n");
273 }
274
275 stable_sort(Allocas, [&](AllocaInst *A, AllocaInst *B) {
276 return Scores.at(A) > Scores.at(B);
277 });
278
279 // clang-format off
281 dbgs() << "Sorted Worklist:\n";
282 for (auto *A: Allocas)
283 dbgs() << " " << *A << "\n";
284 );
285 // clang-format on
286}
287
288void AMDGPUPromoteAllocaImpl::setFunctionLimits(const Function &F) {
289 // Load per function limits, overriding with global options where appropriate.
290 // R600 register tuples/aliasing are fragile with large vector promotions so
291 // apply architecture specific limit here.
292 const int R600MaxVectorRegs = 16;
293 MaxVectorRegs = F.getFnAttributeAsParsedInteger(
294 "amdgpu-promote-alloca-to-vector-max-regs",
295 IsAMDGCN ? PromoteAllocaToVectorMaxRegs : R600MaxVectorRegs);
296 if (PromoteAllocaToVectorMaxRegs.getNumOccurrences())
297 MaxVectorRegs = PromoteAllocaToVectorMaxRegs;
298 VGPRBudgetRatio = F.getFnAttributeAsParsedInteger(
299 "amdgpu-promote-alloca-to-vector-vgpr-ratio",
300 PromoteAllocaToVectorVGPRRatio);
301 if (PromoteAllocaToVectorVGPRRatio.getNumOccurrences())
302 VGPRBudgetRatio = PromoteAllocaToVectorVGPRRatio;
303}
304
305bool AMDGPUPromoteAllocaImpl::run(Function &F, bool PromoteToLDS) {
306 Mod = F.getParent();
307 DL = &Mod->getDataLayout();
308
310 if (!ST.isPromoteAllocaEnabled())
311 return false;
312
313 bool SufficientLDS = PromoteToLDS && hasSufficientLocalMem(F);
314 MaxVGPRs = getMaxVGPRs(CurrentLocalMemUsage, TM, F);
315 setFunctionLimits(F);
316
317 unsigned VectorizationBudget =
318 (PromoteAllocaToVectorLimit ? PromoteAllocaToVectorLimit * 8
319 : (MaxVGPRs * 32)) /
320 VGPRBudgetRatio;
321
323 for (Instruction &I : F.getEntryBlock()) {
324 if (AllocaInst *AI = dyn_cast<AllocaInst>(&I)) {
325 // Array allocations are probably not worth handling, since an allocation
326 // of the array type is the canonical form.
327 if (!AI->isStaticAlloca() || AI->isArrayAllocation())
328 continue;
329 Allocas.push_back(AI);
330 }
331 }
332
333 sortAllocasToPromote(Allocas);
334
335 bool Changed = false;
336 for (AllocaInst *AI : Allocas) {
337 const unsigned AllocaCost = DL->getTypeSizeInBits(AI->getAllocatedType());
338 // First, check if we have enough budget to vectorize this alloca.
339 if (AllocaCost <= VectorizationBudget) {
340 // If we do, attempt vectorization, otherwise, fall through and try
341 // promoting to LDS instead.
342 if (tryPromoteAllocaToVector(*AI)) {
343 Changed = true;
344 assert((VectorizationBudget - AllocaCost) < VectorizationBudget &&
345 "Underflow!");
346 VectorizationBudget -= AllocaCost;
347 LLVM_DEBUG(dbgs() << " Remaining vectorization budget:"
348 << VectorizationBudget << "\n");
349 continue;
350 }
351 } else {
352 LLVM_DEBUG(dbgs() << "Alloca too big for vectorization (size:"
353 << AllocaCost << ", budget:" << VectorizationBudget
354 << "): " << *AI << "\n");
355 }
356
357 if (PromoteToLDS && tryPromoteAllocaToLDS(*AI, SufficientLDS))
358 Changed = true;
359 }
360
361 // NOTE: tryPromoteAllocaToVector removes the alloca, so Allocas contains
362 // dangling pointers. If we want to reuse it past this point, the loop above
363 // would need to be updated to remove successfully promoted allocas.
364
365 return Changed;
366}
367
371};
372
373// Checks if the instruction I is a memset user of the alloca AI that we can
374// deal with. Currently, only non-volatile memsets that affect the whole alloca
375// are handled.
377 const DataLayout &DL) {
378 using namespace PatternMatch;
379 // For now we only care about non-volatile memsets that affect the whole type
380 // (start at index 0 and fill the whole alloca).
381 //
382 // TODO: Now that we moved to PromoteAlloca we could handle any memsets
383 // (except maybe volatile ones?) - we just need to use shufflevector if it
384 // only affects a subset of the vector.
385 const unsigned Size = DL.getTypeStoreSize(AI->getAllocatedType());
386 return I->getOperand(0) == AI &&
387 match(I->getOperand(2), m_SpecificInt(Size)) && !I->isVolatile();
388}
389
391 Value *Ptr, const std::map<GetElementPtrInst *, WeakTrackingVH> &GEPIdx) {
392 auto *GEP = dyn_cast<GetElementPtrInst>(Ptr->stripPointerCasts());
393 if (!GEP)
394 return ConstantInt::getNullValue(Type::getInt32Ty(Ptr->getContext()));
395
396 auto I = GEPIdx.find(GEP);
397 assert(I != GEPIdx.end() && "Must have entry for GEP!");
398
399 Value *IndexValue = I->second;
400 assert(IndexValue && "index value missing from GEP index map");
401 return IndexValue;
402}
403
405 Type *VecElemTy, const DataLayout &DL,
406 SmallVector<Instruction *> &NewInsts) {
407 // TODO: Extracting a "multiple of X" from a GEP might be a useful generic
408 // helper.
409 LLVMContext &Ctx = GEP->getContext();
410 unsigned BW = DL.getIndexTypeSizeInBits(GEP->getType());
412 APInt ConstOffset(BW, 0);
413
414 // Walk backwards through nested GEPs to collect both constant and variable
415 // offsets, so that nested vector GEP chains can be lowered in one step.
416 //
417 // Given this IR fragment as input:
418 //
419 // %0 = alloca [10 x <2 x i32>], align 8, addrspace(5)
420 // %1 = getelementptr [10 x <2 x i32>], ptr addrspace(5) %0, i32 0, i32 %j
421 // %2 = getelementptr i8, ptr addrspace(5) %1, i32 4
422 // %3 = load i32, ptr addrspace(5) %2, align 4
423 //
424 // Combine both GEP operations in a single pass, producing:
425 // BasePtr = %0
426 // ConstOffset = 4
427 // VarOffsets = { %j -> element_size(<2 x i32>) }
428 //
429 // That lets us emit a single buffer_load directly into a VGPR, without ever
430 // allocating scratch memory for the intermediate pointer.
431 Value *CurPtr = GEP;
432 while (auto *CurGEP = dyn_cast<GetElementPtrInst>(CurPtr)) {
433 if (!CurGEP->collectOffset(DL, BW, VarOffsets, ConstOffset))
434 return nullptr;
435
436 // Move to the next outer pointer.
437 CurPtr = CurGEP->getPointerOperand();
438 }
439
440 assert(CurPtr == Alloca && "GEP not based on alloca");
441
442 int64_t VecElemSize = DL.getTypeAllocSize(VecElemTy);
443 if (VarOffsets.size() > 1)
444 return nullptr;
445
446 APInt IndexQuot;
447 int64_t Rem;
448 APInt::sdivrem(ConstOffset, VecElemSize, IndexQuot, Rem);
449 if (Rem != 0)
450 return nullptr;
451 if (VarOffsets.size() == 0)
452 return ConstantInt::get(Ctx, IndexQuot);
453
454 IRBuilder<> Builder(GEP);
455
456 const auto &VarOffset = VarOffsets.front();
457 APInt OffsetQuot;
458 APInt::sdivrem(VarOffset.second, VecElemSize, OffsetQuot, Rem);
459 if (Rem != 0 || OffsetQuot.isZero())
460 return nullptr;
461
462 Value *Offset = VarOffset.first;
463 auto *OffsetType = dyn_cast<IntegerType>(Offset->getType());
464 if (!OffsetType)
465 return nullptr;
466
467 if (!OffsetQuot.isOne()) {
468 ConstantInt *ConstMul =
469 ConstantInt::get(Ctx, OffsetQuot.sext(OffsetType->getBitWidth()));
470 Offset = Builder.CreateMul(Offset, ConstMul);
472 NewInsts.push_back(NewInst);
473 }
474 if (ConstOffset.isZero())
475 return Offset;
476
477 ConstantInt *ConstIndex =
478 ConstantInt::get(Ctx, IndexQuot.sext(OffsetType->getBitWidth()));
479 Value *IndexAdd = Builder.CreateAdd(Offset, ConstIndex);
480 if (Instruction *NewInst = dyn_cast<Instruction>(IndexAdd))
481 NewInsts.push_back(NewInst);
482 return IndexAdd;
483}
484
485/// Promotes a single user of the alloca to a vector form.
486///
487/// \param Inst Instruction to be promoted.
488/// \param DL Module Data Layout.
489/// \param VectorTy Vectorized Type.
490/// \param VecStoreSize Size of \p VectorTy in bytes.
491/// \param ElementSize Size of \p VectorTy element type in bytes.
492/// \param TransferInfo MemTransferInst info map.
493/// \param GEPVectorIdx GEP -> VectorIdx cache.
494/// \param CurVal Current value of the vector (e.g. last stored value)
495/// \param[out] DeferredLoads \p Inst is added to this vector if it can't
496/// be promoted now. This happens when promoting requires \p
497/// CurVal, but \p CurVal is nullptr.
498/// \return the stored value if \p Inst would have written to the alloca, or
499/// nullptr otherwise.
501 Instruction *Inst, const DataLayout &DL, FixedVectorType *VectorTy,
502 unsigned VecStoreSize, unsigned ElementSize,
504 std::map<GetElementPtrInst *, WeakTrackingVH> &GEPVectorIdx, Value *CurVal,
505 SmallVectorImpl<LoadInst *> &DeferredLoads) {
506 // Note: we use InstSimplifyFolder because it can leverage the DataLayout
507 // to do more folding, especially in the case of vector splats.
510 Builder.SetInsertPoint(Inst);
511
512 const auto GetOrLoadCurrentVectorValue = [&]() -> Value * {
513 if (CurVal)
514 return CurVal;
515
516 // If the current value is not known, insert a dummy load and lower it on
517 // the second pass.
518 LoadInst *Dummy =
519 Builder.CreateLoad(VectorTy, PoisonValue::get(Builder.getPtrTy()),
520 "promotealloca.dummyload");
521 DeferredLoads.push_back(Dummy);
522 return Dummy;
523 };
524
525 const auto CreateTempPtrIntCast = [&Builder, DL](Value *Val,
526 Type *PtrTy) -> Value * {
527 assert(DL.getTypeStoreSize(Val->getType()) == DL.getTypeStoreSize(PtrTy));
528 const unsigned Size = DL.getTypeStoreSizeInBits(PtrTy);
529 if (!PtrTy->isVectorTy())
530 return Builder.CreateBitOrPointerCast(Val, Builder.getIntNTy(Size));
531 const unsigned NumPtrElts = cast<FixedVectorType>(PtrTy)->getNumElements();
532 // If we want to cast to cast, e.g. a <2 x ptr> into a <4 x i32>, we need to
533 // first cast the ptr vector to <2 x i64>.
534 assert((Size % NumPtrElts == 0) && "Vector size not divisble");
535 Type *EltTy = Builder.getIntNTy(Size / NumPtrElts);
536 return Builder.CreateBitOrPointerCast(
537 Val, FixedVectorType::get(EltTy, NumPtrElts));
538 };
539
540 Type *VecEltTy = VectorTy->getElementType();
541
542 switch (Inst->getOpcode()) {
543 case Instruction::Load: {
544 // Loads can only be lowered if the value is known.
545 if (!CurVal) {
546 DeferredLoads.push_back(cast<LoadInst>(Inst));
547 return nullptr;
548 }
549
551 cast<LoadInst>(Inst)->getPointerOperand(), GEPVectorIdx);
552
553 // We're loading the full vector.
554 Type *AccessTy = Inst->getType();
555 TypeSize AccessSize = DL.getTypeStoreSize(AccessTy);
556 if (Constant *CI = dyn_cast<Constant>(Index)) {
557 if (CI->isZeroValue() && AccessSize == VecStoreSize) {
558 if (AccessTy->isPtrOrPtrVectorTy())
559 CurVal = CreateTempPtrIntCast(CurVal, AccessTy);
560 else if (CurVal->getType()->isPtrOrPtrVectorTy())
561 CurVal = CreateTempPtrIntCast(CurVal, CurVal->getType());
562 Value *NewVal = Builder.CreateBitOrPointerCast(CurVal, AccessTy);
563 Inst->replaceAllUsesWith(NewVal);
564 return nullptr;
565 }
566 }
567
568 // Loading a subvector.
569 if (isa<FixedVectorType>(AccessTy)) {
570 assert(AccessSize.isKnownMultipleOf(DL.getTypeStoreSize(VecEltTy)));
571 const unsigned NumLoadedElts = AccessSize / DL.getTypeStoreSize(VecEltTy);
572 auto *SubVecTy = FixedVectorType::get(VecEltTy, NumLoadedElts);
573 assert(DL.getTypeStoreSize(SubVecTy) == DL.getTypeStoreSize(AccessTy));
574
575 Value *SubVec = PoisonValue::get(SubVecTy);
576 for (unsigned K = 0; K < NumLoadedElts; ++K) {
577 Value *CurIdx =
578 Builder.CreateAdd(Index, ConstantInt::get(Index->getType(), K));
579 SubVec = Builder.CreateInsertElement(
580 SubVec, Builder.CreateExtractElement(CurVal, CurIdx), K);
581 }
582
583 if (AccessTy->isPtrOrPtrVectorTy())
584 SubVec = CreateTempPtrIntCast(SubVec, AccessTy);
585 else if (SubVecTy->isPtrOrPtrVectorTy())
586 SubVec = CreateTempPtrIntCast(SubVec, SubVecTy);
587
588 SubVec = Builder.CreateBitOrPointerCast(SubVec, AccessTy);
589 Inst->replaceAllUsesWith(SubVec);
590 return nullptr;
591 }
592
593 // We're loading one element.
594 Value *ExtractElement = Builder.CreateExtractElement(CurVal, Index);
595 if (AccessTy != VecEltTy)
596 ExtractElement = Builder.CreateBitOrPointerCast(ExtractElement, AccessTy);
597
598 Inst->replaceAllUsesWith(ExtractElement);
599 return nullptr;
600 }
601 case Instruction::Store: {
602 // For stores, it's a bit trickier and it depends on whether we're storing
603 // the full vector or not. If we're storing the full vector, we don't need
604 // to know the current value. If this is a store of a single element, we
605 // need to know the value.
607 Value *Index = calculateVectorIndex(SI->getPointerOperand(), GEPVectorIdx);
608 Value *Val = SI->getValueOperand();
609
610 // We're storing the full vector, we can handle this without knowing CurVal.
611 Type *AccessTy = Val->getType();
612 TypeSize AccessSize = DL.getTypeStoreSize(AccessTy);
613 if (Constant *CI = dyn_cast<Constant>(Index)) {
614 if (CI->isZeroValue() && AccessSize == VecStoreSize) {
615 if (AccessTy->isPtrOrPtrVectorTy())
616 Val = CreateTempPtrIntCast(Val, AccessTy);
617 else if (VectorTy->isPtrOrPtrVectorTy())
618 Val = CreateTempPtrIntCast(Val, VectorTy);
619 return Builder.CreateBitOrPointerCast(Val, VectorTy);
620 }
621 }
622
623 // Storing a subvector.
624 if (isa<FixedVectorType>(AccessTy)) {
625 assert(AccessSize.isKnownMultipleOf(DL.getTypeStoreSize(VecEltTy)));
626 const unsigned NumWrittenElts =
627 AccessSize / DL.getTypeStoreSize(VecEltTy);
628 const unsigned NumVecElts = VectorTy->getNumElements();
629 auto *SubVecTy = FixedVectorType::get(VecEltTy, NumWrittenElts);
630 assert(DL.getTypeStoreSize(SubVecTy) == DL.getTypeStoreSize(AccessTy));
631
632 if (SubVecTy->isPtrOrPtrVectorTy())
633 Val = CreateTempPtrIntCast(Val, SubVecTy);
634 else if (AccessTy->isPtrOrPtrVectorTy())
635 Val = CreateTempPtrIntCast(Val, AccessTy);
636
637 Val = Builder.CreateBitOrPointerCast(Val, SubVecTy);
638
639 Value *CurVec = GetOrLoadCurrentVectorValue();
640 for (unsigned K = 0, NumElts = std::min(NumWrittenElts, NumVecElts);
641 K < NumElts; ++K) {
642 Value *CurIdx =
643 Builder.CreateAdd(Index, ConstantInt::get(Index->getType(), K));
644 CurVec = Builder.CreateInsertElement(
645 CurVec, Builder.CreateExtractElement(Val, K), CurIdx);
646 }
647 return CurVec;
648 }
649
650 if (Val->getType() != VecEltTy)
651 Val = Builder.CreateBitOrPointerCast(Val, VecEltTy);
652 return Builder.CreateInsertElement(GetOrLoadCurrentVectorValue(), Val,
653 Index);
654 }
655 case Instruction::Call: {
656 if (auto *MTI = dyn_cast<MemTransferInst>(Inst)) {
657 // For memcpy, we need to know curval.
658 ConstantInt *Length = cast<ConstantInt>(MTI->getLength());
659 unsigned NumCopied = Length->getZExtValue() / ElementSize;
660 MemTransferInfo *TI = &TransferInfo[MTI];
661 unsigned SrcBegin = TI->SrcIndex->getZExtValue();
662 unsigned DestBegin = TI->DestIndex->getZExtValue();
663
664 SmallVector<int> Mask;
665 for (unsigned Idx = 0; Idx < VectorTy->getNumElements(); ++Idx) {
666 if (Idx >= DestBegin && Idx < DestBegin + NumCopied) {
667 Mask.push_back(SrcBegin < VectorTy->getNumElements()
668 ? SrcBegin++
670 } else {
671 Mask.push_back(Idx);
672 }
673 }
674
675 return Builder.CreateShuffleVector(GetOrLoadCurrentVectorValue(), Mask);
676 }
677
678 if (auto *MSI = dyn_cast<MemSetInst>(Inst)) {
679 // For memset, we don't need to know the previous value because we
680 // currently only allow memsets that cover the whole alloca.
681 Value *Elt = MSI->getOperand(1);
682 const unsigned BytesPerElt = DL.getTypeStoreSize(VecEltTy);
683 if (BytesPerElt > 1) {
684 Value *EltBytes = Builder.CreateVectorSplat(BytesPerElt, Elt);
685
686 // If the element type of the vector is a pointer, we need to first cast
687 // to an integer, then use a PtrCast.
688 if (VecEltTy->isPointerTy()) {
689 Type *PtrInt = Builder.getIntNTy(BytesPerElt * 8);
690 Elt = Builder.CreateBitCast(EltBytes, PtrInt);
691 Elt = Builder.CreateIntToPtr(Elt, VecEltTy);
692 } else
693 Elt = Builder.CreateBitCast(EltBytes, VecEltTy);
694 }
695
696 return Builder.CreateVectorSplat(VectorTy->getElementCount(), Elt);
697 }
698
699 if (auto *Intr = dyn_cast<IntrinsicInst>(Inst)) {
700 if (Intr->getIntrinsicID() == Intrinsic::objectsize) {
701 Intr->replaceAllUsesWith(
702 Builder.getIntN(Intr->getType()->getIntegerBitWidth(),
703 DL.getTypeAllocSize(VectorTy)));
704 return nullptr;
705 }
706 }
707
708 llvm_unreachable("Unsupported call when promoting alloca to vector");
709 }
710
711 default:
712 llvm_unreachable("Inconsistency in instructions promotable to vector");
713 }
714
715 llvm_unreachable("Did not return after promoting instruction!");
716}
717
718static bool isSupportedAccessType(FixedVectorType *VecTy, Type *AccessTy,
719 const DataLayout &DL) {
720 // Access as a vector type can work if the size of the access vector is a
721 // multiple of the size of the alloca's vector element type.
722 //
723 // Examples:
724 // - VecTy = <8 x float>, AccessTy = <4 x float> -> OK
725 // - VecTy = <4 x double>, AccessTy = <2 x float> -> OK
726 // - VecTy = <4 x double>, AccessTy = <3 x float> -> NOT OK
727 // - 3*32 is not a multiple of 64
728 //
729 // We could handle more complicated cases, but it'd make things a lot more
730 // complicated.
731 if (isa<FixedVectorType>(AccessTy)) {
732 TypeSize AccTS = DL.getTypeStoreSize(AccessTy);
733 // If the type size and the store size don't match, we would need to do more
734 // than just bitcast to translate between an extracted/insertable subvectors
735 // and the accessed value.
736 if (AccTS * 8 != DL.getTypeSizeInBits(AccessTy))
737 return false;
738 TypeSize VecTS = DL.getTypeStoreSize(VecTy->getElementType());
739 return AccTS.isKnownMultipleOf(VecTS);
740 }
741
743 DL);
744}
745
746/// Iterates over an instruction worklist that may contain multiple instructions
747/// from the same basic block, but in a different order.
748template <typename InstContainer>
749static void forEachWorkListItem(const InstContainer &WorkList,
750 std::function<void(Instruction *)> Fn) {
751 // Bucket up uses of the alloca by the block they occur in.
752 // This is important because we have to handle multiple defs/uses in a block
753 // ourselves: SSAUpdater is purely for cross-block references.
755 for (Instruction *User : WorkList)
756 UsesByBlock[User->getParent()].insert(User);
757
758 for (Instruction *User : WorkList) {
759 BasicBlock *BB = User->getParent();
760 auto &BlockUses = UsesByBlock[BB];
761
762 // Already processed, skip.
763 if (BlockUses.empty())
764 continue;
765
766 // Only user in the block, directly process it.
767 if (BlockUses.size() == 1) {
768 Fn(User);
769 continue;
770 }
771
772 // Multiple users in the block, do a linear scan to see users in order.
773 for (Instruction &Inst : *BB) {
774 if (!BlockUses.contains(&Inst))
775 continue;
776
777 Fn(&Inst);
778 }
779
780 // Clear the block so we know it's been processed.
781 BlockUses.clear();
782 }
783}
784
785/// Find an insert point after an alloca, after all other allocas clustered at
786/// the start of the block.
789 for (BasicBlock::iterator E = BB.end(); I != E && isa<AllocaInst>(*I); ++I)
790 ;
791 return I;
792}
793
794// FIXME: Should try to pick the most likely to be profitable allocas first.
795bool AMDGPUPromoteAllocaImpl::tryPromoteAllocaToVector(AllocaInst &Alloca) {
796 LLVM_DEBUG(dbgs() << "Trying to promote to vector: " << Alloca << '\n');
797
798 if (DisablePromoteAllocaToVector) {
799 LLVM_DEBUG(dbgs() << " Promote alloca to vector is disabled\n");
800 return false;
801 }
802
803 Type *AllocaTy = Alloca.getAllocatedType();
804 auto *VectorTy = dyn_cast<FixedVectorType>(AllocaTy);
805 if (auto *ArrayTy = dyn_cast<ArrayType>(AllocaTy)) {
806 uint64_t NumElems = 1;
807 Type *ElemTy;
808 do {
809 NumElems *= ArrayTy->getNumElements();
810 ElemTy = ArrayTy->getElementType();
811 } while ((ArrayTy = dyn_cast<ArrayType>(ElemTy)));
812
813 // Check for array of vectors
814 auto *InnerVectorTy = dyn_cast<FixedVectorType>(ElemTy);
815 if (InnerVectorTy) {
816 NumElems *= InnerVectorTy->getNumElements();
817 ElemTy = InnerVectorTy->getElementType();
818 }
819
820 if (VectorType::isValidElementType(ElemTy) && NumElems > 0) {
821 unsigned ElementSize = DL->getTypeSizeInBits(ElemTy) / 8;
822 if (ElementSize > 0) {
823 unsigned AllocaSize = DL->getTypeStoreSize(AllocaTy);
824 // Expand vector if required to match padding of inner type,
825 // i.e. odd size subvectors.
826 // Storage size of new vector must match that of alloca for correct
827 // behaviour of byte offsets and GEP computation.
828 if (NumElems * ElementSize != AllocaSize)
829 NumElems = AllocaSize / ElementSize;
830 if (NumElems > 0 && (AllocaSize % ElementSize) == 0)
831 VectorTy = FixedVectorType::get(ElemTy, NumElems);
832 }
833 }
834 }
835
836 if (!VectorTy) {
837 LLVM_DEBUG(dbgs() << " Cannot convert type to vector\n");
838 return false;
839 }
840
841 const unsigned MaxElements =
842 (MaxVectorRegs * 32) / DL->getTypeSizeInBits(VectorTy->getElementType());
843
844 if (VectorTy->getNumElements() > MaxElements ||
845 VectorTy->getNumElements() < 2) {
846 LLVM_DEBUG(dbgs() << " " << *VectorTy
847 << " has an unsupported number of elements\n");
848 return false;
849 }
850
851 std::map<GetElementPtrInst *, WeakTrackingVH> GEPVectorIdx;
853 SmallVector<Instruction *> UsersToRemove;
854 SmallVector<Instruction *> DeferredInsts;
855 SmallVector<Instruction *> NewGEPInsts;
857
858 const auto RejectUser = [&](Instruction *Inst, Twine Msg) {
859 LLVM_DEBUG(dbgs() << " Cannot promote alloca to vector: " << Msg << "\n"
860 << " " << *Inst << "\n");
861 for (auto *Inst : reverse(NewGEPInsts))
862 Inst->eraseFromParent();
863 return false;
864 };
865
867 collectAllocaUses(Alloca, Uses);
868
869 LLVM_DEBUG(dbgs() << " Attempting promotion to: " << *VectorTy << "\n");
870
871 Type *VecEltTy = VectorTy->getElementType();
872 unsigned ElementSizeInBits = DL->getTypeSizeInBits(VecEltTy);
873 if (ElementSizeInBits != DL->getTypeAllocSizeInBits(VecEltTy)) {
874 LLVM_DEBUG(dbgs() << " Cannot convert to vector if the allocation size "
875 "does not match the type's size\n");
876 return false;
877 }
878 unsigned ElementSize = ElementSizeInBits / 8;
879 assert(ElementSize > 0);
880 for (auto *U : Uses) {
881 Instruction *Inst = cast<Instruction>(U->getUser());
882
883 if (Value *Ptr = getLoadStorePointerOperand(Inst)) {
884 // This is a store of the pointer, not to the pointer.
885 if (isa<StoreInst>(Inst) &&
886 U->getOperandNo() != StoreInst::getPointerOperandIndex())
887 return RejectUser(Inst, "pointer is being stored");
888
889 Type *AccessTy = getLoadStoreType(Inst);
890 if (AccessTy->isAggregateType())
891 return RejectUser(Inst, "unsupported load/store as aggregate");
892 assert(!AccessTy->isAggregateType() || AccessTy->isArrayTy());
893
894 // Check that this is a simple access of a vector element.
895 bool IsSimple = isa<LoadInst>(Inst) ? cast<LoadInst>(Inst)->isSimple()
896 : cast<StoreInst>(Inst)->isSimple();
897 if (!IsSimple)
898 return RejectUser(Inst, "not a simple load or store");
899
900 Ptr = Ptr->stripPointerCasts();
901
902 // Alloca already accessed as vector.
903 if (Ptr == &Alloca && DL->getTypeStoreSize(Alloca.getAllocatedType()) ==
904 DL->getTypeStoreSize(AccessTy)) {
905 WorkList.push_back(Inst);
906 continue;
907 }
908
909 if (!isSupportedAccessType(VectorTy, AccessTy, *DL))
910 return RejectUser(Inst, "not a supported access type");
911
912 WorkList.push_back(Inst);
913 continue;
914 }
915
916 if (auto *GEP = dyn_cast<GetElementPtrInst>(Inst)) {
917 // If we can't compute a vector index from this GEP, then we can't
918 // promote this alloca to vector.
919 Value *Index = GEPToVectorIndex(GEP, &Alloca, VecEltTy, *DL, NewGEPInsts);
920 if (!Index)
921 return RejectUser(Inst, "cannot compute vector index for GEP");
922
923 GEPVectorIdx[GEP] = Index;
924 UsersToRemove.push_back(Inst);
925 continue;
926 }
927
928 if (MemSetInst *MSI = dyn_cast<MemSetInst>(Inst);
929 MSI && isSupportedMemset(MSI, &Alloca, *DL)) {
930 WorkList.push_back(Inst);
931 continue;
932 }
933
934 if (MemTransferInst *TransferInst = dyn_cast<MemTransferInst>(Inst)) {
935 if (TransferInst->isVolatile())
936 return RejectUser(Inst, "mem transfer inst is volatile");
937
938 ConstantInt *Len = dyn_cast<ConstantInt>(TransferInst->getLength());
939 if (!Len || (Len->getZExtValue() % ElementSize))
940 return RejectUser(Inst, "mem transfer inst length is non-constant or "
941 "not a multiple of the vector element size");
942
943 if (TransferInfo.try_emplace(TransferInst).second) {
944 DeferredInsts.push_back(Inst);
945 WorkList.push_back(Inst);
946 }
947
948 auto getPointerIndexOfAlloca = [&](Value *Ptr) -> ConstantInt * {
950 if (Ptr != &Alloca && !GEPVectorIdx.count(GEP))
951 return nullptr;
952
953 return dyn_cast<ConstantInt>(calculateVectorIndex(Ptr, GEPVectorIdx));
954 };
955
956 unsigned OpNum = U->getOperandNo();
957 MemTransferInfo *TI = &TransferInfo[TransferInst];
958 if (OpNum == 0) {
959 Value *Dest = TransferInst->getDest();
960 ConstantInt *Index = getPointerIndexOfAlloca(Dest);
961 if (!Index)
962 return RejectUser(Inst, "could not calculate constant dest index");
963 TI->DestIndex = Index;
964 } else {
965 assert(OpNum == 1);
966 Value *Src = TransferInst->getSource();
967 ConstantInt *Index = getPointerIndexOfAlloca(Src);
968 if (!Index)
969 return RejectUser(Inst, "could not calculate constant src index");
970 TI->SrcIndex = Index;
971 }
972 continue;
973 }
974
975 if (auto *Intr = dyn_cast<IntrinsicInst>(Inst)) {
976 if (Intr->getIntrinsicID() == Intrinsic::objectsize) {
977 WorkList.push_back(Inst);
978 continue;
979 }
980 }
981
982 // Ignore assume-like intrinsics and comparisons used in assumes.
983 if (isAssumeLikeIntrinsic(Inst)) {
984 if (!Inst->use_empty())
985 return RejectUser(Inst, "assume-like intrinsic cannot have any users");
986 UsersToRemove.push_back(Inst);
987 continue;
988 }
989
990 if (isa<ICmpInst>(Inst) && all_of(Inst->users(), [](User *U) {
991 return isAssumeLikeIntrinsic(cast<Instruction>(U));
992 })) {
993 UsersToRemove.push_back(Inst);
994 continue;
995 }
996
997 return RejectUser(Inst, "unhandled alloca user");
998 }
999
1000 while (!DeferredInsts.empty()) {
1001 Instruction *Inst = DeferredInsts.pop_back_val();
1002 MemTransferInst *TransferInst = cast<MemTransferInst>(Inst);
1003 // TODO: Support the case if the pointers are from different alloca or
1004 // from different address spaces.
1005 MemTransferInfo &Info = TransferInfo[TransferInst];
1006 if (!Info.SrcIndex || !Info.DestIndex)
1007 return RejectUser(
1008 Inst, "mem transfer inst is missing constant src and/or dst index");
1009 }
1010
1011 LLVM_DEBUG(dbgs() << " Converting alloca to vector " << *AllocaTy << " -> "
1012 << *VectorTy << '\n');
1013 const unsigned VecStoreSize = DL->getTypeStoreSize(VectorTy);
1014
1015 // Alloca is uninitialized memory. Imitate that by making the first value
1016 // undef.
1017 SSAUpdater Updater;
1018 Updater.Initialize(VectorTy, "promotealloca");
1019
1020 BasicBlock *EntryBB = Alloca.getParent();
1021 BasicBlock::iterator InitInsertPos =
1022 skipToNonAllocaInsertPt(*EntryBB, Alloca.getIterator());
1023 // Alloca memory is undefined to begin, not poison.
1024 Value *AllocaInitValue =
1025 new FreezeInst(PoisonValue::get(VectorTy), "", InitInsertPos);
1026 AllocaInitValue->takeName(&Alloca);
1027
1028 Updater.AddAvailableValue(EntryBB, AllocaInitValue);
1029
1030 // First handle the initial worklist.
1031 SmallVector<LoadInst *, 4> DeferredLoads;
1032 forEachWorkListItem(WorkList, [&](Instruction *I) {
1033 BasicBlock *BB = I->getParent();
1034 // On the first pass, we only take values that are trivially known, i.e.
1035 // where AddAvailableValue was already called in this block.
1037 I, *DL, VectorTy, VecStoreSize, ElementSize, TransferInfo, GEPVectorIdx,
1038 Updater.FindValueForBlock(BB), DeferredLoads);
1039 if (Result)
1040 Updater.AddAvailableValue(BB, Result);
1041 });
1042
1043 // Then handle deferred loads.
1044 forEachWorkListItem(DeferredLoads, [&](Instruction *I) {
1046 BasicBlock *BB = I->getParent();
1047 // On the second pass, we use GetValueInMiddleOfBlock to guarantee we always
1048 // get a value, inserting PHIs as needed.
1050 I, *DL, VectorTy, VecStoreSize, ElementSize, TransferInfo, GEPVectorIdx,
1051 Updater.GetValueInMiddleOfBlock(I->getParent()), NewDLs);
1052 if (Result)
1053 Updater.AddAvailableValue(BB, Result);
1054 assert(NewDLs.empty() && "No more deferred loads should be queued!");
1055 });
1056
1057 // Delete all instructions. On the first pass, new dummy loads may have been
1058 // added so we need to collect them too.
1059 DenseSet<Instruction *> InstsToDelete(WorkList.begin(), WorkList.end());
1060 InstsToDelete.insert_range(DeferredLoads);
1061 for (Instruction *I : InstsToDelete) {
1062 assert(I->use_empty());
1063 I->eraseFromParent();
1064 }
1065
1066 // Delete all the users that are known to be removeable.
1067 for (Instruction *I : reverse(UsersToRemove)) {
1068 I->dropDroppableUses();
1069 assert(I->use_empty());
1070 I->eraseFromParent();
1071 }
1072
1073 // Alloca should now be dead too.
1074 assert(Alloca.use_empty());
1075 Alloca.eraseFromParent();
1076 return true;
1077}
1078
1079std::pair<Value *, Value *>
1080AMDGPUPromoteAllocaImpl::getLocalSizeYZ(IRBuilder<> &Builder) {
1081 Function &F = *Builder.GetInsertBlock()->getParent();
1083
1084 if (!IsAMDHSA) {
1085 CallInst *LocalSizeY =
1086 Builder.CreateIntrinsic(Intrinsic::r600_read_local_size_y, {});
1087 CallInst *LocalSizeZ =
1088 Builder.CreateIntrinsic(Intrinsic::r600_read_local_size_z, {});
1089
1090 ST.makeLIDRangeMetadata(LocalSizeY);
1091 ST.makeLIDRangeMetadata(LocalSizeZ);
1092
1093 return std::pair(LocalSizeY, LocalSizeZ);
1094 }
1095
1096 // We must read the size out of the dispatch pointer.
1097 assert(IsAMDGCN);
1098
1099 // We are indexing into this struct, and want to extract the workgroup_size_*
1100 // fields.
1101 //
1102 // typedef struct hsa_kernel_dispatch_packet_s {
1103 // uint16_t header;
1104 // uint16_t setup;
1105 // uint16_t workgroup_size_x ;
1106 // uint16_t workgroup_size_y;
1107 // uint16_t workgroup_size_z;
1108 // uint16_t reserved0;
1109 // uint32_t grid_size_x ;
1110 // uint32_t grid_size_y ;
1111 // uint32_t grid_size_z;
1112 //
1113 // uint32_t private_segment_size;
1114 // uint32_t group_segment_size;
1115 // uint64_t kernel_object;
1116 //
1117 // #ifdef HSA_LARGE_MODEL
1118 // void *kernarg_address;
1119 // #elif defined HSA_LITTLE_ENDIAN
1120 // void *kernarg_address;
1121 // uint32_t reserved1;
1122 // #else
1123 // uint32_t reserved1;
1124 // void *kernarg_address;
1125 // #endif
1126 // uint64_t reserved2;
1127 // hsa_signal_t completion_signal; // uint64_t wrapper
1128 // } hsa_kernel_dispatch_packet_t
1129 //
1130 CallInst *DispatchPtr =
1131 Builder.CreateIntrinsic(Intrinsic::amdgcn_dispatch_ptr, {});
1132 DispatchPtr->addRetAttr(Attribute::NoAlias);
1133 DispatchPtr->addRetAttr(Attribute::NonNull);
1134 F.removeFnAttr("amdgpu-no-dispatch-ptr");
1135
1136 // Size of the dispatch packet struct.
1137 DispatchPtr->addDereferenceableRetAttr(64);
1138
1139 Type *I32Ty = Type::getInt32Ty(Mod->getContext());
1140
1141 // We could do a single 64-bit load here, but it's likely that the basic
1142 // 32-bit and extract sequence is already present, and it is probably easier
1143 // to CSE this. The loads should be mergeable later anyway.
1144 Value *GEPXY = Builder.CreateConstInBoundsGEP1_64(I32Ty, DispatchPtr, 1);
1145 LoadInst *LoadXY = Builder.CreateAlignedLoad(I32Ty, GEPXY, Align(4));
1146
1147 Value *GEPZU = Builder.CreateConstInBoundsGEP1_64(I32Ty, DispatchPtr, 2);
1148 LoadInst *LoadZU = Builder.CreateAlignedLoad(I32Ty, GEPZU, Align(4));
1149
1150 MDNode *MD = MDNode::get(Mod->getContext(), {});
1151 LoadXY->setMetadata(LLVMContext::MD_invariant_load, MD);
1152 LoadZU->setMetadata(LLVMContext::MD_invariant_load, MD);
1153 ST.makeLIDRangeMetadata(LoadZU);
1154
1155 // Extract y component. Upper half of LoadZU should be zero already.
1156 Value *Y = Builder.CreateLShr(LoadXY, 16);
1157
1158 return std::pair(Y, LoadZU);
1159}
1160
1161Value *AMDGPUPromoteAllocaImpl::getWorkitemID(IRBuilder<> &Builder,
1162 unsigned N) {
1163 Function *F = Builder.GetInsertBlock()->getParent();
1166 StringRef AttrName;
1167
1168 switch (N) {
1169 case 0:
1170 IntrID = IsAMDGCN ? (Intrinsic::ID)Intrinsic::amdgcn_workitem_id_x
1171 : (Intrinsic::ID)Intrinsic::r600_read_tidig_x;
1172 AttrName = "amdgpu-no-workitem-id-x";
1173 break;
1174 case 1:
1175 IntrID = IsAMDGCN ? (Intrinsic::ID)Intrinsic::amdgcn_workitem_id_y
1176 : (Intrinsic::ID)Intrinsic::r600_read_tidig_y;
1177 AttrName = "amdgpu-no-workitem-id-y";
1178 break;
1179
1180 case 2:
1181 IntrID = IsAMDGCN ? (Intrinsic::ID)Intrinsic::amdgcn_workitem_id_z
1182 : (Intrinsic::ID)Intrinsic::r600_read_tidig_z;
1183 AttrName = "amdgpu-no-workitem-id-z";
1184 break;
1185 default:
1186 llvm_unreachable("invalid dimension");
1187 }
1188
1189 Function *WorkitemIdFn = Intrinsic::getOrInsertDeclaration(Mod, IntrID);
1190 CallInst *CI = Builder.CreateCall(WorkitemIdFn);
1191 ST.makeLIDRangeMetadata(CI);
1192 F->removeFnAttr(AttrName);
1193
1194 return CI;
1195}
1196
1197static bool isCallPromotable(CallInst *CI) {
1199 if (!II)
1200 return false;
1201
1202 switch (II->getIntrinsicID()) {
1203 case Intrinsic::memcpy:
1204 case Intrinsic::memmove:
1205 case Intrinsic::memset:
1206 case Intrinsic::lifetime_start:
1207 case Intrinsic::lifetime_end:
1208 case Intrinsic::invariant_start:
1209 case Intrinsic::invariant_end:
1210 case Intrinsic::launder_invariant_group:
1211 case Intrinsic::strip_invariant_group:
1212 case Intrinsic::objectsize:
1213 return true;
1214 default:
1215 return false;
1216 }
1217}
1218
1219bool AMDGPUPromoteAllocaImpl::binaryOpIsDerivedFromSameAlloca(
1220 Value *BaseAlloca, Value *Val, Instruction *Inst, int OpIdx0,
1221 int OpIdx1) const {
1222 // Figure out which operand is the one we might not be promoting.
1223 Value *OtherOp = Inst->getOperand(OpIdx0);
1224 if (Val == OtherOp)
1225 OtherOp = Inst->getOperand(OpIdx1);
1226
1228 return true;
1229
1230 // TODO: getUnderlyingObject will not work on a vector getelementptr
1231 Value *OtherObj = getUnderlyingObject(OtherOp);
1232 if (!isa<AllocaInst>(OtherObj))
1233 return false;
1234
1235 // TODO: We should be able to replace undefs with the right pointer type.
1236
1237 // TODO: If we know the other base object is another promotable
1238 // alloca, not necessarily this alloca, we can do this. The
1239 // important part is both must have the same address space at
1240 // the end.
1241 if (OtherObj != BaseAlloca) {
1242 LLVM_DEBUG(
1243 dbgs() << "Found a binary instruction with another alloca object\n");
1244 return false;
1245 }
1246
1247 return true;
1248}
1249
1250bool AMDGPUPromoteAllocaImpl::collectUsesWithPtrTypes(
1251 Value *BaseAlloca, Value *Val, std::vector<Value *> &WorkList) const {
1252
1253 for (User *User : Val->users()) {
1254 if (is_contained(WorkList, User))
1255 continue;
1256
1257 if (CallInst *CI = dyn_cast<CallInst>(User)) {
1258 if (!isCallPromotable(CI))
1259 return false;
1260
1261 WorkList.push_back(User);
1262 continue;
1263 }
1264
1266 if (UseInst->getOpcode() == Instruction::PtrToInt)
1267 return false;
1268
1269 if (LoadInst *LI = dyn_cast<LoadInst>(UseInst)) {
1270 if (LI->isVolatile())
1271 return false;
1272 continue;
1273 }
1274
1275 if (StoreInst *SI = dyn_cast<StoreInst>(UseInst)) {
1276 if (SI->isVolatile())
1277 return false;
1278
1279 // Reject if the stored value is not the pointer operand.
1280 if (SI->getPointerOperand() != Val)
1281 return false;
1282 continue;
1283 }
1284
1285 if (AtomicRMWInst *RMW = dyn_cast<AtomicRMWInst>(UseInst)) {
1286 if (RMW->isVolatile())
1287 return false;
1288 continue;
1289 }
1290
1291 if (AtomicCmpXchgInst *CAS = dyn_cast<AtomicCmpXchgInst>(UseInst)) {
1292 if (CAS->isVolatile())
1293 return false;
1294 continue;
1295 }
1296
1297 // Only promote a select if we know that the other select operand
1298 // is from another pointer that will also be promoted.
1299 if (ICmpInst *ICmp = dyn_cast<ICmpInst>(UseInst)) {
1300 if (!binaryOpIsDerivedFromSameAlloca(BaseAlloca, Val, ICmp, 0, 1))
1301 return false;
1302
1303 // May need to rewrite constant operands.
1304 WorkList.push_back(ICmp);
1305 continue;
1306 }
1307
1309 // Be conservative if an address could be computed outside the bounds of
1310 // the alloca.
1311 if (!GEP->isInBounds())
1312 return false;
1313 } else if (SelectInst *SI = dyn_cast<SelectInst>(UseInst)) {
1314 // Only promote a select if we know that the other select operand is from
1315 // another pointer that will also be promoted.
1316 if (!binaryOpIsDerivedFromSameAlloca(BaseAlloca, Val, SI, 1, 2))
1317 return false;
1318 } else if (PHINode *Phi = dyn_cast<PHINode>(UseInst)) {
1319 // Repeat for phis.
1320
1321 // TODO: Handle more complex cases. We should be able to replace loops
1322 // over arrays.
1323 switch (Phi->getNumIncomingValues()) {
1324 case 1:
1325 break;
1326 case 2:
1327 if (!binaryOpIsDerivedFromSameAlloca(BaseAlloca, Val, Phi, 0, 1))
1328 return false;
1329 break;
1330 default:
1331 return false;
1332 }
1333 } else if (!isa<ExtractElementInst>(User)) {
1334 // Do not promote vector/aggregate type instructions. It is hard to track
1335 // their users.
1336
1337 // Do not promote addrspacecast.
1338 //
1339 // TODO: If we know the address is only observed through flat pointers, we
1340 // could still promote.
1341 return false;
1342 }
1343
1344 WorkList.push_back(User);
1345 if (!collectUsesWithPtrTypes(BaseAlloca, User, WorkList))
1346 return false;
1347 }
1348
1349 return true;
1350}
1351
1352bool AMDGPUPromoteAllocaImpl::hasSufficientLocalMem(const Function &F) {
1353
1354 FunctionType *FTy = F.getFunctionType();
1356
1357 // If the function has any arguments in the local address space, then it's
1358 // possible these arguments require the entire local memory space, so
1359 // we cannot use local memory in the pass.
1360 for (Type *ParamTy : FTy->params()) {
1361 PointerType *PtrTy = dyn_cast<PointerType>(ParamTy);
1362 if (PtrTy && PtrTy->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS) {
1363 LocalMemLimit = 0;
1364 LLVM_DEBUG(dbgs() << "Function has local memory argument. Promoting to "
1365 "local memory disabled.\n");
1366 return false;
1367 }
1368 }
1369
1370 LocalMemLimit = ST.getAddressableLocalMemorySize();
1371 if (LocalMemLimit == 0)
1372 return false;
1373
1375 SmallPtrSet<const Constant *, 8> VisitedConstants;
1377
1378 auto visitUsers = [&](const GlobalVariable *GV, const Constant *Val) -> bool {
1379 for (const User *U : Val->users()) {
1380 if (const Instruction *Use = dyn_cast<Instruction>(U)) {
1381 if (Use->getParent()->getParent() == &F)
1382 return true;
1383 } else {
1384 const Constant *C = cast<Constant>(U);
1385 if (VisitedConstants.insert(C).second)
1386 Stack.push_back(C);
1387 }
1388 }
1389
1390 return false;
1391 };
1392
1393 for (GlobalVariable &GV : Mod->globals()) {
1395 continue;
1396
1397 if (visitUsers(&GV, &GV)) {
1398 UsedLDS.insert(&GV);
1399 Stack.clear();
1400 continue;
1401 }
1402
1403 // For any ConstantExpr uses, we need to recursively search the users until
1404 // we see a function.
1405 while (!Stack.empty()) {
1406 const Constant *C = Stack.pop_back_val();
1407 if (visitUsers(&GV, C)) {
1408 UsedLDS.insert(&GV);
1409 Stack.clear();
1410 break;
1411 }
1412 }
1413 }
1414
1415 const DataLayout &DL = Mod->getDataLayout();
1416 SmallVector<std::pair<uint64_t, Align>, 16> AllocatedSizes;
1417 AllocatedSizes.reserve(UsedLDS.size());
1418
1419 for (const GlobalVariable *GV : UsedLDS) {
1420 Align Alignment =
1421 DL.getValueOrABITypeAlignment(GV->getAlign(), GV->getValueType());
1422 uint64_t AllocSize = DL.getTypeAllocSize(GV->getValueType());
1423
1424 // HIP uses an extern unsized array in local address space for dynamically
1425 // allocated shared memory. In that case, we have to disable the promotion.
1426 if (GV->hasExternalLinkage() && AllocSize == 0) {
1427 LocalMemLimit = 0;
1428 LLVM_DEBUG(dbgs() << "Function has a reference to externally allocated "
1429 "local memory. Promoting to local memory "
1430 "disabled.\n");
1431 return false;
1432 }
1433
1434 AllocatedSizes.emplace_back(AllocSize, Alignment);
1435 }
1436
1437 // Sort to try to estimate the worst case alignment padding
1438 //
1439 // FIXME: We should really do something to fix the addresses to a more optimal
1440 // value instead
1441 llvm::sort(AllocatedSizes, llvm::less_second());
1442
1443 // Check how much local memory is being used by global objects
1444 CurrentLocalMemUsage = 0;
1445
1446 // FIXME: Try to account for padding here. The real padding and address is
1447 // currently determined from the inverse order of uses in the function when
1448 // legalizing, which could also potentially change. We try to estimate the
1449 // worst case here, but we probably should fix the addresses earlier.
1450 for (auto Alloc : AllocatedSizes) {
1451 CurrentLocalMemUsage = alignTo(CurrentLocalMemUsage, Alloc.second);
1452 CurrentLocalMemUsage += Alloc.first;
1453 }
1454
1455 unsigned MaxOccupancy =
1456 ST.getWavesPerEU(ST.getFlatWorkGroupSizes(F), CurrentLocalMemUsage, F)
1457 .second;
1458
1459 // Round up to the next tier of usage.
1460 unsigned MaxSizeWithWaveCount =
1461 ST.getMaxLocalMemSizeWithWaveCount(MaxOccupancy, F);
1462
1463 // Program may already use more LDS than is usable at maximum occupancy.
1464 if (CurrentLocalMemUsage > MaxSizeWithWaveCount)
1465 return false;
1466
1467 LocalMemLimit = MaxSizeWithWaveCount;
1468
1469 LLVM_DEBUG(dbgs() << F.getName() << " uses " << CurrentLocalMemUsage
1470 << " bytes of LDS\n"
1471 << " Rounding size to " << MaxSizeWithWaveCount
1472 << " with a maximum occupancy of " << MaxOccupancy << '\n'
1473 << " and " << (LocalMemLimit - CurrentLocalMemUsage)
1474 << " available for promotion\n");
1475
1476 return true;
1477}
1478
1479// FIXME: Should try to pick the most likely to be profitable allocas first.
1480bool AMDGPUPromoteAllocaImpl::tryPromoteAllocaToLDS(AllocaInst &I,
1481 bool SufficientLDS) {
1482 LLVM_DEBUG(dbgs() << "Trying to promote to LDS: " << I << '\n');
1483
1484 if (DisablePromoteAllocaToLDS) {
1485 LLVM_DEBUG(dbgs() << " Promote alloca to LDS is disabled\n");
1486 return false;
1487 }
1488
1489 const DataLayout &DL = Mod->getDataLayout();
1490 IRBuilder<> Builder(&I);
1491
1492 const Function &ContainingFunction = *I.getParent()->getParent();
1493 CallingConv::ID CC = ContainingFunction.getCallingConv();
1494
1495 // Don't promote the alloca to LDS for shader calling conventions as the work
1496 // item ID intrinsics are not supported for these calling conventions.
1497 // Furthermore not all LDS is available for some of the stages.
1498 switch (CC) {
1501 break;
1502 default:
1503 LLVM_DEBUG(
1504 dbgs()
1505 << " promote alloca to LDS not supported with calling convention.\n");
1506 return false;
1507 }
1508
1509 // Not likely to have sufficient local memory for promotion.
1510 if (!SufficientLDS)
1511 return false;
1512
1513 const AMDGPUSubtarget &ST = AMDGPUSubtarget::get(TM, ContainingFunction);
1514 unsigned WorkGroupSize = ST.getFlatWorkGroupSizes(ContainingFunction).second;
1515
1516 Align Alignment =
1517 DL.getValueOrABITypeAlignment(I.getAlign(), I.getAllocatedType());
1518
1519 // FIXME: This computed padding is likely wrong since it depends on inverse
1520 // usage order.
1521 //
1522 // FIXME: It is also possible that if we're allowed to use all of the memory
1523 // could end up using more than the maximum due to alignment padding.
1524
1525 uint32_t NewSize = alignTo(CurrentLocalMemUsage, Alignment);
1526 uint32_t AllocSize =
1527 WorkGroupSize * DL.getTypeAllocSize(I.getAllocatedType());
1528 NewSize += AllocSize;
1529
1530 if (NewSize > LocalMemLimit) {
1531 LLVM_DEBUG(dbgs() << " " << AllocSize
1532 << " bytes of local memory not available to promote\n");
1533 return false;
1534 }
1535
1536 CurrentLocalMemUsage = NewSize;
1537
1538 std::vector<Value *> WorkList;
1539
1540 if (!collectUsesWithPtrTypes(&I, &I, WorkList)) {
1541 LLVM_DEBUG(dbgs() << " Do not know how to convert all uses\n");
1542 return false;
1543 }
1544
1545 LLVM_DEBUG(dbgs() << "Promoting alloca to local memory\n");
1546
1547 Function *F = I.getParent()->getParent();
1548
1549 Type *GVTy = ArrayType::get(I.getAllocatedType(), WorkGroupSize);
1552 Twine(F->getName()) + Twine('.') + I.getName(), nullptr,
1555 GV->setAlignment(I.getAlign());
1556
1557 Value *TCntY, *TCntZ;
1558
1559 std::tie(TCntY, TCntZ) = getLocalSizeYZ(Builder);
1560 Value *TIdX = getWorkitemID(Builder, 0);
1561 Value *TIdY = getWorkitemID(Builder, 1);
1562 Value *TIdZ = getWorkitemID(Builder, 2);
1563
1564 Value *Tmp0 = Builder.CreateMul(TCntY, TCntZ, "", true, true);
1565 Tmp0 = Builder.CreateMul(Tmp0, TIdX);
1566 Value *Tmp1 = Builder.CreateMul(TIdY, TCntZ, "", true, true);
1567 Value *TID = Builder.CreateAdd(Tmp0, Tmp1);
1568 TID = Builder.CreateAdd(TID, TIdZ);
1569
1570 LLVMContext &Context = Mod->getContext();
1572
1573 Value *Offset = Builder.CreateInBoundsGEP(GVTy, GV, Indices);
1574 I.mutateType(Offset->getType());
1575 I.replaceAllUsesWith(Offset);
1576 I.eraseFromParent();
1577
1578 SmallVector<IntrinsicInst *> DeferredIntrs;
1579
1581
1582 for (Value *V : WorkList) {
1584 if (!Call) {
1585 if (ICmpInst *CI = dyn_cast<ICmpInst>(V)) {
1586 Value *LHS = CI->getOperand(0);
1587 Value *RHS = CI->getOperand(1);
1588
1589 Type *NewTy = LHS->getType()->getWithNewType(NewPtrTy);
1591 CI->setOperand(0, Constant::getNullValue(NewTy));
1592
1594 CI->setOperand(1, Constant::getNullValue(NewTy));
1595
1596 continue;
1597 }
1598
1599 // The operand's value should be corrected on its own and we don't want to
1600 // touch the users.
1602 continue;
1603
1604 assert(V->getType()->isPtrOrPtrVectorTy());
1605
1606 Type *NewTy = V->getType()->getWithNewType(NewPtrTy);
1607 V->mutateType(NewTy);
1608
1609 // Adjust the types of any constant operands.
1612 SI->setOperand(1, Constant::getNullValue(NewTy));
1613
1615 SI->setOperand(2, Constant::getNullValue(NewTy));
1616 } else if (PHINode *Phi = dyn_cast<PHINode>(V)) {
1617 for (unsigned I = 0, E = Phi->getNumIncomingValues(); I != E; ++I) {
1619 Phi->getIncomingValue(I)))
1620 Phi->setIncomingValue(I, Constant::getNullValue(NewTy));
1621 }
1622 }
1623
1624 continue;
1625 }
1626
1628 Builder.SetInsertPoint(Intr);
1629 switch (Intr->getIntrinsicID()) {
1630 case Intrinsic::lifetime_start:
1631 case Intrinsic::lifetime_end:
1632 // These intrinsics are for address space 0 only
1633 Intr->eraseFromParent();
1634 continue;
1635 case Intrinsic::memcpy:
1636 case Intrinsic::memmove:
1637 // These have 2 pointer operands. In case if second pointer also needs
1638 // to be replaced we defer processing of these intrinsics until all
1639 // other values are processed.
1640 DeferredIntrs.push_back(Intr);
1641 continue;
1642 case Intrinsic::memset: {
1643 MemSetInst *MemSet = cast<MemSetInst>(Intr);
1644 Builder.CreateMemSet(MemSet->getRawDest(), MemSet->getValue(),
1645 MemSet->getLength(), MemSet->getDestAlign(),
1646 MemSet->isVolatile());
1647 Intr->eraseFromParent();
1648 continue;
1649 }
1650 case Intrinsic::invariant_start:
1651 case Intrinsic::invariant_end:
1652 case Intrinsic::launder_invariant_group:
1653 case Intrinsic::strip_invariant_group: {
1655 if (Intr->getIntrinsicID() == Intrinsic::invariant_start) {
1656 Args.emplace_back(Intr->getArgOperand(0));
1657 } else if (Intr->getIntrinsicID() == Intrinsic::invariant_end) {
1658 Args.emplace_back(Intr->getArgOperand(0));
1659 Args.emplace_back(Intr->getArgOperand(1));
1660 }
1661 Args.emplace_back(Offset);
1663 Intr->getModule(), Intr->getIntrinsicID(), Offset->getType());
1664 CallInst *NewIntr =
1665 CallInst::Create(F, Args, Intr->getName(), Intr->getIterator());
1666 Intr->mutateType(NewIntr->getType());
1667 Intr->replaceAllUsesWith(NewIntr);
1668 Intr->eraseFromParent();
1669 continue;
1670 }
1671 case Intrinsic::objectsize: {
1672 Value *Src = Intr->getOperand(0);
1673
1674 CallInst *NewCall = Builder.CreateIntrinsic(
1675 Intrinsic::objectsize,
1677 {Src, Intr->getOperand(1), Intr->getOperand(2), Intr->getOperand(3)});
1678 Intr->replaceAllUsesWith(NewCall);
1679 Intr->eraseFromParent();
1680 continue;
1681 }
1682 default:
1683 Intr->print(errs());
1684 llvm_unreachable("Don't know how to promote alloca intrinsic use.");
1685 }
1686 }
1687
1688 for (IntrinsicInst *Intr : DeferredIntrs) {
1689 Builder.SetInsertPoint(Intr);
1691 assert(ID == Intrinsic::memcpy || ID == Intrinsic::memmove);
1692
1694 auto *B = Builder.CreateMemTransferInst(
1695 ID, MI->getRawDest(), MI->getDestAlign(), MI->getRawSource(),
1696 MI->getSourceAlign(), MI->getLength(), MI->isVolatile());
1697
1698 for (unsigned I = 0; I != 2; ++I) {
1699 if (uint64_t Bytes = Intr->getParamDereferenceableBytes(I)) {
1700 B->addDereferenceableParamAttr(I, Bytes);
1701 }
1702 }
1703
1704 Intr->eraseFromParent();
1705 }
1706
1707 return true;
1708}
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
static Value * GEPToVectorIndex(GetElementPtrInst *GEP, AllocaInst *Alloca, Type *VecElemTy, const DataLayout &DL, SmallVector< Instruction * > &NewInsts)
static Value * promoteAllocaUserToVector(Instruction *Inst, const DataLayout &DL, FixedVectorType *VectorTy, unsigned VecStoreSize, unsigned ElementSize, DenseMap< MemTransferInst *, MemTransferInfo > &TransferInfo, std::map< GetElementPtrInst *, WeakTrackingVH > &GEPVectorIdx, Value *CurVal, SmallVectorImpl< LoadInst * > &DeferredLoads)
Promotes a single user of the alloca to a vector form.
static void collectAllocaUses(AllocaInst &Alloca, SmallVectorImpl< Use * > &Uses)
static bool isSupportedAccessType(FixedVectorType *VecTy, Type *AccessTy, const DataLayout &DL)
static void forEachWorkListItem(const InstContainer &WorkList, std::function< void(Instruction *)> Fn)
Iterates over an instruction worklist that may contain multiple instructions from the same basic bloc...
static bool isSupportedMemset(MemSetInst *I, AllocaInst *AI, const DataLayout &DL)
static BasicBlock::iterator skipToNonAllocaInsertPt(BasicBlock &BB, BasicBlock::iterator I)
Find an insert point after an alloca, after all other allocas clustered at the start of the block.
static bool isCallPromotable(CallInst *CI)
static Value * calculateVectorIndex(Value *Ptr, const std::map< GetElementPtrInst *, WeakTrackingVH > &GEPIdx)
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
Analysis containing CSE Info
Definition CSEInfo.cpp:27
static bool runOnFunction(Function &F, bool PostInlining)
AMD GCN specific subclass of TargetSubtarget.
#define DEBUG_TYPE
Hexagon Common GEP
IRTranslator LLVM IR MI
#define F(x, y, z)
Definition MD5.cpp:55
#define I(x, y, z)
Definition MD5.cpp:58
uint64_t IntrinsicInst * II
if(auto Err=PB.parsePassPipeline(MPM, Passes)) return wrap(std MPM run * Mod
#define INITIALIZE_PASS_DEPENDENCY(depName)
Definition PassSupport.h:42
#define INITIALIZE_PASS_END(passName, arg, name, cfg, analysis)
Definition PassSupport.h:44
#define INITIALIZE_PASS_BEGIN(passName, arg, name, cfg, analysis)
Definition PassSupport.h:39
Remove Loads Into Fake Uses
static unsigned getNumElements(Type *Ty)
This file contains some templates that are useful if you are working with the STL at all.
#define LLVM_DEBUG(...)
Definition Debug.h:114
static TableGen::Emitter::Opt Y("gen-skeleton-entry", EmitSkeleton, "Generate example skeleton entry")
Target-Independent Code Generator Pass Configuration Options pass.
Value * RHS
Value * LHS
static const AMDGPUSubtarget & get(const MachineFunction &MF)
Class for arbitrary precision integers.
Definition APInt.h:78
static LLVM_ABI void sdivrem(const APInt &LHS, const APInt &RHS, APInt &Quotient, APInt &Remainder)
Definition APInt.cpp:1890
bool isZero() const
Determine if this value is zero, i.e. all bits are clear.
Definition APInt.h:380
LLVM_ABI APInt sext(unsigned width) const
Sign extend to a new width.
Definition APInt.cpp:985
bool isOne() const
Determine if this is a value of 1.
Definition APInt.h:389
an instruction to allocate memory on the stack
Type * getAllocatedType() const
Return the type that is being allocated by the instruction.
PassT::Result & getResult(IRUnitT &IR, ExtraArgTs... ExtraArgs)
Get the result of an analysis pass for a given IR unit.
Represent the analysis usage information of a pass.
AnalysisUsage & addRequired()
LLVM_ABI void setPreservesCFG()
This function should be called by the pass, iff they do not:
Definition Pass.cpp:270
static LLVM_ABI ArrayType * get(Type *ElementType, uint64_t NumElements)
This static method is the primary way to construct an ArrayType.
An instruction that atomically checks whether a specified value is in a memory location,...
an instruction that atomically reads a memory location, combines it with another value,...
LLVM Basic Block Representation.
Definition BasicBlock.h:62
iterator end()
Definition BasicBlock.h:472
const Function * getParent() const
Return the enclosing method, or null if none.
Definition BasicBlock.h:213
InstListType::iterator iterator
Instruction iterators...
Definition BasicBlock.h:170
Represents analyses that only rely on functions' control flow.
Definition Analysis.h:73
uint64_t getParamDereferenceableBytes(unsigned i) const
Extract the number of dereferenceable bytes for a call or parameter (0=unknown).
void addDereferenceableRetAttr(uint64_t Bytes)
adds the dereferenceable attribute to the list of attributes.
void addRetAttr(Attribute::AttrKind Kind)
Adds the attribute to the return value.
Value * getArgOperand(unsigned i) const
This class represents a function call, abstracting a target machine's calling convention.
static CallInst * Create(FunctionType *Ty, Value *F, const Twine &NameStr="", InsertPosition InsertBefore=nullptr)
static LLVM_ABI bool isBitOrNoopPointerCastable(Type *SrcTy, Type *DestTy, const DataLayout &DL)
Check whether a bitcast, inttoptr, or ptrtoint cast between these types is valid and a no-op.
This is the shared class of boolean and integer constants.
Definition Constants.h:87
uint64_t getZExtValue() const
Return the constant as a 64-bit unsigned integer value after it has been zero extended as appropriate...
Definition Constants.h:163
This is an important base class in LLVM.
Definition Constant.h:43
static LLVM_ABI Constant * getNullValue(Type *Ty)
Constructor to create a '0' constant of arbitrary type.
A parsed version of the target data layout string in and methods for querying it.
Definition DataLayout.h:63
std::pair< iterator, bool > try_emplace(KeyT &&Key, Ts &&...Args)
Definition DenseMap.h:229
const ValueT & at(const_arg_type_t< KeyT > Val) const
at - Return the entry for the specified key, or abort if no such entry exists.
Definition DenseMap.h:205
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
Definition DenseMap.h:214
Implements a dense probed hash-table based set.
Definition DenseSet.h:261
Class to represent fixed width SIMD vectors.
unsigned getNumElements() const
static LLVM_ABI FixedVectorType * get(Type *ElementType, unsigned NumElts)
Definition Type.cpp:803
This class represents a freeze function that returns random concrete value if an operand is either a ...
FunctionPass class - This class is used to implement most global optimizations.
Definition Pass.h:314
Class to represent function types.
CallingConv::ID getCallingConv() const
getCallingConv()/setCallingConv(CC) - These method get and set the calling convention of this functio...
Definition Function.h:270
an instruction for type-safe pointer arithmetic to access elements of arrays and structs
bool hasExternalLinkage() const
void setUnnamedAddr(UnnamedAddr Val)
unsigned getAddressSpace() const
Module * getParent()
Get the module that this global value is contained inside of...
@ InternalLinkage
Rename collisions when linking (static functions).
Definition GlobalValue.h:60
Type * getValueType() const
MaybeAlign getAlign() const
Returns the alignment of the given variable.
void setAlignment(Align Align)
Sets the alignment attribute of the GlobalVariable.
This instruction compares its operands according to the predicate given to the constructor.
LoadInst * CreateAlignedLoad(Type *Ty, Value *Ptr, MaybeAlign Align, const char *Name)
Definition IRBuilder.h:1864
Value * CreateLShr(Value *LHS, Value *RHS, const Twine &Name="", bool isExact=false)
Definition IRBuilder.h:1513
BasicBlock * GetInsertBlock() const
Definition IRBuilder.h:201
Value * CreateInBoundsGEP(Type *Ty, Value *Ptr, ArrayRef< Value * > IdxList, const Twine &Name="")
Definition IRBuilder.h:1931
LLVM_ABI CallInst * CreateIntrinsic(Intrinsic::ID ID, ArrayRef< Type * > Types, ArrayRef< Value * > Args, FMFSource FMFSource={}, const Twine &Name="")
Create a call to intrinsic ID with Args, mangled using Types.
CallInst * CreateMemSet(Value *Ptr, Value *Val, uint64_t Size, MaybeAlign Align, bool isVolatile=false, const AAMDNodes &AAInfo=AAMDNodes())
Create and insert a memset to the specified pointer and the specified value.
Definition IRBuilder.h:630
Value * CreateAdd(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
Definition IRBuilder.h:1403
CallInst * CreateCall(FunctionType *FTy, Value *Callee, ArrayRef< Value * > Args={}, const Twine &Name="", MDNode *FPMathTag=nullptr)
Definition IRBuilder.h:2508
Value * CreateConstInBoundsGEP1_64(Type *Ty, Value *Ptr, uint64_t Idx0, const Twine &Name="")
Definition IRBuilder.h:1993
void SetInsertPoint(BasicBlock *TheBB)
This specifies that created instructions should be appended to the end of the specified block.
Definition IRBuilder.h:207
LLVM_ABI CallInst * CreateMemTransferInst(Intrinsic::ID IntrID, Value *Dst, MaybeAlign DstAlign, Value *Src, MaybeAlign SrcAlign, Value *Size, bool isVolatile=false, const AAMDNodes &AAInfo=AAMDNodes())
Value * CreateMul(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
Definition IRBuilder.h:1437
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
Definition IRBuilder.h:2780
InstSimplifyFolder - Use InstructionSimplify to fold operations to existing values.
LLVM_ABI const Module * getModule() const
Return the module owning the function this instruction belongs to or nullptr it the function does not...
LLVM_ABI InstListType::iterator eraseFromParent()
This method unlinks 'this' from the containing basic block and deletes it.
LLVM_ABI void setMetadata(unsigned KindID, MDNode *Node)
Set the metadata of the specified kind to the specified node.
unsigned getOpcode() const
Returns a member of one of the enums like Instruction::Add.
A wrapper class for inspecting calls to intrinsic functions.
Intrinsic::ID getIntrinsicID() const
Return the intrinsic ID of this intrinsic.
This is an important class for using LLVM in a threaded context.
Definition LLVMContext.h:68
An instruction for reading from memory.
Analysis pass that exposes the LoopInfo for a function.
Definition LoopInfo.h:569
The legacy pass manager's analysis pass to compute loop information.
Definition LoopInfo.h:596
Metadata node.
Definition Metadata.h:1077
static MDTuple * get(LLVMContext &Context, ArrayRef< Metadata * > MDs)
Definition Metadata.h:1565
size_type size() const
Definition MapVector.h:56
std::pair< KeyT, ValueT > & front()
Definition MapVector.h:79
Value * getLength() const
Value * getRawDest() const
MaybeAlign getDestAlign() const
bool isVolatile() const
Value * getValue() const
This class wraps the llvm.memset and llvm.memset.inline intrinsics.
This class wraps the llvm.memcpy/memmove intrinsics.
A Module instance is used to store all the information related to an LLVM module.
Definition Module.h:67
virtual void getAnalysisUsage(AnalysisUsage &) const
getAnalysisUsage - This function should be overriden by passes that need analysis information to do t...
Definition Pass.cpp:112
Class to represent pointers.
static LLVM_ABI PointerType * get(Type *ElementType, unsigned AddressSpace)
This constructs a pointer to an object of the specified type in a numbered address space.
static LLVM_ABI PoisonValue * get(Type *T)
Static factory methods - Return an 'poison' object of the specified type.
A set of analyses that are preserved following a run of a transformation pass.
Definition Analysis.h:112
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
Definition Analysis.h:118
PreservedAnalyses & preserveSet()
Mark an analysis set as preserved.
Definition Analysis.h:151
Helper class for SSA formation on a set of values defined in multiple blocks.
Definition SSAUpdater.h:39
Value * FindValueForBlock(BasicBlock *BB) const
Return the value for the specified block if the SSAUpdater has one, otherwise return nullptr.
void Initialize(Type *Ty, StringRef Name)
Reset this object to get ready for a new set of SSA updates with type 'Ty'.
Value * GetValueInMiddleOfBlock(BasicBlock *BB)
Construct SSA form, materializing a value that is live in the middle of the specified block.
void AddAvailableValue(BasicBlock *BB, Value *V)
Indicate that a rewritten value is available in the specified block with the specified value.
This class represents the LLVM 'select' instruction.
size_type size() const
Definition SmallPtrSet.h:99
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
reference emplace_back(ArgTypes &&... Args)
void reserve(size_type N)
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
An instruction for storing to memory.
static unsigned getPointerOperandIndex()
StringRef - Represent a constant reference to a string, i.e.
Definition StringRef.h:55
Primary interface to the complete machine description for the target machine.
Triple - Helper class for working with autoconf configuration names.
Definition Triple.h:47
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
Definition Twine.h:82
The instances of the Type class are immutable: once they are created, they are never changed.
Definition Type.h:45
bool isArrayTy() const
True if this is an instance of ArrayType.
Definition Type.h:264
static LLVM_ABI IntegerType * getInt32Ty(LLVMContext &C)
Definition Type.cpp:297
bool isPointerTy() const
True if this is an instance of PointerType.
Definition Type.h:267
bool isAggregateType() const
Return true if the type is an aggregate type.
Definition Type.h:304
LLVM_ABI Type * getWithNewType(Type *EltTy) const
Given vector type, change the element type, whilst keeping the old number of elements.
bool isPtrOrPtrVectorTy() const
Return true if this is a pointer type or a vector of pointer types.
Definition Type.h:270
static LLVM_ABI IntegerType * getIntNTy(LLVMContext &C, unsigned N)
Definition Type.cpp:301
A Use represents the edge between a Value definition and its users.
Definition Use.h:35
void setOperand(unsigned i, Value *Val)
Definition User.h:237
Value * getOperand(unsigned i) const
Definition User.h:232
LLVM Value Representation.
Definition Value.h:75
Type * getType() const
All values are typed, get the type of this value.
Definition Value.h:256
LLVM_ABI void print(raw_ostream &O, bool IsForDebug=false) const
Implement operator<< on Value.
LLVM_ABI void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
Definition Value.cpp:546
iterator_range< user_iterator > users()
Definition Value.h:426
bool use_empty() const
Definition Value.h:346
LLVM_ABI LLVMContext & getContext() const
All values hold a context through their type.
Definition Value.cpp:1101
void mutateType(Type *Ty)
Mutate the type of this Value to be of the specified type.
Definition Value.h:838
LLVM_ABI StringRef getName() const
Return a constant reference to the value's name.
Definition Value.cpp:322
LLVM_ABI void takeName(Value *V)
Transfer the name from V to this value.
Definition Value.cpp:396
ElementCount getElementCount() const
Return an ElementCount instance to represent the (possibly scalable) number of elements in the vector...
static LLVM_ABI bool isValidElementType(Type *ElemTy)
Return true if the specified type is valid as a element type.
Type * getElementType() const
constexpr bool isKnownMultipleOf(ScalarTy RHS) const
This function tells the caller whether the element count is known at compile time to be a multiple of...
Definition TypeSize.h:181
const ParentTy * getParent() const
Definition ilist_node.h:34
self_iterator getIterator()
Definition ilist_node.h:134
CallInst * Call
Changed
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ LOCAL_ADDRESS
Address space for local memory.
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
LLVM_READNONE constexpr bool isEntryFunctionCC(CallingConv::ID CC)
unsigned getDynamicVGPRBlockSize(const Function &F)
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition CallingConv.h:24
@ AMDGPU_KERNEL
Used for AMDGPU code object kernels.
@ SPIR_KERNEL
Used for SPIR kernel functions.
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
This namespace contains an enum with a value for every intrinsic/builtin function known by LLVM.
LLVM_ABI Function * getOrInsertDeclaration(Module *M, ID id, ArrayRef< Type * > Tys={})
Look up the Function declaration of the intrinsic id in the Module M.
specific_intval< false > m_SpecificInt(const APInt &V)
Match a specific integer value or vector with all elements equal to the value.
bool match(Val *V, const Pattern &P)
initializer< Ty > init(const Ty &Val)
NodeAddr< PhiNode * > Phi
Definition RDFGraph.h:390
This is an optimization pass for GlobalISel generic memory operations.
@ Offset
Definition DWP.cpp:477
@ Length
Definition DWP.cpp:477
void stable_sort(R &&Range)
Definition STLExtras.h:2040
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1707
LLVM_ABI bool isAssumeLikeIntrinsic(const Instruction *I)
Return true if it is an intrinsic that cannot be speculated but also cannot trap.
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:649
const Value * getLoadStorePointerOperand(const Value *V)
A helper function that returns the pointer operand of a load or store instruction.
const Value * getPointerOperand(const Value *V)
A helper function that returns the pointer operand of a load, store or GEP instruction.
auto reverse(ContainerTy &&C)
Definition STLExtras.h:400
void sort(IteratorTy Start, IteratorTy End)
Definition STLExtras.h:1632
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:207
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:548
constexpr int PoisonMaskElem
LLVM_ABI raw_fd_ostream & errs()
This returns a reference to a raw_ostream for standard error.
FunctionPass * createAMDGPUPromoteAlloca()
@ Mod
The access may modify the value stored in memory.
Definition ModRef.h:34
uint64_t alignTo(uint64_t Size, Align A)
Returns a multiple of A needed to store Size bytes.
Definition Alignment.h:155
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:565
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Definition STLExtras.h:1879
Type * getLoadStoreType(const Value *I)
A helper function that returns the type of a load or store instruction.
char & AMDGPUPromoteAllocaID
AnalysisManager< Function > FunctionAnalysisManager
Convenience typedef for the Function analysis manager.
LLVM_ABI const Value * getUnderlyingObject(const Value *V, unsigned MaxLookup=MaxLookupSearchDepth)
This method strips off any GEP address adjustments, pointer casts or llvm.threadlocal....
#define N
AMDGPUPromoteAllocaPass(TargetMachine &TM)
Definition AMDGPU.h:257
PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM)
PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM)
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition Alignment.h:39
A MapVector that performs no allocations if smaller than a certain size.
Definition MapVector.h:249
Function object to check whether the second component of a container supported by std::get (like std:...
Definition STLExtras.h:1444