clang 22.0.0git
AMDGPU.cpp
Go to the documentation of this file.
1//===- AMDGPU.cpp ---------------------------------------------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8
9#include "ABIInfoImpl.h"
10#include "TargetInfo.h"
11#include "llvm/ADT/StringExtras.h"
12#include "llvm/Support/AMDGPUAddrSpace.h"
13
14using namespace clang;
15using namespace clang::CodeGen;
16
17//===----------------------------------------------------------------------===//
18// AMDGPU ABI Implementation
19//===----------------------------------------------------------------------===//
20
21namespace {
22
23class AMDGPUABIInfo final : public DefaultABIInfo {
24private:
25 static const unsigned MaxNumRegsForArgsRet = 16;
26
27 unsigned numRegsForType(QualType Ty) const;
28
29 bool isHomogeneousAggregateBaseType(QualType Ty) const override;
30 bool isHomogeneousAggregateSmallEnough(const Type *Base,
31 uint64_t Members) const override;
32
33 // Coerce HIP scalar pointer arguments from generic pointers to global ones.
34 llvm::Type *coerceKernelArgumentType(llvm::Type *Ty, unsigned FromAS,
35 unsigned ToAS) const {
36 // Single value types.
37 auto *PtrTy = llvm::dyn_cast<llvm::PointerType>(Ty);
38 if (PtrTy && PtrTy->getAddressSpace() == FromAS)
39 return llvm::PointerType::get(Ty->getContext(), ToAS);
40 return Ty;
41 }
42
43public:
44 explicit AMDGPUABIInfo(CodeGen::CodeGenTypes &CGT) :
45 DefaultABIInfo(CGT) {}
46
47 ABIArgInfo classifyReturnType(QualType RetTy) const;
48 ABIArgInfo classifyKernelArgumentType(QualType Ty) const;
49 ABIArgInfo classifyArgumentType(QualType Ty, bool Variadic,
50 unsigned &NumRegsLeft) const;
51
52 void computeInfo(CGFunctionInfo &FI) const override;
53 RValue EmitVAArg(CodeGenFunction &CGF, Address VAListAddr, QualType Ty,
54 AggValueSlot Slot) const override;
55
56 llvm::FixedVectorType *
57 getOptimalVectorMemoryType(llvm::FixedVectorType *T,
58 const LangOptions &Opt) const override {
59 // We have legal instructions for 96-bit so 3x32 can be supported.
60 // FIXME: This check should be a subtarget feature as technically SI doesn't
61 // support it.
62 if (T->getNumElements() == 3 && getDataLayout().getTypeSizeInBits(T) == 96)
63 return T;
64 return DefaultABIInfo::getOptimalVectorMemoryType(T, Opt);
65 }
66};
67
68bool AMDGPUABIInfo::isHomogeneousAggregateBaseType(QualType Ty) const {
69 return true;
70}
71
72bool AMDGPUABIInfo::isHomogeneousAggregateSmallEnough(
73 const Type *Base, uint64_t Members) const {
74 uint32_t NumRegs = (getContext().getTypeSize(Base) + 31) / 32;
75
76 // Homogeneous Aggregates may occupy at most 16 registers.
77 return Members * NumRegs <= MaxNumRegsForArgsRet;
78}
79
80/// Estimate number of registers the type will use when passed in registers.
81unsigned AMDGPUABIInfo::numRegsForType(QualType Ty) const {
82 unsigned NumRegs = 0;
83
84 if (const VectorType *VT = Ty->getAs<VectorType>()) {
85 // Compute from the number of elements. The reported size is based on the
86 // in-memory size, which includes the padding 4th element for 3-vectors.
87 QualType EltTy = VT->getElementType();
88 unsigned EltSize = getContext().getTypeSize(EltTy);
89
90 // 16-bit element vectors should be passed as packed.
91 if (EltSize == 16)
92 return (VT->getNumElements() + 1) / 2;
93
94 unsigned EltNumRegs = (EltSize + 31) / 32;
95 return EltNumRegs * VT->getNumElements();
96 }
97
98 if (const auto *RD = Ty->getAsRecordDecl()) {
99 assert(!RD->hasFlexibleArrayMember());
100
101 for (const FieldDecl *Field : RD->fields()) {
102 QualType FieldTy = Field->getType();
103 NumRegs += numRegsForType(FieldTy);
104 }
105
106 return NumRegs;
107 }
108
109 return (getContext().getTypeSize(Ty) + 31) / 32;
110}
111
112void AMDGPUABIInfo::computeInfo(CGFunctionInfo &FI) const {
113 llvm::CallingConv::ID CC = FI.getCallingConvention();
114
115 if (!getCXXABI().classifyReturnType(FI))
117
118 unsigned ArgumentIndex = 0;
119 const unsigned numFixedArguments = FI.getNumRequiredArgs();
120
121 unsigned NumRegsLeft = MaxNumRegsForArgsRet;
122 for (auto &Arg : FI.arguments()) {
123 if (CC == llvm::CallingConv::AMDGPU_KERNEL) {
124 Arg.info = classifyKernelArgumentType(Arg.type);
125 } else {
126 bool FixedArgument = ArgumentIndex++ < numFixedArguments;
127 Arg.info = classifyArgumentType(Arg.type, !FixedArgument, NumRegsLeft);
128 }
129 }
130}
131
132RValue AMDGPUABIInfo::EmitVAArg(CodeGenFunction &CGF, Address VAListAddr,
133 QualType Ty, AggValueSlot Slot) const {
134 const bool IsIndirect = false;
135 const bool AllowHigherAlign = false;
136 return emitVoidPtrVAArg(CGF, VAListAddr, Ty, IsIndirect,
137 getContext().getTypeInfoInChars(Ty),
138 CharUnits::fromQuantity(4), AllowHigherAlign, Slot);
139}
140
141ABIArgInfo AMDGPUABIInfo::classifyReturnType(QualType RetTy) const {
142 if (isAggregateTypeForABI(RetTy)) {
143 // Records with non-trivial destructors/copy-constructors should not be
144 // returned by value.
145 if (!getRecordArgABI(RetTy, getCXXABI())) {
146 // Ignore empty structs/unions.
147 if (isEmptyRecord(getContext(), RetTy, true))
148 return ABIArgInfo::getIgnore();
149
150 // Lower single-element structs to just return a regular value.
151 if (const Type *SeltTy = isSingleElementStruct(RetTy, getContext()))
152 return ABIArgInfo::getDirect(CGT.ConvertType(QualType(SeltTy, 0)));
153
154 if (const auto *RD = RetTy->getAsRecordDecl();
155 RD && RD->hasFlexibleArrayMember())
157
158 // Pack aggregates <= 4 bytes into single VGPR or pair.
159 uint64_t Size = getContext().getTypeSize(RetTy);
160 if (Size <= 16)
161 return ABIArgInfo::getDirect(llvm::Type::getInt16Ty(getVMContext()));
162
163 if (Size <= 32)
164 return ABIArgInfo::getDirect(llvm::Type::getInt32Ty(getVMContext()));
165
166 if (Size <= 64) {
167 llvm::Type *I32Ty = llvm::Type::getInt32Ty(getVMContext());
168 return ABIArgInfo::getDirect(llvm::ArrayType::get(I32Ty, 2));
169 }
170
171 if (numRegsForType(RetTy) <= MaxNumRegsForArgsRet)
172 return ABIArgInfo::getDirect();
173 }
174 }
175
176 // Otherwise just do the default thing.
178}
179
180/// For kernels all parameters are really passed in a special buffer. It doesn't
181/// make sense to pass anything byval, so everything must be direct.
182ABIArgInfo AMDGPUABIInfo::classifyKernelArgumentType(QualType Ty) const {
184
185 // TODO: Can we omit empty structs?
186
187 if (const Type *SeltTy = isSingleElementStruct(Ty, getContext()))
188 Ty = QualType(SeltTy, 0);
189
190 llvm::Type *OrigLTy = CGT.ConvertType(Ty);
191 llvm::Type *LTy = OrigLTy;
192 if (getContext().getLangOpts().HIP) {
193 LTy = coerceKernelArgumentType(
194 OrigLTy, /*FromAS=*/getContext().getTargetAddressSpace(LangAS::Default),
195 /*ToAS=*/getContext().getTargetAddressSpace(LangAS::cuda_device));
196 }
197
198 // FIXME: This doesn't apply the optimization of coercing pointers in structs
199 // to global address space when using byref. This would require implementing a
200 // new kind of coercion of the in-memory type when for indirect arguments.
201 if (LTy == OrigLTy && isAggregateTypeForABI(Ty)) {
203 getContext().getTypeAlignInChars(Ty),
204 getContext().getTargetAddressSpace(LangAS::opencl_constant),
205 false /*Realign*/, nullptr /*Padding*/);
206 }
207
208 // If we set CanBeFlattened to true, CodeGen will expand the struct to its
209 // individual elements, which confuses the Clover OpenCL backend; therefore we
210 // have to set it to false here. Other args of getDirect() are just defaults.
211 return ABIArgInfo::getDirect(LTy, 0, nullptr, false);
212}
213
214ABIArgInfo AMDGPUABIInfo::classifyArgumentType(QualType Ty, bool Variadic,
215 unsigned &NumRegsLeft) const {
216 assert(NumRegsLeft <= MaxNumRegsForArgsRet && "register estimate underflow");
217
219
220 if (Variadic) {
221 return ABIArgInfo::getDirect(/*T=*/nullptr,
222 /*Offset=*/0,
223 /*Padding=*/nullptr,
224 /*CanBeFlattened=*/false,
225 /*Align=*/0);
226 }
227
228 if (isAggregateTypeForABI(Ty)) {
229 // Records with non-trivial destructors/copy-constructors should not be
230 // passed by value.
231 if (auto RAA = getRecordArgABI(Ty, getCXXABI()))
232 return getNaturalAlignIndirect(Ty, getDataLayout().getAllocaAddrSpace(),
234
235 // Ignore empty structs/unions.
236 if (isEmptyRecord(getContext(), Ty, true))
237 return ABIArgInfo::getIgnore();
238
239 // Lower single-element structs to just pass a regular value. TODO: We
240 // could do reasonable-size multiple-element structs too, using getExpand(),
241 // though watch out for things like bitfields.
242 if (const Type *SeltTy = isSingleElementStruct(Ty, getContext()))
243 return ABIArgInfo::getDirect(CGT.ConvertType(QualType(SeltTy, 0)));
244
245 if (const auto *RD = Ty->getAsRecordDecl();
246 RD && RD->hasFlexibleArrayMember())
248
249 // Pack aggregates <= 8 bytes into single VGPR or pair.
250 uint64_t Size = getContext().getTypeSize(Ty);
251 if (Size <= 64) {
252 unsigned NumRegs = (Size + 31) / 32;
253 NumRegsLeft -= std::min(NumRegsLeft, NumRegs);
254
255 if (Size <= 16)
256 return ABIArgInfo::getDirect(llvm::Type::getInt16Ty(getVMContext()));
257
258 if (Size <= 32)
259 return ABIArgInfo::getDirect(llvm::Type::getInt32Ty(getVMContext()));
260
261 // XXX: Should this be i64 instead, and should the limit increase?
262 llvm::Type *I32Ty = llvm::Type::getInt32Ty(getVMContext());
263 return ABIArgInfo::getDirect(llvm::ArrayType::get(I32Ty, 2));
264 }
265
266 if (NumRegsLeft > 0) {
267 unsigned NumRegs = numRegsForType(Ty);
268 if (NumRegsLeft >= NumRegs) {
269 NumRegsLeft -= NumRegs;
270 return ABIArgInfo::getDirect();
271 }
272 }
273
274 // Use pass-by-reference in stead of pass-by-value for struct arguments in
275 // function ABI.
277 getContext().getTypeAlignInChars(Ty),
278 getContext().getTargetAddressSpace(LangAS::opencl_private));
279 }
280
281 // Otherwise just do the default thing.
282 ABIArgInfo ArgInfo = DefaultABIInfo::classifyArgumentType(Ty);
283 if (!ArgInfo.isIndirect()) {
284 unsigned NumRegs = numRegsForType(Ty);
285 NumRegsLeft -= std::min(NumRegs, NumRegsLeft);
286 }
287
288 return ArgInfo;
289}
290
291class AMDGPUTargetCodeGenInfo : public TargetCodeGenInfo {
292public:
293 AMDGPUTargetCodeGenInfo(CodeGenTypes &CGT)
294 : TargetCodeGenInfo(std::make_unique<AMDGPUABIInfo>(CGT)) {}
295
296 bool supportsLibCall() const override { return false; }
297 void setFunctionDeclAttributes(const FunctionDecl *FD, llvm::Function *F,
298 CodeGenModule &CGM) const;
299
300 void setTargetAttributes(const Decl *D, llvm::GlobalValue *GV,
301 CodeGen::CodeGenModule &M) const override;
302 unsigned getDeviceKernelCallingConv() const override;
303
304 llvm::Constant *getNullPointer(const CodeGen::CodeGenModule &CGM,
305 llvm::PointerType *T, QualType QT) const override;
306
307 LangAS getASTAllocaAddressSpace() const override {
309 getABIInfo().getDataLayout().getAllocaAddrSpace());
310 }
311 LangAS getGlobalVarAddressSpace(CodeGenModule &CGM,
312 const VarDecl *D) const override;
313 llvm::SyncScope::ID getLLVMSyncScopeID(const LangOptions &LangOpts,
314 SyncScope Scope,
315 llvm::AtomicOrdering Ordering,
316 llvm::LLVMContext &Ctx) const override;
317 void setTargetAtomicMetadata(CodeGenFunction &CGF,
318 llvm::Instruction &AtomicInst,
319 const AtomicExpr *Expr = nullptr) const override;
320 llvm::Value *createEnqueuedBlockKernel(CodeGenFunction &CGF,
321 llvm::Function *BlockInvokeFunc,
322 llvm::Type *BlockTy) const override;
323 bool shouldEmitStaticExternCAliases() const override;
324 bool shouldEmitDWARFBitFieldSeparators() const override;
325 void setCUDAKernelCallingConvention(const FunctionType *&FT) const override;
326};
327}
328
330 llvm::GlobalValue *GV) {
331 if (GV->getVisibility() != llvm::GlobalValue::HiddenVisibility)
332 return false;
333
334 return !D->hasAttr<OMPDeclareTargetDeclAttr>() &&
335 (D->hasAttr<DeviceKernelAttr>() ||
336 (isa<FunctionDecl>(D) && D->hasAttr<CUDAGlobalAttr>()) ||
337 (isa<VarDecl>(D) &&
338 (D->hasAttr<CUDADeviceAttr>() || D->hasAttr<CUDAConstantAttr>() ||
339 cast<VarDecl>(D)->getType()->isCUDADeviceBuiltinSurfaceType() ||
340 cast<VarDecl>(D)->getType()->isCUDADeviceBuiltinTextureType())));
341}
342
343void AMDGPUTargetCodeGenInfo::setFunctionDeclAttributes(
344 const FunctionDecl *FD, llvm::Function *F, CodeGenModule &M) const {
345 const auto *ReqdWGS =
346 M.getLangOpts().OpenCL ? FD->getAttr<ReqdWorkGroupSizeAttr>() : nullptr;
347 const bool IsOpenCLKernel =
348 M.getLangOpts().OpenCL && FD->hasAttr<DeviceKernelAttr>();
349 const bool IsHIPKernel = M.getLangOpts().HIP && FD->hasAttr<CUDAGlobalAttr>();
350
351 const auto *FlatWGS = FD->getAttr<AMDGPUFlatWorkGroupSizeAttr>();
352 if (ReqdWGS || FlatWGS) {
353 M.handleAMDGPUFlatWorkGroupSizeAttr(F, FlatWGS, ReqdWGS);
354 } else if (IsOpenCLKernel || IsHIPKernel) {
355 // By default, restrict the maximum size to a value specified by
356 // --gpu-max-threads-per-block=n or its default value for HIP.
357 const unsigned OpenCLDefaultMaxWorkGroupSize = 256;
358 const unsigned DefaultMaxWorkGroupSize =
359 IsOpenCLKernel ? OpenCLDefaultMaxWorkGroupSize
360 : M.getLangOpts().GPUMaxThreadsPerBlock;
361 std::string AttrVal =
362 std::string("1,") + llvm::utostr(DefaultMaxWorkGroupSize);
363 F->addFnAttr("amdgpu-flat-work-group-size", AttrVal);
364 }
365
366 if (const auto *Attr = FD->getAttr<AMDGPUWavesPerEUAttr>())
368
369 if (const auto *Attr = FD->getAttr<AMDGPUNumSGPRAttr>()) {
370 unsigned NumSGPR = Attr->getNumSGPR();
371
372 if (NumSGPR != 0)
373 F->addFnAttr("amdgpu-num-sgpr", llvm::utostr(NumSGPR));
374 }
375
376 if (const auto *Attr = FD->getAttr<AMDGPUNumVGPRAttr>()) {
377 uint32_t NumVGPR = Attr->getNumVGPR();
378
379 if (NumVGPR != 0)
380 F->addFnAttr("amdgpu-num-vgpr", llvm::utostr(NumVGPR));
381 }
382
383 if (const auto *Attr = FD->getAttr<AMDGPUMaxNumWorkGroupsAttr>()) {
384 uint32_t X = Attr->getMaxNumWorkGroupsX()
385 ->EvaluateKnownConstInt(M.getContext())
386 .getExtValue();
387 // Y and Z dimensions default to 1 if not specified
388 uint32_t Y = Attr->getMaxNumWorkGroupsY()
389 ? Attr->getMaxNumWorkGroupsY()
390 ->EvaluateKnownConstInt(M.getContext())
391 .getExtValue()
392 : 1;
393 uint32_t Z = Attr->getMaxNumWorkGroupsZ()
394 ? Attr->getMaxNumWorkGroupsZ()
395 ->EvaluateKnownConstInt(M.getContext())
396 .getExtValue()
397 : 1;
398
399 llvm::SmallString<32> AttrVal;
400 llvm::raw_svector_ostream OS(AttrVal);
401 OS << X << ',' << Y << ',' << Z;
402
403 F->addFnAttr("amdgpu-max-num-workgroups", AttrVal.str());
404 }
405}
406
407void AMDGPUTargetCodeGenInfo::setTargetAttributes(
408 const Decl *D, llvm::GlobalValue *GV, CodeGen::CodeGenModule &M) const {
410 GV->setVisibility(llvm::GlobalValue::ProtectedVisibility);
411 GV->setDSOLocal(true);
412 }
413
414 if (GV->isDeclaration())
415 return;
416
417 llvm::Function *F = dyn_cast<llvm::Function>(GV);
418 if (!F)
419 return;
420
421 const FunctionDecl *FD = dyn_cast_or_null<FunctionDecl>(D);
422 if (FD)
423 setFunctionDeclAttributes(FD, F, M);
424
425 if (!getABIInfo().getCodeGenOpts().EmitIEEENaNCompliantInsts)
426 F->addFnAttr("amdgpu-ieee", "false");
427}
428
429unsigned AMDGPUTargetCodeGenInfo::getDeviceKernelCallingConv() const {
430 return llvm::CallingConv::AMDGPU_KERNEL;
431}
432
433// Currently LLVM assumes null pointers always have value 0,
434// which results in incorrectly transformed IR. Therefore, instead of
435// emitting null pointers in private and local address spaces, a null
436// pointer in generic address space is emitted which is casted to a
437// pointer in local or private address space.
438llvm::Constant *AMDGPUTargetCodeGenInfo::getNullPointer(
439 const CodeGen::CodeGenModule &CGM, llvm::PointerType *PT,
440 QualType QT) const {
441 if (CGM.getContext().getTargetNullPointerValue(QT) == 0)
442 return llvm::ConstantPointerNull::get(PT);
443
444 auto &Ctx = CGM.getContext();
445 auto NPT = llvm::PointerType::get(
446 PT->getContext(), Ctx.getTargetAddressSpace(LangAS::opencl_generic));
447 return llvm::ConstantExpr::getAddrSpaceCast(
448 llvm::ConstantPointerNull::get(NPT), PT);
449}
450
451LangAS
452AMDGPUTargetCodeGenInfo::getGlobalVarAddressSpace(CodeGenModule &CGM,
453 const VarDecl *D) const {
454 assert(!CGM.getLangOpts().OpenCL &&
455 !(CGM.getLangOpts().CUDA && CGM.getLangOpts().CUDAIsDevice) &&
456 "Address space agnostic languages only");
457 LangAS DefaultGlobalAS = getLangASFromTargetAS(
458 CGM.getContext().getTargetAddressSpace(LangAS::opencl_global));
459 if (!D)
460 return DefaultGlobalAS;
461
462 LangAS AddrSpace = D->getType().getAddressSpace();
463 if (AddrSpace != LangAS::Default)
464 return AddrSpace;
465
466 // Only promote to address space 4 if VarDecl has constant initialization.
467 if (D->getType().isConstantStorage(CGM.getContext(), false, false) &&
469 if (auto ConstAS = CGM.getTarget().getConstantAddressSpace())
470 return *ConstAS;
471 }
472 return DefaultGlobalAS;
473}
474
475llvm::SyncScope::ID
476AMDGPUTargetCodeGenInfo::getLLVMSyncScopeID(const LangOptions &LangOpts,
477 SyncScope Scope,
478 llvm::AtomicOrdering Ordering,
479 llvm::LLVMContext &Ctx) const {
480 std::string Name;
481 switch (Scope) {
482 case SyncScope::HIPSingleThread:
483 case SyncScope::SingleScope:
484 Name = "singlethread";
485 break;
486 case SyncScope::HIPWavefront:
487 case SyncScope::OpenCLSubGroup:
488 case SyncScope::WavefrontScope:
489 Name = "wavefront";
490 break;
491 case SyncScope::HIPWorkgroup:
492 case SyncScope::OpenCLWorkGroup:
493 case SyncScope::WorkgroupScope:
494 Name = "workgroup";
495 break;
496 case SyncScope::HIPAgent:
497 case SyncScope::OpenCLDevice:
498 case SyncScope::DeviceScope:
499 Name = "agent";
500 break;
501 case SyncScope::SystemScope:
502 case SyncScope::HIPSystem:
503 case SyncScope::OpenCLAllSVMDevices:
504 Name = "";
505 break;
506 }
507
508 // OpenCL assumes by default that atomic scopes are per-address space for
509 // non-sequentially consistent operations.
510 if (Scope >= SyncScope::OpenCLWorkGroup &&
511 Scope <= SyncScope::OpenCLSubGroup &&
512 Ordering != llvm::AtomicOrdering::SequentiallyConsistent) {
513 if (!Name.empty())
514 Name = Twine(Twine(Name) + Twine("-")).str();
515
516 Name = Twine(Twine(Name) + Twine("one-as")).str();
517 }
518
519 return Ctx.getOrInsertSyncScopeID(Name);
520}
521
522void AMDGPUTargetCodeGenInfo::setTargetAtomicMetadata(
523 CodeGenFunction &CGF, llvm::Instruction &AtomicInst,
524 const AtomicExpr *AE) const {
525 auto *RMW = dyn_cast<llvm::AtomicRMWInst>(&AtomicInst);
526 auto *CmpX = dyn_cast<llvm::AtomicCmpXchgInst>(&AtomicInst);
527
528 // OpenCL and old style HIP atomics consider atomics targeting thread private
529 // memory to be undefined.
530 //
531 // TODO: This is probably undefined for atomic load/store, but there's not
532 // much direct codegen benefit to knowing this.
533 if (((RMW && RMW->getPointerAddressSpace() == llvm::AMDGPUAS::FLAT_ADDRESS) ||
534 (CmpX &&
535 CmpX->getPointerAddressSpace() == llvm::AMDGPUAS::FLAT_ADDRESS)) &&
537 llvm::MDBuilder MDHelper(CGF.getLLVMContext());
538 llvm::MDNode *ASRange = MDHelper.createRange(
539 llvm::APInt(32, llvm::AMDGPUAS::PRIVATE_ADDRESS),
540 llvm::APInt(32, llvm::AMDGPUAS::PRIVATE_ADDRESS + 1));
541 AtomicInst.setMetadata(llvm::LLVMContext::MD_noalias_addrspace, ASRange);
542 }
543
544 if (!RMW)
545 return;
546
547 AtomicOptions AO = CGF.CGM.getAtomicOpts();
548 llvm::MDNode *Empty = llvm::MDNode::get(CGF.getLLVMContext(), {});
550 RMW->setMetadata("amdgpu.no.fine.grained.memory", Empty);
552 RMW->setMetadata("amdgpu.no.remote.memory", Empty);
554 RMW->getOperation() == llvm::AtomicRMWInst::FAdd &&
555 RMW->getType()->isFloatTy())
556 RMW->setMetadata("amdgpu.ignore.denormal.mode", Empty);
557}
558
559bool AMDGPUTargetCodeGenInfo::shouldEmitStaticExternCAliases() const {
560 return false;
561}
562
563bool AMDGPUTargetCodeGenInfo::shouldEmitDWARFBitFieldSeparators() const {
564 return true;
565}
566
567void AMDGPUTargetCodeGenInfo::setCUDAKernelCallingConvention(
568 const FunctionType *&FT) const {
569 FT = getABIInfo().getContext().adjustFunctionType(
571}
572
573/// Return IR struct type for rtinfo struct in rocm-device-libs used for device
574/// enqueue.
575///
576/// ptr addrspace(1) kernel_object, i32 private_segment_size,
577/// i32 group_segment_size
578
579static llvm::StructType *
580getAMDGPURuntimeHandleType(llvm::LLVMContext &C,
581 llvm::Type *KernelDescriptorPtrTy) {
582 llvm::Type *Int32 = llvm::Type::getInt32Ty(C);
583 return llvm::StructType::create(C, {KernelDescriptorPtrTy, Int32, Int32},
584 "block.runtime.handle.t");
585}
586
587/// Create an OpenCL kernel for an enqueued block.
588///
589/// The type of the first argument (the block literal) is the struct type
590/// of the block literal instead of a pointer type. The first argument
591/// (block literal) is passed directly by value to the kernel. The kernel
592/// allocates the same type of struct on stack and stores the block literal
593/// to it and passes its pointer to the block invoke function. The kernel
594/// has "enqueued-block" function attribute and kernel argument metadata.
595llvm::Value *AMDGPUTargetCodeGenInfo::createEnqueuedBlockKernel(
596 CodeGenFunction &CGF, llvm::Function *Invoke, llvm::Type *BlockTy) const {
597 auto &Builder = CGF.Builder;
598 auto &C = CGF.getLLVMContext();
599
600 auto *InvokeFT = Invoke->getFunctionType();
601 llvm::SmallVector<llvm::Type *, 2> ArgTys;
602 llvm::SmallVector<llvm::Metadata *, 8> AddressQuals;
603 llvm::SmallVector<llvm::Metadata *, 8> AccessQuals;
604 llvm::SmallVector<llvm::Metadata *, 8> ArgTypeNames;
605 llvm::SmallVector<llvm::Metadata *, 8> ArgBaseTypeNames;
606 llvm::SmallVector<llvm::Metadata *, 8> ArgTypeQuals;
607 llvm::SmallVector<llvm::Metadata *, 8> ArgNames;
608
609 ArgTys.push_back(BlockTy);
610 ArgTypeNames.push_back(llvm::MDString::get(C, "__block_literal"));
611 AddressQuals.push_back(llvm::ConstantAsMetadata::get(Builder.getInt32(0)));
612 ArgBaseTypeNames.push_back(llvm::MDString::get(C, "__block_literal"));
613 ArgTypeQuals.push_back(llvm::MDString::get(C, ""));
614 AccessQuals.push_back(llvm::MDString::get(C, "none"));
615 ArgNames.push_back(llvm::MDString::get(C, "block_literal"));
616 for (unsigned I = 1, E = InvokeFT->getNumParams(); I < E; ++I) {
617 ArgTys.push_back(InvokeFT->getParamType(I));
618 ArgTypeNames.push_back(llvm::MDString::get(C, "void*"));
619 AddressQuals.push_back(llvm::ConstantAsMetadata::get(Builder.getInt32(3)));
620 AccessQuals.push_back(llvm::MDString::get(C, "none"));
621 ArgBaseTypeNames.push_back(llvm::MDString::get(C, "void*"));
622 ArgTypeQuals.push_back(llvm::MDString::get(C, ""));
623 ArgNames.push_back(
624 llvm::MDString::get(C, (Twine("local_arg") + Twine(I)).str()));
625 }
626
627 llvm::Module &Mod = CGF.CGM.getModule();
628 const llvm::DataLayout &DL = Mod.getDataLayout();
629
630 llvm::Twine Name = Invoke->getName() + "_kernel";
631 auto *FT = llvm::FunctionType::get(llvm::Type::getVoidTy(C), ArgTys, false);
632
633 // The kernel itself can be internal, the runtime does not directly access the
634 // kernel address (only the kernel descriptor).
635 auto *F = llvm::Function::Create(FT, llvm::GlobalValue::InternalLinkage, Name,
636 &Mod);
637 F->setCallingConv(llvm::CallingConv::AMDGPU_KERNEL);
638
639 llvm::AttrBuilder KernelAttrs(C);
640 // FIXME: The invoke isn't applying the right attributes either
641 // FIXME: This is missing setTargetAttributes
643 F->addFnAttrs(KernelAttrs);
644
645 auto IP = CGF.Builder.saveIP();
646 auto *BB = llvm::BasicBlock::Create(C, "entry", F);
647 Builder.SetInsertPoint(BB);
648 const auto BlockAlign = DL.getPrefTypeAlign(BlockTy);
649 auto *BlockPtr = Builder.CreateAlloca(BlockTy, nullptr);
650 BlockPtr->setAlignment(BlockAlign);
651 Builder.CreateAlignedStore(F->arg_begin(), BlockPtr, BlockAlign);
652 auto *Cast = Builder.CreatePointerCast(BlockPtr, InvokeFT->getParamType(0));
653 llvm::SmallVector<llvm::Value *, 2> Args;
654 Args.push_back(Cast);
655 for (llvm::Argument &A : llvm::drop_begin(F->args()))
656 Args.push_back(&A);
657 llvm::CallInst *call = Builder.CreateCall(Invoke, Args);
658 call->setCallingConv(Invoke->getCallingConv());
659 Builder.CreateRetVoid();
660 Builder.restoreIP(IP);
661
662 F->setMetadata("kernel_arg_addr_space", llvm::MDNode::get(C, AddressQuals));
663 F->setMetadata("kernel_arg_access_qual", llvm::MDNode::get(C, AccessQuals));
664 F->setMetadata("kernel_arg_type", llvm::MDNode::get(C, ArgTypeNames));
665 F->setMetadata("kernel_arg_base_type",
666 llvm::MDNode::get(C, ArgBaseTypeNames));
667 F->setMetadata("kernel_arg_type_qual", llvm::MDNode::get(C, ArgTypeQuals));
668 if (CGF.CGM.getCodeGenOpts().EmitOpenCLArgMetadata)
669 F->setMetadata("kernel_arg_name", llvm::MDNode::get(C, ArgNames));
670
671 llvm::StructType *HandleTy = getAMDGPURuntimeHandleType(
672 C, llvm::PointerType::get(C, DL.getDefaultGlobalsAddressSpace()));
673 llvm::Constant *RuntimeHandleInitializer =
674 llvm::ConstantAggregateZero::get(HandleTy);
675
676 llvm::Twine RuntimeHandleName = F->getName() + ".runtime.handle";
677
678 // The runtime needs access to the runtime handle as an external symbol. The
679 // runtime handle will need to be made external later, in
680 // AMDGPUExportOpenCLEnqueuedBlocks. The kernel itself has a hidden reference
681 // inside the runtime handle, and is not directly referenced.
682
683 // TODO: We would initialize the first field by declaring F->getName() + ".kd"
684 // to reference the kernel descriptor. The runtime wouldn't need to bother
685 // setting it. We would need to have a final symbol name though.
686 // TODO: Can we directly use an external symbol with getGlobalIdentifier?
687 auto *RuntimeHandle = new llvm::GlobalVariable(
688 Mod, HandleTy,
689 /*isConstant=*/true, llvm::GlobalValue::InternalLinkage,
690 /*Initializer=*/RuntimeHandleInitializer, RuntimeHandleName,
691 /*InsertBefore=*/nullptr, llvm::GlobalValue::NotThreadLocal,
692 DL.getDefaultGlobalsAddressSpace(),
693 /*isExternallyInitialized=*/true);
694
695 llvm::MDNode *HandleAsMD =
696 llvm::MDNode::get(C, llvm::ValueAsMetadata::get(RuntimeHandle));
697 F->setMetadata(llvm::LLVMContext::MD_associated, HandleAsMD);
698
699 RuntimeHandle->setSection(".amdgpu.kernel.runtime.handle");
700
701 CGF.CGM.addUsedGlobal(F);
702 CGF.CGM.addUsedGlobal(RuntimeHandle);
703 return RuntimeHandle;
704}
705
707 llvm::Function *F, const AMDGPUFlatWorkGroupSizeAttr *FlatWGS,
708 const ReqdWorkGroupSizeAttr *ReqdWGS, int32_t *MinThreadsVal,
709 int32_t *MaxThreadsVal) {
710 unsigned Min = 0;
711 unsigned Max = 0;
712 auto Eval = [&](Expr *E) {
713 return E->EvaluateKnownConstInt(getContext()).getExtValue();
714 };
715 if (FlatWGS) {
716 Min = Eval(FlatWGS->getMin());
717 Max = Eval(FlatWGS->getMax());
718 }
719 if (ReqdWGS && Min == 0 && Max == 0)
720 Min = Max = Eval(ReqdWGS->getXDim()) * Eval(ReqdWGS->getYDim()) *
721 Eval(ReqdWGS->getZDim());
722
723 if (Min != 0) {
724 assert(Min <= Max && "Min must be less than or equal Max");
725
726 if (MinThreadsVal)
727 *MinThreadsVal = Min;
728 if (MaxThreadsVal)
729 *MaxThreadsVal = Max;
730 std::string AttrVal = llvm::utostr(Min) + "," + llvm::utostr(Max);
731 if (F)
732 F->addFnAttr("amdgpu-flat-work-group-size", AttrVal);
733 } else
734 assert(Max == 0 && "Max must be zero");
735}
736
738 llvm::Function *F, const AMDGPUWavesPerEUAttr *Attr) {
739 unsigned Min =
740 Attr->getMin()->EvaluateKnownConstInt(getContext()).getExtValue();
741 unsigned Max =
742 Attr->getMax()
743 ? Attr->getMax()->EvaluateKnownConstInt(getContext()).getExtValue()
744 : 0;
745
746 if (Min != 0) {
747 assert((Max == 0 || Min <= Max) && "Min must be less than or equal Max");
748
749 std::string AttrVal = llvm::utostr(Min);
750 if (Max != 0)
751 AttrVal = AttrVal + "," + llvm::utostr(Max);
752 F->addFnAttr("amdgpu-waves-per-eu", AttrVal);
753 } else
754 assert(Max == 0 && "Max must be zero");
755}
756
757std::unique_ptr<TargetCodeGenInfo>
759 return std::make_unique<AMDGPUTargetCodeGenInfo>(CGM.getTypes());
760}
static void setCUDAKernelCallingConvention(CanQualType &FTy, CodeGenModule &CGM, const FunctionDecl *FD)
Set calling convention for CUDA/HIP kernel.
Definition CGCall.cpp:359
static bool requiresAMDGPUProtectedVisibility(const Decl *D, llvm::GlobalValue *GV)
Definition AMDGPU.cpp:329
static llvm::StructType * getAMDGPURuntimeHandleType(llvm::LLVMContext &C, llvm::Type *KernelDescriptorPtrTy)
Return IR struct type for rtinfo struct in rocm-device-libs used for device enqueue.
Definition AMDGPU.cpp:580
#define X(type, name)
Definition Value.h:97
uint64_t getTargetNullPointerValue(QualType QT) const
Get target-dependent integer value for null pointer which is used for constant folding.
unsigned getTargetAddressSpace(LangAS AS) const
bool threadPrivateMemoryAtomicsAreUndefined() const
Return true if atomics operations targeting allocations in private memory are undefined.
Definition Expr.h:6925
Attr - This represents one attribute.
Definition Attr.h:44
static CharUnits fromQuantity(QuantityType Quantity)
fromQuantity - Construct a CharUnits quantity from a raw integer type.
Definition CharUnits.h:63
static ABIArgInfo getIgnore()
static ABIArgInfo getDirect(llvm::Type *T=nullptr, unsigned Offset=0, llvm::Type *Padding=nullptr, bool CanBeFlattened=true, unsigned Align=0)
static ABIArgInfo getIndirectAliased(CharUnits Alignment, unsigned AddrSpace, bool Realign=false, llvm::Type *Padding=nullptr)
Pass this in memory using the IR byref attribute.
@ RAA_DirectInMemory
Pass it on the stack using its defined layout.
Definition CGCXXABI.h:158
unsigned getCallingConvention() const
getCallingConvention - Return the user specified calling convention, which has been translated into a...
CanQualType getReturnType() const
MutableArrayRef< ArgInfo > arguments()
llvm::LLVMContext & getLLVMContext()
This class organizes the cross-function state that is used while generating LLVM code.
llvm::Module & getModule() const
void handleAMDGPUWavesPerEUAttr(llvm::Function *F, const AMDGPUWavesPerEUAttr *A)
Emit the IR encoding to attach the AMD GPU waves-per-eu attribute to F.
Definition AMDGPU.cpp:737
const LangOptions & getLangOpts() const
const TargetInfo & getTarget() const
void addUsedGlobal(llvm::GlobalValue *GV)
Add a global to a list to be added to the llvm.used metadata.
void handleAMDGPUFlatWorkGroupSizeAttr(llvm::Function *F, const AMDGPUFlatWorkGroupSizeAttr *A, const ReqdWorkGroupSizeAttr *ReqdWGS=nullptr, int32_t *MinThreadsVal=nullptr, int32_t *MaxThreadsVal=nullptr)
Emit the IR encoding to attach the AMD GPU flat-work-group-size attribute to F.
Definition AMDGPU.cpp:706
AtomicOptions getAtomicOpts()
Get the current Atomic options.
ASTContext & getContext() const
const CodeGenOptions & getCodeGenOpts() const
void addDefaultFunctionDefinitionAttributes(llvm::AttrBuilder &attrs)
Like the overload taking a Function &, but intended specifically for frontends that want to build on ...
Definition CGCall.cpp:2235
DefaultABIInfo - The default implementation for ABI specific details.
Definition ABIInfoImpl.h:21
ABIArgInfo classifyArgumentType(QualType RetTy) const
ABIArgInfo classifyReturnType(QualType RetTy) const
Decl - This represents one declaration (or definition), e.g.
Definition DeclBase.h:86
T * getAttr() const
Definition DeclBase.h:573
bool hasAttr() const
Definition DeclBase.h:577
This represents one expression.
Definition Expr.h:112
ExtInfo withCallingConv(CallingConv cc) const
Definition TypeBase.h:4683
ExtInfo getExtInfo() const
Definition TypeBase.h:4816
A (possibly-)qualified type.
Definition TypeBase.h:937
LangAS getAddressSpace() const
Return the address space of this type.
Definition TypeBase.h:8411
bool isConstantStorage(const ASTContext &Ctx, bool ExcludeCtor, bool ExcludeDtor)
Definition TypeBase.h:1036
bool hasFlexibleArrayMember() const
Definition Decl.h:4342
virtual std::optional< LangAS > getConstantAddressSpace() const
Return an AST address space which can be used opportunistically for constant global memory.
RecordDecl * getAsRecordDecl() const
Retrieves the RecordDecl this type refers to.
Definition Type.h:41
const T * getAs() const
Member-template getAs<specific type>'.
Definition TypeBase.h:9101
QualType getType() const
Definition Decl.h:722
bool hasConstantInitialization() const
Determine whether this variable has constant initialization.
Definition Decl.cpp:2648
ABIArgInfo classifyArgumentType(CodeGenModule &CGM, CanQualType type)
Classify the rules for how to pass a particular type.
@ Decl
The l-value was an access to a declared entity or something equivalently strong, like the address of ...
Definition CGValue.h:145
CGCXXABI::RecordArgABI getRecordArgABI(const RecordType *RT, CGCXXABI &CXXABI)
bool classifyReturnType(const CGCXXABI &CXXABI, CGFunctionInfo &FI, const ABIInfo &Info)
std::unique_ptr< TargetCodeGenInfo > createAMDGPUTargetCodeGenInfo(CodeGenModule &CGM)
Definition AMDGPU.cpp:758
RValue emitVoidPtrVAArg(CodeGenFunction &CGF, Address VAListAddr, QualType ValueTy, bool IsIndirect, TypeInfoChars ValueInfo, CharUnits SlotSizeAndAlign, bool AllowHigherAlign, AggValueSlot Slot, bool ForceRightAdjust=false)
Emit va_arg for a platform using the common void* representation, where arguments are simply emitted ...
bool isAggregateTypeForABI(QualType T)
const Type * isSingleElementStruct(QualType T, ASTContext &Context)
isSingleElementStruct - Determine if a structure is a "singleelement struct", i.e.
QualType useFirstFieldIfTransparentUnion(QualType Ty)
Pass transparent unions as if they were the type of the first element.
bool isEmptyRecord(ASTContext &Context, QualType T, bool AllowArrays, bool AsIfNoUniqueAddr=false)
isEmptyRecord - Return true iff a structure contains only empty fields.
@ OS
Indicates that the tracking object is a descendant of a referenced-counted OSObject,...
bool Cast(InterpState &S, CodePtr OpPC)
Definition Interp.h:2481
The JSON file list parser is used to communicate input to InstallAPI.
bool isa(CodeGen::Address addr)
Definition Address.h:330
nullptr
This class represents a compute construct, representing a 'Kind' of β€˜parallel’, 'serial',...
const FunctionProtoType * T
@ Type
The name was classified as a type.
Definition Sema.h:562
LangAS
Defines the address space values used by the address space qualifier of QualType.
SyncScope
Defines sync scope values used internally by clang.
Definition SyncScope.h:42
@ CC_DeviceKernel
Definition Specifiers.h:292
U cast(CodeGen::Address addr)
Definition Address.h:327
LangAS getLangASFromTargetAS(unsigned TargetAS)
unsigned long uint64_t
unsigned int uint32_t
bool getOption(AtomicOptionKind Kind) const