LLVM 22.0.0git
SIMemoryLegalizer.cpp
Go to the documentation of this file.
1//===- SIMemoryLegalizer.cpp ----------------------------------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file
10/// Memory legalizer - implements memory model. More information can be
11/// found here:
12/// http://llvm.org/docs/AMDGPUUsage.html#memory-model
13//
14//===----------------------------------------------------------------------===//
15
16#include "AMDGPU.h"
18#include "GCNSubtarget.h"
27#include "llvm/IR/PassManager.h"
30
31using namespace llvm;
32using namespace llvm::AMDGPU;
33
34#define DEBUG_TYPE "si-memory-legalizer"
35#define PASS_NAME "SI Memory Legalizer"
36
38 "amdgcn-skip-cache-invalidations", cl::init(false), cl::Hidden,
39 cl::desc("Use this to skip inserting cache invalidating instructions."));
40
41namespace {
42
44
45/// Memory operation flags. Can be ORed together.
46enum class SIMemOp {
47 NONE = 0u,
48 LOAD = 1u << 0,
49 STORE = 1u << 1,
50 LLVM_MARK_AS_BITMASK_ENUM(/* LargestFlag = */ STORE)
51};
52
53/// Position to insert a new instruction relative to an existing
54/// instruction.
55enum class Position {
56 BEFORE,
57 AFTER
58};
59
60/// The atomic synchronization scopes supported by the AMDGPU target.
61enum class SIAtomicScope {
62 NONE,
63 SINGLETHREAD,
64 WAVEFRONT,
65 WORKGROUP,
66 CLUSTER, // Promoted to AGENT on targets without workgroup clusters.
67 AGENT,
68 SYSTEM
69};
70
71/// The distinct address spaces supported by the AMDGPU target for
72/// atomic memory operation. Can be ORed together.
73enum class SIAtomicAddrSpace {
74 NONE = 0u,
75 GLOBAL = 1u << 0,
76 LDS = 1u << 1,
77 SCRATCH = 1u << 2,
78 GDS = 1u << 3,
79 OTHER = 1u << 4,
80
81 /// The address spaces that can be accessed by a FLAT instruction.
82 FLAT = GLOBAL | LDS | SCRATCH,
83
84 /// The address spaces that support atomic instructions.
85 ATOMIC = GLOBAL | LDS | SCRATCH | GDS,
86
87 /// All address spaces.
88 ALL = GLOBAL | LDS | SCRATCH | GDS | OTHER,
89
90 LLVM_MARK_AS_BITMASK_ENUM(/* LargestFlag = */ ALL)
91};
92
93class SIMemOpInfo final {
94private:
95
96 friend class SIMemOpAccess;
97
98 AtomicOrdering Ordering = AtomicOrdering::NotAtomic;
99 AtomicOrdering FailureOrdering = AtomicOrdering::NotAtomic;
100 SIAtomicScope Scope = SIAtomicScope::SYSTEM;
101 SIAtomicAddrSpace OrderingAddrSpace = SIAtomicAddrSpace::NONE;
102 SIAtomicAddrSpace InstrAddrSpace = SIAtomicAddrSpace::NONE;
103 bool IsCrossAddressSpaceOrdering = false;
104 bool IsVolatile = false;
105 bool IsNonTemporal = false;
106 bool IsLastUse = false;
107 bool IsCooperative = false;
108
109 SIMemOpInfo(
110 const GCNSubtarget &ST,
111 AtomicOrdering Ordering = AtomicOrdering::SequentiallyConsistent,
112 SIAtomicScope Scope = SIAtomicScope::SYSTEM,
113 SIAtomicAddrSpace OrderingAddrSpace = SIAtomicAddrSpace::ATOMIC,
114 SIAtomicAddrSpace InstrAddrSpace = SIAtomicAddrSpace::ALL,
115 bool IsCrossAddressSpaceOrdering = true,
116 AtomicOrdering FailureOrdering = AtomicOrdering::SequentiallyConsistent,
117 bool IsVolatile = false, bool IsNonTemporal = false,
118 bool IsLastUse = false, bool IsCooperative = false)
119 : Ordering(Ordering), FailureOrdering(FailureOrdering), Scope(Scope),
120 OrderingAddrSpace(OrderingAddrSpace), InstrAddrSpace(InstrAddrSpace),
121 IsCrossAddressSpaceOrdering(IsCrossAddressSpaceOrdering),
122 IsVolatile(IsVolatile), IsNonTemporal(IsNonTemporal),
123 IsLastUse(IsLastUse), IsCooperative(IsCooperative) {
124
125 if (Ordering == AtomicOrdering::NotAtomic) {
126 assert(!IsCooperative && "Cannot be cooperative & non-atomic!");
127 assert(Scope == SIAtomicScope::NONE &&
128 OrderingAddrSpace == SIAtomicAddrSpace::NONE &&
129 !IsCrossAddressSpaceOrdering &&
130 FailureOrdering == AtomicOrdering::NotAtomic);
131 return;
132 }
133
134 assert(Scope != SIAtomicScope::NONE &&
135 (OrderingAddrSpace & SIAtomicAddrSpace::ATOMIC) !=
136 SIAtomicAddrSpace::NONE &&
137 (InstrAddrSpace & SIAtomicAddrSpace::ATOMIC) !=
138 SIAtomicAddrSpace::NONE);
139
140 // There is also no cross address space ordering if the ordering
141 // address space is the same as the instruction address space and
142 // only contains a single address space.
143 if ((OrderingAddrSpace == InstrAddrSpace) &&
144 isPowerOf2_32(uint32_t(InstrAddrSpace)))
145 this->IsCrossAddressSpaceOrdering = false;
146
147 // Limit the scope to the maximum supported by the instruction's address
148 // spaces.
149 if ((InstrAddrSpace & ~SIAtomicAddrSpace::SCRATCH) ==
150 SIAtomicAddrSpace::NONE) {
151 this->Scope = std::min(Scope, SIAtomicScope::SINGLETHREAD);
152 } else if ((InstrAddrSpace &
153 ~(SIAtomicAddrSpace::SCRATCH | SIAtomicAddrSpace::LDS)) ==
154 SIAtomicAddrSpace::NONE) {
155 this->Scope = std::min(Scope, SIAtomicScope::WORKGROUP);
156 } else if ((InstrAddrSpace &
157 ~(SIAtomicAddrSpace::SCRATCH | SIAtomicAddrSpace::LDS |
158 SIAtomicAddrSpace::GDS)) == SIAtomicAddrSpace::NONE) {
159 this->Scope = std::min(Scope, SIAtomicScope::AGENT);
160 }
161
162 // On targets that have no concept of a workgroup cluster, use
163 // AGENT scope as a conservatively correct alternative.
164 if (this->Scope == SIAtomicScope::CLUSTER && !ST.hasClusters())
165 this->Scope = SIAtomicScope::AGENT;
166 }
167
168public:
169 /// \returns Atomic synchronization scope of the machine instruction used to
170 /// create this SIMemOpInfo.
171 SIAtomicScope getScope() const {
172 return Scope;
173 }
174
175 /// \returns Ordering constraint of the machine instruction used to
176 /// create this SIMemOpInfo.
177 AtomicOrdering getOrdering() const {
178 return Ordering;
179 }
180
181 /// \returns Failure ordering constraint of the machine instruction used to
182 /// create this SIMemOpInfo.
183 AtomicOrdering getFailureOrdering() const {
184 return FailureOrdering;
185 }
186
187 /// \returns The address spaces be accessed by the machine
188 /// instruction used to create this SIMemOpInfo.
189 SIAtomicAddrSpace getInstrAddrSpace() const {
190 return InstrAddrSpace;
191 }
192
193 /// \returns The address spaces that must be ordered by the machine
194 /// instruction used to create this SIMemOpInfo.
195 SIAtomicAddrSpace getOrderingAddrSpace() const {
196 return OrderingAddrSpace;
197 }
198
199 /// \returns Return true iff memory ordering of operations on
200 /// different address spaces is required.
201 bool getIsCrossAddressSpaceOrdering() const {
202 return IsCrossAddressSpaceOrdering;
203 }
204
205 /// \returns True if memory access of the machine instruction used to
206 /// create this SIMemOpInfo is volatile, false otherwise.
207 bool isVolatile() const {
208 return IsVolatile;
209 }
210
211 /// \returns True if memory access of the machine instruction used to
212 /// create this SIMemOpInfo is nontemporal, false otherwise.
213 bool isNonTemporal() const {
214 return IsNonTemporal;
215 }
216
217 /// \returns True if memory access of the machine instruction used to
218 /// create this SIMemOpInfo is last use, false otherwise.
219 bool isLastUse() const { return IsLastUse; }
220
221 /// \returns True if this is a cooperative load or store atomic.
222 bool isCooperative() const { return IsCooperative; }
223
224 /// \returns True if ordering constraint of the machine instruction used to
225 /// create this SIMemOpInfo is unordered or higher, false otherwise.
226 bool isAtomic() const {
227 return Ordering != AtomicOrdering::NotAtomic;
228 }
229
230};
231
232class SIMemOpAccess final {
233private:
234 const AMDGPUMachineModuleInfo *MMI = nullptr;
235 const GCNSubtarget &ST;
236
237 /// Reports unsupported message \p Msg for \p MI to LLVM context.
238 void reportUnsupported(const MachineBasicBlock::iterator &MI,
239 const char *Msg) const;
240
241 /// Inspects the target synchronization scope \p SSID and determines
242 /// the SI atomic scope it corresponds to, the address spaces it
243 /// covers, and whether the memory ordering applies between address
244 /// spaces.
245 std::optional<std::tuple<SIAtomicScope, SIAtomicAddrSpace, bool>>
246 toSIAtomicScope(SyncScope::ID SSID, SIAtomicAddrSpace InstrAddrSpace) const;
247
248 /// \return Return a bit set of the address spaces accessed by \p AS.
249 SIAtomicAddrSpace toSIAtomicAddrSpace(unsigned AS) const;
250
251 /// \returns Info constructed from \p MI, which has at least machine memory
252 /// operand.
253 std::optional<SIMemOpInfo>
254 constructFromMIWithMMO(const MachineBasicBlock::iterator &MI) const;
255
256public:
257 /// Construct class to support accessing the machine memory operands
258 /// of instructions in the machine function \p MF.
259 SIMemOpAccess(const AMDGPUMachineModuleInfo &MMI, const GCNSubtarget &ST);
260
261 /// \returns Load info if \p MI is a load operation, "std::nullopt" otherwise.
262 std::optional<SIMemOpInfo>
264
265 /// \returns Store info if \p MI is a store operation, "std::nullopt"
266 /// otherwise.
267 std::optional<SIMemOpInfo>
268 getStoreInfo(const MachineBasicBlock::iterator &MI) const;
269
270 /// \returns Atomic fence info if \p MI is an atomic fence operation,
271 /// "std::nullopt" otherwise.
272 std::optional<SIMemOpInfo>
273 getAtomicFenceInfo(const MachineBasicBlock::iterator &MI) const;
274
275 /// \returns Atomic cmpxchg/rmw info if \p MI is an atomic cmpxchg or
276 /// rmw operation, "std::nullopt" otherwise.
277 std::optional<SIMemOpInfo>
278 getAtomicCmpxchgOrRmwInfo(const MachineBasicBlock::iterator &MI) const;
279};
280
281class SICacheControl {
282protected:
283
284 /// AMDGPU subtarget info.
285 const GCNSubtarget &ST;
286
287 /// Instruction info.
288 const SIInstrInfo *TII = nullptr;
289
290 IsaVersion IV;
291
292 /// Whether to insert cache invalidating instructions.
293 bool InsertCacheInv;
294
295 SICacheControl(const GCNSubtarget &ST);
296
297 /// Sets named bit \p BitName to "true" if present in instruction \p MI.
298 /// \returns Returns true if \p MI is modified, false otherwise.
299 bool enableNamedBit(const MachineBasicBlock::iterator MI,
300 AMDGPU::CPol::CPol Bit) const;
301
302public:
303
304 /// Create a cache control for the subtarget \p ST.
305 static std::unique_ptr<SICacheControl> create(const GCNSubtarget &ST);
306
307 /// Update \p MI memory load instruction to bypass any caches up to
308 /// the \p Scope memory scope for address spaces \p
309 /// AddrSpace. Return true iff the instruction was modified.
310 virtual bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI,
311 SIAtomicScope Scope,
312 SIAtomicAddrSpace AddrSpace) const = 0;
313
314 /// Update \p MI memory store instruction to bypass any caches up to
315 /// the \p Scope memory scope for address spaces \p
316 /// AddrSpace. Return true iff the instruction was modified.
317 virtual bool enableStoreCacheBypass(const MachineBasicBlock::iterator &MI,
318 SIAtomicScope Scope,
319 SIAtomicAddrSpace AddrSpace) const = 0;
320
321 /// Update \p MI memory read-modify-write instruction to bypass any caches up
322 /// to the \p Scope memory scope for address spaces \p AddrSpace. Return true
323 /// iff the instruction was modified.
324 virtual bool enableRMWCacheBypass(const MachineBasicBlock::iterator &MI,
325 SIAtomicScope Scope,
326 SIAtomicAddrSpace AddrSpace) const = 0;
327
328 /// Update \p MI memory instruction of kind \p Op associated with address
329 /// spaces \p AddrSpace to indicate it is volatile and/or
330 /// nontemporal/last-use. Return true iff the instruction was modified.
331 virtual bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI,
332 SIAtomicAddrSpace AddrSpace,
333 SIMemOp Op, bool IsVolatile,
334 bool IsNonTemporal,
335 bool IsLastUse = false) const = 0;
336
337 virtual bool finalizeStore(MachineInstr &MI, bool Atomic) const {
338 return false;
339 };
340
341 /// Handle cooperative load/store atomics.
342 virtual bool handleCooperativeAtomic(MachineInstr &MI) const {
344 "cooperative atomics are not available on this architecture");
345 }
346
347 /// Inserts any necessary instructions at position \p Pos relative
348 /// to instruction \p MI to ensure memory instructions before \p Pos of kind
349 /// \p Op associated with address spaces \p AddrSpace have completed. Used
350 /// between memory instructions to enforce the order they become visible as
351 /// observed by other memory instructions executing in memory scope \p Scope.
352 /// \p IsCrossAddrSpaceOrdering indicates if the memory ordering is between
353 /// address spaces. Returns true iff any instructions inserted.
354 virtual bool insertWait(MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
355 SIAtomicAddrSpace AddrSpace, SIMemOp Op,
356 bool IsCrossAddrSpaceOrdering, Position Pos,
357 AtomicOrdering Order) const = 0;
358
359 /// Inserts any necessary instructions at position \p Pos relative to
360 /// instruction \p MI to ensure any subsequent memory instructions of this
361 /// thread with address spaces \p AddrSpace will observe the previous memory
362 /// operations by any thread for memory scopes up to memory scope \p Scope .
363 /// Returns true iff any instructions inserted.
364 virtual bool insertAcquire(MachineBasicBlock::iterator &MI,
365 SIAtomicScope Scope,
366 SIAtomicAddrSpace AddrSpace,
367 Position Pos) const = 0;
368
369 /// Inserts any necessary instructions at position \p Pos relative to
370 /// instruction \p MI to ensure previous memory instructions by this thread
371 /// with address spaces \p AddrSpace have completed and can be observed by
372 /// subsequent memory instructions by any thread executing in memory scope \p
373 /// Scope. \p IsCrossAddrSpaceOrdering indicates if the memory ordering is
374 /// between address spaces. Returns true iff any instructions inserted.
375 virtual bool insertRelease(MachineBasicBlock::iterator &MI,
376 SIAtomicScope Scope,
377 SIAtomicAddrSpace AddrSpace,
378 bool IsCrossAddrSpaceOrdering,
379 Position Pos) const = 0;
380
381 /// Inserts any necessary instructions before the barrier start instruction
382 /// \p MI in order to support pairing of barriers and fences.
383 virtual bool insertBarrierStart(MachineBasicBlock::iterator &MI) const {
384 return false;
385 };
386
387 /// Virtual destructor to allow derivations to be deleted.
388 virtual ~SICacheControl() = default;
389};
390
391class SIGfx6CacheControl : public SICacheControl {
392protected:
393
394 /// Sets GLC bit to "true" if present in \p MI. Returns true if \p MI
395 /// is modified, false otherwise.
396 bool enableGLCBit(const MachineBasicBlock::iterator &MI) const {
397 return enableNamedBit(MI, AMDGPU::CPol::GLC);
398 }
399
400 /// Sets SLC bit to "true" if present in \p MI. Returns true if \p MI
401 /// is modified, false otherwise.
402 bool enableSLCBit(const MachineBasicBlock::iterator &MI) const {
403 return enableNamedBit(MI, AMDGPU::CPol::SLC);
404 }
405
406public:
407
408 SIGfx6CacheControl(const GCNSubtarget &ST) : SICacheControl(ST) {}
409
410 bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI,
411 SIAtomicScope Scope,
412 SIAtomicAddrSpace AddrSpace) const override;
413
414 bool enableStoreCacheBypass(const MachineBasicBlock::iterator &MI,
415 SIAtomicScope Scope,
416 SIAtomicAddrSpace AddrSpace) const override;
417
418 bool enableRMWCacheBypass(const MachineBasicBlock::iterator &MI,
419 SIAtomicScope Scope,
420 SIAtomicAddrSpace AddrSpace) const override;
421
422 bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI,
423 SIAtomicAddrSpace AddrSpace, SIMemOp Op,
424 bool IsVolatile, bool IsNonTemporal,
425 bool IsLastUse) const override;
426
427 bool insertWait(MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
428 SIAtomicAddrSpace AddrSpace, SIMemOp Op,
429 bool IsCrossAddrSpaceOrdering, Position Pos,
430 AtomicOrdering Order) const override;
431
432 bool insertAcquire(MachineBasicBlock::iterator &MI,
433 SIAtomicScope Scope,
434 SIAtomicAddrSpace AddrSpace,
435 Position Pos) const override;
436
437 bool insertRelease(MachineBasicBlock::iterator &MI,
438 SIAtomicScope Scope,
439 SIAtomicAddrSpace AddrSpace,
440 bool IsCrossAddrSpaceOrdering,
441 Position Pos) const override;
442};
443
444class SIGfx7CacheControl : public SIGfx6CacheControl {
445public:
446
447 SIGfx7CacheControl(const GCNSubtarget &ST) : SIGfx6CacheControl(ST) {}
448
449 bool insertAcquire(MachineBasicBlock::iterator &MI,
450 SIAtomicScope Scope,
451 SIAtomicAddrSpace AddrSpace,
452 Position Pos) const override;
453
454};
455
456class SIGfx90ACacheControl : public SIGfx7CacheControl {
457public:
458
459 SIGfx90ACacheControl(const GCNSubtarget &ST) : SIGfx7CacheControl(ST) {}
460
461 bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI,
462 SIAtomicScope Scope,
463 SIAtomicAddrSpace AddrSpace) const override;
464
465 bool enableRMWCacheBypass(const MachineBasicBlock::iterator &MI,
466 SIAtomicScope Scope,
467 SIAtomicAddrSpace AddrSpace) const override;
468
469 bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI,
470 SIAtomicAddrSpace AddrSpace, SIMemOp Op,
471 bool IsVolatile, bool IsNonTemporal,
472 bool IsLastUse) const override;
473
474 bool insertWait(MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
475 SIAtomicAddrSpace AddrSpace, SIMemOp Op,
476 bool IsCrossAddrSpaceOrdering, Position Pos,
477 AtomicOrdering Order) const override;
478
479 bool insertAcquire(MachineBasicBlock::iterator &MI,
480 SIAtomicScope Scope,
481 SIAtomicAddrSpace AddrSpace,
482 Position Pos) const override;
483
484 bool insertRelease(MachineBasicBlock::iterator &MI,
485 SIAtomicScope Scope,
486 SIAtomicAddrSpace AddrSpace,
487 bool IsCrossAddrSpaceOrdering,
488 Position Pos) const override;
489};
490
491class SIGfx940CacheControl : public SIGfx90ACacheControl {
492protected:
493
494 /// Sets SC0 bit to "true" if present in \p MI. Returns true if \p MI
495 /// is modified, false otherwise.
496 bool enableSC0Bit(const MachineBasicBlock::iterator &MI) const {
497 return enableNamedBit(MI, AMDGPU::CPol::SC0);
498 }
499
500 /// Sets SC1 bit to "true" if present in \p MI. Returns true if \p MI
501 /// is modified, false otherwise.
502 bool enableSC1Bit(const MachineBasicBlock::iterator &MI) const {
503 return enableNamedBit(MI, AMDGPU::CPol::SC1);
504 }
505
506 /// Sets NT bit to "true" if present in \p MI. Returns true if \p MI
507 /// is modified, false otherwise.
508 bool enableNTBit(const MachineBasicBlock::iterator &MI) const {
509 return enableNamedBit(MI, AMDGPU::CPol::NT);
510 }
511
512public:
513 SIGfx940CacheControl(const GCNSubtarget &ST) : SIGfx90ACacheControl(ST) {};
514
515 bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI,
516 SIAtomicScope Scope,
517 SIAtomicAddrSpace AddrSpace) const override;
518
519 bool enableStoreCacheBypass(const MachineBasicBlock::iterator &MI,
520 SIAtomicScope Scope,
521 SIAtomicAddrSpace AddrSpace) const override;
522
523 bool enableRMWCacheBypass(const MachineBasicBlock::iterator &MI,
524 SIAtomicScope Scope,
525 SIAtomicAddrSpace AddrSpace) const override;
526
527 bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI,
528 SIAtomicAddrSpace AddrSpace, SIMemOp Op,
529 bool IsVolatile, bool IsNonTemporal,
530 bool IsLastUse) const override;
531
532 bool insertAcquire(MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
533 SIAtomicAddrSpace AddrSpace, Position Pos) const override;
534
535 bool insertRelease(MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
536 SIAtomicAddrSpace AddrSpace, bool IsCrossAddrSpaceOrdering,
537 Position Pos) const override;
538};
539
540class SIGfx10CacheControl : public SIGfx7CacheControl {
541protected:
542
543 /// Sets DLC bit to "true" if present in \p MI. Returns true if \p MI
544 /// is modified, false otherwise.
545 bool enableDLCBit(const MachineBasicBlock::iterator &MI) const {
546 return enableNamedBit(MI, AMDGPU::CPol::DLC);
547 }
548
549public:
550
551 SIGfx10CacheControl(const GCNSubtarget &ST) : SIGfx7CacheControl(ST) {}
552
553 bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI,
554 SIAtomicScope Scope,
555 SIAtomicAddrSpace AddrSpace) const override;
556
557 bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI,
558 SIAtomicAddrSpace AddrSpace, SIMemOp Op,
559 bool IsVolatile, bool IsNonTemporal,
560 bool IsLastUse) const override;
561
562 bool insertWait(MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
563 SIAtomicAddrSpace AddrSpace, SIMemOp Op,
564 bool IsCrossAddrSpaceOrdering, Position Pos,
565 AtomicOrdering Order) const override;
566
567 bool insertAcquire(MachineBasicBlock::iterator &MI,
568 SIAtomicScope Scope,
569 SIAtomicAddrSpace AddrSpace,
570 Position Pos) const override;
571
572 bool insertBarrierStart(MachineBasicBlock::iterator &MI) const override;
573};
574
575class SIGfx11CacheControl : public SIGfx10CacheControl {
576public:
577 SIGfx11CacheControl(const GCNSubtarget &ST) : SIGfx10CacheControl(ST) {}
578
579 bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI,
580 SIAtomicScope Scope,
581 SIAtomicAddrSpace AddrSpace) const override;
582
583 bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI,
584 SIAtomicAddrSpace AddrSpace, SIMemOp Op,
585 bool IsVolatile, bool IsNonTemporal,
586 bool IsLastUse) const override;
587};
588
589class SIGfx12CacheControl : public SIGfx11CacheControl {
590protected:
591 // Sets TH policy to \p Value if CPol operand is present in instruction \p MI.
592 // \returns Returns true if \p MI is modified, false otherwise.
593 bool setTH(const MachineBasicBlock::iterator MI,
595 // Sets Scope policy to \p Value if CPol operand is present in instruction \p
596 // MI. \returns Returns true if \p MI is modified, false otherwise.
597 bool setScope(const MachineBasicBlock::iterator MI,
599
600 // Stores with system scope (SCOPE_SYS) need to wait for:
601 // - loads or atomics(returning) - wait for {LOAD|SAMPLE|BVH|KM}CNT==0
602 // - non-returning-atomics - wait for STORECNT==0
603 // TODO: SIInsertWaitcnts will not always be able to remove STORECNT waits
604 // since it does not distinguish atomics-with-return from regular stores.
605 // There is no need to wait if memory is cached (mtype != UC).
606 bool
607 insertWaitsBeforeSystemScopeStore(const MachineBasicBlock::iterator MI) const;
608
609 bool setAtomicScope(const MachineBasicBlock::iterator &MI,
610 SIAtomicScope Scope, SIAtomicAddrSpace AddrSpace) const;
611
612public:
613 SIGfx12CacheControl(const GCNSubtarget &ST) : SIGfx11CacheControl(ST) {
614 // GFX12.0 and GFX12.5 memory models greatly overlap, and in some cases
615 // the behavior is the same if assuming GFX12.0 in CU mode.
616 assert(!ST.hasGFX1250Insts() || ST.isCuModeEnabled());
617 }
618
619 bool insertWait(MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
620 SIAtomicAddrSpace AddrSpace, SIMemOp Op,
621 bool IsCrossAddrSpaceOrdering, Position Pos,
622 AtomicOrdering Order) const override;
623
624 bool insertAcquire(MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
625 SIAtomicAddrSpace AddrSpace, Position Pos) const override;
626
627 bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI,
628 SIAtomicAddrSpace AddrSpace, SIMemOp Op,
629 bool IsVolatile, bool IsNonTemporal,
630 bool IsLastUse) const override;
631
632 bool finalizeStore(MachineInstr &MI, bool Atomic) const override;
633
634 virtual bool handleCooperativeAtomic(MachineInstr &MI) const override;
635
636 bool insertRelease(MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
637 SIAtomicAddrSpace AddrSpace, bool IsCrossAddrSpaceOrdering,
638 Position Pos) const override;
639
640 bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI,
641 SIAtomicScope Scope,
642 SIAtomicAddrSpace AddrSpace) const override {
643 return setAtomicScope(MI, Scope, AddrSpace);
644 }
645
646 bool enableStoreCacheBypass(const MachineBasicBlock::iterator &MI,
647 SIAtomicScope Scope,
648 SIAtomicAddrSpace AddrSpace) const override {
649 return setAtomicScope(MI, Scope, AddrSpace);
650 }
651
652 bool enableRMWCacheBypass(const MachineBasicBlock::iterator &MI,
653 SIAtomicScope Scope,
654 SIAtomicAddrSpace AddrSpace) const override {
655 return setAtomicScope(MI, Scope, AddrSpace);
656 }
657};
658
659class SIMemoryLegalizer final {
660private:
661 const MachineModuleInfo &MMI;
662 /// Cache Control.
663 std::unique_ptr<SICacheControl> CC = nullptr;
664
665 /// List of atomic pseudo instructions.
666 std::list<MachineBasicBlock::iterator> AtomicPseudoMIs;
667
668 /// Return true iff instruction \p MI is a atomic instruction that
669 /// returns a result.
670 bool isAtomicRet(const MachineInstr &MI) const {
672 }
673
674 /// Removes all processed atomic pseudo instructions from the current
675 /// function. Returns true if current function is modified, false otherwise.
676 bool removeAtomicPseudoMIs();
677
678 /// Expands load operation \p MI. Returns true if instructions are
679 /// added/deleted or \p MI is modified, false otherwise.
680 bool expandLoad(const SIMemOpInfo &MOI,
682 /// Expands store operation \p MI. Returns true if instructions are
683 /// added/deleted or \p MI is modified, false otherwise.
684 bool expandStore(const SIMemOpInfo &MOI,
686 /// Expands atomic fence operation \p MI. Returns true if
687 /// instructions are added/deleted or \p MI is modified, false otherwise.
688 bool expandAtomicFence(const SIMemOpInfo &MOI,
690 /// Expands atomic cmpxchg or rmw operation \p MI. Returns true if
691 /// instructions are added/deleted or \p MI is modified, false otherwise.
692 bool expandAtomicCmpxchgOrRmw(const SIMemOpInfo &MOI,
694
695public:
696 SIMemoryLegalizer(const MachineModuleInfo &MMI) : MMI(MMI) {};
697 bool run(MachineFunction &MF);
698};
699
700class SIMemoryLegalizerLegacy final : public MachineFunctionPass {
701public:
702 static char ID;
703
704 SIMemoryLegalizerLegacy() : MachineFunctionPass(ID) {}
705
706 void getAnalysisUsage(AnalysisUsage &AU) const override {
707 AU.setPreservesCFG();
709 }
710
711 StringRef getPassName() const override {
712 return PASS_NAME;
713 }
714
715 bool runOnMachineFunction(MachineFunction &MF) override;
716};
717
718static const StringMap<SIAtomicAddrSpace> ASNames = {{
719 {"global", SIAtomicAddrSpace::GLOBAL},
720 {"local", SIAtomicAddrSpace::LDS},
721}};
722
723void diagnoseUnknownMMRAASName(const MachineInstr &MI, StringRef AS) {
724 const MachineFunction *MF = MI.getMF();
725 const Function &Fn = MF->getFunction();
727 raw_svector_ostream OS(Str);
728 OS << "unknown address space '" << AS << "'; expected one of ";
730 for (const auto &[Name, Val] : ASNames)
731 OS << LS << '\'' << Name << '\'';
732 Fn.getContext().diagnose(
733 DiagnosticInfoUnsupported(Fn, Str.str(), MI.getDebugLoc(), DS_Warning));
734}
735
736/// Reads \p MI's MMRAs to parse the "amdgpu-synchronize-as" MMRA.
737/// If this tag isn't present, or if it has no meaningful values, returns
738/// \p none, otherwise returns the address spaces specified by the MD.
739static std::optional<SIAtomicAddrSpace>
740getSynchronizeAddrSpaceMD(const MachineInstr &MI) {
741 static constexpr StringLiteral FenceASPrefix = "amdgpu-synchronize-as";
742
743 auto MMRA = MMRAMetadata(MI.getMMRAMetadata());
744 if (!MMRA)
745 return std::nullopt;
746
747 SIAtomicAddrSpace Result = SIAtomicAddrSpace::NONE;
748 for (const auto &[Prefix, Suffix] : MMRA) {
749 if (Prefix != FenceASPrefix)
750 continue;
751
752 if (auto It = ASNames.find(Suffix); It != ASNames.end())
753 Result |= It->second;
754 else
755 diagnoseUnknownMMRAASName(MI, Suffix);
756 }
757
758 if (Result == SIAtomicAddrSpace::NONE)
759 return std::nullopt;
760
761 return Result;
762}
763
764} // end anonymous namespace
765
766void SIMemOpAccess::reportUnsupported(const MachineBasicBlock::iterator &MI,
767 const char *Msg) const {
768 const Function &Func = MI->getParent()->getParent()->getFunction();
769 Func.getContext().diagnose(
770 DiagnosticInfoUnsupported(Func, Msg, MI->getDebugLoc()));
771}
772
773std::optional<std::tuple<SIAtomicScope, SIAtomicAddrSpace, bool>>
774SIMemOpAccess::toSIAtomicScope(SyncScope::ID SSID,
775 SIAtomicAddrSpace InstrAddrSpace) const {
776 if (SSID == SyncScope::System)
777 return std::tuple(SIAtomicScope::SYSTEM, SIAtomicAddrSpace::ATOMIC, true);
778 if (SSID == MMI->getAgentSSID())
779 return std::tuple(SIAtomicScope::AGENT, SIAtomicAddrSpace::ATOMIC, true);
780 if (SSID == MMI->getClusterSSID())
781 return std::tuple(SIAtomicScope::CLUSTER, SIAtomicAddrSpace::ATOMIC, true);
782 if (SSID == MMI->getWorkgroupSSID())
783 return std::tuple(SIAtomicScope::WORKGROUP, SIAtomicAddrSpace::ATOMIC,
784 true);
785 if (SSID == MMI->getWavefrontSSID())
786 return std::tuple(SIAtomicScope::WAVEFRONT, SIAtomicAddrSpace::ATOMIC,
787 true);
788 if (SSID == SyncScope::SingleThread)
789 return std::tuple(SIAtomicScope::SINGLETHREAD, SIAtomicAddrSpace::ATOMIC,
790 true);
791 if (SSID == MMI->getSystemOneAddressSpaceSSID())
792 return std::tuple(SIAtomicScope::SYSTEM,
793 SIAtomicAddrSpace::ATOMIC & InstrAddrSpace, false);
794 if (SSID == MMI->getAgentOneAddressSpaceSSID())
795 return std::tuple(SIAtomicScope::AGENT,
796 SIAtomicAddrSpace::ATOMIC & InstrAddrSpace, false);
797 if (SSID == MMI->getClusterOneAddressSpaceSSID())
798 return std::tuple(SIAtomicScope::CLUSTER,
799 SIAtomicAddrSpace::ATOMIC & InstrAddrSpace, false);
800 if (SSID == MMI->getWorkgroupOneAddressSpaceSSID())
801 return std::tuple(SIAtomicScope::WORKGROUP,
802 SIAtomicAddrSpace::ATOMIC & InstrAddrSpace, false);
803 if (SSID == MMI->getWavefrontOneAddressSpaceSSID())
804 return std::tuple(SIAtomicScope::WAVEFRONT,
805 SIAtomicAddrSpace::ATOMIC & InstrAddrSpace, false);
806 if (SSID == MMI->getSingleThreadOneAddressSpaceSSID())
807 return std::tuple(SIAtomicScope::SINGLETHREAD,
808 SIAtomicAddrSpace::ATOMIC & InstrAddrSpace, false);
809 return std::nullopt;
810}
811
812SIAtomicAddrSpace SIMemOpAccess::toSIAtomicAddrSpace(unsigned AS) const {
813 if (AS == AMDGPUAS::FLAT_ADDRESS)
814 return SIAtomicAddrSpace::FLAT;
815 if (AS == AMDGPUAS::GLOBAL_ADDRESS)
816 return SIAtomicAddrSpace::GLOBAL;
817 if (AS == AMDGPUAS::LOCAL_ADDRESS)
818 return SIAtomicAddrSpace::LDS;
820 return SIAtomicAddrSpace::SCRATCH;
821 if (AS == AMDGPUAS::REGION_ADDRESS)
822 return SIAtomicAddrSpace::GDS;
823
824 return SIAtomicAddrSpace::OTHER;
825}
826
827SIMemOpAccess::SIMemOpAccess(const AMDGPUMachineModuleInfo &MMI_,
828 const GCNSubtarget &ST)
829 : MMI(&MMI_), ST(ST) {}
830
831std::optional<SIMemOpInfo> SIMemOpAccess::constructFromMIWithMMO(
832 const MachineBasicBlock::iterator &MI) const {
833 assert(MI->getNumMemOperands() > 0);
834
836 AtomicOrdering Ordering = AtomicOrdering::NotAtomic;
837 AtomicOrdering FailureOrdering = AtomicOrdering::NotAtomic;
838 SIAtomicAddrSpace InstrAddrSpace = SIAtomicAddrSpace::NONE;
839 bool IsNonTemporal = true;
840 bool IsVolatile = false;
841 bool IsLastUse = false;
842 bool IsCooperative = false;
843
844 // Validator should check whether or not MMOs cover the entire set of
845 // locations accessed by the memory instruction.
846 for (const auto &MMO : MI->memoperands()) {
847 IsNonTemporal &= MMO->isNonTemporal();
848 IsVolatile |= MMO->isVolatile();
849 IsLastUse |= MMO->getFlags() & MOLastUse;
850 IsCooperative |= MMO->getFlags() & MOCooperative;
851 InstrAddrSpace |=
852 toSIAtomicAddrSpace(MMO->getPointerInfo().getAddrSpace());
853 AtomicOrdering OpOrdering = MMO->getSuccessOrdering();
854 if (OpOrdering != AtomicOrdering::NotAtomic) {
855 const auto &IsSyncScopeInclusion =
856 MMI->isSyncScopeInclusion(SSID, MMO->getSyncScopeID());
857 if (!IsSyncScopeInclusion) {
858 reportUnsupported(MI,
859 "Unsupported non-inclusive atomic synchronization scope");
860 return std::nullopt;
861 }
862
863 SSID = *IsSyncScopeInclusion ? SSID : MMO->getSyncScopeID();
864 Ordering = getMergedAtomicOrdering(Ordering, OpOrdering);
865 assert(MMO->getFailureOrdering() != AtomicOrdering::Release &&
866 MMO->getFailureOrdering() != AtomicOrdering::AcquireRelease);
867 FailureOrdering =
868 getMergedAtomicOrdering(FailureOrdering, MMO->getFailureOrdering());
869 }
870 }
871
872 SIAtomicScope Scope = SIAtomicScope::NONE;
873 SIAtomicAddrSpace OrderingAddrSpace = SIAtomicAddrSpace::NONE;
874 bool IsCrossAddressSpaceOrdering = false;
875 if (Ordering != AtomicOrdering::NotAtomic) {
876 auto ScopeOrNone = toSIAtomicScope(SSID, InstrAddrSpace);
877 if (!ScopeOrNone) {
878 reportUnsupported(MI, "Unsupported atomic synchronization scope");
879 return std::nullopt;
880 }
881 std::tie(Scope, OrderingAddrSpace, IsCrossAddressSpaceOrdering) =
882 *ScopeOrNone;
883 if ((OrderingAddrSpace == SIAtomicAddrSpace::NONE) ||
884 ((OrderingAddrSpace & SIAtomicAddrSpace::ATOMIC) != OrderingAddrSpace) ||
885 ((InstrAddrSpace & SIAtomicAddrSpace::ATOMIC) == SIAtomicAddrSpace::NONE)) {
886 reportUnsupported(MI, "Unsupported atomic address space");
887 return std::nullopt;
888 }
889 }
890 return SIMemOpInfo(ST, Ordering, Scope, OrderingAddrSpace, InstrAddrSpace,
891 IsCrossAddressSpaceOrdering, FailureOrdering, IsVolatile,
892 IsNonTemporal, IsLastUse, IsCooperative);
893}
894
895std::optional<SIMemOpInfo>
896SIMemOpAccess::getLoadInfo(const MachineBasicBlock::iterator &MI) const {
897 assert(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic);
898
899 if (!(MI->mayLoad() && !MI->mayStore()))
900 return std::nullopt;
901
902 // Be conservative if there are no memory operands.
903 if (MI->getNumMemOperands() == 0)
904 return SIMemOpInfo(ST);
905
906 return constructFromMIWithMMO(MI);
907}
908
909std::optional<SIMemOpInfo>
910SIMemOpAccess::getStoreInfo(const MachineBasicBlock::iterator &MI) const {
911 assert(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic);
912
913 if (!(!MI->mayLoad() && MI->mayStore()))
914 return std::nullopt;
915
916 // Be conservative if there are no memory operands.
917 if (MI->getNumMemOperands() == 0)
918 return SIMemOpInfo(ST);
919
920 return constructFromMIWithMMO(MI);
921}
922
923std::optional<SIMemOpInfo>
924SIMemOpAccess::getAtomicFenceInfo(const MachineBasicBlock::iterator &MI) const {
925 assert(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic);
926
927 if (MI->getOpcode() != AMDGPU::ATOMIC_FENCE)
928 return std::nullopt;
929
931 static_cast<AtomicOrdering>(MI->getOperand(0).getImm());
932
933 SyncScope::ID SSID = static_cast<SyncScope::ID>(MI->getOperand(1).getImm());
934 auto ScopeOrNone = toSIAtomicScope(SSID, SIAtomicAddrSpace::ATOMIC);
935 if (!ScopeOrNone) {
936 reportUnsupported(MI, "Unsupported atomic synchronization scope");
937 return std::nullopt;
938 }
939
940 SIAtomicScope Scope = SIAtomicScope::NONE;
941 SIAtomicAddrSpace OrderingAddrSpace = SIAtomicAddrSpace::NONE;
942 bool IsCrossAddressSpaceOrdering = false;
943 std::tie(Scope, OrderingAddrSpace, IsCrossAddressSpaceOrdering) =
944 *ScopeOrNone;
945
946 if (OrderingAddrSpace != SIAtomicAddrSpace::ATOMIC) {
947 // We currently expect refineOrderingAS to be the only place that
948 // can refine the AS ordered by the fence.
949 // If that changes, we need to review the semantics of that function
950 // in case it needs to preserve certain address spaces.
951 reportUnsupported(MI, "Unsupported atomic address space");
952 return std::nullopt;
953 }
954
955 auto SynchronizeAS = getSynchronizeAddrSpaceMD(*MI);
956 if (SynchronizeAS)
957 OrderingAddrSpace = *SynchronizeAS;
958
959 return SIMemOpInfo(ST, Ordering, Scope, OrderingAddrSpace,
960 SIAtomicAddrSpace::ATOMIC, IsCrossAddressSpaceOrdering,
961 AtomicOrdering::NotAtomic);
962}
963
964std::optional<SIMemOpInfo> SIMemOpAccess::getAtomicCmpxchgOrRmwInfo(
965 const MachineBasicBlock::iterator &MI) const {
966 assert(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic);
967
968 if (!(MI->mayLoad() && MI->mayStore()))
969 return std::nullopt;
970
971 // Be conservative if there are no memory operands.
972 if (MI->getNumMemOperands() == 0)
973 return SIMemOpInfo(ST);
974
975 return constructFromMIWithMMO(MI);
976}
977
978SICacheControl::SICacheControl(const GCNSubtarget &ST) : ST(ST) {
979 TII = ST.getInstrInfo();
980 IV = getIsaVersion(ST.getCPU());
981 InsertCacheInv = !AmdgcnSkipCacheInvalidations;
982}
983
984bool SICacheControl::enableNamedBit(const MachineBasicBlock::iterator MI,
985 AMDGPU::CPol::CPol Bit) const {
986 MachineOperand *CPol = TII->getNamedOperand(*MI, AMDGPU::OpName::cpol);
987 if (!CPol)
988 return false;
989
990 CPol->setImm(CPol->getImm() | Bit);
991 return true;
992}
993
994/* static */
995std::unique_ptr<SICacheControl> SICacheControl::create(const GCNSubtarget &ST) {
996 GCNSubtarget::Generation Generation = ST.getGeneration();
997 if (ST.hasGFX940Insts())
998 return std::make_unique<SIGfx940CacheControl>(ST);
999 if (ST.hasGFX90AInsts())
1000 return std::make_unique<SIGfx90ACacheControl>(ST);
1001 if (Generation <= AMDGPUSubtarget::SOUTHERN_ISLANDS)
1002 return std::make_unique<SIGfx6CacheControl>(ST);
1003 if (Generation < AMDGPUSubtarget::GFX10)
1004 return std::make_unique<SIGfx7CacheControl>(ST);
1005 if (Generation < AMDGPUSubtarget::GFX11)
1006 return std::make_unique<SIGfx10CacheControl>(ST);
1007 if (Generation < AMDGPUSubtarget::GFX12)
1008 return std::make_unique<SIGfx11CacheControl>(ST);
1009 return std::make_unique<SIGfx12CacheControl>(ST);
1010}
1011
1012bool SIGfx6CacheControl::enableLoadCacheBypass(
1014 SIAtomicScope Scope,
1015 SIAtomicAddrSpace AddrSpace) const {
1016 assert(MI->mayLoad() && !MI->mayStore());
1017 bool Changed = false;
1018
1019 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1020 switch (Scope) {
1021 case SIAtomicScope::SYSTEM:
1022 case SIAtomicScope::AGENT:
1023 // Set L1 cache policy to MISS_EVICT.
1024 // Note: there is no L2 cache bypass policy at the ISA level.
1025 Changed |= enableGLCBit(MI);
1026 break;
1027 case SIAtomicScope::WORKGROUP:
1028 case SIAtomicScope::WAVEFRONT:
1029 case SIAtomicScope::SINGLETHREAD:
1030 // No cache to bypass.
1031 break;
1032 default:
1033 llvm_unreachable("Unsupported synchronization scope");
1034 }
1035 }
1036
1037 /// The scratch address space does not need the global memory caches
1038 /// to be bypassed as all memory operations by the same thread are
1039 /// sequentially consistent, and no other thread can access scratch
1040 /// memory.
1041
1042 /// Other address spaces do not have a cache.
1043
1044 return Changed;
1045}
1046
1047bool SIGfx6CacheControl::enableStoreCacheBypass(
1049 SIAtomicScope Scope,
1050 SIAtomicAddrSpace AddrSpace) const {
1051 assert(!MI->mayLoad() && MI->mayStore());
1052 bool Changed = false;
1053
1054 /// The L1 cache is write through so does not need to be bypassed. There is no
1055 /// bypass control for the L2 cache at the isa level.
1056
1057 return Changed;
1058}
1059
1060bool SIGfx6CacheControl::enableRMWCacheBypass(
1062 SIAtomicScope Scope,
1063 SIAtomicAddrSpace AddrSpace) const {
1064 assert(MI->mayLoad() && MI->mayStore());
1065 bool Changed = false;
1066
1067 /// Do not set GLC for RMW atomic operations as L0/L1 cache is automatically
1068 /// bypassed, and the GLC bit is instead used to indicate if they are
1069 /// return or no-return.
1070 /// Note: there is no L2 cache coherent bypass control at the ISA level.
1071
1072 return Changed;
1073}
1074
1075bool SIGfx6CacheControl::enableVolatileAndOrNonTemporal(
1076 MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op,
1077 bool IsVolatile, bool IsNonTemporal, bool IsLastUse = false) const {
1078 // Only handle load and store, not atomic read-modify-write insructions. The
1079 // latter use glc to indicate if the atomic returns a result and so must not
1080 // be used for cache control.
1081 assert(MI->mayLoad() ^ MI->mayStore());
1082
1083 // Only update load and store, not LLVM IR atomic read-modify-write
1084 // instructions. The latter are always marked as volatile so cannot sensibly
1085 // handle it as do not want to pessimize all atomics. Also they do not support
1086 // the nontemporal attribute.
1087 assert(Op == SIMemOp::LOAD || Op == SIMemOp::STORE);
1088
1089 bool Changed = false;
1090
1091 if (IsVolatile) {
1092 // Set L1 cache policy to be MISS_EVICT for load instructions
1093 // and MISS_LRU for store instructions.
1094 // Note: there is no L2 cache bypass policy at the ISA level.
1095 if (Op == SIMemOp::LOAD)
1096 Changed |= enableGLCBit(MI);
1097
1098 // Ensure operation has completed at system scope to cause all volatile
1099 // operations to be visible outside the program in a global order. Do not
1100 // request cross address space as only the global address space can be
1101 // observable outside the program, so no need to cause a waitcnt for LDS
1102 // address space operations.
1103 Changed |= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false,
1104 Position::AFTER, AtomicOrdering::Unordered);
1105
1106 return Changed;
1107 }
1108
1109 if (IsNonTemporal) {
1110 // Setting both GLC and SLC configures L1 cache policy to MISS_EVICT
1111 // for both loads and stores, and the L2 cache policy to STREAM.
1112 Changed |= enableGLCBit(MI);
1113 Changed |= enableSLCBit(MI);
1114 return Changed;
1115 }
1116
1117 return Changed;
1118}
1119
1120bool SIGfx6CacheControl::insertWait(MachineBasicBlock::iterator &MI,
1121 SIAtomicScope Scope,
1122 SIAtomicAddrSpace AddrSpace, SIMemOp Op,
1123 bool IsCrossAddrSpaceOrdering, Position Pos,
1124 AtomicOrdering Order) const {
1125 bool Changed = false;
1126
1127 MachineBasicBlock &MBB = *MI->getParent();
1128 DebugLoc DL = MI->getDebugLoc();
1129
1130 if (Pos == Position::AFTER)
1131 ++MI;
1132
1133 bool VMCnt = false;
1134 bool LGKMCnt = false;
1135
1136 if ((AddrSpace & (SIAtomicAddrSpace::GLOBAL | SIAtomicAddrSpace::SCRATCH)) !=
1137 SIAtomicAddrSpace::NONE) {
1138 switch (Scope) {
1139 case SIAtomicScope::SYSTEM:
1140 case SIAtomicScope::AGENT:
1141 VMCnt |= true;
1142 break;
1143 case SIAtomicScope::WORKGROUP:
1144 case SIAtomicScope::WAVEFRONT:
1145 case SIAtomicScope::SINGLETHREAD:
1146 // The L1 cache keeps all memory operations in order for
1147 // wavefronts in the same work-group.
1148 break;
1149 default:
1150 llvm_unreachable("Unsupported synchronization scope");
1151 }
1152 }
1153
1154 if ((AddrSpace & SIAtomicAddrSpace::LDS) != SIAtomicAddrSpace::NONE) {
1155 switch (Scope) {
1156 case SIAtomicScope::SYSTEM:
1157 case SIAtomicScope::AGENT:
1158 case SIAtomicScope::WORKGROUP:
1159 // If no cross address space ordering then an "S_WAITCNT lgkmcnt(0)" is
1160 // not needed as LDS operations for all waves are executed in a total
1161 // global ordering as observed by all waves. Required if also
1162 // synchronizing with global/GDS memory as LDS operations could be
1163 // reordered with respect to later global/GDS memory operations of the
1164 // same wave.
1165 LGKMCnt |= IsCrossAddrSpaceOrdering;
1166 break;
1167 case SIAtomicScope::WAVEFRONT:
1168 case SIAtomicScope::SINGLETHREAD:
1169 // The LDS keeps all memory operations in order for
1170 // the same wavefront.
1171 break;
1172 default:
1173 llvm_unreachable("Unsupported synchronization scope");
1174 }
1175 }
1176
1177 if ((AddrSpace & SIAtomicAddrSpace::GDS) != SIAtomicAddrSpace::NONE) {
1178 switch (Scope) {
1179 case SIAtomicScope::SYSTEM:
1180 case SIAtomicScope::AGENT:
1181 // If no cross address space ordering then an GDS "S_WAITCNT lgkmcnt(0)"
1182 // is not needed as GDS operations for all waves are executed in a total
1183 // global ordering as observed by all waves. Required if also
1184 // synchronizing with global/LDS memory as GDS operations could be
1185 // reordered with respect to later global/LDS memory operations of the
1186 // same wave.
1187 LGKMCnt |= IsCrossAddrSpaceOrdering;
1188 break;
1189 case SIAtomicScope::WORKGROUP:
1190 case SIAtomicScope::WAVEFRONT:
1191 case SIAtomicScope::SINGLETHREAD:
1192 // The GDS keeps all memory operations in order for
1193 // the same work-group.
1194 break;
1195 default:
1196 llvm_unreachable("Unsupported synchronization scope");
1197 }
1198 }
1199
1200 if (VMCnt || LGKMCnt) {
1201 unsigned WaitCntImmediate =
1203 VMCnt ? 0 : getVmcntBitMask(IV),
1205 LGKMCnt ? 0 : getLgkmcntBitMask(IV));
1206 BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT_soft))
1207 .addImm(WaitCntImmediate);
1208 Changed = true;
1209 }
1210
1211 // On architectures that support direct loads to LDS, emit an unknown waitcnt
1212 // at workgroup-scoped release operations that specify the LDS address space.
1213 // SIInsertWaitcnts will later replace this with a vmcnt().
1214 if (ST.hasVMemToLDSLoad() && isReleaseOrStronger(Order) &&
1215 Scope == SIAtomicScope::WORKGROUP &&
1216 (AddrSpace & SIAtomicAddrSpace::LDS) != SIAtomicAddrSpace::NONE) {
1217 BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT_lds_direct));
1218 Changed = true;
1219 }
1220
1221 if (Pos == Position::AFTER)
1222 --MI;
1223
1224 return Changed;
1225}
1226
1227bool SIGfx6CacheControl::insertAcquire(MachineBasicBlock::iterator &MI,
1228 SIAtomicScope Scope,
1229 SIAtomicAddrSpace AddrSpace,
1230 Position Pos) const {
1231 if (!InsertCacheInv)
1232 return false;
1233
1234 bool Changed = false;
1235
1236 MachineBasicBlock &MBB = *MI->getParent();
1237 DebugLoc DL = MI->getDebugLoc();
1238
1239 if (Pos == Position::AFTER)
1240 ++MI;
1241
1242 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1243 switch (Scope) {
1244 case SIAtomicScope::SYSTEM:
1245 case SIAtomicScope::AGENT:
1246 BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_WBINVL1));
1247 Changed = true;
1248 break;
1249 case SIAtomicScope::WORKGROUP:
1250 case SIAtomicScope::WAVEFRONT:
1251 case SIAtomicScope::SINGLETHREAD:
1252 // No cache to invalidate.
1253 break;
1254 default:
1255 llvm_unreachable("Unsupported synchronization scope");
1256 }
1257 }
1258
1259 /// The scratch address space does not need the global memory cache
1260 /// to be flushed as all memory operations by the same thread are
1261 /// sequentially consistent, and no other thread can access scratch
1262 /// memory.
1263
1264 /// Other address spaces do not have a cache.
1265
1266 if (Pos == Position::AFTER)
1267 --MI;
1268
1269 return Changed;
1270}
1271
1272bool SIGfx6CacheControl::insertRelease(MachineBasicBlock::iterator &MI,
1273 SIAtomicScope Scope,
1274 SIAtomicAddrSpace AddrSpace,
1275 bool IsCrossAddrSpaceOrdering,
1276 Position Pos) const {
1277 return insertWait(MI, Scope, AddrSpace, SIMemOp::LOAD | SIMemOp::STORE,
1278 IsCrossAddrSpaceOrdering, Pos, AtomicOrdering::Release);
1279}
1280
1281bool SIGfx7CacheControl::insertAcquire(MachineBasicBlock::iterator &MI,
1282 SIAtomicScope Scope,
1283 SIAtomicAddrSpace AddrSpace,
1284 Position Pos) const {
1285 if (!InsertCacheInv)
1286 return false;
1287
1288 bool Changed = false;
1289
1290 MachineBasicBlock &MBB = *MI->getParent();
1291 DebugLoc DL = MI->getDebugLoc();
1292
1293 const GCNSubtarget &STM = MBB.getParent()->getSubtarget<GCNSubtarget>();
1294
1295 const unsigned InvalidateL1 = STM.isAmdPalOS() || STM.isMesa3DOS()
1296 ? AMDGPU::BUFFER_WBINVL1
1297 : AMDGPU::BUFFER_WBINVL1_VOL;
1298
1299 if (Pos == Position::AFTER)
1300 ++MI;
1301
1302 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1303 switch (Scope) {
1304 case SIAtomicScope::SYSTEM:
1305 case SIAtomicScope::AGENT:
1306 BuildMI(MBB, MI, DL, TII->get(InvalidateL1));
1307 Changed = true;
1308 break;
1309 case SIAtomicScope::WORKGROUP:
1310 case SIAtomicScope::WAVEFRONT:
1311 case SIAtomicScope::SINGLETHREAD:
1312 // No cache to invalidate.
1313 break;
1314 default:
1315 llvm_unreachable("Unsupported synchronization scope");
1316 }
1317 }
1318
1319 /// The scratch address space does not need the global memory cache
1320 /// to be flushed as all memory operations by the same thread are
1321 /// sequentially consistent, and no other thread can access scratch
1322 /// memory.
1323
1324 /// Other address spaces do not have a cache.
1325
1326 if (Pos == Position::AFTER)
1327 --MI;
1328
1329 return Changed;
1330}
1331
1332bool SIGfx90ACacheControl::enableLoadCacheBypass(
1334 SIAtomicScope Scope,
1335 SIAtomicAddrSpace AddrSpace) const {
1336 assert(MI->mayLoad() && !MI->mayStore());
1337 bool Changed = false;
1338
1339 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1340 switch (Scope) {
1341 case SIAtomicScope::SYSTEM:
1342 case SIAtomicScope::AGENT:
1343 // Set the L1 cache policy to MISS_LRU.
1344 // Note: there is no L2 cache bypass policy at the ISA level.
1345 Changed |= enableGLCBit(MI);
1346 break;
1347 case SIAtomicScope::WORKGROUP:
1348 // In threadgroup split mode the waves of a work-group can be executing on
1349 // different CUs. Therefore need to bypass the L1 which is per CU.
1350 // Otherwise in non-threadgroup split mode all waves of a work-group are
1351 // on the same CU, and so the L1 does not need to be bypassed.
1352 if (ST.isTgSplitEnabled())
1353 Changed |= enableGLCBit(MI);
1354 break;
1355 case SIAtomicScope::WAVEFRONT:
1356 case SIAtomicScope::SINGLETHREAD:
1357 // No cache to bypass.
1358 break;
1359 default:
1360 llvm_unreachable("Unsupported synchronization scope");
1361 }
1362 }
1363
1364 /// The scratch address space does not need the global memory caches
1365 /// to be bypassed as all memory operations by the same thread are
1366 /// sequentially consistent, and no other thread can access scratch
1367 /// memory.
1368
1369 /// Other address spaces do not have a cache.
1370
1371 return Changed;
1372}
1373
1374bool SIGfx90ACacheControl::enableRMWCacheBypass(
1376 SIAtomicScope Scope,
1377 SIAtomicAddrSpace AddrSpace) const {
1378 assert(MI->mayLoad() && MI->mayStore());
1379 bool Changed = false;
1380
1381 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1382 switch (Scope) {
1383 case SIAtomicScope::SYSTEM:
1384 case SIAtomicScope::AGENT:
1385 /// Do not set glc for RMW atomic operations as they implicitly bypass
1386 /// the L1 cache, and the glc bit is instead used to indicate if they are
1387 /// return or no-return.
1388 break;
1389 case SIAtomicScope::WORKGROUP:
1390 case SIAtomicScope::WAVEFRONT:
1391 case SIAtomicScope::SINGLETHREAD:
1392 // No cache to bypass. RMW atomics implicitly bypass the L1 cache.
1393 break;
1394 default:
1395 llvm_unreachable("Unsupported synchronization scope");
1396 }
1397 }
1398
1399 return Changed;
1400}
1401
1402bool SIGfx90ACacheControl::enableVolatileAndOrNonTemporal(
1403 MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op,
1404 bool IsVolatile, bool IsNonTemporal, bool IsLastUse = false) const {
1405 // Only handle load and store, not atomic read-modify-write insructions. The
1406 // latter use glc to indicate if the atomic returns a result and so must not
1407 // be used for cache control.
1408 assert(MI->mayLoad() ^ MI->mayStore());
1409
1410 // Only update load and store, not LLVM IR atomic read-modify-write
1411 // instructions. The latter are always marked as volatile so cannot sensibly
1412 // handle it as do not want to pessimize all atomics. Also they do not support
1413 // the nontemporal attribute.
1414 assert(Op == SIMemOp::LOAD || Op == SIMemOp::STORE);
1415
1416 bool Changed = false;
1417
1418 if (IsVolatile) {
1419 // Set L1 cache policy to be MISS_EVICT for load instructions
1420 // and MISS_LRU for store instructions.
1421 // Note: there is no L2 cache bypass policy at the ISA level.
1422 if (Op == SIMemOp::LOAD)
1423 Changed |= enableGLCBit(MI);
1424
1425 // Ensure operation has completed at system scope to cause all volatile
1426 // operations to be visible outside the program in a global order. Do not
1427 // request cross address space as only the global address space can be
1428 // observable outside the program, so no need to cause a waitcnt for LDS
1429 // address space operations.
1430 Changed |= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false,
1431 Position::AFTER, AtomicOrdering::Unordered);
1432
1433 return Changed;
1434 }
1435
1436 if (IsNonTemporal) {
1437 // Setting both GLC and SLC configures L1 cache policy to MISS_EVICT
1438 // for both loads and stores, and the L2 cache policy to STREAM.
1439 Changed |= enableGLCBit(MI);
1440 Changed |= enableSLCBit(MI);
1441 return Changed;
1442 }
1443
1444 return Changed;
1445}
1446
1447bool SIGfx90ACacheControl::insertWait(MachineBasicBlock::iterator &MI,
1448 SIAtomicScope Scope,
1449 SIAtomicAddrSpace AddrSpace, SIMemOp Op,
1450 bool IsCrossAddrSpaceOrdering,
1451 Position Pos,
1452 AtomicOrdering Order) const {
1453 if (ST.isTgSplitEnabled()) {
1454 // In threadgroup split mode the waves of a work-group can be executing on
1455 // different CUs. Therefore need to wait for global or GDS memory operations
1456 // to complete to ensure they are visible to waves in the other CUs.
1457 // Otherwise in non-threadgroup split mode all waves of a work-group are on
1458 // the same CU, so no need to wait for global memory as all waves in the
1459 // work-group access the same the L1, nor wait for GDS as access are ordered
1460 // on a CU.
1461 if (((AddrSpace & (SIAtomicAddrSpace::GLOBAL | SIAtomicAddrSpace::SCRATCH |
1462 SIAtomicAddrSpace::GDS)) != SIAtomicAddrSpace::NONE) &&
1463 (Scope == SIAtomicScope::WORKGROUP)) {
1464 // Same as GFX7 using agent scope.
1465 Scope = SIAtomicScope::AGENT;
1466 }
1467 // In threadgroup split mode LDS cannot be allocated so no need to wait for
1468 // LDS memory operations.
1469 AddrSpace &= ~SIAtomicAddrSpace::LDS;
1470 }
1471 return SIGfx7CacheControl::insertWait(MI, Scope, AddrSpace, Op,
1472 IsCrossAddrSpaceOrdering, Pos, Order);
1473}
1474
1475bool SIGfx90ACacheControl::insertAcquire(MachineBasicBlock::iterator &MI,
1476 SIAtomicScope Scope,
1477 SIAtomicAddrSpace AddrSpace,
1478 Position Pos) const {
1479 if (!InsertCacheInv)
1480 return false;
1481
1482 bool Changed = false;
1483
1484 MachineBasicBlock &MBB = *MI->getParent();
1485 DebugLoc DL = MI->getDebugLoc();
1486
1487 if (Pos == Position::AFTER)
1488 ++MI;
1489
1490 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1491 switch (Scope) {
1492 case SIAtomicScope::SYSTEM:
1493 // Ensures that following loads will not see stale remote VMEM data or
1494 // stale local VMEM data with MTYPE NC. Local VMEM data with MTYPE RW and
1495 // CC will never be stale due to the local memory probes.
1496 BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_INVL2));
1497 // Inserting a "S_WAITCNT vmcnt(0)" after is not required because the
1498 // hardware does not reorder memory operations by the same wave with
1499 // respect to a preceding "BUFFER_INVL2". The invalidate is guaranteed to
1500 // remove any cache lines of earlier writes by the same wave and ensures
1501 // later reads by the same wave will refetch the cache lines.
1502 Changed = true;
1503 break;
1504 case SIAtomicScope::AGENT:
1505 // Same as GFX7.
1506 break;
1507 case SIAtomicScope::WORKGROUP:
1508 // In threadgroup split mode the waves of a work-group can be executing on
1509 // different CUs. Therefore need to invalidate the L1 which is per CU.
1510 // Otherwise in non-threadgroup split mode all waves of a work-group are
1511 // on the same CU, and so the L1 does not need to be invalidated.
1512 if (ST.isTgSplitEnabled()) {
1513 // Same as GFX7 using agent scope.
1514 Scope = SIAtomicScope::AGENT;
1515 }
1516 break;
1517 case SIAtomicScope::WAVEFRONT:
1518 case SIAtomicScope::SINGLETHREAD:
1519 // Same as GFX7.
1520 break;
1521 default:
1522 llvm_unreachable("Unsupported synchronization scope");
1523 }
1524 }
1525
1526 /// The scratch address space does not need the global memory cache
1527 /// to be flushed as all memory operations by the same thread are
1528 /// sequentially consistent, and no other thread can access scratch
1529 /// memory.
1530
1531 /// Other address spaces do not have a cache.
1532
1533 if (Pos == Position::AFTER)
1534 --MI;
1535
1536 Changed |= SIGfx7CacheControl::insertAcquire(MI, Scope, AddrSpace, Pos);
1537
1538 return Changed;
1539}
1540
1541bool SIGfx90ACacheControl::insertRelease(MachineBasicBlock::iterator &MI,
1542 SIAtomicScope Scope,
1543 SIAtomicAddrSpace AddrSpace,
1544 bool IsCrossAddrSpaceOrdering,
1545 Position Pos) const {
1546 bool Changed = false;
1547
1548 MachineBasicBlock &MBB = *MI->getParent();
1549 const DebugLoc &DL = MI->getDebugLoc();
1550
1551 if (Pos == Position::AFTER)
1552 ++MI;
1553
1554 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1555 switch (Scope) {
1556 case SIAtomicScope::SYSTEM:
1557 // Inserting a "S_WAITCNT vmcnt(0)" before is not required because the
1558 // hardware does not reorder memory operations by the same wave with
1559 // respect to a following "BUFFER_WBL2". The "BUFFER_WBL2" is guaranteed
1560 // to initiate writeback of any dirty cache lines of earlier writes by the
1561 // same wave. A "S_WAITCNT vmcnt(0)" is needed after to ensure the
1562 // writeback has completed.
1563 BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_WBL2))
1564 // Set SC bits to indicate system scope.
1566 // Followed by same as GFX7, which will ensure the necessary "S_WAITCNT
1567 // vmcnt(0)" needed by the "BUFFER_WBL2".
1568 Changed = true;
1569 break;
1570 case SIAtomicScope::AGENT:
1571 case SIAtomicScope::WORKGROUP:
1572 case SIAtomicScope::WAVEFRONT:
1573 case SIAtomicScope::SINGLETHREAD:
1574 // Same as GFX7.
1575 break;
1576 default:
1577 llvm_unreachable("Unsupported synchronization scope");
1578 }
1579 }
1580
1581 if (Pos == Position::AFTER)
1582 --MI;
1583
1584 Changed |=
1585 SIGfx7CacheControl::insertRelease(MI, Scope, AddrSpace,
1586 IsCrossAddrSpaceOrdering, Pos);
1587
1588 return Changed;
1589}
1590
1591bool SIGfx940CacheControl::enableLoadCacheBypass(
1592 const MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
1593 SIAtomicAddrSpace AddrSpace) const {
1594 assert(MI->mayLoad() && !MI->mayStore());
1595 bool Changed = false;
1596
1597 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1598 switch (Scope) {
1599 case SIAtomicScope::SYSTEM:
1600 // Set SC bits to indicate system scope.
1601 Changed |= enableSC0Bit(MI);
1602 Changed |= enableSC1Bit(MI);
1603 break;
1604 case SIAtomicScope::AGENT:
1605 // Set SC bits to indicate agent scope.
1606 Changed |= enableSC1Bit(MI);
1607 break;
1608 case SIAtomicScope::WORKGROUP:
1609 // In threadgroup split mode the waves of a work-group can be executing on
1610 // different CUs. Therefore need to bypass the L1 which is per CU.
1611 // Otherwise in non-threadgroup split mode all waves of a work-group are
1612 // on the same CU, and so the L1 does not need to be bypassed. Setting SC
1613 // bits to indicate work-group scope will do this automatically.
1614 Changed |= enableSC0Bit(MI);
1615 break;
1616 case SIAtomicScope::WAVEFRONT:
1617 case SIAtomicScope::SINGLETHREAD:
1618 // Leave SC bits unset to indicate wavefront scope.
1619 break;
1620 default:
1621 llvm_unreachable("Unsupported synchronization scope");
1622 }
1623 }
1624
1625 /// The scratch address space does not need the global memory caches
1626 /// to be bypassed as all memory operations by the same thread are
1627 /// sequentially consistent, and no other thread can access scratch
1628 /// memory.
1629
1630 /// Other address spaces do not have a cache.
1631
1632 return Changed;
1633}
1634
1635bool SIGfx940CacheControl::enableStoreCacheBypass(
1637 SIAtomicScope Scope, SIAtomicAddrSpace AddrSpace) const {
1638 assert(!MI->mayLoad() && MI->mayStore());
1639 bool Changed = false;
1640
1641 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1642 switch (Scope) {
1643 case SIAtomicScope::SYSTEM:
1644 // Set SC bits to indicate system scope.
1645 Changed |= enableSC0Bit(MI);
1646 Changed |= enableSC1Bit(MI);
1647 break;
1648 case SIAtomicScope::AGENT:
1649 // Set SC bits to indicate agent scope.
1650 Changed |= enableSC1Bit(MI);
1651 break;
1652 case SIAtomicScope::WORKGROUP:
1653 // Set SC bits to indicate workgroup scope.
1654 Changed |= enableSC0Bit(MI);
1655 break;
1656 case SIAtomicScope::WAVEFRONT:
1657 case SIAtomicScope::SINGLETHREAD:
1658 // Leave SC bits unset to indicate wavefront scope.
1659 break;
1660 default:
1661 llvm_unreachable("Unsupported synchronization scope");
1662 }
1663 }
1664
1665 /// The scratch address space does not need the global memory caches
1666 /// to be bypassed as all memory operations by the same thread are
1667 /// sequentially consistent, and no other thread can access scratch
1668 /// memory.
1669
1670 /// Other address spaces do not have a cache.
1671
1672 return Changed;
1673}
1674
1675bool SIGfx940CacheControl::enableRMWCacheBypass(
1676 const MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
1677 SIAtomicAddrSpace AddrSpace) const {
1678 assert(MI->mayLoad() && MI->mayStore());
1679 bool Changed = false;
1680
1681 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1682 switch (Scope) {
1683 case SIAtomicScope::SYSTEM:
1684 // Set SC1 bit to indicate system scope.
1685 Changed |= enableSC1Bit(MI);
1686 break;
1687 case SIAtomicScope::AGENT:
1688 case SIAtomicScope::WORKGROUP:
1689 case SIAtomicScope::WAVEFRONT:
1690 case SIAtomicScope::SINGLETHREAD:
1691 // RMW atomic operations implicitly bypass the L1 cache and only use SC1
1692 // to indicate system or agent scope. The SC0 bit is used to indicate if
1693 // they are return or no-return. Leave SC1 bit unset to indicate agent
1694 // scope.
1695 break;
1696 default:
1697 llvm_unreachable("Unsupported synchronization scope");
1698 }
1699 }
1700
1701 return Changed;
1702}
1703
1704bool SIGfx940CacheControl::enableVolatileAndOrNonTemporal(
1705 MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op,
1706 bool IsVolatile, bool IsNonTemporal, bool IsLastUse = false) const {
1707 // Only handle load and store, not atomic read-modify-write insructions. The
1708 // latter use glc to indicate if the atomic returns a result and so must not
1709 // be used for cache control.
1710 assert(MI->mayLoad() ^ MI->mayStore());
1711
1712 // Only update load and store, not LLVM IR atomic read-modify-write
1713 // instructions. The latter are always marked as volatile so cannot sensibly
1714 // handle it as do not want to pessimize all atomics. Also they do not support
1715 // the nontemporal attribute.
1716 assert(Op == SIMemOp::LOAD || Op == SIMemOp::STORE);
1717
1718 bool Changed = false;
1719
1720 if (IsVolatile) {
1721 // Set SC bits to indicate system scope.
1722 Changed |= enableSC0Bit(MI);
1723 Changed |= enableSC1Bit(MI);
1724
1725 // Ensure operation has completed at system scope to cause all volatile
1726 // operations to be visible outside the program in a global order. Do not
1727 // request cross address space as only the global address space can be
1728 // observable outside the program, so no need to cause a waitcnt for LDS
1729 // address space operations.
1730 Changed |= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false,
1731 Position::AFTER, AtomicOrdering::Unordered);
1732
1733 return Changed;
1734 }
1735
1736 if (IsNonTemporal) {
1737 Changed |= enableNTBit(MI);
1738 return Changed;
1739 }
1740
1741 return Changed;
1742}
1743
1744bool SIGfx940CacheControl::insertAcquire(MachineBasicBlock::iterator &MI,
1745 SIAtomicScope Scope,
1746 SIAtomicAddrSpace AddrSpace,
1747 Position Pos) const {
1748 if (!InsertCacheInv)
1749 return false;
1750
1751 bool Changed = false;
1752
1753 MachineBasicBlock &MBB = *MI->getParent();
1754 DebugLoc DL = MI->getDebugLoc();
1755
1756 if (Pos == Position::AFTER)
1757 ++MI;
1758
1759 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1760 switch (Scope) {
1761 case SIAtomicScope::SYSTEM:
1762 // Ensures that following loads will not see stale remote VMEM data or
1763 // stale local VMEM data with MTYPE NC. Local VMEM data with MTYPE RW and
1764 // CC will never be stale due to the local memory probes.
1765 BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_INV))
1766 // Set SC bits to indicate system scope.
1768 // Inserting a "S_WAITCNT vmcnt(0)" after is not required because the
1769 // hardware does not reorder memory operations by the same wave with
1770 // respect to a preceding "BUFFER_INV". The invalidate is guaranteed to
1771 // remove any cache lines of earlier writes by the same wave and ensures
1772 // later reads by the same wave will refetch the cache lines.
1773 Changed = true;
1774 break;
1775 case SIAtomicScope::AGENT:
1776 // Ensures that following loads will not see stale remote date or local
1777 // MTYPE NC global data. Local MTYPE RW and CC memory will never be stale
1778 // due to the memory probes.
1779 BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_INV))
1780 // Set SC bits to indicate agent scope.
1782 // Inserting "S_WAITCNT vmcnt(0)" is not required because the hardware
1783 // does not reorder memory operations with respect to preceeding buffer
1784 // invalidate. The invalidate is guaranteed to remove any cache lines of
1785 // earlier writes and ensures later writes will refetch the cache lines.
1786 Changed = true;
1787 break;
1788 case SIAtomicScope::WORKGROUP:
1789 // In threadgroup split mode the waves of a work-group can be executing on
1790 // different CUs. Therefore need to invalidate the L1 which is per CU.
1791 // Otherwise in non-threadgroup split mode all waves of a work-group are
1792 // on the same CU, and so the L1 does not need to be invalidated.
1793 if (ST.isTgSplitEnabled()) {
1794 // Ensures L1 is invalidated if in threadgroup split mode. In
1795 // non-threadgroup split mode it is a NOP, but no point generating it in
1796 // that case if know not in that mode.
1797 BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_INV))
1798 // Set SC bits to indicate work-group scope.
1800 // Inserting "S_WAITCNT vmcnt(0)" is not required because the hardware
1801 // does not reorder memory operations with respect to preceeding buffer
1802 // invalidate. The invalidate is guaranteed to remove any cache lines of
1803 // earlier writes and ensures later writes will refetch the cache lines.
1804 Changed = true;
1805 }
1806 break;
1807 case SIAtomicScope::WAVEFRONT:
1808 case SIAtomicScope::SINGLETHREAD:
1809 // Could generate "BUFFER_INV" but it would do nothing as there are no
1810 // caches to invalidate.
1811 break;
1812 default:
1813 llvm_unreachable("Unsupported synchronization scope");
1814 }
1815 }
1816
1817 /// The scratch address space does not need the global memory cache
1818 /// to be flushed as all memory operations by the same thread are
1819 /// sequentially consistent, and no other thread can access scratch
1820 /// memory.
1821
1822 /// Other address spaces do not have a cache.
1823
1824 if (Pos == Position::AFTER)
1825 --MI;
1826
1827 return Changed;
1828}
1829
1830bool SIGfx940CacheControl::insertRelease(MachineBasicBlock::iterator &MI,
1831 SIAtomicScope Scope,
1832 SIAtomicAddrSpace AddrSpace,
1833 bool IsCrossAddrSpaceOrdering,
1834 Position Pos) const {
1835 bool Changed = false;
1836
1837 MachineBasicBlock &MBB = *MI->getParent();
1838 DebugLoc DL = MI->getDebugLoc();
1839
1840 if (Pos == Position::AFTER)
1841 ++MI;
1842
1843 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1844 switch (Scope) {
1845 case SIAtomicScope::SYSTEM:
1846 // Inserting a "S_WAITCNT vmcnt(0)" before is not required because the
1847 // hardware does not reorder memory operations by the same wave with
1848 // respect to a following "BUFFER_WBL2". The "BUFFER_WBL2" is guaranteed
1849 // to initiate writeback of any dirty cache lines of earlier writes by the
1850 // same wave. A "S_WAITCNT vmcnt(0)" is needed after to ensure the
1851 // writeback has completed.
1852 BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_WBL2))
1853 // Set SC bits to indicate system scope.
1855 // Since AddrSpace contains SIAtomicAddrSpace::GLOBAL and Scope is
1856 // SIAtomicScope::SYSTEM, the following insertWait will generate the
1857 // required "S_WAITCNT vmcnt(0)" needed by the "BUFFER_WBL2".
1858 Changed = true;
1859 break;
1860 case SIAtomicScope::AGENT:
1861 BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_WBL2))
1862 // Set SC bits to indicate agent scope.
1864
1865 // Since AddrSpace contains SIAtomicAddrSpace::GLOBAL and Scope is
1866 // SIAtomicScope::AGENT, the following insertWait will generate the
1867 // required "S_WAITCNT vmcnt(0)".
1868 Changed = true;
1869 break;
1870 case SIAtomicScope::WORKGROUP:
1871 case SIAtomicScope::WAVEFRONT:
1872 case SIAtomicScope::SINGLETHREAD:
1873 // Do not generate "BUFFER_WBL2" as there are no caches it would
1874 // writeback, and would require an otherwise unnecessary
1875 // "S_WAITCNT vmcnt(0)".
1876 break;
1877 default:
1878 llvm_unreachable("Unsupported synchronization scope");
1879 }
1880 }
1881
1882 if (Pos == Position::AFTER)
1883 --MI;
1884
1885 // Ensure the necessary S_WAITCNT needed by any "BUFFER_WBL2" as well as other
1886 // S_WAITCNT needed.
1887 Changed |= insertWait(MI, Scope, AddrSpace, SIMemOp::LOAD | SIMemOp::STORE,
1888 IsCrossAddrSpaceOrdering, Pos, AtomicOrdering::Release);
1889
1890 return Changed;
1891}
1892
1893bool SIGfx10CacheControl::enableLoadCacheBypass(
1895 SIAtomicScope Scope,
1896 SIAtomicAddrSpace AddrSpace) const {
1897 assert(MI->mayLoad() && !MI->mayStore());
1898 bool Changed = false;
1899
1900 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1901 switch (Scope) {
1902 case SIAtomicScope::SYSTEM:
1903 case SIAtomicScope::AGENT:
1904 // Set the L0 and L1 cache policies to MISS_EVICT.
1905 // Note: there is no L2 cache coherent bypass control at the ISA level.
1906 Changed |= enableGLCBit(MI);
1907 Changed |= enableDLCBit(MI);
1908 break;
1909 case SIAtomicScope::WORKGROUP:
1910 // In WGP mode the waves of a work-group can be executing on either CU of
1911 // the WGP. Therefore need to bypass the L0 which is per CU. Otherwise in
1912 // CU mode all waves of a work-group are on the same CU, and so the L0
1913 // does not need to be bypassed.
1914 if (!ST.isCuModeEnabled())
1915 Changed |= enableGLCBit(MI);
1916 break;
1917 case SIAtomicScope::WAVEFRONT:
1918 case SIAtomicScope::SINGLETHREAD:
1919 // No cache to bypass.
1920 break;
1921 default:
1922 llvm_unreachable("Unsupported synchronization scope");
1923 }
1924 }
1925
1926 /// The scratch address space does not need the global memory caches
1927 /// to be bypassed as all memory operations by the same thread are
1928 /// sequentially consistent, and no other thread can access scratch
1929 /// memory.
1930
1931 /// Other address spaces do not have a cache.
1932
1933 return Changed;
1934}
1935
1936bool SIGfx10CacheControl::enableVolatileAndOrNonTemporal(
1937 MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op,
1938 bool IsVolatile, bool IsNonTemporal, bool IsLastUse = false) const {
1939
1940 // Only handle load and store, not atomic read-modify-write insructions. The
1941 // latter use glc to indicate if the atomic returns a result and so must not
1942 // be used for cache control.
1943 assert(MI->mayLoad() ^ MI->mayStore());
1944
1945 // Only update load and store, not LLVM IR atomic read-modify-write
1946 // instructions. The latter are always marked as volatile so cannot sensibly
1947 // handle it as do not want to pessimize all atomics. Also they do not support
1948 // the nontemporal attribute.
1949 assert(Op == SIMemOp::LOAD || Op == SIMemOp::STORE);
1950
1951 bool Changed = false;
1952
1953 if (IsVolatile) {
1954 // Set L0 and L1 cache policy to be MISS_EVICT for load instructions
1955 // and MISS_LRU for store instructions.
1956 // Note: there is no L2 cache coherent bypass control at the ISA level.
1957 if (Op == SIMemOp::LOAD) {
1958 Changed |= enableGLCBit(MI);
1959 Changed |= enableDLCBit(MI);
1960 }
1961
1962 // Ensure operation has completed at system scope to cause all volatile
1963 // operations to be visible outside the program in a global order. Do not
1964 // request cross address space as only the global address space can be
1965 // observable outside the program, so no need to cause a waitcnt for LDS
1966 // address space operations.
1967 Changed |= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false,
1968 Position::AFTER, AtomicOrdering::Unordered);
1969 return Changed;
1970 }
1971
1972 if (IsNonTemporal) {
1973 // For loads setting SLC configures L0 and L1 cache policy to HIT_EVICT
1974 // and L2 cache policy to STREAM.
1975 // For stores setting both GLC and SLC configures L0 and L1 cache policy
1976 // to MISS_EVICT and the L2 cache policy to STREAM.
1977 if (Op == SIMemOp::STORE)
1978 Changed |= enableGLCBit(MI);
1979 Changed |= enableSLCBit(MI);
1980
1981 return Changed;
1982 }
1983
1984 return Changed;
1985}
1986
1987bool SIGfx10CacheControl::insertWait(MachineBasicBlock::iterator &MI,
1988 SIAtomicScope Scope,
1989 SIAtomicAddrSpace AddrSpace, SIMemOp Op,
1990 bool IsCrossAddrSpaceOrdering,
1991 Position Pos, AtomicOrdering Order) const {
1992 bool Changed = false;
1993
1994 MachineBasicBlock &MBB = *MI->getParent();
1995 DebugLoc DL = MI->getDebugLoc();
1996
1997 if (Pos == Position::AFTER)
1998 ++MI;
1999
2000 bool VMCnt = false;
2001 bool VSCnt = false;
2002 bool LGKMCnt = false;
2003
2004 if ((AddrSpace & (SIAtomicAddrSpace::GLOBAL | SIAtomicAddrSpace::SCRATCH)) !=
2005 SIAtomicAddrSpace::NONE) {
2006 switch (Scope) {
2007 case SIAtomicScope::SYSTEM:
2008 case SIAtomicScope::AGENT:
2009 if ((Op & SIMemOp::LOAD) != SIMemOp::NONE)
2010 VMCnt |= true;
2011 if ((Op & SIMemOp::STORE) != SIMemOp::NONE)
2012 VSCnt |= true;
2013 break;
2014 case SIAtomicScope::WORKGROUP:
2015 // In WGP mode the waves of a work-group can be executing on either CU of
2016 // the WGP. Therefore need to wait for operations to complete to ensure
2017 // they are visible to waves in the other CU as the L0 is per CU.
2018 // Otherwise in CU mode and all waves of a work-group are on the same CU
2019 // which shares the same L0.
2020 if (!ST.isCuModeEnabled()) {
2021 if ((Op & SIMemOp::LOAD) != SIMemOp::NONE)
2022 VMCnt |= true;
2023 if ((Op & SIMemOp::STORE) != SIMemOp::NONE)
2024 VSCnt |= true;
2025 }
2026 break;
2027 case SIAtomicScope::WAVEFRONT:
2028 case SIAtomicScope::SINGLETHREAD:
2029 // The L0 cache keeps all memory operations in order for
2030 // work-items in the same wavefront.
2031 break;
2032 default:
2033 llvm_unreachable("Unsupported synchronization scope");
2034 }
2035 }
2036
2037 if ((AddrSpace & SIAtomicAddrSpace::LDS) != SIAtomicAddrSpace::NONE) {
2038 switch (Scope) {
2039 case SIAtomicScope::SYSTEM:
2040 case SIAtomicScope::AGENT:
2041 case SIAtomicScope::WORKGROUP:
2042 // If no cross address space ordering then an "S_WAITCNT lgkmcnt(0)" is
2043 // not needed as LDS operations for all waves are executed in a total
2044 // global ordering as observed by all waves. Required if also
2045 // synchronizing with global/GDS memory as LDS operations could be
2046 // reordered with respect to later global/GDS memory operations of the
2047 // same wave.
2048 LGKMCnt |= IsCrossAddrSpaceOrdering;
2049 break;
2050 case SIAtomicScope::WAVEFRONT:
2051 case SIAtomicScope::SINGLETHREAD:
2052 // The LDS keeps all memory operations in order for
2053 // the same wavefront.
2054 break;
2055 default:
2056 llvm_unreachable("Unsupported synchronization scope");
2057 }
2058 }
2059
2060 if ((AddrSpace & SIAtomicAddrSpace::GDS) != SIAtomicAddrSpace::NONE) {
2061 switch (Scope) {
2062 case SIAtomicScope::SYSTEM:
2063 case SIAtomicScope::AGENT:
2064 // If no cross address space ordering then an GDS "S_WAITCNT lgkmcnt(0)"
2065 // is not needed as GDS operations for all waves are executed in a total
2066 // global ordering as observed by all waves. Required if also
2067 // synchronizing with global/LDS memory as GDS operations could be
2068 // reordered with respect to later global/LDS memory operations of the
2069 // same wave.
2070 LGKMCnt |= IsCrossAddrSpaceOrdering;
2071 break;
2072 case SIAtomicScope::WORKGROUP:
2073 case SIAtomicScope::WAVEFRONT:
2074 case SIAtomicScope::SINGLETHREAD:
2075 // The GDS keeps all memory operations in order for
2076 // the same work-group.
2077 break;
2078 default:
2079 llvm_unreachable("Unsupported synchronization scope");
2080 }
2081 }
2082
2083 if (VMCnt || LGKMCnt) {
2084 unsigned WaitCntImmediate =
2086 VMCnt ? 0 : getVmcntBitMask(IV),
2088 LGKMCnt ? 0 : getLgkmcntBitMask(IV));
2089 BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT_soft))
2090 .addImm(WaitCntImmediate);
2091 Changed = true;
2092 }
2093
2094 // On architectures that support direct loads to LDS, emit an unknown waitcnt
2095 // at workgroup-scoped release operations that specify the LDS address space.
2096 // SIInsertWaitcnts will later replace this with a vmcnt().
2097 if (ST.hasVMemToLDSLoad() && isReleaseOrStronger(Order) &&
2098 Scope == SIAtomicScope::WORKGROUP &&
2099 (AddrSpace & SIAtomicAddrSpace::LDS) != SIAtomicAddrSpace::NONE) {
2100 BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT_lds_direct));
2101 Changed = true;
2102 }
2103
2104 if (VSCnt) {
2105 BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT_VSCNT_soft))
2106 .addReg(AMDGPU::SGPR_NULL, RegState::Undef)
2107 .addImm(0);
2108 Changed = true;
2109 }
2110
2111 if (Pos == Position::AFTER)
2112 --MI;
2113
2114 return Changed;
2115}
2116
2117bool SIGfx10CacheControl::insertAcquire(MachineBasicBlock::iterator &MI,
2118 SIAtomicScope Scope,
2119 SIAtomicAddrSpace AddrSpace,
2120 Position Pos) const {
2121 if (!InsertCacheInv)
2122 return false;
2123
2124 bool Changed = false;
2125
2126 MachineBasicBlock &MBB = *MI->getParent();
2127 DebugLoc DL = MI->getDebugLoc();
2128
2129 if (Pos == Position::AFTER)
2130 ++MI;
2131
2132 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
2133 switch (Scope) {
2134 case SIAtomicScope::SYSTEM:
2135 case SIAtomicScope::AGENT:
2136 // The order of invalidates matter here. We must invalidate "outer in"
2137 // so L1 -> L0 to avoid L0 pulling in stale data from L1 when it is
2138 // invalidated.
2139 BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_GL1_INV));
2140 BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_GL0_INV));
2141 Changed = true;
2142 break;
2143 case SIAtomicScope::WORKGROUP:
2144 // In WGP mode the waves of a work-group can be executing on either CU of
2145 // the WGP. Therefore need to invalidate the L0 which is per CU. Otherwise
2146 // in CU mode and all waves of a work-group are on the same CU, and so the
2147 // L0 does not need to be invalidated.
2148 if (!ST.isCuModeEnabled()) {
2149 BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_GL0_INV));
2150 Changed = true;
2151 }
2152 break;
2153 case SIAtomicScope::WAVEFRONT:
2154 case SIAtomicScope::SINGLETHREAD:
2155 // No cache to invalidate.
2156 break;
2157 default:
2158 llvm_unreachable("Unsupported synchronization scope");
2159 }
2160 }
2161
2162 /// The scratch address space does not need the global memory cache
2163 /// to be flushed as all memory operations by the same thread are
2164 /// sequentially consistent, and no other thread can access scratch
2165 /// memory.
2166
2167 /// Other address spaces do not have a cache.
2168
2169 if (Pos == Position::AFTER)
2170 --MI;
2171
2172 return Changed;
2173}
2174
2175bool SIGfx10CacheControl::insertBarrierStart(
2177 // We need to wait on vm_vsrc so barriers can pair with fences in GFX10+ CU
2178 // mode. This is because a CU mode release fence does not emit any wait, which
2179 // is fine when only dealing with vmem, but isn't sufficient in the presence
2180 // of barriers which do not go through vmem.
2181 // GFX12.5 does not require this additional wait.
2182 if (!ST.isCuModeEnabled() || ST.hasGFX1250Insts())
2183 return false;
2184
2185 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
2186 TII->get(AMDGPU::S_WAITCNT_DEPCTR))
2188 return true;
2189}
2190
2191bool SIGfx11CacheControl::enableLoadCacheBypass(
2192 const MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
2193 SIAtomicAddrSpace AddrSpace) const {
2194 assert(MI->mayLoad() && !MI->mayStore());
2195 bool Changed = false;
2196
2197 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
2198 switch (Scope) {
2199 case SIAtomicScope::SYSTEM:
2200 case SIAtomicScope::AGENT:
2201 // Set the L0 and L1 cache policies to MISS_EVICT.
2202 // Note: there is no L2 cache coherent bypass control at the ISA level.
2203 Changed |= enableGLCBit(MI);
2204 break;
2205 case SIAtomicScope::WORKGROUP:
2206 // In WGP mode the waves of a work-group can be executing on either CU of
2207 // the WGP. Therefore need to bypass the L0 which is per CU. Otherwise in
2208 // CU mode all waves of a work-group are on the same CU, and so the L0
2209 // does not need to be bypassed.
2210 if (!ST.isCuModeEnabled())
2211 Changed |= enableGLCBit(MI);
2212 break;
2213 case SIAtomicScope::WAVEFRONT:
2214 case SIAtomicScope::SINGLETHREAD:
2215 // No cache to bypass.
2216 break;
2217 default:
2218 llvm_unreachable("Unsupported synchronization scope");
2219 }
2220 }
2221
2222 /// The scratch address space does not need the global memory caches
2223 /// to be bypassed as all memory operations by the same thread are
2224 /// sequentially consistent, and no other thread can access scratch
2225 /// memory.
2226
2227 /// Other address spaces do not have a cache.
2228
2229 return Changed;
2230}
2231
2232bool SIGfx11CacheControl::enableVolatileAndOrNonTemporal(
2233 MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op,
2234 bool IsVolatile, bool IsNonTemporal, bool IsLastUse = false) const {
2235
2236 // Only handle load and store, not atomic read-modify-write insructions. The
2237 // latter use glc to indicate if the atomic returns a result and so must not
2238 // be used for cache control.
2239 assert(MI->mayLoad() ^ MI->mayStore());
2240
2241 // Only update load and store, not LLVM IR atomic read-modify-write
2242 // instructions. The latter are always marked as volatile so cannot sensibly
2243 // handle it as do not want to pessimize all atomics. Also they do not support
2244 // the nontemporal attribute.
2245 assert(Op == SIMemOp::LOAD || Op == SIMemOp::STORE);
2246
2247 bool Changed = false;
2248
2249 if (IsVolatile) {
2250 // Set L0 and L1 cache policy to be MISS_EVICT for load instructions
2251 // and MISS_LRU for store instructions.
2252 // Note: there is no L2 cache coherent bypass control at the ISA level.
2253 if (Op == SIMemOp::LOAD)
2254 Changed |= enableGLCBit(MI);
2255
2256 // Set MALL NOALLOC for load and store instructions.
2257 Changed |= enableDLCBit(MI);
2258
2259 // Ensure operation has completed at system scope to cause all volatile
2260 // operations to be visible outside the program in a global order. Do not
2261 // request cross address space as only the global address space can be
2262 // observable outside the program, so no need to cause a waitcnt for LDS
2263 // address space operations.
2264 Changed |= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false,
2265 Position::AFTER, AtomicOrdering::Unordered);
2266 return Changed;
2267 }
2268
2269 if (IsNonTemporal) {
2270 // For loads setting SLC configures L0 and L1 cache policy to HIT_EVICT
2271 // and L2 cache policy to STREAM.
2272 // For stores setting both GLC and SLC configures L0 and L1 cache policy
2273 // to MISS_EVICT and the L2 cache policy to STREAM.
2274 if (Op == SIMemOp::STORE)
2275 Changed |= enableGLCBit(MI);
2276 Changed |= enableSLCBit(MI);
2277
2278 // Set MALL NOALLOC for load and store instructions.
2279 Changed |= enableDLCBit(MI);
2280 return Changed;
2281 }
2282
2283 return Changed;
2284}
2285
2286bool SIGfx12CacheControl::setTH(const MachineBasicBlock::iterator MI,
2287 AMDGPU::CPol::CPol Value) const {
2288 MachineOperand *CPol = TII->getNamedOperand(*MI, OpName::cpol);
2289 if (!CPol)
2290 return false;
2291
2292 uint64_t NewTH = Value & AMDGPU::CPol::TH;
2293 if ((CPol->getImm() & AMDGPU::CPol::TH) != NewTH) {
2294 CPol->setImm((CPol->getImm() & ~AMDGPU::CPol::TH) | NewTH);
2295 return true;
2296 }
2297
2298 return false;
2299}
2300
2301bool SIGfx12CacheControl::setScope(const MachineBasicBlock::iterator MI,
2302 AMDGPU::CPol::CPol Value) const {
2303 MachineOperand *CPol = TII->getNamedOperand(*MI, OpName::cpol);
2304 if (!CPol)
2305 return false;
2306
2307 uint64_t NewScope = Value & AMDGPU::CPol::SCOPE;
2308 if ((CPol->getImm() & AMDGPU::CPol::SCOPE) != NewScope) {
2309 CPol->setImm((CPol->getImm() & ~AMDGPU::CPol::SCOPE) | NewScope);
2310 return true;
2311 }
2312
2313 return false;
2314}
2315
2316bool SIGfx12CacheControl::insertWaitsBeforeSystemScopeStore(
2317 const MachineBasicBlock::iterator MI) const {
2318 // TODO: implement flag for frontend to give us a hint not to insert waits.
2319
2320 MachineBasicBlock &MBB = *MI->getParent();
2321 const DebugLoc &DL = MI->getDebugLoc();
2322
2323 BuildMI(MBB, MI, DL, TII->get(S_WAIT_LOADCNT_soft)).addImm(0);
2324 if (ST.hasImageInsts()) {
2325 BuildMI(MBB, MI, DL, TII->get(S_WAIT_SAMPLECNT_soft)).addImm(0);
2326 BuildMI(MBB, MI, DL, TII->get(S_WAIT_BVHCNT_soft)).addImm(0);
2327 }
2328 BuildMI(MBB, MI, DL, TII->get(S_WAIT_KMCNT_soft)).addImm(0);
2329 BuildMI(MBB, MI, DL, TII->get(S_WAIT_STORECNT_soft)).addImm(0);
2330
2331 return true;
2332}
2333
2334bool SIGfx12CacheControl::insertWait(MachineBasicBlock::iterator &MI,
2335 SIAtomicScope Scope,
2336 SIAtomicAddrSpace AddrSpace, SIMemOp Op,
2337 bool IsCrossAddrSpaceOrdering,
2338 Position Pos, AtomicOrdering Order) const {
2339 bool Changed = false;
2340
2341 MachineBasicBlock &MBB = *MI->getParent();
2342 DebugLoc DL = MI->getDebugLoc();
2343
2344 bool LOADCnt = false;
2345 bool DSCnt = false;
2346 bool STORECnt = false;
2347
2348 if (Pos == Position::AFTER)
2349 ++MI;
2350
2351 if ((AddrSpace & (SIAtomicAddrSpace::GLOBAL | SIAtomicAddrSpace::SCRATCH)) !=
2352 SIAtomicAddrSpace::NONE) {
2353 switch (Scope) {
2354 case SIAtomicScope::SYSTEM:
2355 case SIAtomicScope::AGENT:
2356 case SIAtomicScope::CLUSTER:
2357 if ((Op & SIMemOp::LOAD) != SIMemOp::NONE)
2358 LOADCnt |= true;
2359 if ((Op & SIMemOp::STORE) != SIMemOp::NONE)
2360 STORECnt |= true;
2361 break;
2362 case SIAtomicScope::WORKGROUP:
2363 // GFX12.0:
2364 // In WGP mode the waves of a work-group can be executing on either CU
2365 // of the WGP. Therefore need to wait for operations to complete to
2366 // ensure they are visible to waves in the other CU as the L0 is per CU.
2367 // Otherwise in CU mode and all waves of a work-group are on the same CU
2368 // which shares the same L0.
2369 //
2370 // GFX12.5:
2371 // TODO DOCS
2372 if (!ST.isCuModeEnabled() || ST.hasGFX1250Insts()) {
2373 if ((Op & SIMemOp::LOAD) != SIMemOp::NONE)
2374 LOADCnt |= true;
2375 if ((Op & SIMemOp::STORE) != SIMemOp::NONE)
2376 STORECnt |= true;
2377 }
2378 break;
2379 case SIAtomicScope::WAVEFRONT:
2380 case SIAtomicScope::SINGLETHREAD:
2381 // The L0 cache keeps all memory operations in order for
2382 // work-items in the same wavefront.
2383 break;
2384 default:
2385 llvm_unreachable("Unsupported synchronization scope");
2386 }
2387 }
2388
2389 if ((AddrSpace & SIAtomicAddrSpace::LDS) != SIAtomicAddrSpace::NONE) {
2390 switch (Scope) {
2391 case SIAtomicScope::SYSTEM:
2392 case SIAtomicScope::AGENT:
2393 case SIAtomicScope::CLUSTER:
2394 case SIAtomicScope::WORKGROUP:
2395 // If no cross address space ordering then an "S_WAITCNT lgkmcnt(0)" is
2396 // not needed as LDS operations for all waves are executed in a total
2397 // global ordering as observed by all waves. Required if also
2398 // synchronizing with global/GDS memory as LDS operations could be
2399 // reordered with respect to later global/GDS memory operations of the
2400 // same wave.
2401 DSCnt |= IsCrossAddrSpaceOrdering;
2402 break;
2403 case SIAtomicScope::WAVEFRONT:
2404 case SIAtomicScope::SINGLETHREAD:
2405 // The LDS keeps all memory operations in order for
2406 // the same wavefront.
2407 break;
2408 default:
2409 llvm_unreachable("Unsupported synchronization scope");
2410 }
2411 }
2412
2413 if (LOADCnt) {
2414 // Acquire sequences only need to wait on the previous atomic operation.
2415 // e.g. a typical sequence looks like
2416 // atomic load
2417 // (wait)
2418 // global_inv
2419 //
2420 // We do not have BVH or SAMPLE atomics, so the atomic load is always going
2421 // to be tracked using loadcnt.
2422 //
2423 // This also applies to fences. Fences cannot pair with an instruction
2424 // tracked with bvh/samplecnt as we don't have any atomics that do that.
2425 if (Order != AtomicOrdering::Acquire && ST.hasImageInsts()) {
2426 BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAIT_BVHCNT_soft)).addImm(0);
2427 BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAIT_SAMPLECNT_soft)).addImm(0);
2428 }
2429 BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAIT_LOADCNT_soft)).addImm(0);
2430 Changed = true;
2431 }
2432
2433 if (STORECnt) {
2434 BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAIT_STORECNT_soft)).addImm(0);
2435 Changed = true;
2436 }
2437
2438 if (DSCnt) {
2439 BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAIT_DSCNT_soft)).addImm(0);
2440 Changed = true;
2441 }
2442
2443 if (Pos == Position::AFTER)
2444 --MI;
2445
2446 return Changed;
2447}
2448
2449bool SIGfx12CacheControl::insertAcquire(MachineBasicBlock::iterator &MI,
2450 SIAtomicScope Scope,
2451 SIAtomicAddrSpace AddrSpace,
2452 Position Pos) const {
2453 if (!InsertCacheInv)
2454 return false;
2455
2456 MachineBasicBlock &MBB = *MI->getParent();
2457 DebugLoc DL = MI->getDebugLoc();
2458
2459 /// The scratch address space does not need the global memory cache
2460 /// to be flushed as all memory operations by the same thread are
2461 /// sequentially consistent, and no other thread can access scratch
2462 /// memory.
2463
2464 /// Other address spaces do not have a cache.
2465 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) == SIAtomicAddrSpace::NONE)
2466 return false;
2467
2469 switch (Scope) {
2470 case SIAtomicScope::SYSTEM:
2471 ScopeImm = AMDGPU::CPol::SCOPE_SYS;
2472 break;
2473 case SIAtomicScope::AGENT:
2474 ScopeImm = AMDGPU::CPol::SCOPE_DEV;
2475 break;
2476 case SIAtomicScope::CLUSTER:
2477 ScopeImm = AMDGPU::CPol::SCOPE_SE;
2478 break;
2479 case SIAtomicScope::WORKGROUP:
2480 // GFX12.0:
2481 // In WGP mode the waves of a work-group can be executing on either CU of
2482 // the WGP. Therefore we need to invalidate the L0 which is per CU.
2483 // Otherwise in CU mode all waves of a work-group are on the same CU, and
2484 // so the L0 does not need to be invalidated.
2485 //
2486 // GFX12.5
2487 // TODO DOCS
2488 if (ST.isCuModeEnabled())
2489 return false;
2490
2491 ScopeImm = AMDGPU::CPol::SCOPE_SE;
2492 break;
2493 case SIAtomicScope::WAVEFRONT:
2494 case SIAtomicScope::SINGLETHREAD:
2495 // No cache to invalidate.
2496 return false;
2497 default:
2498 llvm_unreachable("Unsupported synchronization scope");
2499 }
2500
2501 if (Pos == Position::AFTER)
2502 ++MI;
2503
2504 BuildMI(MBB, MI, DL, TII->get(AMDGPU::GLOBAL_INV)).addImm(ScopeImm);
2505
2506 if (Pos == Position::AFTER)
2507 --MI;
2508
2509 return true;
2510}
2511
2512bool SIGfx12CacheControl::insertRelease(MachineBasicBlock::iterator &MI,
2513 SIAtomicScope Scope,
2514 SIAtomicAddrSpace AddrSpace,
2515 bool IsCrossAddrSpaceOrdering,
2516 Position Pos) const {
2517 MachineBasicBlock &MBB = *MI->getParent();
2518 DebugLoc DL = MI->getDebugLoc();
2519
2520 // The scratch address space does not need the global memory cache
2521 // writeback as all memory operations by the same thread are
2522 // sequentially consistent, and no other thread can access scratch
2523 // memory.
2524
2525 // Other address spaces do not have a cache.
2526 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) == SIAtomicAddrSpace::NONE)
2527 return false;
2528
2529 if (Pos == Position::AFTER)
2530 ++MI;
2531
2532 // global_wb is only necessary at system scope for GFX12.0,
2533 // they're also necessary at device scope for GFX12.5.
2534 //
2535 // Emitting it for lower scopes is a slow no-op, so we omit it
2536 // for performance.
2537 switch (Scope) {
2538 case SIAtomicScope::SYSTEM:
2539 BuildMI(MBB, MI, DL, TII->get(AMDGPU::GLOBAL_WB))
2541 break;
2542 case SIAtomicScope::AGENT:
2543 // TODO DOCS
2544 if (ST.hasGFX1250Insts()) {
2545 BuildMI(MBB, MI, DL, TII->get(AMDGPU::GLOBAL_WB))
2547 }
2548 break;
2549 case SIAtomicScope::CLUSTER:
2550 case SIAtomicScope::WORKGROUP:
2551 // No WB necessary, but we still have to wait.
2552 break;
2553 case SIAtomicScope::WAVEFRONT:
2554 case SIAtomicScope::SINGLETHREAD:
2555 // No WB or wait necessary here.
2556 return false;
2557 default:
2558 llvm_unreachable("Unsupported synchronization scope");
2559 }
2560
2561 if (Pos == Position::AFTER)
2562 --MI;
2563
2564 // We always have to wait for previous memory operations (load/store) to
2565 // complete, whether we inserted a WB or not. If we inserted a WB (storecnt),
2566 // we of course need to wait for that as well.
2567 insertWait(MI, Scope, AddrSpace, SIMemOp::LOAD | SIMemOp::STORE,
2568 IsCrossAddrSpaceOrdering, Pos, AtomicOrdering::Release);
2569
2570 return true;
2571}
2572
2573bool SIGfx12CacheControl::enableVolatileAndOrNonTemporal(
2574 MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op,
2575 bool IsVolatile, bool IsNonTemporal, bool IsLastUse = false) const {
2576
2577 // Only handle load and store, not atomic read-modify-write instructions.
2578 assert(MI->mayLoad() ^ MI->mayStore());
2579
2580 // Only update load and store, not LLVM IR atomic read-modify-write
2581 // instructions. The latter are always marked as volatile so cannot sensibly
2582 // handle it as do not want to pessimize all atomics. Also they do not support
2583 // the nontemporal attribute.
2584 assert(Op == SIMemOp::LOAD || Op == SIMemOp::STORE);
2585
2586 bool Changed = false;
2587
2588 if (IsLastUse) {
2589 // Set last-use hint.
2590 Changed |= setTH(MI, AMDGPU::CPol::TH_LU);
2591 } else if (IsNonTemporal) {
2592 // Set non-temporal hint for all cache levels.
2593 Changed |= setTH(MI, AMDGPU::CPol::TH_NT);
2594 }
2595
2596 if (IsVolatile) {
2597 Changed |= setScope(MI, AMDGPU::CPol::SCOPE_SYS);
2598
2599 // Ensure operation has completed at system scope to cause all volatile
2600 // operations to be visible outside the program in a global order. Do not
2601 // request cross address space as only the global address space can be
2602 // observable outside the program, so no need to cause a waitcnt for LDS
2603 // address space operations.
2604 Changed |= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false,
2605 Position::AFTER, AtomicOrdering::Unordered);
2606 }
2607
2608 return Changed;
2609}
2610
2611bool SIGfx12CacheControl::finalizeStore(MachineInstr &MI, bool Atomic) const {
2612 assert(MI.mayStore() && "Not a Store inst");
2613 const bool IsRMW = (MI.mayLoad() && MI.mayStore());
2614 bool Changed = false;
2615
2616 // GFX12.5 only: xcnt wait is needed before flat and global atomics
2617 // stores/rmw.
2618 if (Atomic && ST.requiresWaitXCntBeforeAtomicStores() && TII->isFLAT(MI)) {
2619 MachineBasicBlock &MBB = *MI.getParent();
2620 BuildMI(MBB, MI, MI.getDebugLoc(), TII->get(S_WAIT_XCNT_soft)).addImm(0);
2621 Changed = true;
2622 }
2623
2624 // Remaining fixes do not apply to RMWs.
2625 if (IsRMW)
2626 return Changed;
2627
2628 MachineOperand *CPol = TII->getNamedOperand(MI, OpName::cpol);
2629 if (!CPol) // Some vmem operations do not have a scope and are not concerned.
2630 return Changed;
2631 const unsigned Scope = CPol->getImm() & CPol::SCOPE;
2632
2633 // GFX12.0 only: Extra waits needed before system scope stores.
2634 if (!ST.hasGFX1250Insts() && !Atomic && Scope == CPol::SCOPE_SYS)
2635 Changed |= insertWaitsBeforeSystemScopeStore(MI.getIterator());
2636
2637 return Changed;
2638}
2639
2640bool SIGfx12CacheControl::handleCooperativeAtomic(MachineInstr &MI) const {
2641 if (!ST.hasGFX1250Insts())
2642 return false;
2643
2644 // Cooperative atomics need to be SCOPE_DEV or higher.
2645 MachineOperand *CPol = TII->getNamedOperand(MI, OpName::cpol);
2646 assert(CPol && "No CPol operand?");
2647 const unsigned Scope = CPol->getImm() & CPol::SCOPE;
2648 if (Scope < CPol::SCOPE_DEV)
2649 return setScope(MI, CPol::SCOPE_DEV);
2650 return false;
2651}
2652
2653bool SIGfx12CacheControl::setAtomicScope(const MachineBasicBlock::iterator &MI,
2654 SIAtomicScope Scope,
2655 SIAtomicAddrSpace AddrSpace) const {
2656 bool Changed = false;
2657
2658 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
2659 switch (Scope) {
2660 case SIAtomicScope::SYSTEM:
2661 Changed |= setScope(MI, AMDGPU::CPol::SCOPE_SYS);
2662 break;
2663 case SIAtomicScope::AGENT:
2664 Changed |= setScope(MI, AMDGPU::CPol::SCOPE_DEV);
2665 break;
2666 case SIAtomicScope::CLUSTER:
2667 Changed |= setScope(MI, AMDGPU::CPol::SCOPE_SE);
2668 break;
2669 case SIAtomicScope::WORKGROUP:
2670 // In workgroup mode, SCOPE_SE is needed as waves can executes on
2671 // different CUs that access different L0s.
2672 if (!ST.isCuModeEnabled())
2673 Changed |= setScope(MI, AMDGPU::CPol::SCOPE_SE);
2674 break;
2675 case SIAtomicScope::WAVEFRONT:
2676 case SIAtomicScope::SINGLETHREAD:
2677 // No cache to bypass.
2678 break;
2679 default:
2680 llvm_unreachable("Unsupported synchronization scope");
2681 }
2682 }
2683
2684 // The scratch address space does not need the global memory caches
2685 // to be bypassed as all memory operations by the same thread are
2686 // sequentially consistent, and no other thread can access scratch
2687 // memory.
2688
2689 // Other address spaces do not have a cache.
2690
2691 return Changed;
2692}
2693
2694bool SIMemoryLegalizer::removeAtomicPseudoMIs() {
2695 if (AtomicPseudoMIs.empty())
2696 return false;
2697
2698 for (auto &MI : AtomicPseudoMIs)
2699 MI->eraseFromParent();
2700
2701 AtomicPseudoMIs.clear();
2702 return true;
2703}
2704
2705bool SIMemoryLegalizer::expandLoad(const SIMemOpInfo &MOI,
2707 assert(MI->mayLoad() && !MI->mayStore());
2708
2709 bool Changed = false;
2710
2711 if (MOI.isAtomic()) {
2712 const AtomicOrdering Order = MOI.getOrdering();
2713 if (Order == AtomicOrdering::Monotonic ||
2714 Order == AtomicOrdering::Acquire ||
2715 Order == AtomicOrdering::SequentiallyConsistent) {
2716 Changed |= CC->enableLoadCacheBypass(MI, MOI.getScope(),
2717 MOI.getOrderingAddrSpace());
2718 }
2719
2720 // Handle cooperative atomics after cache bypass step, as it may override
2721 // the scope of the instruction to a greater scope.
2722 if (MOI.isCooperative())
2723 Changed |= CC->handleCooperativeAtomic(*MI);
2724
2725 if (Order == AtomicOrdering::SequentiallyConsistent)
2726 Changed |= CC->insertWait(MI, MOI.getScope(), MOI.getOrderingAddrSpace(),
2727 SIMemOp::LOAD | SIMemOp::STORE,
2728 MOI.getIsCrossAddressSpaceOrdering(),
2729 Position::BEFORE, Order);
2730
2731 if (Order == AtomicOrdering::Acquire ||
2732 Order == AtomicOrdering::SequentiallyConsistent) {
2733 Changed |= CC->insertWait(
2734 MI, MOI.getScope(), MOI.getInstrAddrSpace(), SIMemOp::LOAD,
2735 MOI.getIsCrossAddressSpaceOrdering(), Position::AFTER, Order);
2736 Changed |= CC->insertAcquire(MI, MOI.getScope(),
2737 MOI.getOrderingAddrSpace(),
2738 Position::AFTER);
2739 }
2740
2741 return Changed;
2742 }
2743
2744 // Atomic instructions already bypass caches to the scope specified by the
2745 // SyncScope operand. Only non-atomic volatile and nontemporal/last-use
2746 // instructions need additional treatment.
2747 Changed |= CC->enableVolatileAndOrNonTemporal(
2748 MI, MOI.getInstrAddrSpace(), SIMemOp::LOAD, MOI.isVolatile(),
2749 MOI.isNonTemporal(), MOI.isLastUse());
2750
2751 return Changed;
2752}
2753
2754bool SIMemoryLegalizer::expandStore(const SIMemOpInfo &MOI,
2756 assert(!MI->mayLoad() && MI->mayStore());
2757
2758 bool Changed = false;
2759 // FIXME: Necessary hack because iterator can lose track of the store.
2760 MachineInstr &StoreMI = *MI;
2761
2762 if (MOI.isAtomic()) {
2763 if (MOI.getOrdering() == AtomicOrdering::Monotonic ||
2764 MOI.getOrdering() == AtomicOrdering::Release ||
2765 MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) {
2766 Changed |= CC->enableStoreCacheBypass(MI, MOI.getScope(),
2767 MOI.getOrderingAddrSpace());
2768 }
2769
2770 // Handle cooperative atomics after cache bypass step, as it may override
2771 // the scope of the instruction to a greater scope.
2772 if (MOI.isCooperative())
2773 Changed |= CC->handleCooperativeAtomic(*MI);
2774
2775 if (MOI.getOrdering() == AtomicOrdering::Release ||
2776 MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent)
2777 Changed |= CC->insertRelease(MI, MOI.getScope(),
2778 MOI.getOrderingAddrSpace(),
2779 MOI.getIsCrossAddressSpaceOrdering(),
2780 Position::BEFORE);
2781
2782 Changed |= CC->finalizeStore(StoreMI, /*Atomic=*/true);
2783 return Changed;
2784 }
2785
2786 // Atomic instructions already bypass caches to the scope specified by the
2787 // SyncScope operand. Only non-atomic volatile and nontemporal instructions
2788 // need additional treatment.
2789 Changed |= CC->enableVolatileAndOrNonTemporal(
2790 MI, MOI.getInstrAddrSpace(), SIMemOp::STORE, MOI.isVolatile(),
2791 MOI.isNonTemporal());
2792
2793 // GFX12 specific, scope(desired coherence domain in cache hierarchy) is
2794 // instruction field, do not confuse it with atomic scope.
2795 Changed |= CC->finalizeStore(StoreMI, /*Atomic=*/false);
2796 return Changed;
2797}
2798
2799bool SIMemoryLegalizer::expandAtomicFence(const SIMemOpInfo &MOI,
2801 assert(MI->getOpcode() == AMDGPU::ATOMIC_FENCE);
2802
2803 AtomicPseudoMIs.push_back(MI);
2804 bool Changed = false;
2805
2806 const SIAtomicAddrSpace OrderingAddrSpace = MOI.getOrderingAddrSpace();
2807
2808 if (MOI.isAtomic()) {
2809 const AtomicOrdering Order = MOI.getOrdering();
2810 if (Order == AtomicOrdering::Acquire) {
2811 Changed |= CC->insertWait(
2812 MI, MOI.getScope(), OrderingAddrSpace, SIMemOp::LOAD | SIMemOp::STORE,
2813 MOI.getIsCrossAddressSpaceOrdering(), Position::BEFORE, Order);
2814 }
2815
2816 if (Order == AtomicOrdering::Release ||
2817 Order == AtomicOrdering::AcquireRelease ||
2818 Order == AtomicOrdering::SequentiallyConsistent)
2819 /// TODO: This relies on a barrier always generating a waitcnt
2820 /// for LDS to ensure it is not reordered with the completion of
2821 /// the proceeding LDS operations. If barrier had a memory
2822 /// ordering and memory scope, then library does not need to
2823 /// generate a fence. Could add support in this file for
2824 /// barrier. SIInsertWaitcnt.cpp could then stop unconditionally
2825 /// adding S_WAITCNT before a S_BARRIER.
2826 Changed |= CC->insertRelease(MI, MOI.getScope(), OrderingAddrSpace,
2827 MOI.getIsCrossAddressSpaceOrdering(),
2828 Position::BEFORE);
2829
2830 // TODO: If both release and invalidate are happening they could be combined
2831 // to use the single "BUFFER_WBINV*" instruction. This could be done by
2832 // reorganizing this code or as part of optimizing SIInsertWaitcnt pass to
2833 // track cache invalidate and write back instructions.
2834
2835 if (Order == AtomicOrdering::Acquire ||
2836 Order == AtomicOrdering::AcquireRelease ||
2837 Order == AtomicOrdering::SequentiallyConsistent)
2838 Changed |= CC->insertAcquire(MI, MOI.getScope(), OrderingAddrSpace,
2839 Position::BEFORE);
2840
2841 return Changed;
2842 }
2843
2844 return Changed;
2845}
2846
2847bool SIMemoryLegalizer::expandAtomicCmpxchgOrRmw(const SIMemOpInfo &MOI,
2849 assert(MI->mayLoad() && MI->mayStore());
2850
2851 bool Changed = false;
2852 MachineInstr &RMWMI = *MI;
2853
2854 if (MOI.isAtomic()) {
2855 const AtomicOrdering Order = MOI.getOrdering();
2856 if (Order == AtomicOrdering::Monotonic ||
2857 Order == AtomicOrdering::Acquire || Order == AtomicOrdering::Release ||
2858 Order == AtomicOrdering::AcquireRelease ||
2859 Order == AtomicOrdering::SequentiallyConsistent) {
2860 Changed |= CC->enableRMWCacheBypass(MI, MOI.getScope(),
2861 MOI.getInstrAddrSpace());
2862 }
2863
2864 if (Order == AtomicOrdering::Release ||
2865 Order == AtomicOrdering::AcquireRelease ||
2866 Order == AtomicOrdering::SequentiallyConsistent ||
2867 MOI.getFailureOrdering() == AtomicOrdering::SequentiallyConsistent)
2868 Changed |= CC->insertRelease(MI, MOI.getScope(),
2869 MOI.getOrderingAddrSpace(),
2870 MOI.getIsCrossAddressSpaceOrdering(),
2871 Position::BEFORE);
2872
2873 if (Order == AtomicOrdering::Acquire ||
2874 Order == AtomicOrdering::AcquireRelease ||
2875 Order == AtomicOrdering::SequentiallyConsistent ||
2876 MOI.getFailureOrdering() == AtomicOrdering::Acquire ||
2877 MOI.getFailureOrdering() == AtomicOrdering::SequentiallyConsistent) {
2878 Changed |= CC->insertWait(
2879 MI, MOI.getScope(), MOI.getInstrAddrSpace(),
2880 isAtomicRet(*MI) ? SIMemOp::LOAD : SIMemOp::STORE,
2881 MOI.getIsCrossAddressSpaceOrdering(), Position::AFTER, Order);
2882 Changed |= CC->insertAcquire(MI, MOI.getScope(),
2883 MOI.getOrderingAddrSpace(),
2884 Position::AFTER);
2885 }
2886
2887 Changed |= CC->finalizeStore(RMWMI, /*Atomic=*/true);
2888 return Changed;
2889 }
2890
2891 return Changed;
2892}
2893
2894bool SIMemoryLegalizerLegacy::runOnMachineFunction(MachineFunction &MF) {
2895 const MachineModuleInfo &MMI =
2896 getAnalysis<MachineModuleInfoWrapperPass>().getMMI();
2897 return SIMemoryLegalizer(MMI).run(MF);
2898}
2899
2900PreservedAnalyses
2904 .getCachedResult<MachineModuleAnalysis>(
2905 *MF.getFunction().getParent());
2906 assert(MMI && "MachineModuleAnalysis must be available");
2907 if (!SIMemoryLegalizer(MMI->getMMI()).run(MF))
2908 return PreservedAnalyses::all();
2910}
2911
2912bool SIMemoryLegalizer::run(MachineFunction &MF) {
2913 bool Changed = false;
2914
2915 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
2916 SIMemOpAccess MOA(MMI.getObjFileInfo<AMDGPUMachineModuleInfo>(), ST);
2917 CC = SICacheControl::create(ST);
2918
2919 for (auto &MBB : MF) {
2920 for (auto MI = MBB.begin(); MI != MBB.end(); ++MI) {
2921
2922 // Unbundle instructions after the post-RA scheduler.
2923 if (MI->isBundle() && MI->mayLoadOrStore()) {
2924 MachineBasicBlock::instr_iterator II(MI->getIterator());
2925 for (MachineBasicBlock::instr_iterator I = ++II, E = MBB.instr_end();
2926 I != E && I->isBundledWithPred(); ++I) {
2927 I->unbundleFromPred();
2928 for (MachineOperand &MO : I->operands())
2929 if (MO.isReg())
2930 MO.setIsInternalRead(false);
2931 }
2932
2933 MI->eraseFromParent();
2934 MI = II->getIterator();
2935 }
2936
2937 if (ST.getInstrInfo()->isBarrierStart(MI->getOpcode())) {
2938 Changed |= CC->insertBarrierStart(MI);
2939 continue;
2940 }
2941
2942 if (!(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic))
2943 continue;
2944
2945 if (const auto &MOI = MOA.getLoadInfo(MI))
2946 Changed |= expandLoad(*MOI, MI);
2947 else if (const auto &MOI = MOA.getStoreInfo(MI)) {
2948 Changed |= expandStore(*MOI, MI);
2949 } else if (const auto &MOI = MOA.getAtomicFenceInfo(MI))
2950 Changed |= expandAtomicFence(*MOI, MI);
2951 else if (const auto &MOI = MOA.getAtomicCmpxchgOrRmwInfo(MI))
2952 Changed |= expandAtomicCmpxchgOrRmw(*MOI, MI);
2953 }
2954 }
2955
2956 Changed |= removeAtomicPseudoMIs();
2957 return Changed;
2958}
2959
2960INITIALIZE_PASS(SIMemoryLegalizerLegacy, DEBUG_TYPE, PASS_NAME, false, false)
2961
2962char SIMemoryLegalizerLegacy::ID = 0;
2963char &llvm::SIMemoryLegalizerID = SIMemoryLegalizerLegacy::ID;
2964
2966 return new SIMemoryLegalizerLegacy();
2967}
static std::optional< LoadInfo > getLoadInfo(const MachineInstr &MI)
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
Provides AMDGPU specific target descriptions.
AMDGPU Machine Module Info.
AMDGPU promote alloca to vector or LDS
MachineBasicBlock & MBB
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
Atomic ordering constants.
AMD GCN specific subclass of TargetSubtarget.
#define DEBUG_TYPE
const HexagonInstrInfo * TII
IRTranslator LLVM IR MI
This header defines various interfaces for pass management in LLVM.
#define I(x, y, z)
Definition MD5.cpp:58
This file provides utility for Memory Model Relaxation Annotations (MMRAs).
uint64_t IntrinsicInst * II
#define INITIALIZE_PASS(passName, arg, name, cfg, analysis)
Definition PassSupport.h:56
static cl::opt< bool > AmdgcnSkipCacheInvalidations("amdgcn-skip-cache-invalidations", cl::init(false), cl::Hidden, cl::desc("Use this to skip inserting cache invalidating instructions."))
#define PASS_NAME
This file contains some functions that are useful when dealing with strings.
#define PASS_NAME
static const uint32_t IV[8]
Definition blake3_impl.h:83
SyncScope::ID getClusterOneAddressSpaceSSID() const
std::optional< bool > isSyncScopeInclusion(SyncScope::ID A, SyncScope::ID B) const
In AMDGPU target synchronization scopes are inclusive, meaning a larger synchronization scope is incl...
SyncScope::ID getAgentOneAddressSpaceSSID() const
SyncScope::ID getSingleThreadOneAddressSpaceSSID() const
SyncScope::ID getWavefrontOneAddressSpaceSSID() const
SyncScope::ID getSystemOneAddressSpaceSSID() const
SyncScope::ID getWorkgroupOneAddressSpaceSSID() const
PassT::Result & getResult(IRUnitT &IR, ExtraArgTs... ExtraArgs)
Get the result of an analysis pass for a given IR unit.
LLVM_ABI void setPreservesCFG()
This function should be called by the pass, iff they do not:
Definition Pass.cpp:270
Represents analyses that only rely on functions' control flow.
Definition Analysis.h:73
Diagnostic information for unsupported feature in backend.
FunctionPass class - This class is used to implement most global optimizations.
Definition Pass.h:314
LLVMContext & getContext() const
getContext - Return a reference to the LLVMContext associated with this function.
Definition Function.cpp:359
Module * getParent()
Get the module that this global value is contained inside of...
LLVM_ABI void diagnose(const DiagnosticInfo &DI)
Report a message to the currently installed diagnostic handler.
A helper class to return the specified delimiter string after the first invocation of operator String...
Helper class to manipulate !mmra metadata nodes.
Instructions::iterator instr_iterator
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
MachineInstrBundleIterator< MachineInstr > iterator
MachineFunctionPass - This class adapts the FunctionPass interface to allow convenient creation of pa...
void getAnalysisUsage(AnalysisUsage &AU) const override
getAnalysisUsage - Subclasses that override getAnalysisUsage must call this.
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
Function & getFunction()
Return the LLVM function that this machine code represents.
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & addReg(Register RegNo, unsigned flags=0, unsigned SubReg=0) const
Add a new virtual register operand.
Representation of each machine instruction.
Ty & getObjFileInfo()
Keep track of various per-module pieces of information for backends that would like to do so.
MachineOperand class - Representation of each machine instruction operand.
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
Definition Analysis.h:118
PreservedAnalyses & preserveSet()
Mark an analysis set as preserved.
Definition Analysis.h:151
static bool isAtomicRet(const MachineInstr &MI)
PreservedAnalyses run(MachineFunction &MF, MachineFunctionAnalysisManager &MFAM)
SmallString - A SmallString is just a SmallVector with methods and accessors that make it work better...
Definition SmallString.h:26
A wrapper around a string literal that serves as a proxy for constructing global tables of StringRefs...
Definition StringRef.h:862
StringMap - This is an unconventional map that is specialized for handling keys that are "strings",...
Definition StringMap.h:133
StringRef - Represent a constant reference to a string, i.e.
Definition StringRef.h:55
A raw_ostream that writes to an SmallVector or SmallString.
Changed
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ REGION_ADDRESS
Address space for region memory. (GDS)
@ LOCAL_ADDRESS
Address space for local memory.
@ FLAT_ADDRESS
Address space for flat memory.
@ GLOBAL_ADDRESS
Address space for global memory (RAT0, VTX0).
@ PRIVATE_ADDRESS
Address space for private memory.
unsigned encodeFieldVmVsrc(unsigned Encoded, unsigned VmVsrc)
constexpr char IsVolatile[]
Key for Kernel::Arg::Metadata::mIsVolatile.
LLVM_ABI IsaVersion getIsaVersion(StringRef GPU)
unsigned encodeWaitcnt(const IsaVersion &Version, unsigned Vmcnt, unsigned Expcnt, unsigned Lgkmcnt)
Encodes Vmcnt, Expcnt and Lgkmcnt into Waitcnt for given isa Version.
unsigned getVmcntBitMask(const IsaVersion &Version)
unsigned getLgkmcntBitMask(const IsaVersion &Version)
unsigned getExpcntBitMask(const IsaVersion &Version)
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition CallingConv.h:24
@ Undef
Value of the register doesn't matter.
@ SingleThread
Synchronized with respect to signal handlers executing in the same thread.
Definition LLVMContext.h:55
@ System
Synchronized with respect to all concurrently executing threads.
Definition LLVMContext.h:58
initializer< Ty > init(const Ty &Val)
PointerTypeMap run(const Module &M)
Compute the PointerTypeMap for the module M.
NodeAddr< FuncNode * > Func
Definition RDFGraph.h:393
This is an optimization pass for GlobalISel generic memory operations.
FunctionAddr VTableAddr Value
Definition InstrProf.h:137
OuterAnalysisManagerProxy< ModuleAnalysisManager, MachineFunction > ModuleAnalysisManagerMachineFunctionProxy
Provide the ModuleAnalysisManager to Function proxy.
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
char & SIMemoryLegalizerID
LLVM_ENABLE_BITMASK_ENUMS_IN_NAMESPACE()
AnalysisManager< MachineFunction > MachineFunctionAnalysisManager
bool isReleaseOrStronger(AtomicOrdering AO)
LLVM_ABI PreservedAnalyses getMachineFunctionPassPreservedAnalyses()
Returns the minimum set of Analyses that all machine function passes must preserve.
MachineInstr * getImm(const MachineOperand &MO, const MachineRegisterInfo *MRI)
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
Definition MathExtras.h:288
AtomicOrdering getMergedAtomicOrdering(AtomicOrdering AO, AtomicOrdering Other)
Return a single atomic ordering that is at least as strong as both the AO and Other orderings for an ...
static const MachineMemOperand::Flags MOCooperative
Mark the MMO of cooperative load/store atomics.
Definition SIInstrInfo.h:52
AtomicOrdering
Atomic ordering for LLVM's memory model.
@ LLVM_MARK_AS_BITMASK_ENUM
Definition ModRef.h:37
DWARFExpression::Operation Op
static const MachineMemOperand::Flags MOLastUse
Mark the MMO of a load as the last use.
Definition SIInstrInfo.h:48
FunctionPass * createSIMemoryLegalizerPass()