LLVM 22.0.0git
SIPeepholeSDWA.cpp
Go to the documentation of this file.
1//===- SIPeepholeSDWA.cpp - Peephole optimization for SDWA instructions ---===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file This pass tries to apply several peephole SDWA patterns.
10///
11/// E.g. original:
12/// V_LSHRREV_B32_e32 %0, 16, %1
13/// V_ADD_CO_U32_e32 %2, %0, %3
14/// V_LSHLREV_B32_e32 %4, 16, %2
15///
16/// Replace:
17/// V_ADD_CO_U32_sdwa %4, %1, %3
18/// dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
19///
20//===----------------------------------------------------------------------===//
21
22#include "SIPeepholeSDWA.h"
23#include "AMDGPU.h"
24#include "GCNSubtarget.h"
26#include "llvm/ADT/MapVector.h"
27#include "llvm/ADT/Statistic.h"
29#include <optional>
30
31using namespace llvm;
32
33#define DEBUG_TYPE "si-peephole-sdwa"
34
35STATISTIC(NumSDWAPatternsFound, "Number of SDWA patterns found.");
36STATISTIC(NumSDWAInstructionsPeepholed,
37 "Number of instruction converted to SDWA.");
38
39namespace {
40
41bool isConvertibleToSDWA(MachineInstr &MI, const GCNSubtarget &ST,
42 const SIInstrInfo *TII);
43class SDWAOperand;
44class SDWADstOperand;
45
46using SDWAOperandsVector = SmallVector<SDWAOperand *, 4>;
48
49class SIPeepholeSDWA {
50private:
52 const SIRegisterInfo *TRI;
53 const SIInstrInfo *TII;
54
56 SDWAOperandsMap PotentialMatches;
57 SmallVector<MachineInstr *, 8> ConvertedInstructions;
58
59 std::optional<int64_t> foldToImm(const MachineOperand &Op) const;
60
61 void matchSDWAOperands(MachineBasicBlock &MBB);
62 std::unique_ptr<SDWAOperand> matchSDWAOperand(MachineInstr &MI);
63 void pseudoOpConvertToVOP2(MachineInstr &MI,
64 const GCNSubtarget &ST) const;
65 void convertVcndmaskToVOP2(MachineInstr &MI, const GCNSubtarget &ST) const;
66 MachineInstr *createSDWAVersion(MachineInstr &MI);
67 bool convertToSDWA(MachineInstr &MI, const SDWAOperandsVector &SDWAOperands);
68 void legalizeScalarOperands(MachineInstr &MI, const GCNSubtarget &ST) const;
69
70public:
71 bool run(MachineFunction &MF);
72};
73
74class SIPeepholeSDWALegacy : public MachineFunctionPass {
75public:
76 static char ID;
77
78 SIPeepholeSDWALegacy() : MachineFunctionPass(ID) {}
79
80 StringRef getPassName() const override { return "SI Peephole SDWA"; }
81
82 bool runOnMachineFunction(MachineFunction &MF) override;
83
84 void getAnalysisUsage(AnalysisUsage &AU) const override {
85 AU.setPreservesCFG();
87 }
88};
89
90using namespace AMDGPU::SDWA;
91
92class SDWAOperand {
93private:
94 MachineOperand *Target; // Operand that would be used in converted instruction
95 MachineOperand *Replaced; // Operand that would be replace by Target
96
97 /// Returns true iff the SDWA selection of this SDWAOperand can be combined
98 /// with the SDWA selections of its uses in \p MI.
99 virtual bool canCombineSelections(const MachineInstr &MI,
100 const SIInstrInfo *TII) = 0;
101
102public:
103 SDWAOperand(MachineOperand *TargetOp, MachineOperand *ReplacedOp)
104 : Target(TargetOp), Replaced(ReplacedOp) {
105 assert(Target->isReg());
106 assert(Replaced->isReg());
107 }
108
109 virtual ~SDWAOperand() = default;
110
111 virtual MachineInstr *potentialToConvert(const SIInstrInfo *TII,
112 const GCNSubtarget &ST,
113 SDWAOperandsMap *PotentialMatches = nullptr) = 0;
114 virtual bool convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) = 0;
115
116 MachineOperand *getTargetOperand() const { return Target; }
117 MachineOperand *getReplacedOperand() const { return Replaced; }
118 MachineInstr *getParentInst() const { return Target->getParent(); }
119
120 MachineRegisterInfo *getMRI() const {
121 return &getParentInst()->getParent()->getParent()->getRegInfo();
122 }
123
124#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
125 virtual void print(raw_ostream& OS) const = 0;
126 void dump() const { print(dbgs()); }
127#endif
128};
129
130class SDWASrcOperand : public SDWAOperand {
131private:
132 SdwaSel SrcSel;
133 bool Abs;
134 bool Neg;
135 bool Sext;
136
137public:
138 SDWASrcOperand(MachineOperand *TargetOp, MachineOperand *ReplacedOp,
139 SdwaSel SrcSel_ = DWORD, bool Abs_ = false, bool Neg_ = false,
140 bool Sext_ = false)
141 : SDWAOperand(TargetOp, ReplacedOp), SrcSel(SrcSel_), Abs(Abs_),
142 Neg(Neg_), Sext(Sext_) {}
143
144 MachineInstr *potentialToConvert(const SIInstrInfo *TII,
145 const GCNSubtarget &ST,
146 SDWAOperandsMap *PotentialMatches = nullptr) override;
147 bool convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) override;
148 bool canCombineSelections(const MachineInstr &MI,
149 const SIInstrInfo *TII) override;
150
151 SdwaSel getSrcSel() const { return SrcSel; }
152 bool getAbs() const { return Abs; }
153 bool getNeg() const { return Neg; }
154 bool getSext() const { return Sext; }
155
156 uint64_t getSrcMods(const SIInstrInfo *TII,
157 const MachineOperand *SrcOp) const;
158
159#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
160 void print(raw_ostream& OS) const override;
161#endif
162};
163
164class SDWADstOperand : public SDWAOperand {
165private:
166 SdwaSel DstSel;
167 DstUnused DstUn;
168
169public:
170 SDWADstOperand(MachineOperand *TargetOp, MachineOperand *ReplacedOp,
171 SdwaSel DstSel_ = DWORD, DstUnused DstUn_ = UNUSED_PAD)
172 : SDWAOperand(TargetOp, ReplacedOp), DstSel(DstSel_), DstUn(DstUn_) {}
173
174 MachineInstr *potentialToConvert(const SIInstrInfo *TII,
175 const GCNSubtarget &ST,
176 SDWAOperandsMap *PotentialMatches = nullptr) override;
177 bool convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) override;
178 bool canCombineSelections(const MachineInstr &MI,
179 const SIInstrInfo *TII) override;
180
181 SdwaSel getDstSel() const { return DstSel; }
182 DstUnused getDstUnused() const { return DstUn; }
183
184#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
185 void print(raw_ostream& OS) const override;
186#endif
187};
188
189class SDWADstPreserveOperand : public SDWADstOperand {
190private:
191 MachineOperand *Preserve;
192
193public:
194 SDWADstPreserveOperand(MachineOperand *TargetOp, MachineOperand *ReplacedOp,
195 MachineOperand *PreserveOp, SdwaSel DstSel_ = DWORD)
196 : SDWADstOperand(TargetOp, ReplacedOp, DstSel_, UNUSED_PRESERVE),
197 Preserve(PreserveOp) {}
198
199 bool convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) override;
200 bool canCombineSelections(const MachineInstr &MI,
201 const SIInstrInfo *TII) override;
202
203 MachineOperand *getPreservedOperand() const { return Preserve; }
204
205#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
206 void print(raw_ostream& OS) const override;
207#endif
208};
209
210} // end anonymous namespace
211
212INITIALIZE_PASS(SIPeepholeSDWALegacy, DEBUG_TYPE, "SI Peephole SDWA", false,
213 false)
214
215char SIPeepholeSDWALegacy::ID = 0;
216
217char &llvm::SIPeepholeSDWALegacyID = SIPeepholeSDWALegacy::ID;
218
220 return new SIPeepholeSDWALegacy();
221}
222
223#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
225 switch(Sel) {
226 case BYTE_0: OS << "BYTE_0"; break;
227 case BYTE_1: OS << "BYTE_1"; break;
228 case BYTE_2: OS << "BYTE_2"; break;
229 case BYTE_3: OS << "BYTE_3"; break;
230 case WORD_0: OS << "WORD_0"; break;
231 case WORD_1: OS << "WORD_1"; break;
232 case DWORD: OS << "DWORD"; break;
233 }
234 return OS;
235}
236
238 switch(Un) {
239 case UNUSED_PAD: OS << "UNUSED_PAD"; break;
240 case UNUSED_SEXT: OS << "UNUSED_SEXT"; break;
241 case UNUSED_PRESERVE: OS << "UNUSED_PRESERVE"; break;
242 }
243 return OS;
244}
245
247void SDWASrcOperand::print(raw_ostream& OS) const {
248 OS << "SDWA src: " << *getTargetOperand()
249 << " src_sel:" << getSrcSel()
250 << " abs:" << getAbs() << " neg:" << getNeg()
251 << " sext:" << getSext() << '\n';
252}
253
255void SDWADstOperand::print(raw_ostream& OS) const {
256 OS << "SDWA dst: " << *getTargetOperand()
257 << " dst_sel:" << getDstSel()
258 << " dst_unused:" << getDstUnused() << '\n';
259}
260
262void SDWADstPreserveOperand::print(raw_ostream& OS) const {
263 OS << "SDWA preserve dst: " << *getTargetOperand()
264 << " dst_sel:" << getDstSel()
265 << " preserve:" << *getPreservedOperand() << '\n';
266}
267
268#endif
269
270static void copyRegOperand(MachineOperand &To, const MachineOperand &From) {
271 assert(To.isReg() && From.isReg());
272 To.setReg(From.getReg());
273 To.setSubReg(From.getSubReg());
274 To.setIsUndef(From.isUndef());
275 if (To.isUse()) {
276 To.setIsKill(From.isKill());
277 } else {
278 To.setIsDead(From.isDead());
279 }
280}
281
282static bool isSameReg(const MachineOperand &LHS, const MachineOperand &RHS) {
283 return LHS.isReg() &&
284 RHS.isReg() &&
285 LHS.getReg() == RHS.getReg() &&
286 LHS.getSubReg() == RHS.getSubReg();
287}
288
290 const MachineRegisterInfo *MRI) {
291 if (!Reg->isReg() || !Reg->isDef())
292 return nullptr;
293
294 return MRI->getOneNonDBGUse(Reg->getReg());
295}
296
298 const MachineRegisterInfo *MRI) {
299 if (!Reg->isReg())
300 return nullptr;
301
302 return MRI->getOneDef(Reg->getReg());
303}
304
305/// Combine an SDWA instruction's existing SDWA selection \p Sel with
306/// the SDWA selection \p OperandSel of its operand. If the selections
307/// are compatible, return the combined selection, otherwise return a
308/// nullopt.
309/// For example, if we have Sel = BYTE_0 Sel and OperandSel = WORD_1:
310/// BYTE_0 Sel (WORD_1 Sel (%X)) -> BYTE_2 Sel (%X)
311static std::optional<SdwaSel> combineSdwaSel(SdwaSel Sel, SdwaSel OperandSel) {
312 if (Sel == SdwaSel::DWORD)
313 return OperandSel;
314
315 if (Sel == OperandSel || OperandSel == SdwaSel::DWORD)
316 return Sel;
317
318 if (Sel == SdwaSel::WORD_1 || Sel == SdwaSel::BYTE_2 ||
319 Sel == SdwaSel::BYTE_3)
320 return {};
321
322 if (OperandSel == SdwaSel::WORD_0)
323 return Sel;
324
325 if (OperandSel == SdwaSel::WORD_1) {
326 if (Sel == SdwaSel::BYTE_0)
327 return SdwaSel::BYTE_2;
328 if (Sel == SdwaSel::BYTE_1)
329 return SdwaSel::BYTE_3;
330 if (Sel == SdwaSel::WORD_0)
331 return SdwaSel::WORD_1;
332 }
333
334 return {};
335}
336
337uint64_t SDWASrcOperand::getSrcMods(const SIInstrInfo *TII,
338 const MachineOperand *SrcOp) const {
339 uint64_t Mods = 0;
340 const auto *MI = SrcOp->getParent();
341 if (TII->getNamedOperand(*MI, AMDGPU::OpName::src0) == SrcOp) {
342 if (auto *Mod = TII->getNamedOperand(*MI, AMDGPU::OpName::src0_modifiers)) {
343 Mods = Mod->getImm();
344 }
345 } else if (TII->getNamedOperand(*MI, AMDGPU::OpName::src1) == SrcOp) {
346 if (auto *Mod = TII->getNamedOperand(*MI, AMDGPU::OpName::src1_modifiers)) {
347 Mods = Mod->getImm();
348 }
349 }
350 if (Abs || Neg) {
351 assert(!Sext &&
352 "Float and integer src modifiers can't be set simultaneously");
353 Mods |= Abs ? SISrcMods::ABS : 0u;
354 Mods ^= Neg ? SISrcMods::NEG : 0u;
355 } else if (Sext) {
356 Mods |= SISrcMods::SEXT;
357 }
358
359 return Mods;
360}
361
362MachineInstr *SDWASrcOperand::potentialToConvert(const SIInstrInfo *TII,
363 const GCNSubtarget &ST,
364 SDWAOperandsMap *PotentialMatches) {
365 if (PotentialMatches != nullptr) {
366 // Fill out the map for all uses if all can be converted
367 MachineOperand *Reg = getReplacedOperand();
368 if (!Reg->isReg() || !Reg->isDef())
369 return nullptr;
370
371 for (MachineInstr &UseMI : getMRI()->use_nodbg_instructions(Reg->getReg()))
372 // Check that all instructions that use Reg can be converted
373 if (!isConvertibleToSDWA(UseMI, ST, TII) ||
374 !canCombineSelections(UseMI, TII))
375 return nullptr;
376
377 // Now that it's guaranteed all uses are legal, iterate over the uses again
378 // to add them for later conversion.
379 for (MachineOperand &UseMO : getMRI()->use_nodbg_operands(Reg->getReg())) {
380 // Should not get a subregister here
381 assert(isSameReg(UseMO, *Reg));
382
383 SDWAOperandsMap &potentialMatchesMap = *PotentialMatches;
384 MachineInstr *UseMI = UseMO.getParent();
385 potentialMatchesMap[UseMI].push_back(this);
386 }
387 return nullptr;
388 }
389
390 // For SDWA src operand potential instruction is one that use register
391 // defined by parent instruction
392 MachineOperand *PotentialMO = findSingleRegUse(getReplacedOperand(), getMRI());
393 if (!PotentialMO)
394 return nullptr;
395
396 MachineInstr *Parent = PotentialMO->getParent();
397
398 return canCombineSelections(*Parent, TII) ? Parent : nullptr;
399}
400
401bool SDWASrcOperand::convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) {
402 switch (MI.getOpcode()) {
403 case AMDGPU::V_CVT_F32_FP8_sdwa:
404 case AMDGPU::V_CVT_F32_BF8_sdwa:
405 case AMDGPU::V_CVT_PK_F32_FP8_sdwa:
406 case AMDGPU::V_CVT_PK_F32_BF8_sdwa:
407 // Does not support input modifiers: noabs, noneg, nosext.
408 return false;
409 case AMDGPU::V_CNDMASK_B32_sdwa:
410 // SISrcMods uses the same bitmask for SEXT and NEG modifiers and
411 // hence the compiler can only support one type of modifier for
412 // each SDWA instruction. For V_CNDMASK_B32_sdwa, this is NEG
413 // since its operands get printed using
414 // AMDGPUInstPrinter::printOperandAndFPInputMods which produces
415 // the output intended for NEG if SEXT is set.
416 //
417 // The ISA does actually support both modifiers on most SDWA
418 // instructions.
419 //
420 // FIXME Accept SEXT here after fixing this issue.
421 if (Sext)
422 return false;
423 break;
424 }
425
426 // Find operand in instruction that matches source operand and replace it with
427 // target operand. Set corresponding src_sel
428 bool IsPreserveSrc = false;
429 MachineOperand *Src = TII->getNamedOperand(MI, AMDGPU::OpName::src0);
430 MachineOperand *SrcSel = TII->getNamedOperand(MI, AMDGPU::OpName::src0_sel);
431 MachineOperand *SrcMods =
432 TII->getNamedOperand(MI, AMDGPU::OpName::src0_modifiers);
433 assert(Src && (Src->isReg() || Src->isImm()));
434 if (!isSameReg(*Src, *getReplacedOperand())) {
435 // If this is not src0 then it could be src1
436 Src = TII->getNamedOperand(MI, AMDGPU::OpName::src1);
437 SrcSel = TII->getNamedOperand(MI, AMDGPU::OpName::src1_sel);
438 SrcMods = TII->getNamedOperand(MI, AMDGPU::OpName::src1_modifiers);
439
440 if (!Src ||
441 !isSameReg(*Src, *getReplacedOperand())) {
442 // It's possible this Src is a tied operand for
443 // UNUSED_PRESERVE, in which case we can either
444 // abandon the peephole attempt, or if legal we can
445 // copy the target operand into the tied slot
446 // if the preserve operation will effectively cause the same
447 // result by overwriting the rest of the dst.
448 MachineOperand *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst);
449 MachineOperand *DstUnused =
450 TII->getNamedOperand(MI, AMDGPU::OpName::dst_unused);
451
452 if (Dst &&
453 DstUnused->getImm() == AMDGPU::SDWA::DstUnused::UNUSED_PRESERVE) {
454 // This will work if the tied src is accessing WORD_0, and the dst is
455 // writing WORD_1. Modifiers don't matter because all the bits that
456 // would be impacted are being overwritten by the dst.
457 // Any other case will not work.
458 SdwaSel DstSel = static_cast<SdwaSel>(
459 TII->getNamedImmOperand(MI, AMDGPU::OpName::dst_sel));
460 if (DstSel == AMDGPU::SDWA::SdwaSel::WORD_1 &&
461 getSrcSel() == AMDGPU::SDWA::SdwaSel::WORD_0) {
462 IsPreserveSrc = true;
463 auto DstIdx = AMDGPU::getNamedOperandIdx(MI.getOpcode(),
464 AMDGPU::OpName::vdst);
465 auto TiedIdx = MI.findTiedOperandIdx(DstIdx);
466 Src = &MI.getOperand(TiedIdx);
467 SrcSel = nullptr;
468 SrcMods = nullptr;
469 } else {
470 // Not legal to convert this src
471 return false;
472 }
473 }
474 }
475 assert(Src && Src->isReg());
476
477 if ((MI.getOpcode() == AMDGPU::V_FMAC_F16_sdwa ||
478 MI.getOpcode() == AMDGPU::V_FMAC_F32_sdwa ||
479 MI.getOpcode() == AMDGPU::V_MAC_F16_sdwa ||
480 MI.getOpcode() == AMDGPU::V_MAC_F32_sdwa) &&
481 !isSameReg(*Src, *getReplacedOperand())) {
482 // In case of v_mac_f16/32_sdwa this pass can try to apply src operand to
483 // src2. This is not allowed.
484 return false;
485 }
486
487 assert(isSameReg(*Src, *getReplacedOperand()) &&
488 (IsPreserveSrc || (SrcSel && SrcMods)));
489 }
490 copyRegOperand(*Src, *getTargetOperand());
491 if (!IsPreserveSrc) {
492 SdwaSel ExistingSel = static_cast<SdwaSel>(SrcSel->getImm());
493 SrcSel->setImm(*combineSdwaSel(ExistingSel, getSrcSel()));
494 SrcMods->setImm(getSrcMods(TII, Src));
495 }
496 getTargetOperand()->setIsKill(false);
497 return true;
498}
499
500/// Verify that the SDWA selection operand \p SrcSelOpName of the SDWA
501/// instruction \p MI can be combined with the selection \p OpSel.
502static bool canCombineOpSel(const MachineInstr &MI, const SIInstrInfo *TII,
503 AMDGPU::OpName SrcSelOpName, SdwaSel OpSel) {
504 assert(TII->isSDWA(MI.getOpcode()));
505
506 const MachineOperand *SrcSelOp = TII->getNamedOperand(MI, SrcSelOpName);
507 SdwaSel SrcSel = static_cast<SdwaSel>(SrcSelOp->getImm());
508
509 return combineSdwaSel(SrcSel, OpSel).has_value();
510}
511
512/// Verify that \p Op is the same register as the operand of the SDWA
513/// instruction \p MI named by \p SrcOpName and that the SDWA
514/// selection \p SrcSelOpName can be combined with the \p OpSel.
515static bool canCombineOpSel(const MachineInstr &MI, const SIInstrInfo *TII,
516 AMDGPU::OpName SrcOpName,
517 AMDGPU::OpName SrcSelOpName, MachineOperand *Op,
518 SdwaSel OpSel) {
519 assert(TII->isSDWA(MI.getOpcode()));
520
521 const MachineOperand *Src = TII->getNamedOperand(MI, SrcOpName);
522 if (!Src || !isSameReg(*Src, *Op))
523 return true;
524
525 return canCombineOpSel(MI, TII, SrcSelOpName, OpSel);
526}
527
528bool SDWASrcOperand::canCombineSelections(const MachineInstr &MI,
529 const SIInstrInfo *TII) {
530 if (!TII->isSDWA(MI.getOpcode()))
531 return true;
532
533 using namespace AMDGPU;
534
535 return canCombineOpSel(MI, TII, OpName::src0, OpName::src0_sel,
536 getReplacedOperand(), getSrcSel()) &&
537 canCombineOpSel(MI, TII, OpName::src1, OpName::src1_sel,
538 getReplacedOperand(), getSrcSel());
539}
540
541MachineInstr *SDWADstOperand::potentialToConvert(const SIInstrInfo *TII,
542 const GCNSubtarget &ST,
543 SDWAOperandsMap *PotentialMatches) {
544 // For SDWA dst operand potential instruction is one that defines register
545 // that this operand uses
546 MachineRegisterInfo *MRI = getMRI();
547 MachineInstr *ParentMI = getParentInst();
548
549 MachineOperand *PotentialMO = findSingleRegDef(getReplacedOperand(), MRI);
550 if (!PotentialMO)
551 return nullptr;
552
553 // Check that ParentMI is the only instruction that uses replaced register
554 for (MachineInstr &UseInst : MRI->use_nodbg_instructions(PotentialMO->getReg())) {
555 if (&UseInst != ParentMI)
556 return nullptr;
557 }
558
559 MachineInstr *Parent = PotentialMO->getParent();
560 return canCombineSelections(*Parent, TII) ? Parent : nullptr;
561}
562
563bool SDWADstOperand::convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) {
564 // Replace vdst operand in MI with target operand. Set dst_sel and dst_unused
565
566 if ((MI.getOpcode() == AMDGPU::V_FMAC_F16_sdwa ||
567 MI.getOpcode() == AMDGPU::V_FMAC_F32_sdwa ||
568 MI.getOpcode() == AMDGPU::V_MAC_F16_sdwa ||
569 MI.getOpcode() == AMDGPU::V_MAC_F32_sdwa) &&
570 getDstSel() != AMDGPU::SDWA::DWORD) {
571 // v_mac_f16/32_sdwa allow dst_sel to be equal only to DWORD
572 return false;
573 }
574
575 MachineOperand *Operand = TII->getNamedOperand(MI, AMDGPU::OpName::vdst);
576 assert(Operand &&
577 Operand->isReg() &&
578 isSameReg(*Operand, *getReplacedOperand()));
579 copyRegOperand(*Operand, *getTargetOperand());
580 MachineOperand *DstSel= TII->getNamedOperand(MI, AMDGPU::OpName::dst_sel);
581 assert(DstSel);
582
583 SdwaSel ExistingSel = static_cast<SdwaSel>(DstSel->getImm());
584 DstSel->setImm(combineSdwaSel(ExistingSel, getDstSel()).value());
585
586 MachineOperand *DstUnused= TII->getNamedOperand(MI, AMDGPU::OpName::dst_unused);
588 DstUnused->setImm(getDstUnused());
589
590 // Remove original instruction because it would conflict with our new
591 // instruction by register definition
592 getParentInst()->eraseFromParent();
593 return true;
594}
595
596bool SDWADstOperand::canCombineSelections(const MachineInstr &MI,
597 const SIInstrInfo *TII) {
598 if (!TII->isSDWA(MI.getOpcode()))
599 return true;
600
601 return canCombineOpSel(MI, TII, AMDGPU::OpName::dst_sel, getDstSel());
602}
603
604bool SDWADstPreserveOperand::convertToSDWA(MachineInstr &MI,
605 const SIInstrInfo *TII) {
606 // MI should be moved right before v_or_b32.
607 // For this we should clear all kill flags on uses of MI src-operands or else
608 // we can encounter problem with use of killed operand.
609 for (MachineOperand &MO : MI.uses()) {
610 if (!MO.isReg())
611 continue;
612 getMRI()->clearKillFlags(MO.getReg());
613 }
614
615 // Move MI before v_or_b32
616 MI.getParent()->remove(&MI);
617 getParentInst()->getParent()->insert(getParentInst(), &MI);
618
619 // Add Implicit use of preserved register
620 MachineInstrBuilder MIB(*MI.getMF(), MI);
621 MIB.addReg(getPreservedOperand()->getReg(),
623 getPreservedOperand()->getSubReg());
624
625 // Tie dst to implicit use
626 MI.tieOperands(AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::vdst),
627 MI.getNumOperands() - 1);
628
629 // Convert MI as any other SDWADstOperand and remove v_or_b32
630 return SDWADstOperand::convertToSDWA(MI, TII);
631}
632
633bool SDWADstPreserveOperand::canCombineSelections(const MachineInstr &MI,
634 const SIInstrInfo *TII) {
635 return SDWADstOperand::canCombineSelections(MI, TII);
636}
637
638std::optional<int64_t>
639SIPeepholeSDWA::foldToImm(const MachineOperand &Op) const {
640 if (Op.isImm()) {
641 return Op.getImm();
642 }
643
644 // If this is not immediate then it can be copy of immediate value, e.g.:
645 // %1 = S_MOV_B32 255;
646 if (Op.isReg()) {
647 for (const MachineOperand &Def : MRI->def_operands(Op.getReg())) {
648 if (!isSameReg(Op, Def))
649 continue;
650
651 const MachineInstr *DefInst = Def.getParent();
652 if (!TII->isFoldableCopy(*DefInst))
653 return std::nullopt;
654
655 const MachineOperand &Copied = DefInst->getOperand(1);
656 if (!Copied.isImm())
657 return std::nullopt;
658
659 return Copied.getImm();
660 }
661 }
662
663 return std::nullopt;
664}
665
666std::unique_ptr<SDWAOperand>
667SIPeepholeSDWA::matchSDWAOperand(MachineInstr &MI) {
668 unsigned Opcode = MI.getOpcode();
669 switch (Opcode) {
670 case AMDGPU::V_LSHRREV_B32_e32:
671 case AMDGPU::V_ASHRREV_I32_e32:
672 case AMDGPU::V_LSHLREV_B32_e32:
673 case AMDGPU::V_LSHRREV_B32_e64:
674 case AMDGPU::V_ASHRREV_I32_e64:
675 case AMDGPU::V_LSHLREV_B32_e64: {
676 // from: v_lshrrev_b32_e32 v1, 16/24, v0
677 // to SDWA src:v0 src_sel:WORD_1/BYTE_3
678
679 // from: v_ashrrev_i32_e32 v1, 16/24, v0
680 // to SDWA src:v0 src_sel:WORD_1/BYTE_3 sext:1
681
682 // from: v_lshlrev_b32_e32 v1, 16/24, v0
683 // to SDWA dst:v1 dst_sel:WORD_1/BYTE_3 dst_unused:UNUSED_PAD
684 MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0);
685 auto Imm = foldToImm(*Src0);
686 if (!Imm)
687 break;
688
689 if (*Imm != 16 && *Imm != 24)
690 break;
691
692 MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1);
693 MachineOperand *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst);
694 if (!Src1->isReg() || Src1->getReg().isPhysical() ||
695 Dst->getReg().isPhysical())
696 break;
697
698 if (Opcode == AMDGPU::V_LSHLREV_B32_e32 ||
699 Opcode == AMDGPU::V_LSHLREV_B32_e64) {
700 return std::make_unique<SDWADstOperand>(
701 Dst, Src1, *Imm == 16 ? WORD_1 : BYTE_3, UNUSED_PAD);
702 }
703 return std::make_unique<SDWASrcOperand>(
704 Src1, Dst, *Imm == 16 ? WORD_1 : BYTE_3, false, false,
705 Opcode != AMDGPU::V_LSHRREV_B32_e32 &&
706 Opcode != AMDGPU::V_LSHRREV_B32_e64);
707 break;
708 }
709
710 case AMDGPU::V_LSHRREV_B16_e32:
711 case AMDGPU::V_ASHRREV_I16_e32:
712 case AMDGPU::V_LSHLREV_B16_e32:
713 case AMDGPU::V_LSHRREV_B16_e64:
714 case AMDGPU::V_LSHRREV_B16_opsel_e64:
715 case AMDGPU::V_ASHRREV_I16_e64:
716 case AMDGPU::V_LSHLREV_B16_opsel_e64:
717 case AMDGPU::V_LSHLREV_B16_e64: {
718 // from: v_lshrrev_b16_e32 v1, 8, v0
719 // to SDWA src:v0 src_sel:BYTE_1
720
721 // from: v_ashrrev_i16_e32 v1, 8, v0
722 // to SDWA src:v0 src_sel:BYTE_1 sext:1
723
724 // from: v_lshlrev_b16_e32 v1, 8, v0
725 // to SDWA dst:v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD
726 MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0);
727 auto Imm = foldToImm(*Src0);
728 if (!Imm || *Imm != 8)
729 break;
730
731 MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1);
732 MachineOperand *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst);
733
734 if (!Src1->isReg() || Src1->getReg().isPhysical() ||
735 Dst->getReg().isPhysical())
736 break;
737
738 if (Opcode == AMDGPU::V_LSHLREV_B16_e32 ||
739 Opcode == AMDGPU::V_LSHLREV_B16_opsel_e64 ||
740 Opcode == AMDGPU::V_LSHLREV_B16_e64)
741 return std::make_unique<SDWADstOperand>(Dst, Src1, BYTE_1, UNUSED_PAD);
742 return std::make_unique<SDWASrcOperand>(
743 Src1, Dst, BYTE_1, false, false,
744 Opcode != AMDGPU::V_LSHRREV_B16_e32 &&
745 Opcode != AMDGPU::V_LSHRREV_B16_opsel_e64 &&
746 Opcode != AMDGPU::V_LSHRREV_B16_e64);
747 break;
748 }
749
750 case AMDGPU::V_BFE_I32_e64:
751 case AMDGPU::V_BFE_U32_e64: {
752 // e.g.:
753 // from: v_bfe_u32 v1, v0, 8, 8
754 // to SDWA src:v0 src_sel:BYTE_1
755
756 // offset | width | src_sel
757 // ------------------------
758 // 0 | 8 | BYTE_0
759 // 0 | 16 | WORD_0
760 // 0 | 32 | DWORD ?
761 // 8 | 8 | BYTE_1
762 // 16 | 8 | BYTE_2
763 // 16 | 16 | WORD_1
764 // 24 | 8 | BYTE_3
765
766 MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1);
767 auto Offset = foldToImm(*Src1);
768 if (!Offset)
769 break;
770
771 MachineOperand *Src2 = TII->getNamedOperand(MI, AMDGPU::OpName::src2);
772 auto Width = foldToImm(*Src2);
773 if (!Width)
774 break;
775
776 SdwaSel SrcSel = DWORD;
777
778 if (*Offset == 0 && *Width == 8)
779 SrcSel = BYTE_0;
780 else if (*Offset == 0 && *Width == 16)
781 SrcSel = WORD_0;
782 else if (*Offset == 0 && *Width == 32)
783 SrcSel = DWORD;
784 else if (*Offset == 8 && *Width == 8)
785 SrcSel = BYTE_1;
786 else if (*Offset == 16 && *Width == 8)
787 SrcSel = BYTE_2;
788 else if (*Offset == 16 && *Width == 16)
789 SrcSel = WORD_1;
790 else if (*Offset == 24 && *Width == 8)
791 SrcSel = BYTE_3;
792 else
793 break;
794
795 MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0);
796 MachineOperand *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst);
797
798 if (!Src0->isReg() || Src0->getReg().isPhysical() ||
799 Dst->getReg().isPhysical())
800 break;
801
802 return std::make_unique<SDWASrcOperand>(
803 Src0, Dst, SrcSel, false, false, Opcode != AMDGPU::V_BFE_U32_e64);
804 }
805
806 case AMDGPU::V_AND_B32_e32:
807 case AMDGPU::V_AND_B32_e64: {
808 // e.g.:
809 // from: v_and_b32_e32 v1, 0x0000ffff/0x000000ff, v0
810 // to SDWA src:v0 src_sel:WORD_0/BYTE_0
811
812 MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0);
813 MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1);
814 auto *ValSrc = Src1;
815 auto Imm = foldToImm(*Src0);
816
817 if (!Imm) {
818 Imm = foldToImm(*Src1);
819 ValSrc = Src0;
820 }
821
822 if (!Imm || (*Imm != 0x0000ffff && *Imm != 0x000000ff))
823 break;
824
825 MachineOperand *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst);
826
827 if (!ValSrc->isReg() || ValSrc->getReg().isPhysical() ||
828 Dst->getReg().isPhysical())
829 break;
830
831 return std::make_unique<SDWASrcOperand>(
832 ValSrc, Dst, *Imm == 0x0000ffff ? WORD_0 : BYTE_0);
833 }
834
835 case AMDGPU::V_OR_B32_e32:
836 case AMDGPU::V_OR_B32_e64: {
837 // Patterns for dst_unused:UNUSED_PRESERVE.
838 // e.g., from:
839 // v_add_f16_sdwa v0, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD
840 // src1_sel:WORD_1 src2_sel:WORD1
841 // v_add_f16_e32 v3, v1, v2
842 // v_or_b32_e32 v4, v0, v3
843 // to SDWA preserve dst:v4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE preserve:v3
844
845 // Check if one of operands of v_or_b32 is SDWA instruction
846 using CheckRetType =
847 std::optional<std::pair<MachineOperand *, MachineOperand *>>;
848 auto CheckOROperandsForSDWA =
849 [&](const MachineOperand *Op1, const MachineOperand *Op2) -> CheckRetType {
850 if (!Op1 || !Op1->isReg() || !Op2 || !Op2->isReg())
851 return CheckRetType(std::nullopt);
852
853 MachineOperand *Op1Def = findSingleRegDef(Op1, MRI);
854 if (!Op1Def)
855 return CheckRetType(std::nullopt);
856
857 MachineInstr *Op1Inst = Op1Def->getParent();
858 if (!TII->isSDWA(*Op1Inst))
859 return CheckRetType(std::nullopt);
860
861 MachineOperand *Op2Def = findSingleRegDef(Op2, MRI);
862 if (!Op2Def)
863 return CheckRetType(std::nullopt);
864
865 return CheckRetType(std::pair(Op1Def, Op2Def));
866 };
867
868 MachineOperand *OrSDWA = TII->getNamedOperand(MI, AMDGPU::OpName::src0);
869 MachineOperand *OrOther = TII->getNamedOperand(MI, AMDGPU::OpName::src1);
870 assert(OrSDWA && OrOther);
871 auto Res = CheckOROperandsForSDWA(OrSDWA, OrOther);
872 if (!Res) {
873 OrSDWA = TII->getNamedOperand(MI, AMDGPU::OpName::src1);
874 OrOther = TII->getNamedOperand(MI, AMDGPU::OpName::src0);
875 assert(OrSDWA && OrOther);
876 Res = CheckOROperandsForSDWA(OrSDWA, OrOther);
877 if (!Res)
878 break;
879 }
880
881 MachineOperand *OrSDWADef = Res->first;
882 MachineOperand *OrOtherDef = Res->second;
883 assert(OrSDWADef && OrOtherDef);
884
885 MachineInstr *SDWAInst = OrSDWADef->getParent();
886 MachineInstr *OtherInst = OrOtherDef->getParent();
887
888 // Check that OtherInstr is actually bitwise compatible with SDWAInst = their
889 // destination patterns don't overlap. Compatible instruction can be either
890 // regular instruction with compatible bitness or SDWA instruction with
891 // correct dst_sel
892 // SDWAInst | OtherInst bitness / OtherInst dst_sel
893 // -----------------------------------------------------
894 // DWORD | no / no
895 // WORD_0 | no / BYTE_2/3, WORD_1
896 // WORD_1 | 8/16-bit instructions / BYTE_0/1, WORD_0
897 // BYTE_0 | no / BYTE_1/2/3, WORD_1
898 // BYTE_1 | 8-bit / BYTE_0/2/3, WORD_1
899 // BYTE_2 | 8/16-bit / BYTE_0/1/3. WORD_0
900 // BYTE_3 | 8/16/24-bit / BYTE_0/1/2, WORD_0
901 // E.g. if SDWAInst is v_add_f16_sdwa dst_sel:WORD_1 then v_add_f16 is OK
902 // but v_add_f32 is not.
903
904 // TODO: add support for non-SDWA instructions as OtherInst.
905 // For now this only works with SDWA instructions. For regular instructions
906 // there is no way to determine if the instruction writes only 8/16/24-bit
907 // out of full register size and all registers are at min 32-bit wide.
908 if (!TII->isSDWA(*OtherInst))
909 break;
910
911 SdwaSel DstSel = static_cast<SdwaSel>(
912 TII->getNamedImmOperand(*SDWAInst, AMDGPU::OpName::dst_sel));
913 SdwaSel OtherDstSel = static_cast<SdwaSel>(
914 TII->getNamedImmOperand(*OtherInst, AMDGPU::OpName::dst_sel));
915
916 bool DstSelAgree = false;
917 switch (DstSel) {
918 case WORD_0: DstSelAgree = ((OtherDstSel == BYTE_2) ||
919 (OtherDstSel == BYTE_3) ||
920 (OtherDstSel == WORD_1));
921 break;
922 case WORD_1: DstSelAgree = ((OtherDstSel == BYTE_0) ||
923 (OtherDstSel == BYTE_1) ||
924 (OtherDstSel == WORD_0));
925 break;
926 case BYTE_0: DstSelAgree = ((OtherDstSel == BYTE_1) ||
927 (OtherDstSel == BYTE_2) ||
928 (OtherDstSel == BYTE_3) ||
929 (OtherDstSel == WORD_1));
930 break;
931 case BYTE_1: DstSelAgree = ((OtherDstSel == BYTE_0) ||
932 (OtherDstSel == BYTE_2) ||
933 (OtherDstSel == BYTE_3) ||
934 (OtherDstSel == WORD_1));
935 break;
936 case BYTE_2: DstSelAgree = ((OtherDstSel == BYTE_0) ||
937 (OtherDstSel == BYTE_1) ||
938 (OtherDstSel == BYTE_3) ||
939 (OtherDstSel == WORD_0));
940 break;
941 case BYTE_3: DstSelAgree = ((OtherDstSel == BYTE_0) ||
942 (OtherDstSel == BYTE_1) ||
943 (OtherDstSel == BYTE_2) ||
944 (OtherDstSel == WORD_0));
945 break;
946 default: DstSelAgree = false;
947 }
948
949 if (!DstSelAgree)
950 break;
951
952 // Also OtherInst dst_unused should be UNUSED_PAD
953 DstUnused OtherDstUnused = static_cast<DstUnused>(
954 TII->getNamedImmOperand(*OtherInst, AMDGPU::OpName::dst_unused));
955 if (OtherDstUnused != DstUnused::UNUSED_PAD)
956 break;
957
958 // Create DstPreserveOperand
959 MachineOperand *OrDst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst);
960 assert(OrDst && OrDst->isReg());
961
962 return std::make_unique<SDWADstPreserveOperand>(
963 OrDst, OrSDWADef, OrOtherDef, DstSel);
964
965 }
966 }
967
968 return std::unique_ptr<SDWAOperand>(nullptr);
969}
970
971#if !defined(NDEBUG)
972static raw_ostream& operator<<(raw_ostream &OS, const SDWAOperand &Operand) {
973 Operand.print(OS);
974 return OS;
975}
976#endif
977
978void SIPeepholeSDWA::matchSDWAOperands(MachineBasicBlock &MBB) {
979 for (MachineInstr &MI : MBB) {
980 if (auto Operand = matchSDWAOperand(MI)) {
981 LLVM_DEBUG(dbgs() << "Match: " << MI << "To: " << *Operand << '\n');
982 SDWAOperands[&MI] = std::move(Operand);
983 ++NumSDWAPatternsFound;
984 }
985 }
986}
987
988// Convert the V_ADD_CO_U32_e64 into V_ADD_CO_U32_e32. This allows
989// isConvertibleToSDWA to perform its transformation on V_ADD_CO_U32_e32 into
990// V_ADD_CO_U32_sdwa.
991//
992// We are transforming from a VOP3 into a VOP2 form of the instruction.
993// %19:vgpr_32 = V_AND_B32_e32 255,
994// killed %16:vgpr_32, implicit $exec
995// %47:vgpr_32, %49:sreg_64_xexec = V_ADD_CO_U32_e64
996// %26.sub0:vreg_64, %19:vgpr_32, implicit $exec
997// %48:vgpr_32, dead %50:sreg_64_xexec = V_ADDC_U32_e64
998// %26.sub1:vreg_64, %54:vgpr_32, killed %49:sreg_64_xexec, implicit $exec
999//
1000// becomes
1001// %47:vgpr_32 = V_ADD_CO_U32_sdwa
1002// 0, %26.sub0:vreg_64, 0, killed %16:vgpr_32, 0, 6, 0, 6, 0,
1003// implicit-def $vcc, implicit $exec
1004// %48:vgpr_32, dead %50:sreg_64_xexec = V_ADDC_U32_e64
1005// %26.sub1:vreg_64, %54:vgpr_32, killed $vcc, implicit $exec
1006void SIPeepholeSDWA::pseudoOpConvertToVOP2(MachineInstr &MI,
1007 const GCNSubtarget &ST) const {
1008 int Opc = MI.getOpcode();
1009 assert((Opc == AMDGPU::V_ADD_CO_U32_e64 || Opc == AMDGPU::V_SUB_CO_U32_e64) &&
1010 "Currently only handles V_ADD_CO_U32_e64 or V_SUB_CO_U32_e64");
1011
1012 // Can the candidate MI be shrunk?
1013 if (!TII->canShrink(MI, *MRI))
1014 return;
1016 // Find the related ADD instruction.
1017 const MachineOperand *Sdst = TII->getNamedOperand(MI, AMDGPU::OpName::sdst);
1018 if (!Sdst)
1019 return;
1020 MachineOperand *NextOp = findSingleRegUse(Sdst, MRI);
1021 if (!NextOp)
1022 return;
1023 MachineInstr &MISucc = *NextOp->getParent();
1024
1025 // Make sure the carry in/out are subsequently unused.
1026 MachineOperand *CarryIn = TII->getNamedOperand(MISucc, AMDGPU::OpName::src2);
1027 if (!CarryIn)
1028 return;
1029 MachineOperand *CarryOut = TII->getNamedOperand(MISucc, AMDGPU::OpName::sdst);
1030 if (!CarryOut)
1031 return;
1032 if (!MRI->hasOneUse(CarryIn->getReg()) || !MRI->use_empty(CarryOut->getReg()))
1033 return;
1034 // Make sure VCC or its subregs are dead before MI.
1035 MachineBasicBlock &MBB = *MI.getParent();
1037 MBB.computeRegisterLiveness(TRI, AMDGPU::VCC, MI, 25);
1038 if (Liveness != MachineBasicBlock::LQR_Dead)
1039 return;
1040 // Check if VCC is referenced in range of (MI,MISucc].
1041 for (auto I = std::next(MI.getIterator()), E = MISucc.getIterator();
1042 I != E; ++I) {
1043 if (I->modifiesRegister(AMDGPU::VCC, TRI))
1044 return;
1045 }
1046
1047 // Replace MI with V_{SUB|ADD}_I32_e32
1048 BuildMI(MBB, MI, MI.getDebugLoc(), TII->get(Opc))
1049 .add(*TII->getNamedOperand(MI, AMDGPU::OpName::vdst))
1050 .add(*TII->getNamedOperand(MI, AMDGPU::OpName::src0))
1051 .add(*TII->getNamedOperand(MI, AMDGPU::OpName::src1))
1052 .setMIFlags(MI.getFlags());
1053
1054 MI.eraseFromParent();
1055
1056 // Since the carry output of MI is now VCC, update its use in MISucc.
1057
1058 MISucc.substituteRegister(CarryIn->getReg(), TRI->getVCC(), 0, *TRI);
1059}
1060
1061/// Try to convert an \p MI in VOP3 which takes an src2 carry-in
1062/// operand into the corresponding VOP2 form which expects the
1063/// argument in VCC. To this end, add an copy from the carry-in to
1064/// VCC. The conversion will only be applied if \p MI can be shrunk
1065/// to VOP2 and if VCC can be proven to be dead before \p MI.
1066void SIPeepholeSDWA::convertVcndmaskToVOP2(MachineInstr &MI,
1067 const GCNSubtarget &ST) const {
1068 assert(MI.getOpcode() == AMDGPU::V_CNDMASK_B32_e64);
1069
1070 LLVM_DEBUG(dbgs() << "Attempting VOP2 conversion: " << MI);
1071 if (!TII->canShrink(MI, *MRI)) {
1072 LLVM_DEBUG(dbgs() << "Cannot shrink instruction\n");
1073 return;
1074 }
1075
1076 const MachineOperand &CarryIn =
1077 *TII->getNamedOperand(MI, AMDGPU::OpName::src2);
1078 Register CarryReg = CarryIn.getReg();
1079 MachineInstr *CarryDef = MRI->getVRegDef(CarryReg);
1080 if (!CarryDef) {
1081 LLVM_DEBUG(dbgs() << "Missing carry-in operand definition\n");
1082 return;
1083 }
1084
1085 // Make sure VCC or its subregs are dead before MI.
1086 MCRegister Vcc = TRI->getVCC();
1087 MachineBasicBlock &MBB = *MI.getParent();
1090 if (Liveness != MachineBasicBlock::LQR_Dead) {
1091 LLVM_DEBUG(dbgs() << "VCC not known to be dead before instruction\n");
1092 return;
1093 }
1094
1095 BuildMI(MBB, MI, MI.getDebugLoc(), TII->get(AMDGPU::COPY), Vcc).add(CarryIn);
1096
1097 auto Converted = BuildMI(MBB, MI, MI.getDebugLoc(),
1098 TII->get(AMDGPU::getVOPe32(MI.getOpcode())))
1099 .add(*TII->getNamedOperand(MI, AMDGPU::OpName::vdst))
1100 .add(*TII->getNamedOperand(MI, AMDGPU::OpName::src0))
1101 .add(*TII->getNamedOperand(MI, AMDGPU::OpName::src1))
1102 .setMIFlags(MI.getFlags());
1103 TII->fixImplicitOperands(*Converted);
1104 LLVM_DEBUG(dbgs() << "Converted to VOP2: " << *Converted);
1105 (void)Converted;
1106 MI.eraseFromParent();
1107}
1108
1109namespace {
1110bool isConvertibleToSDWA(MachineInstr &MI,
1111 const GCNSubtarget &ST,
1112 const SIInstrInfo* TII) {
1113 // Check if this is already an SDWA instruction
1114 unsigned Opc = MI.getOpcode();
1115 if (TII->isSDWA(Opc))
1116 return true;
1117
1118 // Can only be handled after ealier conversion to
1119 // AMDGPU::V_CNDMASK_B32_e32 which is not always possible.
1120 if (Opc == AMDGPU::V_CNDMASK_B32_e64)
1121 return false;
1122
1123 // Check if this instruction has opcode that supports SDWA
1124 if (AMDGPU::getSDWAOp(Opc) == -1)
1126
1127 if (AMDGPU::getSDWAOp(Opc) == -1)
1128 return false;
1129
1130 if (!ST.hasSDWAOmod() && TII->hasModifiersSet(MI, AMDGPU::OpName::omod))
1131 return false;
1132
1133 if (TII->isVOPC(Opc)) {
1134 if (!ST.hasSDWASdst()) {
1135 const MachineOperand *SDst = TII->getNamedOperand(MI, AMDGPU::OpName::sdst);
1136 if (SDst && (SDst->getReg() != AMDGPU::VCC &&
1137 SDst->getReg() != AMDGPU::VCC_LO))
1138 return false;
1139 }
1140
1141 if (!ST.hasSDWAOutModsVOPC() &&
1142 (TII->hasModifiersSet(MI, AMDGPU::OpName::clamp) ||
1143 TII->hasModifiersSet(MI, AMDGPU::OpName::omod)))
1144 return false;
1145
1146 } else if (TII->getNamedOperand(MI, AMDGPU::OpName::sdst) ||
1147 !TII->getNamedOperand(MI, AMDGPU::OpName::vdst)) {
1148 return false;
1149 }
1150
1151 if (!ST.hasSDWAMac() && (Opc == AMDGPU::V_FMAC_F16_e32 ||
1152 Opc == AMDGPU::V_FMAC_F32_e32 ||
1153 Opc == AMDGPU::V_MAC_F16_e32 ||
1154 Opc == AMDGPU::V_MAC_F32_e32))
1155 return false;
1156
1157 // Check if target supports this SDWA opcode
1158 if (TII->pseudoToMCOpcode(Opc) == -1)
1159 return false;
1160
1161 if (MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0)) {
1162 if (!Src0->isReg() && !Src0->isImm())
1163 return false;
1164 }
1165
1166 if (MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1)) {
1167 if (!Src1->isReg() && !Src1->isImm())
1168 return false;
1169 }
1170
1171 return true;
1172}
1173} // namespace
1174
1175MachineInstr *SIPeepholeSDWA::createSDWAVersion(MachineInstr &MI) {
1176 unsigned Opcode = MI.getOpcode();
1177 assert(!TII->isSDWA(Opcode));
1178
1179 int SDWAOpcode = AMDGPU::getSDWAOp(Opcode);
1180 if (SDWAOpcode == -1)
1181 SDWAOpcode = AMDGPU::getSDWAOp(AMDGPU::getVOPe32(Opcode));
1182 assert(SDWAOpcode != -1);
1183
1184 const MCInstrDesc &SDWADesc = TII->get(SDWAOpcode);
1185
1186 // Create SDWA version of instruction MI and initialize its operands
1187 MachineInstrBuilder SDWAInst =
1188 BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), SDWADesc)
1189 .setMIFlags(MI.getFlags());
1190
1191 // Copy dst, if it is present in original then should also be present in SDWA
1192 MachineOperand *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst);
1193 if (Dst) {
1194 assert(AMDGPU::hasNamedOperand(SDWAOpcode, AMDGPU::OpName::vdst));
1195 SDWAInst.add(*Dst);
1196 } else if ((Dst = TII->getNamedOperand(MI, AMDGPU::OpName::sdst))) {
1197 assert(Dst && AMDGPU::hasNamedOperand(SDWAOpcode, AMDGPU::OpName::sdst));
1198 SDWAInst.add(*Dst);
1199 } else {
1200 assert(AMDGPU::hasNamedOperand(SDWAOpcode, AMDGPU::OpName::sdst));
1201 SDWAInst.addReg(TRI->getVCC(), RegState::Define);
1202 }
1203
1204 // Copy src0, initialize src0_modifiers. All sdwa instructions has src0 and
1205 // src0_modifiers (except for v_nop_sdwa, but it can't get here)
1206 MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0);
1207 assert(Src0 && AMDGPU::hasNamedOperand(SDWAOpcode, AMDGPU::OpName::src0) &&
1208 AMDGPU::hasNamedOperand(SDWAOpcode, AMDGPU::OpName::src0_modifiers));
1209 if (auto *Mod = TII->getNamedOperand(MI, AMDGPU::OpName::src0_modifiers))
1210 SDWAInst.addImm(Mod->getImm());
1211 else
1212 SDWAInst.addImm(0);
1213 SDWAInst.add(*Src0);
1214
1215 // Copy src1 if present, initialize src1_modifiers.
1216 MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1);
1217 if (Src1) {
1218 assert(AMDGPU::hasNamedOperand(SDWAOpcode, AMDGPU::OpName::src1) &&
1219 AMDGPU::hasNamedOperand(SDWAOpcode, AMDGPU::OpName::src1_modifiers));
1220 if (auto *Mod = TII->getNamedOperand(MI, AMDGPU::OpName::src1_modifiers))
1221 SDWAInst.addImm(Mod->getImm());
1222 else
1223 SDWAInst.addImm(0);
1224 SDWAInst.add(*Src1);
1225 }
1226
1227 if (SDWAOpcode == AMDGPU::V_FMAC_F16_sdwa ||
1228 SDWAOpcode == AMDGPU::V_FMAC_F32_sdwa ||
1229 SDWAOpcode == AMDGPU::V_MAC_F16_sdwa ||
1230 SDWAOpcode == AMDGPU::V_MAC_F32_sdwa) {
1231 // v_mac_f16/32 has additional src2 operand tied to vdst
1232 MachineOperand *Src2 = TII->getNamedOperand(MI, AMDGPU::OpName::src2);
1233 assert(Src2);
1234 SDWAInst.add(*Src2);
1235 }
1236
1237 // Copy clamp if present, initialize otherwise
1238 assert(AMDGPU::hasNamedOperand(SDWAOpcode, AMDGPU::OpName::clamp));
1239 MachineOperand *Clamp = TII->getNamedOperand(MI, AMDGPU::OpName::clamp);
1240 if (Clamp) {
1241 SDWAInst.add(*Clamp);
1242 } else {
1243 SDWAInst.addImm(0);
1244 }
1245
1246 // Copy omod if present, initialize otherwise if needed
1247 if (AMDGPU::hasNamedOperand(SDWAOpcode, AMDGPU::OpName::omod)) {
1248 MachineOperand *OMod = TII->getNamedOperand(MI, AMDGPU::OpName::omod);
1249 if (OMod) {
1250 SDWAInst.add(*OMod);
1251 } else {
1252 SDWAInst.addImm(0);
1253 }
1254 }
1255
1256 // Initialize SDWA specific operands
1257 if (AMDGPU::hasNamedOperand(SDWAOpcode, AMDGPU::OpName::dst_sel))
1258 SDWAInst.addImm(AMDGPU::SDWA::SdwaSel::DWORD);
1259
1260 if (AMDGPU::hasNamedOperand(SDWAOpcode, AMDGPU::OpName::dst_unused))
1261 SDWAInst.addImm(AMDGPU::SDWA::DstUnused::UNUSED_PAD);
1262
1263 assert(AMDGPU::hasNamedOperand(SDWAOpcode, AMDGPU::OpName::src0_sel));
1264 SDWAInst.addImm(AMDGPU::SDWA::SdwaSel::DWORD);
1265
1266 if (Src1) {
1267 assert(AMDGPU::hasNamedOperand(SDWAOpcode, AMDGPU::OpName::src1_sel));
1268 SDWAInst.addImm(AMDGPU::SDWA::SdwaSel::DWORD);
1269 }
1270
1271 // Check for a preserved register that needs to be copied.
1272 MachineInstr *Ret = SDWAInst.getInstr();
1273 TII->fixImplicitOperands(*Ret);
1274 return Ret;
1275}
1276
1277bool SIPeepholeSDWA::convertToSDWA(MachineInstr &MI,
1278 const SDWAOperandsVector &SDWAOperands) {
1279 LLVM_DEBUG(dbgs() << "Convert instruction:" << MI);
1280
1281 MachineInstr *SDWAInst;
1282 if (TII->isSDWA(MI.getOpcode())) {
1283 // Clone the instruction to allow revoking changes
1284 // made to MI during the processing of the operands
1285 // if the conversion fails.
1286 SDWAInst = MI.getParent()->getParent()->CloneMachineInstr(&MI);
1287 MI.getParent()->insert(MI.getIterator(), SDWAInst);
1288 } else {
1289 SDWAInst = createSDWAVersion(MI);
1290 }
1291
1292 // Apply all sdwa operand patterns.
1293 bool Converted = false;
1294 for (auto &Operand : SDWAOperands) {
1295 LLVM_DEBUG(dbgs() << *SDWAInst << "\nOperand: " << *Operand);
1296 // There should be no intersection between SDWA operands and potential MIs
1297 // e.g.:
1298 // v_and_b32 v0, 0xff, v1 -> src:v1 sel:BYTE_0
1299 // v_and_b32 v2, 0xff, v0 -> src:v0 sel:BYTE_0
1300 // v_add_u32 v3, v4, v2
1301 //
1302 // In that example it is possible that we would fold 2nd instruction into
1303 // 3rd (v_add_u32_sdwa) and then try to fold 1st instruction into 2nd (that
1304 // was already destroyed). So if SDWAOperand is also a potential MI then do
1305 // not apply it.
1306 if (PotentialMatches.count(Operand->getParentInst()) == 0)
1307 Converted |= Operand->convertToSDWA(*SDWAInst, TII);
1308 }
1309
1310 if (!Converted) {
1311 SDWAInst->eraseFromParent();
1312 return false;
1313 }
1314
1315 ConvertedInstructions.push_back(SDWAInst);
1316 for (MachineOperand &MO : SDWAInst->uses()) {
1317 if (!MO.isReg())
1318 continue;
1319
1320 MRI->clearKillFlags(MO.getReg());
1321 }
1322 LLVM_DEBUG(dbgs() << "\nInto:" << *SDWAInst << '\n');
1323 ++NumSDWAInstructionsPeepholed;
1324
1325 MI.eraseFromParent();
1326 return true;
1327}
1328
1329// If an instruction was converted to SDWA it should not have immediates or SGPR
1330// operands (allowed one SGPR on GFX9). Copy its scalar operands into VGPRs.
1331void SIPeepholeSDWA::legalizeScalarOperands(MachineInstr &MI,
1332 const GCNSubtarget &ST) const {
1333 const MCInstrDesc &Desc = TII->get(MI.getOpcode());
1334 unsigned ConstantBusCount = 0;
1335 for (MachineOperand &Op : MI.explicit_uses()) {
1336 if (!Op.isImm() && !(Op.isReg() && !TRI->isVGPR(*MRI, Op.getReg())))
1337 continue;
1338
1339 unsigned I = Op.getOperandNo();
1340 if (Desc.operands()[I].RegClass == -1 ||
1341 !TRI->isVSSuperClass(TRI->getRegClass(Desc.operands()[I].RegClass)))
1342 continue;
1343
1344 if (ST.hasSDWAScalar() && ConstantBusCount == 0 && Op.isReg() &&
1345 TRI->isSGPRReg(*MRI, Op.getReg())) {
1346 ++ConstantBusCount;
1347 continue;
1348 }
1349
1350 Register VGPR = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
1351 auto Copy = BuildMI(*MI.getParent(), MI.getIterator(), MI.getDebugLoc(),
1352 TII->get(AMDGPU::V_MOV_B32_e32), VGPR);
1353 if (Op.isImm())
1354 Copy.addImm(Op.getImm());
1355 else if (Op.isReg())
1356 Copy.addReg(Op.getReg(), Op.isKill() ? RegState::Kill : 0,
1357 Op.getSubReg());
1358 Op.ChangeToRegister(VGPR, false);
1359 }
1360}
1361
1362bool SIPeepholeSDWALegacy::runOnMachineFunction(MachineFunction &MF) {
1363 if (skipFunction(MF.getFunction()))
1364 return false;
1365
1366 return SIPeepholeSDWA().run(MF);
1367}
1368
1369bool SIPeepholeSDWA::run(MachineFunction &MF) {
1370 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1371
1372 if (!ST.hasSDWA())
1373 return false;
1374
1375 MRI = &MF.getRegInfo();
1376 TRI = ST.getRegisterInfo();
1377 TII = ST.getInstrInfo();
1378
1379 // Find all SDWA operands in MF.
1380 bool Ret = false;
1381 for (MachineBasicBlock &MBB : MF) {
1382 bool Changed = false;
1383 do {
1384 // Preprocess the ADD/SUB pairs so they could be SDWA'ed.
1385 // Look for a possible ADD or SUB that resulted from a previously lowered
1386 // V_{ADD|SUB}_U64_PSEUDO. The function pseudoOpConvertToVOP2
1387 // lowers the pair of instructions into e32 form.
1388 matchSDWAOperands(MBB);
1389 for (const auto &OperandPair : SDWAOperands) {
1390 const auto &Operand = OperandPair.second;
1391 MachineInstr *PotentialMI = Operand->potentialToConvert(TII, ST);
1392 if (!PotentialMI)
1393 continue;
1394
1395 switch (PotentialMI->getOpcode()) {
1396 case AMDGPU::V_ADD_CO_U32_e64:
1397 case AMDGPU::V_SUB_CO_U32_e64:
1398 pseudoOpConvertToVOP2(*PotentialMI, ST);
1399 break;
1400 case AMDGPU::V_CNDMASK_B32_e64:
1401 convertVcndmaskToVOP2(*PotentialMI, ST);
1402 break;
1403 };
1404 }
1405 SDWAOperands.clear();
1406
1407 // Generate potential match list.
1408 matchSDWAOperands(MBB);
1409
1410 for (const auto &OperandPair : SDWAOperands) {
1411 const auto &Operand = OperandPair.second;
1412 MachineInstr *PotentialMI =
1413 Operand->potentialToConvert(TII, ST, &PotentialMatches);
1414
1415 if (PotentialMI && isConvertibleToSDWA(*PotentialMI, ST, TII))
1416 PotentialMatches[PotentialMI].push_back(Operand.get());
1417 }
1418
1419 for (auto &PotentialPair : PotentialMatches) {
1420 MachineInstr &PotentialMI = *PotentialPair.first;
1421 convertToSDWA(PotentialMI, PotentialPair.second);
1422 }
1423
1424 PotentialMatches.clear();
1425 SDWAOperands.clear();
1426
1427 Changed = !ConvertedInstructions.empty();
1428
1429 if (Changed)
1430 Ret = true;
1431 while (!ConvertedInstructions.empty())
1432 legalizeScalarOperands(*ConvertedInstructions.pop_back_val(), ST);
1433 } while (Changed);
1434 }
1435
1436 return Ret;
1437}
1438
unsigned const MachineRegisterInfo * MRI
MachineInstrBuilder & UseMI
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
Provides AMDGPU specific target descriptions.
MachineBasicBlock & MBB
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
#define LLVM_DUMP_METHOD
Mark debug helper function definitions like dump() that should not be stripped from debug builds.
Definition Compiler.h:638
AMD GCN specific subclass of TargetSubtarget.
#define DEBUG_TYPE
const HexagonInstrInfo * TII
IRTranslator LLVM IR MI
#define I(x, y, z)
Definition MD5.cpp:58
Register Reg
Register const TargetRegisterInfo * TRI
This file implements a map that provides insertion order iteration.
Promote Memory to Register
Definition Mem2Reg.cpp:110
static unsigned getReg(const MCDisassembler *D, unsigned RC, unsigned RegNo)
if(auto Err=PB.parsePassPipeline(MPM, Passes)) return wrap(std MPM run * Mod
#define INITIALIZE_PASS(passName, arg, name, cfg, analysis)
Definition PassSupport.h:56
static MachineOperand * findSingleRegDef(const MachineOperand *Reg, const MachineRegisterInfo *MRI)
static void copyRegOperand(MachineOperand &To, const MachineOperand &From)
static MachineOperand * findSingleRegUse(const MachineOperand *Reg, const MachineRegisterInfo *MRI)
static std::optional< SdwaSel > combineSdwaSel(SdwaSel Sel, SdwaSel OperandSel)
Combine an SDWA instruction's existing SDWA selection Sel with the SDWA selection OperandSel of its o...
static bool isSameReg(const MachineOperand &LHS, const MachineOperand &RHS)
static bool canCombineOpSel(const MachineInstr &MI, const SIInstrInfo *TII, AMDGPU::OpName SrcSelOpName, SdwaSel OpSel)
Verify that the SDWA selection operand SrcSelOpName of the SDWA instruction MI can be combined with t...
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
Definition Statistic.h:167
#define LLVM_DEBUG(...)
Definition Debug.h:114
Value * RHS
Value * LHS
LLVM_ABI void setPreservesCFG()
This function should be called by the pass, iff they do not:
Definition Pass.cpp:270
Represents analyses that only rely on functions' control flow.
Definition Analysis.h:73
FunctionPass class - This class is used to implement most global optimizations.
Definition Pass.h:314
bool hasOptNone() const
Do not optimize this function (-O0).
Definition Function.h:700
LLVM_ABI LivenessQueryResult computeRegisterLiveness(const TargetRegisterInfo *TRI, MCRegister Reg, const_iterator Before, unsigned Neighborhood=10) const
Return whether (physical) register Reg has been defined and not killed as of just before Before.
LivenessQueryResult
Possible outcome of a register liveness query to computeRegisterLiveness()
@ LQR_Dead
Register is known to be fully dead.
MachineFunctionPass - This class adapts the FunctionPass interface to allow convenient creation of pa...
void getAnalysisUsage(AnalysisUsage &AU) const override
getAnalysisUsage - Subclasses that override getAnalysisUsage must call this.
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
Function & getFunction()
Return the LLVM function that this machine code represents.
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & add(const MachineOperand &MO) const
const MachineInstrBuilder & addReg(Register RegNo, unsigned flags=0, unsigned SubReg=0) const
Add a new virtual register operand.
const MachineInstrBuilder & setMIFlags(unsigned Flags) const
MachineInstr * getInstr() const
If conversion operators fail, use this method to get the MachineInstr explicitly.
Representation of each machine instruction.
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
LLVM_ABI void substituteRegister(Register FromReg, Register ToReg, unsigned SubIdx, const TargetRegisterInfo &RegInfo)
Replace all occurrences of FromReg with ToReg:SubIdx, properly composing subreg indices where necessa...
mop_range uses()
Returns all operands which may be register uses.
LLVM_ABI void eraseFromParent()
Unlink 'this' from the containing basic block and delete it.
const MachineOperand & getOperand(unsigned i) const
MachineOperand class - Representation of each machine instruction operand.
void setSubReg(unsigned subReg)
unsigned getSubReg() const
void setImm(int64_t immVal)
int64_t getImm() const
bool isReg() const
isReg - Tests if this is a MO_Register operand.
void setIsDead(bool Val=true)
LLVM_ABI void setReg(Register Reg)
Change the register this operand corresponds to.
bool isImm() const
isImm - Tests if this is a MO_Immediate operand.
void setIsKill(bool Val=true)
MachineInstr * getParent()
getParent - Return the instruction that this operand belongs to.
void setIsUndef(bool Val=true)
Register getReg() const
getReg - Returns the register number.
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
This class implements a map that also provides access to all stored values in a deterministic order.
Definition MapVector.h:36
A set of analyses that are preserved following a run of a transformation pass.
Definition Analysis.h:112
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
Definition Analysis.h:118
PreservedAnalyses & preserveSet()
Mark an analysis set as preserved.
Definition Analysis.h:151
constexpr bool isPhysical() const
Return true if the specified register number is in the physical register namespace.
Definition Register.h:78
PreservedAnalyses run(MachineFunction &MF, MachineFunctionAnalysisManager &MFAM)
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
self_iterator getIterator()
Definition ilist_node.h:134
This class implements an extremely fast bulk output stream that can only output to a stream.
Definition raw_ostream.h:53
Changed
LLVM_READONLY int getVOPe32(uint16_t Opcode)
LLVM_READONLY bool hasNamedOperand(uint64_t Opcode, OpName NamedIdx)
LLVM_READONLY int getSDWAOp(uint16_t Opcode)
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition CallingConv.h:24
@ Define
Register definition.
@ Kill
The last use of a register.
NodeAddr< DefNode * > Def
Definition RDFGraph.h:384
This is an optimization pass for GlobalISel generic memory operations.
void dump(const SparseBitVector< ElementSize > &LHS, raw_ostream &out)
@ Offset
Definition DWP.cpp:477
Printable print(const GCNRegPressure &RP, const GCNSubtarget *ST=nullptr, unsigned DynamicVGPRBlockSize=0)
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
AnalysisManager< MachineFunction > MachineFunctionAnalysisManager
Op::Description Desc
FunctionPass * createSIPeepholeSDWALegacyPass()
LLVM_ABI PreservedAnalyses getMachineFunctionPassPreservedAnalyses()
Returns the minimum set of Analyses that all machine function passes must preserve.
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:207
DWARFExpression::Operation Op
raw_ostream & operator<<(raw_ostream &OS, const APFixedPoint &FX)
char & SIPeepholeSDWALegacyID