33#define DEBUG_TYPE "si-peephole-sdwa"
35STATISTIC(NumSDWAPatternsFound,
"Number of SDWA patterns found.");
37 "Number of instruction converted to SDWA.");
56 SDWAOperandsMap PotentialMatches;
67 bool convertToSDWA(
MachineInstr &
MI,
const SDWAOperandsVector &SDWAOperands);
78 SIPeepholeSDWALegacy() : MachineFunctionPass(ID) {}
80 StringRef getPassName()
const override {
return "SI Peephole SDWA"; }
82 bool runOnMachineFunction(MachineFunction &MF)
override;
84 void getAnalysisUsage(AnalysisUsage &AU)
const override {
94 MachineOperand *Target;
95 MachineOperand *Replaced;
99 virtual bool canCombineSelections(
const MachineInstr &
MI,
100 const SIInstrInfo *
TII) = 0;
103 SDWAOperand(MachineOperand *TargetOp, MachineOperand *ReplacedOp)
104 : Target(TargetOp), Replaced(ReplacedOp) {
106 assert(Replaced->isReg());
109 virtual ~SDWAOperand() =
default;
111 virtual MachineInstr *potentialToConvert(
const SIInstrInfo *
TII,
112 const GCNSubtarget &ST,
113 SDWAOperandsMap *PotentialMatches =
nullptr) = 0;
114 virtual bool convertToSDWA(MachineInstr &
MI,
const SIInstrInfo *
TII) = 0;
116 MachineOperand *getTargetOperand()
const {
return Target; }
117 MachineOperand *getReplacedOperand()
const {
return Replaced; }
118 MachineInstr *getParentInst()
const {
return Target->getParent(); }
120 MachineRegisterInfo *getMRI()
const {
121 return &getParentInst()->getParent()->getParent()->getRegInfo();
124#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
125 virtual void print(raw_ostream& OS)
const = 0;
130class SDWASrcOperand :
public SDWAOperand {
138 SDWASrcOperand(MachineOperand *TargetOp, MachineOperand *ReplacedOp,
139 SdwaSel SrcSel_ =
DWORD,
bool Abs_ =
false,
bool Neg_ =
false,
141 : SDWAOperand(TargetOp, ReplacedOp), SrcSel(SrcSel_), Abs(Abs_),
142 Neg(Neg_), Sext(Sext_) {}
144 MachineInstr *potentialToConvert(
const SIInstrInfo *
TII,
145 const GCNSubtarget &ST,
146 SDWAOperandsMap *PotentialMatches =
nullptr)
override;
147 bool convertToSDWA(MachineInstr &
MI,
const SIInstrInfo *
TII)
override;
148 bool canCombineSelections(
const MachineInstr &
MI,
149 const SIInstrInfo *
TII)
override;
151 SdwaSel getSrcSel()
const {
return SrcSel; }
152 bool getAbs()
const {
return Abs; }
153 bool getNeg()
const {
return Neg; }
154 bool getSext()
const {
return Sext; }
156 uint64_t getSrcMods(
const SIInstrInfo *
TII,
157 const MachineOperand *SrcOp)
const;
159#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
160 void print(raw_ostream& OS)
const override;
164class SDWADstOperand :
public SDWAOperand {
170 SDWADstOperand(MachineOperand *TargetOp, MachineOperand *ReplacedOp,
172 : SDWAOperand(TargetOp, ReplacedOp), DstSel(DstSel_), DstUn(DstUn_) {}
174 MachineInstr *potentialToConvert(
const SIInstrInfo *
TII,
175 const GCNSubtarget &ST,
176 SDWAOperandsMap *PotentialMatches =
nullptr)
override;
177 bool convertToSDWA(MachineInstr &
MI,
const SIInstrInfo *
TII)
override;
178 bool canCombineSelections(
const MachineInstr &
MI,
179 const SIInstrInfo *
TII)
override;
181 SdwaSel getDstSel()
const {
return DstSel; }
182 DstUnused getDstUnused()
const {
return DstUn; }
184#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
185 void print(raw_ostream& OS)
const override;
189class SDWADstPreserveOperand :
public SDWADstOperand {
191 MachineOperand *Preserve;
194 SDWADstPreserveOperand(MachineOperand *TargetOp, MachineOperand *ReplacedOp,
197 Preserve(PreserveOp) {}
199 bool convertToSDWA(MachineInstr &
MI,
const SIInstrInfo *
TII)
override;
200 bool canCombineSelections(
const MachineInstr &
MI,
201 const SIInstrInfo *
TII)
override;
203 MachineOperand *getPreservedOperand()
const {
return Preserve; }
205#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
206 void print(raw_ostream& OS)
const override;
215char SIPeepholeSDWALegacy::
ID = 0;
220 return new SIPeepholeSDWALegacy();
223#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
226 case BYTE_0: OS <<
"BYTE_0";
break;
227 case BYTE_1: OS <<
"BYTE_1";
break;
228 case BYTE_2: OS <<
"BYTE_2";
break;
229 case BYTE_3: OS <<
"BYTE_3";
break;
230 case WORD_0: OS <<
"WORD_0";
break;
231 case WORD_1: OS <<
"WORD_1";
break;
232 case DWORD: OS <<
"DWORD";
break;
248 OS <<
"SDWA src: " << *getTargetOperand()
249 <<
" src_sel:" << getSrcSel()
250 <<
" abs:" << getAbs() <<
" neg:" << getNeg()
251 <<
" sext:" << getSext() <<
'\n';
255void SDWADstOperand::print(raw_ostream& OS)
const {
256 OS <<
"SDWA dst: " << *getTargetOperand()
257 <<
" dst_sel:" << getDstSel()
258 <<
" dst_unused:" << getDstUnused() <<
'\n';
262void SDWADstPreserveOperand::print(raw_ostream& OS)
const {
263 OS <<
"SDWA preserve dst: " << *getTargetOperand()
264 <<
" dst_sel:" << getDstSel()
265 <<
" preserve:" << *getPreservedOperand() <<
'\n';
283 return LHS.isReg() &&
285 LHS.getReg() ==
RHS.getReg() &&
286 LHS.getSubReg() ==
RHS.getSubReg();
291 if (!
Reg->isReg() || !
Reg->isDef())
294 return MRI->getOneNonDBGUse(
Reg->getReg());
302 return MRI->getOneDef(
Reg->getReg());
312 if (Sel == SdwaSel::DWORD)
315 if (Sel == OperandSel || OperandSel == SdwaSel::DWORD)
318 if (Sel == SdwaSel::WORD_1 || Sel == SdwaSel::BYTE_2 ||
319 Sel == SdwaSel::BYTE_3)
322 if (OperandSel == SdwaSel::WORD_0)
325 if (OperandSel == SdwaSel::WORD_1) {
326 if (Sel == SdwaSel::BYTE_0)
327 return SdwaSel::BYTE_2;
328 if (Sel == SdwaSel::BYTE_1)
329 return SdwaSel::BYTE_3;
330 if (Sel == SdwaSel::WORD_0)
331 return SdwaSel::WORD_1;
337uint64_t SDWASrcOperand::getSrcMods(
const SIInstrInfo *
TII,
338 const MachineOperand *SrcOp)
const {
341 if (
TII->getNamedOperand(*
MI, AMDGPU::OpName::src0) == SrcOp) {
342 if (
auto *
Mod =
TII->getNamedOperand(*
MI, AMDGPU::OpName::src0_modifiers)) {
343 Mods =
Mod->getImm();
345 }
else if (
TII->getNamedOperand(*
MI, AMDGPU::OpName::src1) == SrcOp) {
346 if (
auto *
Mod =
TII->getNamedOperand(*
MI, AMDGPU::OpName::src1_modifiers)) {
347 Mods =
Mod->getImm();
352 "Float and integer src modifiers can't be set simultaneously");
362MachineInstr *SDWASrcOperand::potentialToConvert(
const SIInstrInfo *
TII,
363 const GCNSubtarget &ST,
364 SDWAOperandsMap *PotentialMatches) {
365 if (PotentialMatches !=
nullptr) {
367 MachineOperand *
Reg = getReplacedOperand();
368 if (!
Reg->isReg() || !
Reg->isDef())
371 for (MachineInstr &
UseMI : getMRI()->use_nodbg_instructions(
Reg->getReg()))
373 if (!isConvertibleToSDWA(
UseMI, ST,
TII) ||
379 for (MachineOperand &UseMO : getMRI()->use_nodbg_operands(
Reg->getReg())) {
383 SDWAOperandsMap &potentialMatchesMap = *PotentialMatches;
384 MachineInstr *
UseMI = UseMO.getParent();
385 potentialMatchesMap[
UseMI].push_back(
this);
392 MachineOperand *PotentialMO =
findSingleRegUse(getReplacedOperand(), getMRI());
396 MachineInstr *Parent = PotentialMO->
getParent();
398 return canCombineSelections(*Parent,
TII) ? Parent :
nullptr;
401bool SDWASrcOperand::convertToSDWA(MachineInstr &
MI,
const SIInstrInfo *
TII) {
402 switch (
MI.getOpcode()) {
403 case AMDGPU::V_CVT_F32_FP8_sdwa:
404 case AMDGPU::V_CVT_F32_BF8_sdwa:
405 case AMDGPU::V_CVT_PK_F32_FP8_sdwa:
406 case AMDGPU::V_CVT_PK_F32_BF8_sdwa:
409 case AMDGPU::V_CNDMASK_B32_sdwa:
428 bool IsPreserveSrc =
false;
429 MachineOperand *Src =
TII->getNamedOperand(
MI, AMDGPU::OpName::src0);
430 MachineOperand *SrcSel =
TII->getNamedOperand(
MI, AMDGPU::OpName::src0_sel);
431 MachineOperand *SrcMods =
432 TII->getNamedOperand(
MI, AMDGPU::OpName::src0_modifiers);
433 assert(Src && (Src->isReg() || Src->isImm()));
434 if (!
isSameReg(*Src, *getReplacedOperand())) {
436 Src =
TII->getNamedOperand(
MI, AMDGPU::OpName::src1);
437 SrcSel =
TII->getNamedOperand(
MI, AMDGPU::OpName::src1_sel);
438 SrcMods =
TII->getNamedOperand(
MI, AMDGPU::OpName::src1_modifiers);
441 !
isSameReg(*Src, *getReplacedOperand())) {
448 MachineOperand *Dst =
TII->getNamedOperand(
MI, AMDGPU::OpName::vdst);
450 TII->getNamedOperand(
MI, AMDGPU::OpName::dst_unused);
453 DstUnused->getImm() == AMDGPU::SDWA::DstUnused::UNUSED_PRESERVE) {
459 TII->getNamedImmOperand(
MI, AMDGPU::OpName::dst_sel));
460 if (DstSel == AMDGPU::SDWA::SdwaSel::WORD_1 &&
461 getSrcSel() == AMDGPU::SDWA::SdwaSel::WORD_0) {
462 IsPreserveSrc =
true;
463 auto DstIdx = AMDGPU::getNamedOperandIdx(
MI.getOpcode(),
464 AMDGPU::OpName::vdst);
465 auto TiedIdx =
MI.findTiedOperandIdx(DstIdx);
466 Src = &
MI.getOperand(TiedIdx);
475 assert(Src && Src->isReg());
477 if ((
MI.getOpcode() == AMDGPU::V_FMAC_F16_sdwa ||
478 MI.getOpcode() == AMDGPU::V_FMAC_F32_sdwa ||
479 MI.getOpcode() == AMDGPU::V_MAC_F16_sdwa ||
480 MI.getOpcode() == AMDGPU::V_MAC_F32_sdwa) &&
481 !
isSameReg(*Src, *getReplacedOperand())) {
488 (IsPreserveSrc || (SrcSel && SrcMods)));
491 if (!IsPreserveSrc) {
496 getTargetOperand()->setIsKill(
false);
503 AMDGPU::OpName SrcSelOpName,
SdwaSel OpSel) {
516 AMDGPU::OpName SrcOpName,
528bool SDWASrcOperand::canCombineSelections(
const MachineInstr &
MI,
529 const SIInstrInfo *
TII) {
530 if (!
TII->isSDWA(
MI.getOpcode()))
533 using namespace AMDGPU;
536 getReplacedOperand(), getSrcSel()) &&
538 getReplacedOperand(), getSrcSel());
541MachineInstr *SDWADstOperand::potentialToConvert(
const SIInstrInfo *
TII,
542 const GCNSubtarget &ST,
543 SDWAOperandsMap *PotentialMatches) {
546 MachineRegisterInfo *
MRI = getMRI();
547 MachineInstr *ParentMI = getParentInst();
554 for (MachineInstr &UseInst :
MRI->use_nodbg_instructions(PotentialMO->
getReg())) {
555 if (&UseInst != ParentMI)
559 MachineInstr *Parent = PotentialMO->
getParent();
560 return canCombineSelections(*Parent,
TII) ? Parent :
nullptr;
563bool SDWADstOperand::convertToSDWA(MachineInstr &
MI,
const SIInstrInfo *
TII) {
566 if ((
MI.getOpcode() == AMDGPU::V_FMAC_F16_sdwa ||
567 MI.getOpcode() == AMDGPU::V_FMAC_F32_sdwa ||
568 MI.getOpcode() == AMDGPU::V_MAC_F16_sdwa ||
569 MI.getOpcode() == AMDGPU::V_MAC_F32_sdwa) &&
575 MachineOperand *Operand =
TII->getNamedOperand(
MI, AMDGPU::OpName::vdst);
578 isSameReg(*Operand, *getReplacedOperand()));
580 MachineOperand *DstSel=
TII->getNamedOperand(
MI, AMDGPU::OpName::dst_sel);
586 MachineOperand *
DstUnused=
TII->getNamedOperand(
MI, AMDGPU::OpName::dst_unused);
592 getParentInst()->eraseFromParent();
596bool SDWADstOperand::canCombineSelections(
const MachineInstr &
MI,
597 const SIInstrInfo *
TII) {
598 if (!
TII->isSDWA(
MI.getOpcode()))
604bool SDWADstPreserveOperand::convertToSDWA(MachineInstr &
MI,
605 const SIInstrInfo *
TII) {
609 for (MachineOperand &MO :
MI.uses()) {
612 getMRI()->clearKillFlags(MO.getReg());
616 MI.getParent()->remove(&
MI);
617 getParentInst()->getParent()->insert(getParentInst(), &
MI);
620 MachineInstrBuilder MIB(*
MI.getMF(),
MI);
621 MIB.addReg(getPreservedOperand()->
getReg(),
623 getPreservedOperand()->getSubReg());
626 MI.tieOperands(AMDGPU::getNamedOperandIdx(
MI.getOpcode(), AMDGPU::OpName::vdst),
627 MI.getNumOperands() - 1);
630 return SDWADstOperand::convertToSDWA(
MI,
TII);
633bool SDWADstPreserveOperand::canCombineSelections(
const MachineInstr &
MI,
634 const SIInstrInfo *
TII) {
635 return SDWADstOperand::canCombineSelections(
MI,
TII);
638std::optional<int64_t>
639SIPeepholeSDWA::foldToImm(
const MachineOperand &
Op)
const {
647 for (
const MachineOperand &Def :
MRI->def_operands(
Op.getReg())) {
651 const MachineInstr *DefInst =
Def.getParent();
652 if (!
TII->isFoldableCopy(*DefInst))
655 const MachineOperand &Copied = DefInst->
getOperand(1);
666std::unique_ptr<SDWAOperand>
667SIPeepholeSDWA::matchSDWAOperand(MachineInstr &
MI) {
668 unsigned Opcode =
MI.getOpcode();
670 case AMDGPU::V_LSHRREV_B32_e32:
671 case AMDGPU::V_ASHRREV_I32_e32:
672 case AMDGPU::V_LSHLREV_B32_e32:
673 case AMDGPU::V_LSHRREV_B32_e64:
674 case AMDGPU::V_ASHRREV_I32_e64:
675 case AMDGPU::V_LSHLREV_B32_e64: {
684 MachineOperand *Src0 =
TII->getNamedOperand(
MI, AMDGPU::OpName::src0);
685 auto Imm = foldToImm(*Src0);
689 if (*Imm != 16 && *Imm != 24)
692 MachineOperand *Src1 =
TII->getNamedOperand(
MI, AMDGPU::OpName::src1);
693 MachineOperand *Dst =
TII->getNamedOperand(
MI, AMDGPU::OpName::vdst);
695 Dst->getReg().isPhysical())
698 if (Opcode == AMDGPU::V_LSHLREV_B32_e32 ||
699 Opcode == AMDGPU::V_LSHLREV_B32_e64) {
700 return std::make_unique<SDWADstOperand>(
703 return std::make_unique<SDWASrcOperand>(
705 Opcode != AMDGPU::V_LSHRREV_B32_e32 &&
706 Opcode != AMDGPU::V_LSHRREV_B32_e64);
710 case AMDGPU::V_LSHRREV_B16_e32:
711 case AMDGPU::V_ASHRREV_I16_e32:
712 case AMDGPU::V_LSHLREV_B16_e32:
713 case AMDGPU::V_LSHRREV_B16_e64:
714 case AMDGPU::V_LSHRREV_B16_opsel_e64:
715 case AMDGPU::V_ASHRREV_I16_e64:
716 case AMDGPU::V_LSHLREV_B16_opsel_e64:
717 case AMDGPU::V_LSHLREV_B16_e64: {
726 MachineOperand *Src0 =
TII->getNamedOperand(
MI, AMDGPU::OpName::src0);
727 auto Imm = foldToImm(*Src0);
728 if (!Imm || *Imm != 8)
731 MachineOperand *Src1 =
TII->getNamedOperand(
MI, AMDGPU::OpName::src1);
732 MachineOperand *Dst =
TII->getNamedOperand(
MI, AMDGPU::OpName::vdst);
735 Dst->getReg().isPhysical())
738 if (Opcode == AMDGPU::V_LSHLREV_B16_e32 ||
739 Opcode == AMDGPU::V_LSHLREV_B16_opsel_e64 ||
740 Opcode == AMDGPU::V_LSHLREV_B16_e64)
742 return std::make_unique<SDWASrcOperand>(
743 Src1, Dst,
BYTE_1,
false,
false,
744 Opcode != AMDGPU::V_LSHRREV_B16_e32 &&
745 Opcode != AMDGPU::V_LSHRREV_B16_opsel_e64 &&
746 Opcode != AMDGPU::V_LSHRREV_B16_e64);
750 case AMDGPU::V_BFE_I32_e64:
751 case AMDGPU::V_BFE_U32_e64: {
766 MachineOperand *Src1 =
TII->getNamedOperand(
MI, AMDGPU::OpName::src1);
767 auto Offset = foldToImm(*Src1);
771 MachineOperand *Src2 =
TII->getNamedOperand(
MI, AMDGPU::OpName::src2);
772 auto Width = foldToImm(*Src2);
778 if (*
Offset == 0 && *Width == 8)
780 else if (*
Offset == 0 && *Width == 16)
782 else if (*
Offset == 0 && *Width == 32)
784 else if (*
Offset == 8 && *Width == 8)
786 else if (*
Offset == 16 && *Width == 8)
788 else if (*
Offset == 16 && *Width == 16)
790 else if (*
Offset == 24 && *Width == 8)
795 MachineOperand *Src0 =
TII->getNamedOperand(
MI, AMDGPU::OpName::src0);
796 MachineOperand *Dst =
TII->getNamedOperand(
MI, AMDGPU::OpName::vdst);
799 Dst->getReg().isPhysical())
802 return std::make_unique<SDWASrcOperand>(
803 Src0, Dst, SrcSel,
false,
false, Opcode != AMDGPU::V_BFE_U32_e64);
806 case AMDGPU::V_AND_B32_e32:
807 case AMDGPU::V_AND_B32_e64: {
812 MachineOperand *Src0 =
TII->getNamedOperand(
MI, AMDGPU::OpName::src0);
813 MachineOperand *Src1 =
TII->getNamedOperand(
MI, AMDGPU::OpName::src1);
815 auto Imm = foldToImm(*Src0);
818 Imm = foldToImm(*Src1);
822 if (!Imm || (*Imm != 0x0000ffff && *Imm != 0x000000ff))
825 MachineOperand *Dst =
TII->getNamedOperand(
MI, AMDGPU::OpName::vdst);
827 if (!ValSrc->isReg() || ValSrc->getReg().isPhysical() ||
828 Dst->getReg().isPhysical())
831 return std::make_unique<SDWASrcOperand>(
835 case AMDGPU::V_OR_B32_e32:
836 case AMDGPU::V_OR_B32_e64: {
847 std::optional<std::pair<MachineOperand *, MachineOperand *>>;
848 auto CheckOROperandsForSDWA =
849 [&](
const MachineOperand *Op1,
const MachineOperand *Op2) -> CheckRetType {
850 if (!Op1 || !Op1->
isReg() || !Op2 || !Op2->isReg())
851 return CheckRetType(std::nullopt);
855 return CheckRetType(std::nullopt);
857 MachineInstr *Op1Inst = Op1Def->
getParent();
858 if (!
TII->isSDWA(*Op1Inst))
859 return CheckRetType(std::nullopt);
863 return CheckRetType(std::nullopt);
865 return CheckRetType(std::pair(Op1Def, Op2Def));
868 MachineOperand *OrSDWA =
TII->getNamedOperand(
MI, AMDGPU::OpName::src0);
869 MachineOperand *OrOther =
TII->getNamedOperand(
MI, AMDGPU::OpName::src1);
870 assert(OrSDWA && OrOther);
871 auto Res = CheckOROperandsForSDWA(OrSDWA, OrOther);
873 OrSDWA =
TII->getNamedOperand(
MI, AMDGPU::OpName::src1);
874 OrOther =
TII->getNamedOperand(
MI, AMDGPU::OpName::src0);
875 assert(OrSDWA && OrOther);
876 Res = CheckOROperandsForSDWA(OrSDWA, OrOther);
881 MachineOperand *OrSDWADef = Res->first;
882 MachineOperand *OrOtherDef = Res->second;
883 assert(OrSDWADef && OrOtherDef);
885 MachineInstr *SDWAInst = OrSDWADef->
getParent();
886 MachineInstr *OtherInst = OrOtherDef->
getParent();
908 if (!
TII->isSDWA(*OtherInst))
912 TII->getNamedImmOperand(*SDWAInst, AMDGPU::OpName::dst_sel));
914 TII->getNamedImmOperand(*OtherInst, AMDGPU::OpName::dst_sel));
916 bool DstSelAgree =
false;
919 (OtherDstSel ==
BYTE_3) ||
923 (OtherDstSel ==
BYTE_1) ||
927 (OtherDstSel ==
BYTE_2) ||
928 (OtherDstSel ==
BYTE_3) ||
932 (OtherDstSel ==
BYTE_2) ||
933 (OtherDstSel ==
BYTE_3) ||
937 (OtherDstSel ==
BYTE_1) ||
938 (OtherDstSel ==
BYTE_3) ||
942 (OtherDstSel ==
BYTE_1) ||
943 (OtherDstSel ==
BYTE_2) ||
946 default: DstSelAgree =
false;
954 TII->getNamedImmOperand(*OtherInst, AMDGPU::OpName::dst_unused));
955 if (OtherDstUnused != DstUnused::UNUSED_PAD)
959 MachineOperand *OrDst =
TII->getNamedOperand(
MI, AMDGPU::OpName::vdst);
962 return std::make_unique<SDWADstPreserveOperand>(
963 OrDst, OrSDWADef, OrOtherDef, DstSel);
968 return std::unique_ptr<SDWAOperand>(
nullptr);
978void SIPeepholeSDWA::matchSDWAOperands(MachineBasicBlock &
MBB) {
979 for (MachineInstr &
MI :
MBB) {
980 if (
auto Operand = matchSDWAOperand(
MI)) {
982 SDWAOperands[&
MI] = std::move(Operand);
983 ++NumSDWAPatternsFound;
1006void SIPeepholeSDWA::pseudoOpConvertToVOP2(MachineInstr &
MI,
1007 const GCNSubtarget &ST)
const {
1008 int Opc =
MI.getOpcode();
1009 assert((
Opc == AMDGPU::V_ADD_CO_U32_e64 ||
Opc == AMDGPU::V_SUB_CO_U32_e64) &&
1010 "Currently only handles V_ADD_CO_U32_e64 or V_SUB_CO_U32_e64");
1017 const MachineOperand *Sdst =
TII->getNamedOperand(
MI, AMDGPU::OpName::sdst);
1023 MachineInstr &MISucc = *NextOp->
getParent();
1026 MachineOperand *CarryIn =
TII->getNamedOperand(MISucc, AMDGPU::OpName::src2);
1029 MachineOperand *CarryOut =
TII->getNamedOperand(MISucc, AMDGPU::OpName::sdst);
1035 MachineBasicBlock &
MBB = *
MI.getParent();
1043 if (
I->modifiesRegister(AMDGPU::VCC,
TRI))
1049 .
add(*
TII->getNamedOperand(
MI, AMDGPU::OpName::vdst))
1050 .
add(*
TII->getNamedOperand(
MI, AMDGPU::OpName::src0))
1051 .
add(*
TII->getNamedOperand(
MI, AMDGPU::OpName::src1))
1054 MI.eraseFromParent();
1066void SIPeepholeSDWA::convertVcndmaskToVOP2(MachineInstr &
MI,
1067 const GCNSubtarget &ST)
const {
1068 assert(
MI.getOpcode() == AMDGPU::V_CNDMASK_B32_e64);
1076 const MachineOperand &CarryIn =
1077 *
TII->getNamedOperand(
MI, AMDGPU::OpName::src2);
1079 MachineInstr *CarryDef =
MRI->getVRegDef(CarryReg);
1086 MCRegister
Vcc =
TRI->getVCC();
1087 MachineBasicBlock &
MBB = *
MI.getParent();
1091 LLVM_DEBUG(
dbgs() <<
"VCC not known to be dead before instruction\n");
1099 .
add(*
TII->getNamedOperand(
MI, AMDGPU::OpName::vdst))
1100 .
add(*
TII->getNamedOperand(
MI, AMDGPU::OpName::src0))
1101 .
add(*
TII->getNamedOperand(
MI, AMDGPU::OpName::src1))
1103 TII->fixImplicitOperands(*Converted);
1106 MI.eraseFromParent();
1110bool isConvertibleToSDWA(MachineInstr &
MI,
1111 const GCNSubtarget &ST,
1112 const SIInstrInfo*
TII) {
1114 unsigned Opc =
MI.getOpcode();
1120 if (
Opc == AMDGPU::V_CNDMASK_B32_e64)
1130 if (!
ST.hasSDWAOmod() &&
TII->hasModifiersSet(
MI, AMDGPU::OpName::omod))
1134 if (!
ST.hasSDWASdst()) {
1135 const MachineOperand *SDst =
TII->getNamedOperand(
MI, AMDGPU::OpName::sdst);
1136 if (SDst && (SDst->
getReg() != AMDGPU::VCC &&
1137 SDst->
getReg() != AMDGPU::VCC_LO))
1141 if (!
ST.hasSDWAOutModsVOPC() &&
1142 (
TII->hasModifiersSet(
MI, AMDGPU::OpName::clamp) ||
1143 TII->hasModifiersSet(
MI, AMDGPU::OpName::omod)))
1146 }
else if (
TII->getNamedOperand(
MI, AMDGPU::OpName::sdst) ||
1147 !
TII->getNamedOperand(
MI, AMDGPU::OpName::vdst)) {
1151 if (!
ST.hasSDWAMac() && (
Opc == AMDGPU::V_FMAC_F16_e32 ||
1152 Opc == AMDGPU::V_FMAC_F32_e32 ||
1153 Opc == AMDGPU::V_MAC_F16_e32 ||
1154 Opc == AMDGPU::V_MAC_F32_e32))
1158 if (
TII->pseudoToMCOpcode(
Opc) == -1)
1161 if (MachineOperand *Src0 =
TII->getNamedOperand(
MI, AMDGPU::OpName::src0)) {
1166 if (MachineOperand *Src1 =
TII->getNamedOperand(
MI, AMDGPU::OpName::src1)) {
1175MachineInstr *SIPeepholeSDWA::createSDWAVersion(MachineInstr &
MI) {
1176 unsigned Opcode =
MI.getOpcode();
1180 if (SDWAOpcode == -1)
1182 assert(SDWAOpcode != -1);
1184 const MCInstrDesc &SDWADesc =
TII->get(SDWAOpcode);
1187 MachineInstrBuilder SDWAInst =
1192 MachineOperand *Dst =
TII->getNamedOperand(
MI, AMDGPU::OpName::vdst);
1196 }
else if ((Dst =
TII->getNamedOperand(
MI, AMDGPU::OpName::sdst))) {
1206 MachineOperand *Src0 =
TII->getNamedOperand(
MI, AMDGPU::OpName::src0);
1209 if (
auto *
Mod =
TII->getNamedOperand(
MI, AMDGPU::OpName::src0_modifiers))
1213 SDWAInst.
add(*Src0);
1216 MachineOperand *Src1 =
TII->getNamedOperand(
MI, AMDGPU::OpName::src1);
1220 if (
auto *
Mod =
TII->getNamedOperand(
MI, AMDGPU::OpName::src1_modifiers))
1224 SDWAInst.
add(*Src1);
1227 if (SDWAOpcode == AMDGPU::V_FMAC_F16_sdwa ||
1228 SDWAOpcode == AMDGPU::V_FMAC_F32_sdwa ||
1229 SDWAOpcode == AMDGPU::V_MAC_F16_sdwa ||
1230 SDWAOpcode == AMDGPU::V_MAC_F32_sdwa) {
1232 MachineOperand *Src2 =
TII->getNamedOperand(
MI, AMDGPU::OpName::src2);
1234 SDWAInst.
add(*Src2);
1239 MachineOperand *Clamp =
TII->getNamedOperand(
MI, AMDGPU::OpName::clamp);
1241 SDWAInst.
add(*Clamp);
1248 MachineOperand *OMod =
TII->getNamedOperand(
MI, AMDGPU::OpName::omod);
1250 SDWAInst.
add(*OMod);
1258 SDWAInst.
addImm(AMDGPU::SDWA::SdwaSel::DWORD);
1261 SDWAInst.
addImm(AMDGPU::SDWA::DstUnused::UNUSED_PAD);
1264 SDWAInst.
addImm(AMDGPU::SDWA::SdwaSel::DWORD);
1268 SDWAInst.
addImm(AMDGPU::SDWA::SdwaSel::DWORD);
1273 TII->fixImplicitOperands(*Ret);
1277bool SIPeepholeSDWA::convertToSDWA(MachineInstr &
MI,
1278 const SDWAOperandsVector &SDWAOperands) {
1281 MachineInstr *SDWAInst;
1282 if (
TII->isSDWA(
MI.getOpcode())) {
1286 SDWAInst =
MI.getParent()->getParent()->CloneMachineInstr(&
MI);
1287 MI.getParent()->insert(
MI.getIterator(), SDWAInst);
1289 SDWAInst = createSDWAVersion(
MI);
1293 bool Converted =
false;
1294 for (
auto &Operand : SDWAOperands) {
1306 if (PotentialMatches.count(Operand->getParentInst()) == 0)
1307 Converted |= Operand->convertToSDWA(*SDWAInst,
TII);
1315 ConvertedInstructions.
push_back(SDWAInst);
1316 for (MachineOperand &MO : SDWAInst->
uses()) {
1320 MRI->clearKillFlags(MO.getReg());
1323 ++NumSDWAInstructionsPeepholed;
1325 MI.eraseFromParent();
1331void SIPeepholeSDWA::legalizeScalarOperands(MachineInstr &
MI,
1332 const GCNSubtarget &ST)
const {
1333 const MCInstrDesc &
Desc =
TII->get(
MI.getOpcode());
1334 unsigned ConstantBusCount = 0;
1335 for (MachineOperand &
Op :
MI.explicit_uses()) {
1336 if (!
Op.isImm() && !(
Op.isReg() && !
TRI->isVGPR(*
MRI,
Op.getReg())))
1339 unsigned I =
Op.getOperandNo();
1340 if (
Desc.operands()[
I].RegClass == -1 ||
1341 !
TRI->isVSSuperClass(
TRI->getRegClass(
Desc.operands()[
I].RegClass)))
1344 if (
ST.hasSDWAScalar() && ConstantBusCount == 0 &&
Op.isReg() &&
1345 TRI->isSGPRReg(*
MRI,
Op.getReg())) {
1350 Register VGPR =
MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
1352 TII->get(AMDGPU::V_MOV_B32_e32), VGPR);
1354 Copy.addImm(
Op.getImm());
1355 else if (
Op.isReg())
1358 Op.ChangeToRegister(VGPR,
false);
1362bool SIPeepholeSDWALegacy::runOnMachineFunction(MachineFunction &MF) {
1366 return SIPeepholeSDWA().run(MF);
1369bool SIPeepholeSDWA::run(MachineFunction &MF) {
1376 TRI =
ST.getRegisterInfo();
1377 TII =
ST.getInstrInfo();
1381 for (MachineBasicBlock &
MBB : MF) {
1388 matchSDWAOperands(
MBB);
1389 for (
const auto &OperandPair : SDWAOperands) {
1390 const auto &Operand = OperandPair.second;
1391 MachineInstr *PotentialMI = Operand->potentialToConvert(
TII, ST);
1396 case AMDGPU::V_ADD_CO_U32_e64:
1397 case AMDGPU::V_SUB_CO_U32_e64:
1398 pseudoOpConvertToVOP2(*PotentialMI, ST);
1400 case AMDGPU::V_CNDMASK_B32_e64:
1401 convertVcndmaskToVOP2(*PotentialMI, ST);
1405 SDWAOperands.clear();
1408 matchSDWAOperands(
MBB);
1410 for (
const auto &OperandPair : SDWAOperands) {
1411 const auto &Operand = OperandPair.second;
1412 MachineInstr *PotentialMI =
1413 Operand->potentialToConvert(
TII, ST, &PotentialMatches);
1415 if (PotentialMI && isConvertibleToSDWA(*PotentialMI, ST,
TII))
1416 PotentialMatches[PotentialMI].push_back(Operand.get());
1419 for (
auto &PotentialPair : PotentialMatches) {
1420 MachineInstr &PotentialMI = *PotentialPair.first;
1421 convertToSDWA(PotentialMI, PotentialPair.second);
1424 PotentialMatches.clear();
1425 SDWAOperands.clear();
1431 while (!ConvertedInstructions.
empty())
1432 legalizeScalarOperands(*ConvertedInstructions.
pop_back_val(), ST);
unsigned const MachineRegisterInfo * MRI
MachineInstrBuilder & UseMI
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
Provides AMDGPU specific target descriptions.
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
#define LLVM_DUMP_METHOD
Mark debug helper function definitions like dump() that should not be stripped from debug builds.
AMD GCN specific subclass of TargetSubtarget.
const HexagonInstrInfo * TII
Register const TargetRegisterInfo * TRI
This file implements a map that provides insertion order iteration.
Promote Memory to Register
static unsigned getReg(const MCDisassembler *D, unsigned RC, unsigned RegNo)
if(auto Err=PB.parsePassPipeline(MPM, Passes)) return wrap(std MPM run * Mod
#define INITIALIZE_PASS(passName, arg, name, cfg, analysis)
static MachineOperand * findSingleRegDef(const MachineOperand *Reg, const MachineRegisterInfo *MRI)
static void copyRegOperand(MachineOperand &To, const MachineOperand &From)
static MachineOperand * findSingleRegUse(const MachineOperand *Reg, const MachineRegisterInfo *MRI)
static std::optional< SdwaSel > combineSdwaSel(SdwaSel Sel, SdwaSel OperandSel)
Combine an SDWA instruction's existing SDWA selection Sel with the SDWA selection OperandSel of its o...
static bool isSameReg(const MachineOperand &LHS, const MachineOperand &RHS)
static bool canCombineOpSel(const MachineInstr &MI, const SIInstrInfo *TII, AMDGPU::OpName SrcSelOpName, SdwaSel OpSel)
Verify that the SDWA selection operand SrcSelOpName of the SDWA instruction MI can be combined with t...
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
LLVM_ABI void setPreservesCFG()
This function should be called by the pass, iff they do not:
Represents analyses that only rely on functions' control flow.
FunctionPass class - This class is used to implement most global optimizations.
bool hasOptNone() const
Do not optimize this function (-O0).
LLVM_ABI LivenessQueryResult computeRegisterLiveness(const TargetRegisterInfo *TRI, MCRegister Reg, const_iterator Before, unsigned Neighborhood=10) const
Return whether (physical) register Reg has been defined and not killed as of just before Before.
LivenessQueryResult
Possible outcome of a register liveness query to computeRegisterLiveness()
@ LQR_Dead
Register is known to be fully dead.
MachineFunctionPass - This class adapts the FunctionPass interface to allow convenient creation of pa...
void getAnalysisUsage(AnalysisUsage &AU) const override
getAnalysisUsage - Subclasses that override getAnalysisUsage must call this.
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
Function & getFunction()
Return the LLVM function that this machine code represents.
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & add(const MachineOperand &MO) const
const MachineInstrBuilder & addReg(Register RegNo, unsigned flags=0, unsigned SubReg=0) const
Add a new virtual register operand.
const MachineInstrBuilder & setMIFlags(unsigned Flags) const
MachineInstr * getInstr() const
If conversion operators fail, use this method to get the MachineInstr explicitly.
Representation of each machine instruction.
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
LLVM_ABI void substituteRegister(Register FromReg, Register ToReg, unsigned SubIdx, const TargetRegisterInfo &RegInfo)
Replace all occurrences of FromReg with ToReg:SubIdx, properly composing subreg indices where necessa...
mop_range uses()
Returns all operands which may be register uses.
LLVM_ABI void eraseFromParent()
Unlink 'this' from the containing basic block and delete it.
const MachineOperand & getOperand(unsigned i) const
MachineOperand class - Representation of each machine instruction operand.
void setSubReg(unsigned subReg)
unsigned getSubReg() const
void setImm(int64_t immVal)
bool isReg() const
isReg - Tests if this is a MO_Register operand.
void setIsDead(bool Val=true)
LLVM_ABI void setReg(Register Reg)
Change the register this operand corresponds to.
bool isImm() const
isImm - Tests if this is a MO_Immediate operand.
void setIsKill(bool Val=true)
MachineInstr * getParent()
getParent - Return the instruction that this operand belongs to.
void setIsUndef(bool Val=true)
Register getReg() const
getReg - Returns the register number.
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
This class implements a map that also provides access to all stored values in a deterministic order.
A set of analyses that are preserved following a run of a transformation pass.
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
PreservedAnalyses & preserveSet()
Mark an analysis set as preserved.
constexpr bool isPhysical() const
Return true if the specified register number is in the physical register namespace.
PreservedAnalyses run(MachineFunction &MF, MachineFunctionAnalysisManager &MFAM)
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
self_iterator getIterator()
This class implements an extremely fast bulk output stream that can only output to a stream.
LLVM_READONLY int getVOPe32(uint16_t Opcode)
LLVM_READONLY bool hasNamedOperand(uint64_t Opcode, OpName NamedIdx)
LLVM_READONLY int getSDWAOp(uint16_t Opcode)
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
@ Define
Register definition.
@ Kill
The last use of a register.
NodeAddr< DefNode * > Def
This is an optimization pass for GlobalISel generic memory operations.
void dump(const SparseBitVector< ElementSize > &LHS, raw_ostream &out)
Printable print(const GCNRegPressure &RP, const GCNSubtarget *ST=nullptr, unsigned DynamicVGPRBlockSize=0)
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
AnalysisManager< MachineFunction > MachineFunctionAnalysisManager
FunctionPass * createSIPeepholeSDWALegacyPass()
LLVM_ABI PreservedAnalyses getMachineFunctionPassPreservedAnalyses()
Returns the minimum set of Analyses that all machine function passes must preserve.
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
DWARFExpression::Operation Op
raw_ostream & operator<<(raw_ostream &OS, const APFixedPoint &FX)
char & SIPeepholeSDWALegacyID