LLVM 22.0.0git
RISCVTargetTransformInfo.cpp
Go to the documentation of this file.
1//===-- RISCVTargetTransformInfo.cpp - RISC-V specific TTI ----------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8
11#include "llvm/ADT/STLExtras.h"
19#include <cmath>
20#include <optional>
21using namespace llvm;
22using namespace llvm::PatternMatch;
23
24#define DEBUG_TYPE "riscvtti"
25
27 "riscv-v-register-bit-width-lmul",
29 "The LMUL to use for getRegisterBitWidth queries. Affects LMUL used "
30 "by autovectorized code. Fractional LMULs are not supported."),
32
34 "riscv-v-slp-max-vf",
36 "Overrides result used for getMaximumVF query which is used "
37 "exclusively by SLP vectorizer."),
39
41 RVVMinTripCount("riscv-v-min-trip-count",
42 cl::desc("Set the lower bound of a trip count to decide on "
43 "vectorization while tail-folding."),
45
47RISCVTTIImpl::getRISCVInstructionCost(ArrayRef<unsigned> OpCodes, MVT VT,
49 // Check if the type is valid for all CostKind
50 if (!VT.isVector())
52 size_t NumInstr = OpCodes.size();
54 return NumInstr;
55 InstructionCost LMULCost = TLI->getLMULCost(VT);
57 return LMULCost * NumInstr;
58 InstructionCost Cost = 0;
59 for (auto Op : OpCodes) {
60 switch (Op) {
61 case RISCV::VRGATHER_VI:
62 Cost += TLI->getVRGatherVICost(VT);
63 break;
64 case RISCV::VRGATHER_VV:
65 Cost += TLI->getVRGatherVVCost(VT);
66 break;
67 case RISCV::VSLIDEUP_VI:
68 case RISCV::VSLIDEDOWN_VI:
69 Cost += TLI->getVSlideVICost(VT);
70 break;
71 case RISCV::VSLIDEUP_VX:
72 case RISCV::VSLIDEDOWN_VX:
73 Cost += TLI->getVSlideVXCost(VT);
74 break;
75 case RISCV::VREDMAX_VS:
76 case RISCV::VREDMIN_VS:
77 case RISCV::VREDMAXU_VS:
78 case RISCV::VREDMINU_VS:
79 case RISCV::VREDSUM_VS:
80 case RISCV::VREDAND_VS:
81 case RISCV::VREDOR_VS:
82 case RISCV::VREDXOR_VS:
83 case RISCV::VFREDMAX_VS:
84 case RISCV::VFREDMIN_VS:
85 case RISCV::VFREDUSUM_VS: {
86 unsigned VL = VT.getVectorMinNumElements();
87 if (!VT.isFixedLengthVector())
88 VL *= *getVScaleForTuning();
89 Cost += Log2_32_Ceil(VL);
90 break;
91 }
92 case RISCV::VFREDOSUM_VS: {
93 unsigned VL = VT.getVectorMinNumElements();
94 if (!VT.isFixedLengthVector())
95 VL *= *getVScaleForTuning();
96 Cost += VL;
97 break;
98 }
99 case RISCV::VMV_X_S:
100 case RISCV::VMV_S_X:
101 case RISCV::VFMV_F_S:
102 case RISCV::VFMV_S_F:
103 case RISCV::VMOR_MM:
104 case RISCV::VMXOR_MM:
105 case RISCV::VMAND_MM:
106 case RISCV::VMANDN_MM:
107 case RISCV::VMNAND_MM:
108 case RISCV::VCPOP_M:
109 case RISCV::VFIRST_M:
110 Cost += 1;
111 break;
112 default:
113 Cost += LMULCost;
114 }
115 }
116 return Cost;
117}
118
120 const RISCVSubtarget *ST,
121 const APInt &Imm, Type *Ty,
123 bool FreeZeroes) {
124 assert(Ty->isIntegerTy() &&
125 "getIntImmCost can only estimate cost of materialising integers");
126
127 // We have a Zero register, so 0 is always free.
128 if (Imm == 0)
129 return TTI::TCC_Free;
130
131 // Otherwise, we check how many instructions it will take to materialise.
132 return RISCVMatInt::getIntMatCost(Imm, DL.getTypeSizeInBits(Ty), *ST,
133 /*CompressionCost=*/false, FreeZeroes);
134}
135
139 return getIntImmCostImpl(getDataLayout(), getST(), Imm, Ty, CostKind, false);
140}
141
142// Look for patterns of shift followed by AND that can be turned into a pair of
143// shifts. We won't need to materialize an immediate for the AND so these can
144// be considered free.
145static bool canUseShiftPair(Instruction *Inst, const APInt &Imm) {
146 uint64_t Mask = Imm.getZExtValue();
147 auto *BO = dyn_cast<BinaryOperator>(Inst->getOperand(0));
148 if (!BO || !BO->hasOneUse())
149 return false;
150
151 if (BO->getOpcode() != Instruction::Shl)
152 return false;
153
154 if (!isa<ConstantInt>(BO->getOperand(1)))
155 return false;
156
157 unsigned ShAmt = cast<ConstantInt>(BO->getOperand(1))->getZExtValue();
158 // (and (shl x, c2), c1) will be matched to (srli (slli x, c2+c3), c3) if c1
159 // is a mask shifted by c2 bits with c3 leading zeros.
160 if (isShiftedMask_64(Mask)) {
161 unsigned Trailing = llvm::countr_zero(Mask);
162 if (ShAmt == Trailing)
163 return true;
164 }
165
166 return false;
167}
168
170 const APInt &Imm, Type *Ty,
172 Instruction *Inst) const {
173 assert(Ty->isIntegerTy() &&
174 "getIntImmCost can only estimate cost of materialising integers");
175
176 // We have a Zero register, so 0 is always free.
177 if (Imm == 0)
178 return TTI::TCC_Free;
179
180 // Some instructions in RISC-V can take a 12-bit immediate. Some of these are
181 // commutative, in others the immediate comes from a specific argument index.
182 bool Takes12BitImm = false;
183 unsigned ImmArgIdx = ~0U;
184
185 switch (Opcode) {
186 case Instruction::GetElementPtr:
187 // Never hoist any arguments to a GetElementPtr. CodeGenPrepare will
188 // split up large offsets in GEP into better parts than ConstantHoisting
189 // can.
190 return TTI::TCC_Free;
191 case Instruction::Store: {
192 // Use the materialization cost regardless of if it's the address or the
193 // value that is constant, except for if the store is misaligned and
194 // misaligned accesses are not legal (experience shows constant hoisting
195 // can sometimes be harmful in such cases).
196 if (Idx == 1 || !Inst)
197 return getIntImmCostImpl(getDataLayout(), getST(), Imm, Ty, CostKind,
198 /*FreeZeroes=*/true);
199
200 StoreInst *ST = cast<StoreInst>(Inst);
201 if (!getTLI()->allowsMemoryAccessForAlignment(
202 Ty->getContext(), DL, getTLI()->getValueType(DL, Ty),
203 ST->getPointerAddressSpace(), ST->getAlign()))
204 return TTI::TCC_Free;
205
206 return getIntImmCostImpl(getDataLayout(), getST(), Imm, Ty, CostKind,
207 /*FreeZeroes=*/true);
208 }
209 case Instruction::Load:
210 // If the address is a constant, use the materialization cost.
211 return getIntImmCost(Imm, Ty, CostKind);
212 case Instruction::And:
213 // zext.h
214 if (Imm == UINT64_C(0xffff) && ST->hasStdExtZbb())
215 return TTI::TCC_Free;
216 // zext.w
217 if (Imm == UINT64_C(0xffffffff) &&
218 ((ST->hasStdExtZba() && ST->isRV64()) || ST->isRV32()))
219 return TTI::TCC_Free;
220 // bclri
221 if (ST->hasStdExtZbs() && (~Imm).isPowerOf2())
222 return TTI::TCC_Free;
223 if (Inst && Idx == 1 && Imm.getBitWidth() <= ST->getXLen() &&
224 canUseShiftPair(Inst, Imm))
225 return TTI::TCC_Free;
226 Takes12BitImm = true;
227 break;
228 case Instruction::Add:
229 Takes12BitImm = true;
230 break;
231 case Instruction::Or:
232 case Instruction::Xor:
233 // bseti/binvi
234 if (ST->hasStdExtZbs() && Imm.isPowerOf2())
235 return TTI::TCC_Free;
236 Takes12BitImm = true;
237 break;
238 case Instruction::Mul:
239 // Power of 2 is a shift. Negated power of 2 is a shift and a negate.
240 if (Imm.isPowerOf2() || Imm.isNegatedPowerOf2())
241 return TTI::TCC_Free;
242 // One more or less than a power of 2 can use SLLI+ADD/SUB.
243 if ((Imm + 1).isPowerOf2() || (Imm - 1).isPowerOf2())
244 return TTI::TCC_Free;
245 // FIXME: There is no MULI instruction.
246 Takes12BitImm = true;
247 break;
248 case Instruction::Sub:
249 case Instruction::Shl:
250 case Instruction::LShr:
251 case Instruction::AShr:
252 Takes12BitImm = true;
253 ImmArgIdx = 1;
254 break;
255 default:
256 break;
257 }
258
259 if (Takes12BitImm) {
260 // Check immediate is the correct argument...
261 if (Instruction::isCommutative(Opcode) || Idx == ImmArgIdx) {
262 // ... and fits into the 12-bit immediate.
263 if (Imm.getSignificantBits() <= 64 &&
264 getTLI()->isLegalAddImmediate(Imm.getSExtValue())) {
265 return TTI::TCC_Free;
266 }
267 }
268
269 // Otherwise, use the full materialisation cost.
270 return getIntImmCost(Imm, Ty, CostKind);
271 }
272
273 // By default, prevent hoisting.
274 return TTI::TCC_Free;
275}
276
279 const APInt &Imm, Type *Ty,
281 // Prevent hoisting in unknown cases.
282 return TTI::TCC_Free;
283}
284
286 return ST->hasVInstructions();
287}
288
290RISCVTTIImpl::getPopcntSupport(unsigned TyWidth) const {
291 assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2");
292 return ST->hasCPOPLike() ? TTI::PSK_FastHardware : TTI::PSK_Software;
293}
294
296 unsigned Opcode, Type *InputTypeA, Type *InputTypeB, Type *AccumType,
298 TTI::PartialReductionExtendKind OpBExtend, std::optional<unsigned> BinOp,
300
301 // zve32x is broken for partial_reduce_umla, but let's make sure we
302 // don't generate them.
303 if (!ST->hasStdExtZvqdotq() || ST->getELen() < 64 ||
304 Opcode != Instruction::Add || !BinOp || *BinOp != Instruction::Mul ||
305 InputTypeA != InputTypeB || !InputTypeA->isIntegerTy(8) ||
306 !AccumType->isIntegerTy(32) || !VF.isKnownMultipleOf(4))
308
309 Type *Tp = VectorType::get(AccumType, VF.divideCoefficientBy(4));
310 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Tp);
311 // Note: Asuming all vqdot* variants are equal cost
312 return LT.first *
313 getRISCVInstructionCost(RISCV::VQDOT_VV, LT.second, CostKind);
314}
315
317 // Currently, the ExpandReductions pass can't expand scalable-vector
318 // reductions, but we still request expansion as RVV doesn't support certain
319 // reductions and the SelectionDAG can't legalize them either.
320 switch (II->getIntrinsicID()) {
321 default:
322 return false;
323 // These reductions have no equivalent in RVV
324 case Intrinsic::vector_reduce_mul:
325 case Intrinsic::vector_reduce_fmul:
326 return true;
327 }
328}
329
330std::optional<unsigned> RISCVTTIImpl::getMaxVScale() const {
331 if (ST->hasVInstructions())
332 return ST->getRealMaxVLen() / RISCV::RVVBitsPerBlock;
333 return BaseT::getMaxVScale();
334}
335
336std::optional<unsigned> RISCVTTIImpl::getVScaleForTuning() const {
337 if (ST->hasVInstructions())
338 if (unsigned MinVLen = ST->getRealMinVLen();
339 MinVLen >= RISCV::RVVBitsPerBlock)
340 return MinVLen / RISCV::RVVBitsPerBlock;
342}
343
346 unsigned LMUL =
347 llvm::bit_floor(std::clamp<unsigned>(RVVRegisterWidthLMUL, 1, 8));
348 switch (K) {
350 return TypeSize::getFixed(ST->getXLen());
352 return TypeSize::getFixed(
353 ST->useRVVForFixedLengthVectors() ? LMUL * ST->getRealMinVLen() : 0);
356 (ST->hasVInstructions() &&
357 ST->getRealMinVLen() >= RISCV::RVVBitsPerBlock)
359 : 0);
360 }
361
362 llvm_unreachable("Unsupported register kind");
363}
364
366RISCVTTIImpl::getConstantPoolLoadCost(Type *Ty,
368 // Add a cost of address generation + the cost of the load. The address
369 // is expected to be a PC relative offset to a constant pool entry
370 // using auipc/addi.
371 return 2 + getMemoryOpCost(Instruction::Load, Ty, DL.getABITypeAlign(Ty),
372 /*AddressSpace=*/0, CostKind);
373}
374
375static bool isRepeatedConcatMask(ArrayRef<int> Mask, int &SubVectorSize) {
376 unsigned Size = Mask.size();
377 if (!isPowerOf2_32(Size))
378 return false;
379 for (unsigned I = 0; I != Size; ++I) {
380 if (static_cast<unsigned>(Mask[I]) == I)
381 continue;
382 if (Mask[I] != 0)
383 return false;
384 if (Size % I != 0)
385 return false;
386 for (unsigned J = I + 1; J != Size; ++J)
387 // Check the pattern is repeated.
388 if (static_cast<unsigned>(Mask[J]) != J % I)
389 return false;
390 SubVectorSize = I;
391 return true;
392 }
393 // That means Mask is <0, 1, 2, 3>. This is not a concatenation.
394 return false;
395}
396
398 LLVMContext &C) {
399 assert((DataVT.getScalarSizeInBits() != 8 ||
400 DataVT.getVectorNumElements() <= 256) && "unhandled case in lowering");
401 MVT IndexVT = DataVT.changeTypeToInteger();
402 if (IndexVT.getScalarType().bitsGT(ST.getXLenVT()))
403 IndexVT = IndexVT.changeVectorElementType(MVT::i16);
404 return cast<VectorType>(EVT(IndexVT).getTypeForEVT(C));
405}
406
407/// Attempt to approximate the cost of a shuffle which will require splitting
408/// during legalization. Note that processShuffleMasks is not an exact proxy
409/// for the algorithm used in LegalizeVectorTypes, but hopefully it's a
410/// reasonably close upperbound.
412 MVT LegalVT, VectorType *Tp,
413 ArrayRef<int> Mask,
415 assert(LegalVT.isFixedLengthVector() && !Mask.empty() &&
416 "Expected fixed vector type and non-empty mask");
417 unsigned LegalNumElts = LegalVT.getVectorNumElements();
418 // Number of destination vectors after legalization:
419 unsigned NumOfDests = divideCeil(Mask.size(), LegalNumElts);
420 // We are going to permute multiple sources and the result will be in
421 // multiple destinations. Providing an accurate cost only for splits where
422 // the element type remains the same.
423 if (NumOfDests <= 1 ||
425 Tp->getElementType()->getPrimitiveSizeInBits() ||
426 LegalNumElts >= Tp->getElementCount().getFixedValue())
428
429 unsigned VecTySize = TTI.getDataLayout().getTypeStoreSize(Tp);
430 unsigned LegalVTSize = LegalVT.getStoreSize();
431 // Number of source vectors after legalization:
432 unsigned NumOfSrcs = divideCeil(VecTySize, LegalVTSize);
433
434 auto *SingleOpTy = FixedVectorType::get(Tp->getElementType(), LegalNumElts);
435
436 unsigned NormalizedVF = LegalNumElts * std::max(NumOfSrcs, NumOfDests);
437 unsigned NumOfSrcRegs = NormalizedVF / LegalNumElts;
438 unsigned NumOfDestRegs = NormalizedVF / LegalNumElts;
439 SmallVector<int> NormalizedMask(NormalizedVF, PoisonMaskElem);
440 assert(NormalizedVF >= Mask.size() &&
441 "Normalized mask expected to be not shorter than original mask.");
442 copy(Mask, NormalizedMask.begin());
443 InstructionCost Cost = 0;
444 SmallDenseSet<std::pair<ArrayRef<int>, unsigned>> ReusedSingleSrcShuffles;
446 NormalizedMask, NumOfSrcRegs, NumOfDestRegs, NumOfDestRegs, []() {},
447 [&](ArrayRef<int> RegMask, unsigned SrcReg, unsigned DestReg) {
448 if (ShuffleVectorInst::isIdentityMask(RegMask, RegMask.size()))
449 return;
450 if (!ReusedSingleSrcShuffles.insert(std::make_pair(RegMask, SrcReg))
451 .second)
452 return;
453 Cost += TTI.getShuffleCost(
455 FixedVectorType::get(SingleOpTy->getElementType(), RegMask.size()),
456 SingleOpTy, RegMask, CostKind, 0, nullptr);
457 },
458 [&](ArrayRef<int> RegMask, unsigned Idx1, unsigned Idx2, bool NewReg) {
459 Cost += TTI.getShuffleCost(
461 FixedVectorType::get(SingleOpTy->getElementType(), RegMask.size()),
462 SingleOpTy, RegMask, CostKind, 0, nullptr);
463 });
464 return Cost;
465}
466
467/// Try to perform better estimation of the permutation.
468/// 1. Split the source/destination vectors into real registers.
469/// 2. Do the mask analysis to identify which real registers are
470/// permuted. If more than 1 source registers are used for the
471/// destination register building, the cost for this destination register
472/// is (Number_of_source_register - 1) * Cost_PermuteTwoSrc. If only one
473/// source register is used, build mask and calculate the cost as a cost
474/// of PermuteSingleSrc.
475/// Also, for the single register permute we try to identify if the
476/// destination register is just a copy of the source register or the
477/// copy of the previous destination register (the cost is
478/// TTI::TCC_Basic). If the source register is just reused, the cost for
479/// this operation is 0.
480static InstructionCost
482 std::optional<unsigned> VLen, VectorType *Tp,
484 assert(LegalVT.isFixedLengthVector());
485 if (!VLen || Mask.empty())
487 MVT ElemVT = LegalVT.getVectorElementType();
488 unsigned ElemsPerVReg = *VLen / ElemVT.getFixedSizeInBits();
489 LegalVT = TTI.getTypeLegalizationCost(
490 FixedVectorType::get(Tp->getElementType(), ElemsPerVReg))
491 .second;
492 // Number of destination vectors after legalization:
493 InstructionCost NumOfDests =
494 divideCeil(Mask.size(), LegalVT.getVectorNumElements());
495 if (NumOfDests <= 1 ||
497 Tp->getElementType()->getPrimitiveSizeInBits() ||
498 LegalVT.getVectorNumElements() >= Tp->getElementCount().getFixedValue())
500
501 unsigned VecTySize = TTI.getDataLayout().getTypeStoreSize(Tp);
502 unsigned LegalVTSize = LegalVT.getStoreSize();
503 // Number of source vectors after legalization:
504 unsigned NumOfSrcs = divideCeil(VecTySize, LegalVTSize);
505
506 auto *SingleOpTy = FixedVectorType::get(Tp->getElementType(),
507 LegalVT.getVectorNumElements());
508
509 unsigned E = NumOfDests.getValue();
510 unsigned NormalizedVF =
511 LegalVT.getVectorNumElements() * std::max(NumOfSrcs, E);
512 unsigned NumOfSrcRegs = NormalizedVF / LegalVT.getVectorNumElements();
513 unsigned NumOfDestRegs = NormalizedVF / LegalVT.getVectorNumElements();
514 SmallVector<int> NormalizedMask(NormalizedVF, PoisonMaskElem);
515 assert(NormalizedVF >= Mask.size() &&
516 "Normalized mask expected to be not shorter than original mask.");
517 copy(Mask, NormalizedMask.begin());
518 InstructionCost Cost = 0;
519 int NumShuffles = 0;
520 SmallDenseSet<std::pair<ArrayRef<int>, unsigned>> ReusedSingleSrcShuffles;
522 NormalizedMask, NumOfSrcRegs, NumOfDestRegs, NumOfDestRegs, []() {},
523 [&](ArrayRef<int> RegMask, unsigned SrcReg, unsigned DestReg) {
524 if (ShuffleVectorInst::isIdentityMask(RegMask, RegMask.size()))
525 return;
526 if (!ReusedSingleSrcShuffles.insert(std::make_pair(RegMask, SrcReg))
527 .second)
528 return;
529 ++NumShuffles;
530 Cost += TTI.getShuffleCost(TTI::SK_PermuteSingleSrc, SingleOpTy,
531 SingleOpTy, RegMask, CostKind, 0, nullptr);
532 },
533 [&](ArrayRef<int> RegMask, unsigned Idx1, unsigned Idx2, bool NewReg) {
534 Cost += TTI.getShuffleCost(TTI::SK_PermuteTwoSrc, SingleOpTy,
535 SingleOpTy, RegMask, CostKind, 0, nullptr);
536 NumShuffles += 2;
537 });
538 // Note: check that we do not emit too many shuffles here to prevent code
539 // size explosion.
540 // TODO: investigate, if it can be improved by extra analysis of the masks
541 // to check if the code is more profitable.
542 if ((NumOfDestRegs > 2 && NumShuffles <= static_cast<int>(NumOfDestRegs)) ||
543 (NumOfDestRegs <= 2 && NumShuffles < 4))
544 return Cost;
546}
547
548InstructionCost RISCVTTIImpl::getSlideCost(FixedVectorType *Tp,
549 ArrayRef<int> Mask,
551 // Avoid missing masks and length changing shuffles
552 if (Mask.size() <= 2 || Mask.size() != Tp->getNumElements())
554
555 int NumElts = Tp->getNumElements();
556 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Tp);
557 // Avoid scalarization cases
558 if (!LT.second.isFixedLengthVector())
560
561 // Requires moving elements between parts, which requires additional
562 // unmodeled instructions.
563 if (LT.first != 1)
565
566 auto GetSlideOpcode = [&](int SlideAmt) {
567 assert(SlideAmt != 0);
568 bool IsVI = isUInt<5>(std::abs(SlideAmt));
569 if (SlideAmt < 0)
570 return IsVI ? RISCV::VSLIDEDOWN_VI : RISCV::VSLIDEDOWN_VX;
571 return IsVI ? RISCV::VSLIDEUP_VI : RISCV::VSLIDEUP_VX;
572 };
573
574 std::array<std::pair<int, int>, 2> SrcInfo;
575 if (!isMaskedSlidePair(Mask, NumElts, SrcInfo))
577
578 if (SrcInfo[1].second == 0)
579 std::swap(SrcInfo[0], SrcInfo[1]);
580
581 InstructionCost FirstSlideCost = 0;
582 if (SrcInfo[0].second != 0) {
583 unsigned Opcode = GetSlideOpcode(SrcInfo[0].second);
584 FirstSlideCost = getRISCVInstructionCost(Opcode, LT.second, CostKind);
585 }
586
587 if (SrcInfo[1].first == -1)
588 return FirstSlideCost;
589
590 InstructionCost SecondSlideCost = 0;
591 if (SrcInfo[1].second != 0) {
592 unsigned Opcode = GetSlideOpcode(SrcInfo[1].second);
593 SecondSlideCost = getRISCVInstructionCost(Opcode, LT.second, CostKind);
594 } else {
595 SecondSlideCost =
596 getRISCVInstructionCost(RISCV::VMERGE_VVM, LT.second, CostKind);
597 }
598
599 auto EC = Tp->getElementCount();
600 VectorType *MaskTy =
602 InstructionCost MaskCost = getConstantPoolLoadCost(MaskTy, CostKind);
603 return FirstSlideCost + SecondSlideCost + MaskCost;
604}
605
608 VectorType *SrcTy, ArrayRef<int> Mask,
609 TTI::TargetCostKind CostKind, int Index,
611 const Instruction *CxtI) const {
612 assert((Mask.empty() || DstTy->isScalableTy() ||
613 Mask.size() == DstTy->getElementCount().getKnownMinValue()) &&
614 "Expected the Mask to match the return size if given");
615 assert(SrcTy->getScalarType() == DstTy->getScalarType() &&
616 "Expected the same scalar types");
617
618 Kind = improveShuffleKindFromMask(Kind, Mask, SrcTy, Index, SubTp);
619 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(SrcTy);
620
621 // First, handle cases where having a fixed length vector enables us to
622 // give a more accurate cost than falling back to generic scalable codegen.
623 // TODO: Each of these cases hints at a modeling gap around scalable vectors.
624 if (auto *FVTp = dyn_cast<FixedVectorType>(SrcTy);
625 FVTp && ST->hasVInstructions() && LT.second.isFixedLengthVector()) {
627 *this, LT.second, ST->getRealVLen(),
628 Kind == TTI::SK_InsertSubvector ? DstTy : SrcTy, Mask, CostKind);
629 if (VRegSplittingCost.isValid())
630 return VRegSplittingCost;
631 switch (Kind) {
632 default:
633 break;
635 if (Mask.size() >= 2) {
636 MVT EltTp = LT.second.getVectorElementType();
637 // If the size of the element is < ELEN then shuffles of interleaves and
638 // deinterleaves of 2 vectors can be lowered into the following
639 // sequences
640 if (EltTp.getScalarSizeInBits() < ST->getELen()) {
641 // Example sequence:
642 // vsetivli zero, 4, e8, mf4, ta, ma (ignored)
643 // vwaddu.vv v10, v8, v9
644 // li a0, -1 (ignored)
645 // vwmaccu.vx v10, a0, v9
646 if (ShuffleVectorInst::isInterleaveMask(Mask, 2, Mask.size()))
647 return 2 * LT.first * TLI->getLMULCost(LT.second);
648
649 if (Mask[0] == 0 || Mask[0] == 1) {
650 auto DeinterleaveMask = createStrideMask(Mask[0], 2, Mask.size());
651 // Example sequence:
652 // vnsrl.wi v10, v8, 0
653 if (equal(DeinterleaveMask, Mask))
654 return LT.first * getRISCVInstructionCost(RISCV::VNSRL_WI,
655 LT.second, CostKind);
656 }
657 }
658 int SubVectorSize;
659 if (LT.second.getScalarSizeInBits() != 1 &&
660 isRepeatedConcatMask(Mask, SubVectorSize)) {
662 unsigned NumSlides = Log2_32(Mask.size() / SubVectorSize);
663 // The cost of extraction from a subvector is 0 if the index is 0.
664 for (unsigned I = 0; I != NumSlides; ++I) {
665 unsigned InsertIndex = SubVectorSize * (1 << I);
666 FixedVectorType *SubTp =
667 FixedVectorType::get(SrcTy->getElementType(), InsertIndex);
668 FixedVectorType *DestTp =
670 std::pair<InstructionCost, MVT> DestLT =
672 // Add the cost of whole vector register move because the
673 // destination vector register group for vslideup cannot overlap the
674 // source.
675 Cost += DestLT.first * TLI->getLMULCost(DestLT.second);
676 Cost += getShuffleCost(TTI::SK_InsertSubvector, DestTp, DestTp, {},
677 CostKind, InsertIndex, SubTp);
678 }
679 return Cost;
680 }
681 }
682
683 if (InstructionCost SlideCost = getSlideCost(FVTp, Mask, CostKind);
684 SlideCost.isValid())
685 return SlideCost;
686
687 // vrgather + cost of generating the mask constant.
688 // We model this for an unknown mask with a single vrgather.
689 if (LT.first == 1 && (LT.second.getScalarSizeInBits() != 8 ||
690 LT.second.getVectorNumElements() <= 256)) {
691 VectorType *IdxTy =
692 getVRGatherIndexType(LT.second, *ST, SrcTy->getContext());
693 InstructionCost IndexCost = getConstantPoolLoadCost(IdxTy, CostKind);
694 return IndexCost +
695 getRISCVInstructionCost(RISCV::VRGATHER_VV, LT.second, CostKind);
696 }
697 break;
698 }
701
702 if (InstructionCost SlideCost = getSlideCost(FVTp, Mask, CostKind);
703 SlideCost.isValid())
704 return SlideCost;
705
706 // 2 x (vrgather + cost of generating the mask constant) + cost of mask
707 // register for the second vrgather. We model this for an unknown
708 // (shuffle) mask.
709 if (LT.first == 1 && (LT.second.getScalarSizeInBits() != 8 ||
710 LT.second.getVectorNumElements() <= 256)) {
711 auto &C = SrcTy->getContext();
712 auto EC = SrcTy->getElementCount();
713 VectorType *IdxTy = getVRGatherIndexType(LT.second, *ST, C);
715 InstructionCost IndexCost = getConstantPoolLoadCost(IdxTy, CostKind);
716 InstructionCost MaskCost = getConstantPoolLoadCost(MaskTy, CostKind);
717 return 2 * IndexCost +
718 getRISCVInstructionCost({RISCV::VRGATHER_VV, RISCV::VRGATHER_VV},
719 LT.second, CostKind) +
720 MaskCost;
721 }
722 break;
723 }
724 }
725
726 auto shouldSplit = [](TTI::ShuffleKind Kind) {
727 switch (Kind) {
728 default:
729 return false;
733 return true;
734 }
735 };
736
737 if (!Mask.empty() && LT.first.isValid() && LT.first != 1 &&
738 shouldSplit(Kind)) {
739 InstructionCost SplitCost =
740 costShuffleViaSplitting(*this, LT.second, FVTp, Mask, CostKind);
741 if (SplitCost.isValid())
742 return SplitCost;
743 }
744 }
745
746 // Handle scalable vectors (and fixed vectors legalized to scalable vectors).
747 switch (Kind) {
748 default:
749 // Fallthrough to generic handling.
750 // TODO: Most of these cases will return getInvalid in generic code, and
751 // must be implemented here.
752 break;
754 // Extract at zero is always a subregister extract
755 if (Index == 0)
756 return TTI::TCC_Free;
757
758 // If we're extracting a subvector of at most m1 size at a sub-register
759 // boundary - which unfortunately we need exact vlen to identify - this is
760 // a subregister extract at worst and thus won't require a vslidedown.
761 // TODO: Extend for aligned m2, m4 subvector extracts
762 // TODO: Extend for misalgined (but contained) extracts
763 // TODO: Extend for scalable subvector types
764 if (std::pair<InstructionCost, MVT> SubLT = getTypeLegalizationCost(SubTp);
765 SubLT.second.isValid() && SubLT.second.isFixedLengthVector()) {
766 if (std::optional<unsigned> VLen = ST->getRealVLen();
767 VLen && SubLT.second.getScalarSizeInBits() * Index % *VLen == 0 &&
768 SubLT.second.getSizeInBits() <= *VLen)
769 return TTI::TCC_Free;
770 }
771
772 // Example sequence:
773 // vsetivli zero, 4, e8, mf2, tu, ma (ignored)
774 // vslidedown.vi v8, v9, 2
775 return LT.first *
776 getRISCVInstructionCost(RISCV::VSLIDEDOWN_VI, LT.second, CostKind);
778 // Example sequence:
779 // vsetivli zero, 4, e8, mf2, tu, ma (ignored)
780 // vslideup.vi v8, v9, 2
781 LT = getTypeLegalizationCost(DstTy);
782 return LT.first *
783 getRISCVInstructionCost(RISCV::VSLIDEUP_VI, LT.second, CostKind);
784 case TTI::SK_Select: {
785 // Example sequence:
786 // li a0, 90
787 // vsetivli zero, 8, e8, mf2, ta, ma (ignored)
788 // vmv.s.x v0, a0
789 // vmerge.vvm v8, v9, v8, v0
790 // We use 2 for the cost of the mask materialization as this is the true
791 // cost for small masks and most shuffles are small. At worst, this cost
792 // should be a very small constant for the constant pool load. As such,
793 // we may bias towards large selects slightly more than truly warranted.
794 return LT.first *
795 (1 + getRISCVInstructionCost({RISCV::VMV_S_X, RISCV::VMERGE_VVM},
796 LT.second, CostKind));
797 }
798 case TTI::SK_Broadcast: {
799 bool HasScalar = (Args.size() > 0) && (Operator::getOpcode(Args[0]) ==
800 Instruction::InsertElement);
801 if (LT.second.getScalarSizeInBits() == 1) {
802 if (HasScalar) {
803 // Example sequence:
804 // andi a0, a0, 1
805 // vsetivli zero, 2, e8, mf8, ta, ma (ignored)
806 // vmv.v.x v8, a0
807 // vmsne.vi v0, v8, 0
808 return LT.first *
809 (1 + getRISCVInstructionCost({RISCV::VMV_V_X, RISCV::VMSNE_VI},
810 LT.second, CostKind));
811 }
812 // Example sequence:
813 // vsetivli zero, 2, e8, mf8, ta, mu (ignored)
814 // vmv.v.i v8, 0
815 // vmerge.vim v8, v8, 1, v0
816 // vmv.x.s a0, v8
817 // andi a0, a0, 1
818 // vmv.v.x v8, a0
819 // vmsne.vi v0, v8, 0
820
821 return LT.first *
822 (1 + getRISCVInstructionCost({RISCV::VMV_V_I, RISCV::VMERGE_VIM,
823 RISCV::VMV_X_S, RISCV::VMV_V_X,
824 RISCV::VMSNE_VI},
825 LT.second, CostKind));
826 }
827
828 if (HasScalar) {
829 // Example sequence:
830 // vmv.v.x v8, a0
831 return LT.first *
832 getRISCVInstructionCost(RISCV::VMV_V_X, LT.second, CostKind);
833 }
834
835 // Example sequence:
836 // vrgather.vi v9, v8, 0
837 return LT.first *
838 getRISCVInstructionCost(RISCV::VRGATHER_VI, LT.second, CostKind);
839 }
840 case TTI::SK_Splice: {
841 // vslidedown+vslideup.
842 // TODO: Multiplying by LT.first implies this legalizes into multiple copies
843 // of similar code, but I think we expand through memory.
844 unsigned Opcodes[2] = {RISCV::VSLIDEDOWN_VX, RISCV::VSLIDEUP_VX};
845 if (Index >= 0 && Index < 32)
846 Opcodes[0] = RISCV::VSLIDEDOWN_VI;
847 else if (Index < 0 && Index > -32)
848 Opcodes[1] = RISCV::VSLIDEUP_VI;
849 return LT.first * getRISCVInstructionCost(Opcodes, LT.second, CostKind);
850 }
851 case TTI::SK_Reverse: {
852
853 if (!LT.second.isVector())
855
856 // TODO: Cases to improve here:
857 // * Illegal vector types
858 // * i64 on RV32
859 if (SrcTy->getElementType()->isIntegerTy(1)) {
860 VectorType *WideTy =
861 VectorType::get(IntegerType::get(SrcTy->getContext(), 8),
862 cast<VectorType>(SrcTy)->getElementCount());
863 return getCastInstrCost(Instruction::ZExt, WideTy, SrcTy,
865 getShuffleCost(TTI::SK_Reverse, WideTy, WideTy, {}, CostKind, 0,
866 nullptr) +
867 getCastInstrCost(Instruction::Trunc, SrcTy, WideTy,
869 }
870
871 MVT ContainerVT = LT.second;
872 if (LT.second.isFixedLengthVector())
873 ContainerVT = TLI->getContainerForFixedLengthVector(LT.second);
874 MVT M1VT = RISCVTargetLowering::getM1VT(ContainerVT);
875 if (ContainerVT.bitsLE(M1VT)) {
876 // Example sequence:
877 // csrr a0, vlenb
878 // srli a0, a0, 3
879 // addi a0, a0, -1
880 // vsetvli a1, zero, e8, mf8, ta, mu (ignored)
881 // vid.v v9
882 // vrsub.vx v10, v9, a0
883 // vrgather.vv v9, v8, v10
884 InstructionCost LenCost = 3;
885 if (LT.second.isFixedLengthVector())
886 // vrsub.vi has a 5 bit immediate field, otherwise an li suffices
887 LenCost = isInt<5>(LT.second.getVectorNumElements() - 1) ? 0 : 1;
888 unsigned Opcodes[] = {RISCV::VID_V, RISCV::VRSUB_VX, RISCV::VRGATHER_VV};
889 if (LT.second.isFixedLengthVector() &&
890 isInt<5>(LT.second.getVectorNumElements() - 1))
891 Opcodes[1] = RISCV::VRSUB_VI;
892 InstructionCost GatherCost =
893 getRISCVInstructionCost(Opcodes, LT.second, CostKind);
894 return LT.first * (LenCost + GatherCost);
895 }
896
897 // At high LMUL, we split into a series of M1 reverses (see
898 // lowerVECTOR_REVERSE) and then do a single slide at the end to eliminate
899 // the resulting gap at the bottom (for fixed vectors only). The important
900 // bit is that the cost scales linearly, not quadratically with LMUL.
901 unsigned M1Opcodes[] = {RISCV::VID_V, RISCV::VRSUB_VX};
902 InstructionCost FixedCost =
903 getRISCVInstructionCost(M1Opcodes, M1VT, CostKind) + 3;
904 unsigned Ratio =
906 InstructionCost GatherCost =
907 getRISCVInstructionCost({RISCV::VRGATHER_VV}, M1VT, CostKind) * Ratio;
908 InstructionCost SlideCost = !LT.second.isFixedLengthVector() ? 0 :
909 getRISCVInstructionCost({RISCV::VSLIDEDOWN_VX}, LT.second, CostKind);
910 return FixedCost + LT.first * (GatherCost + SlideCost);
911 }
912 }
913 return BaseT::getShuffleCost(Kind, DstTy, SrcTy, Mask, CostKind, Index,
914 SubTp);
915}
916
917static unsigned isM1OrSmaller(MVT VT) {
919 return (LMUL == RISCVVType::VLMUL::LMUL_F8 ||
923}
924
926 VectorType *Ty, const APInt &DemandedElts, bool Insert, bool Extract,
927 TTI::TargetCostKind CostKind, bool ForPoisonSrc,
928 ArrayRef<Value *> VL) const {
931
932 // A build_vector (which is m1 sized or smaller) can be done in no
933 // worse than one vslide1down.vx per element in the type. We could
934 // in theory do an explode_vector in the inverse manner, but our
935 // lowering today does not have a first class node for this pattern.
937 Ty, DemandedElts, Insert, Extract, CostKind);
938 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
939 if (Insert && !Extract && LT.first.isValid() && LT.second.isVector()) {
940 if (Ty->getScalarSizeInBits() == 1) {
941 auto *WideVecTy = cast<VectorType>(Ty->getWithNewBitWidth(8));
942 // Note: Implicit scalar anyextend is assumed to be free since the i1
943 // must be stored in a GPR.
944 return getScalarizationOverhead(WideVecTy, DemandedElts, Insert, Extract,
945 CostKind) +
946 getCastInstrCost(Instruction::Trunc, Ty, WideVecTy,
948 }
949
950 assert(LT.second.isFixedLengthVector());
951 MVT ContainerVT = TLI->getContainerForFixedLengthVector(LT.second);
952 if (isM1OrSmaller(ContainerVT)) {
953 InstructionCost BV =
954 cast<FixedVectorType>(Ty)->getNumElements() *
955 getRISCVInstructionCost(RISCV::VSLIDE1DOWN_VX, LT.second, CostKind);
956 if (BV < Cost)
957 Cost = BV;
958 }
959 }
960 return Cost;
961}
962
964RISCVTTIImpl::getMaskedMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment,
965 unsigned AddressSpace,
967 if (!isLegalMaskedLoadStore(Src, Alignment) ||
969 return BaseT::getMaskedMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
970 CostKind);
971
972 return getMemoryOpCost(Opcode, Src, Alignment, AddressSpace, CostKind);
973}
974
976 unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices,
977 Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind,
978 bool UseMaskForCond, bool UseMaskForGaps) const {
979
980 // The interleaved memory access pass will lower (de)interleave ops combined
981 // with an adjacent appropriate memory to vlseg/vsseg intrinsics. vlseg/vsseg
982 // only support masking per-iteration (i.e. condition), not per-segment (i.e.
983 // gap).
984 if (!UseMaskForGaps && Factor <= TLI->getMaxSupportedInterleaveFactor()) {
985 auto *VTy = cast<VectorType>(VecTy);
986 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(VTy);
987 // Need to make sure type has't been scalarized
988 if (LT.second.isVector()) {
989 auto *SubVecTy =
990 VectorType::get(VTy->getElementType(),
991 VTy->getElementCount().divideCoefficientBy(Factor));
992 if (VTy->getElementCount().isKnownMultipleOf(Factor) &&
993 TLI->isLegalInterleavedAccessType(SubVecTy, Factor, Alignment,
994 AddressSpace, DL)) {
995
996 // Some processors optimize segment loads/stores as one wide memory op +
997 // Factor * LMUL shuffle ops.
998 if (ST->hasOptimizedSegmentLoadStore(Factor)) {
1000 getMemoryOpCost(Opcode, VTy, Alignment, AddressSpace, CostKind);
1001 MVT SubVecVT = getTLI()->getValueType(DL, SubVecTy).getSimpleVT();
1002 Cost += Factor * TLI->getLMULCost(SubVecVT);
1003 return LT.first * Cost;
1004 }
1005
1006 // Otherwise, the cost is proportional to the number of elements (VL *
1007 // Factor ops).
1008 InstructionCost MemOpCost =
1009 getMemoryOpCost(Opcode, VTy->getElementType(), Alignment, 0,
1010 CostKind, {TTI::OK_AnyValue, TTI::OP_None});
1011 unsigned NumLoads = getEstimatedVLFor(VTy);
1012 return NumLoads * MemOpCost;
1013 }
1014 }
1015 }
1016
1017 // TODO: Return the cost of interleaved accesses for scalable vector when
1018 // unable to convert to segment accesses instructions.
1019 if (isa<ScalableVectorType>(VecTy))
1021
1022 auto *FVTy = cast<FixedVectorType>(VecTy);
1023 InstructionCost MemCost =
1024 getMemoryOpCost(Opcode, VecTy, Alignment, AddressSpace, CostKind);
1025 unsigned VF = FVTy->getNumElements() / Factor;
1026
1027 // An interleaved load will look like this for Factor=3:
1028 // %wide.vec = load <12 x i32>, ptr %3, align 4
1029 // %strided.vec = shufflevector %wide.vec, poison, <4 x i32> <stride mask>
1030 // %strided.vec1 = shufflevector %wide.vec, poison, <4 x i32> <stride mask>
1031 // %strided.vec2 = shufflevector %wide.vec, poison, <4 x i32> <stride mask>
1032 if (Opcode == Instruction::Load) {
1033 InstructionCost Cost = MemCost;
1034 for (unsigned Index : Indices) {
1035 FixedVectorType *VecTy =
1036 FixedVectorType::get(FVTy->getElementType(), VF * Factor);
1037 auto Mask = createStrideMask(Index, Factor, VF);
1038 Mask.resize(VF * Factor, -1);
1039 InstructionCost ShuffleCost =
1041 Mask, CostKind, 0, nullptr, {});
1042 Cost += ShuffleCost;
1043 }
1044 return Cost;
1045 }
1046
1047 // TODO: Model for NF > 2
1048 // We'll need to enhance getShuffleCost to model shuffles that are just
1049 // inserts and extracts into subvectors, since they won't have the full cost
1050 // of a vrgather.
1051 // An interleaved store for 3 vectors of 4 lanes will look like
1052 // %11 = shufflevector <4 x i32> %4, <4 x i32> %6, <8 x i32> <0...7>
1053 // %12 = shufflevector <4 x i32> %9, <4 x i32> poison, <8 x i32> <0...3>
1054 // %13 = shufflevector <8 x i32> %11, <8 x i32> %12, <12 x i32> <0...11>
1055 // %interleaved.vec = shufflevector %13, poison, <12 x i32> <interleave mask>
1056 // store <12 x i32> %interleaved.vec, ptr %10, align 4
1057 if (Factor != 2)
1058 return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
1059 Alignment, AddressSpace, CostKind,
1060 UseMaskForCond, UseMaskForGaps);
1061
1062 assert(Opcode == Instruction::Store && "Opcode must be a store");
1063 // For an interleaving store of 2 vectors, we perform one large interleaving
1064 // shuffle that goes into the wide store
1065 auto Mask = createInterleaveMask(VF, Factor);
1066 InstructionCost ShuffleCost =
1068 CostKind, 0, nullptr, {});
1069 return MemCost + ShuffleCost;
1070}
1071
1073 unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask,
1074 Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I) const {
1076 return BaseT::getGatherScatterOpCost(Opcode, DataTy, Ptr, VariableMask,
1077 Alignment, CostKind, I);
1078
1079 if ((Opcode == Instruction::Load &&
1080 !isLegalMaskedGather(DataTy, Align(Alignment))) ||
1081 (Opcode == Instruction::Store &&
1082 !isLegalMaskedScatter(DataTy, Align(Alignment))))
1083 return BaseT::getGatherScatterOpCost(Opcode, DataTy, Ptr, VariableMask,
1084 Alignment, CostKind, I);
1085
1086 // Cost is proportional to the number of memory operations implied. For
1087 // scalable vectors, we use an estimate on that number since we don't
1088 // know exactly what VL will be.
1089 auto &VTy = *cast<VectorType>(DataTy);
1090 InstructionCost MemOpCost =
1091 getMemoryOpCost(Opcode, VTy.getElementType(), Alignment, 0, CostKind,
1092 {TTI::OK_AnyValue, TTI::OP_None}, I);
1093 unsigned NumLoads = getEstimatedVLFor(&VTy);
1094 return NumLoads * MemOpCost;
1095}
1096
1098 unsigned Opcode, Type *DataTy, bool VariableMask, Align Alignment,
1099 TTI::TargetCostKind CostKind, const Instruction *I) const {
1100 bool IsLegal = (Opcode == Instruction::Store &&
1101 isLegalMaskedCompressStore(DataTy, Alignment)) ||
1102 (Opcode == Instruction::Load &&
1103 isLegalMaskedExpandLoad(DataTy, Alignment));
1104 if (!IsLegal || CostKind != TTI::TCK_RecipThroughput)
1105 return BaseT::getExpandCompressMemoryOpCost(Opcode, DataTy, VariableMask,
1106 Alignment, CostKind, I);
1107 // Example compressstore sequence:
1108 // vsetivli zero, 8, e32, m2, ta, ma (ignored)
1109 // vcompress.vm v10, v8, v0
1110 // vcpop.m a1, v0
1111 // vsetvli zero, a1, e32, m2, ta, ma
1112 // vse32.v v10, (a0)
1113 // Example expandload sequence:
1114 // vsetivli zero, 8, e8, mf2, ta, ma (ignored)
1115 // vcpop.m a1, v0
1116 // vsetvli zero, a1, e32, m2, ta, ma
1117 // vle32.v v10, (a0)
1118 // vsetivli zero, 8, e32, m2, ta, ma
1119 // viota.m v12, v0
1120 // vrgather.vv v8, v10, v12, v0.t
1121 auto MemOpCost =
1122 getMemoryOpCost(Opcode, DataTy, Alignment, /*AddressSpace*/ 0, CostKind);
1123 auto LT = getTypeLegalizationCost(DataTy);
1124 SmallVector<unsigned, 4> Opcodes{RISCV::VSETVLI};
1125 if (VariableMask)
1126 Opcodes.push_back(RISCV::VCPOP_M);
1127 if (Opcode == Instruction::Store)
1128 Opcodes.append({RISCV::VCOMPRESS_VM});
1129 else
1130 Opcodes.append({RISCV::VSETIVLI, RISCV::VIOTA_M, RISCV::VRGATHER_VV});
1131 return MemOpCost +
1132 LT.first * getRISCVInstructionCost(Opcodes, LT.second, CostKind);
1133}
1134
1136 unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask,
1137 Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I) const {
1138 if (((Opcode == Instruction::Load || Opcode == Instruction::Store) &&
1139 !isLegalStridedLoadStore(DataTy, Alignment)) ||
1140 (Opcode != Instruction::Load && Opcode != Instruction::Store))
1141 return BaseT::getStridedMemoryOpCost(Opcode, DataTy, Ptr, VariableMask,
1142 Alignment, CostKind, I);
1143
1145 return TTI::TCC_Basic;
1146
1147 // Cost is proportional to the number of memory operations implied. For
1148 // scalable vectors, we use an estimate on that number since we don't
1149 // know exactly what VL will be.
1150 auto &VTy = *cast<VectorType>(DataTy);
1151 InstructionCost MemOpCost =
1152 getMemoryOpCost(Opcode, VTy.getElementType(), Alignment, 0, CostKind,
1153 {TTI::OK_AnyValue, TTI::OP_None}, I);
1154 unsigned NumLoads = getEstimatedVLFor(&VTy);
1155 return NumLoads * MemOpCost;
1156}
1157
1160 // FIXME: This is a property of the default vector convention, not
1161 // all possible calling conventions. Fixing that will require
1162 // some TTI API and SLP rework.
1165 for (auto *Ty : Tys) {
1166 if (!Ty->isVectorTy())
1167 continue;
1168 Align A = DL.getPrefTypeAlign(Ty);
1169 Cost += getMemoryOpCost(Instruction::Store, Ty, A, 0, CostKind) +
1170 getMemoryOpCost(Instruction::Load, Ty, A, 0, CostKind);
1171 }
1172 return Cost;
1173}
1174
1175// Currently, these represent both throughput and codesize costs
1176// for the respective intrinsics. The costs in this table are simply
1177// instruction counts with the following adjustments made:
1178// * One vsetvli is considered free.
1180 {Intrinsic::floor, MVT::f32, 9},
1181 {Intrinsic::floor, MVT::f64, 9},
1182 {Intrinsic::ceil, MVT::f32, 9},
1183 {Intrinsic::ceil, MVT::f64, 9},
1184 {Intrinsic::trunc, MVT::f32, 7},
1185 {Intrinsic::trunc, MVT::f64, 7},
1186 {Intrinsic::round, MVT::f32, 9},
1187 {Intrinsic::round, MVT::f64, 9},
1188 {Intrinsic::roundeven, MVT::f32, 9},
1189 {Intrinsic::roundeven, MVT::f64, 9},
1190 {Intrinsic::rint, MVT::f32, 7},
1191 {Intrinsic::rint, MVT::f64, 7},
1192 {Intrinsic::nearbyint, MVT::f32, 9},
1193 {Intrinsic::nearbyint, MVT::f64, 9},
1194 {Intrinsic::bswap, MVT::i16, 3},
1195 {Intrinsic::bswap, MVT::i32, 12},
1196 {Intrinsic::bswap, MVT::i64, 31},
1197 {Intrinsic::vp_bswap, MVT::i16, 3},
1198 {Intrinsic::vp_bswap, MVT::i32, 12},
1199 {Intrinsic::vp_bswap, MVT::i64, 31},
1200 {Intrinsic::vp_fshl, MVT::i8, 7},
1201 {Intrinsic::vp_fshl, MVT::i16, 7},
1202 {Intrinsic::vp_fshl, MVT::i32, 7},
1203 {Intrinsic::vp_fshl, MVT::i64, 7},
1204 {Intrinsic::vp_fshr, MVT::i8, 7},
1205 {Intrinsic::vp_fshr, MVT::i16, 7},
1206 {Intrinsic::vp_fshr, MVT::i32, 7},
1207 {Intrinsic::vp_fshr, MVT::i64, 7},
1208 {Intrinsic::bitreverse, MVT::i8, 17},
1209 {Intrinsic::bitreverse, MVT::i16, 24},
1210 {Intrinsic::bitreverse, MVT::i32, 33},
1211 {Intrinsic::bitreverse, MVT::i64, 52},
1212 {Intrinsic::vp_bitreverse, MVT::i8, 17},
1213 {Intrinsic::vp_bitreverse, MVT::i16, 24},
1214 {Intrinsic::vp_bitreverse, MVT::i32, 33},
1215 {Intrinsic::vp_bitreverse, MVT::i64, 52},
1216 {Intrinsic::ctpop, MVT::i8, 12},
1217 {Intrinsic::ctpop, MVT::i16, 19},
1218 {Intrinsic::ctpop, MVT::i32, 20},
1219 {Intrinsic::ctpop, MVT::i64, 21},
1220 {Intrinsic::ctlz, MVT::i8, 19},
1221 {Intrinsic::ctlz, MVT::i16, 28},
1222 {Intrinsic::ctlz, MVT::i32, 31},
1223 {Intrinsic::ctlz, MVT::i64, 35},
1224 {Intrinsic::cttz, MVT::i8, 16},
1225 {Intrinsic::cttz, MVT::i16, 23},
1226 {Intrinsic::cttz, MVT::i32, 24},
1227 {Intrinsic::cttz, MVT::i64, 25},
1228 {Intrinsic::vp_ctpop, MVT::i8, 12},
1229 {Intrinsic::vp_ctpop, MVT::i16, 19},
1230 {Intrinsic::vp_ctpop, MVT::i32, 20},
1231 {Intrinsic::vp_ctpop, MVT::i64, 21},
1232 {Intrinsic::vp_ctlz, MVT::i8, 19},
1233 {Intrinsic::vp_ctlz, MVT::i16, 28},
1234 {Intrinsic::vp_ctlz, MVT::i32, 31},
1235 {Intrinsic::vp_ctlz, MVT::i64, 35},
1236 {Intrinsic::vp_cttz, MVT::i8, 16},
1237 {Intrinsic::vp_cttz, MVT::i16, 23},
1238 {Intrinsic::vp_cttz, MVT::i32, 24},
1239 {Intrinsic::vp_cttz, MVT::i64, 25},
1240};
1241
1245 auto *RetTy = ICA.getReturnType();
1246 switch (ICA.getID()) {
1247 case Intrinsic::lrint:
1248 case Intrinsic::llrint:
1249 case Intrinsic::lround:
1250 case Intrinsic::llround: {
1251 auto LT = getTypeLegalizationCost(RetTy);
1252 Type *SrcTy = ICA.getArgTypes().front();
1253 auto SrcLT = getTypeLegalizationCost(SrcTy);
1254 if (ST->hasVInstructions() && LT.second.isVector()) {
1256 unsigned SrcEltSz = DL.getTypeSizeInBits(SrcTy->getScalarType());
1257 unsigned DstEltSz = DL.getTypeSizeInBits(RetTy->getScalarType());
1258 if (LT.second.getVectorElementType() == MVT::bf16) {
1259 if (!ST->hasVInstructionsBF16Minimal())
1261 if (DstEltSz == 32)
1262 Ops = {RISCV::VFWCVTBF16_F_F_V, RISCV::VFCVT_X_F_V};
1263 else
1264 Ops = {RISCV::VFWCVTBF16_F_F_V, RISCV::VFWCVT_X_F_V};
1265 } else if (LT.second.getVectorElementType() == MVT::f16 &&
1266 !ST->hasVInstructionsF16()) {
1267 if (!ST->hasVInstructionsF16Minimal())
1269 if (DstEltSz == 32)
1270 Ops = {RISCV::VFWCVT_F_F_V, RISCV::VFCVT_X_F_V};
1271 else
1272 Ops = {RISCV::VFWCVT_F_F_V, RISCV::VFWCVT_X_F_V};
1273
1274 } else if (SrcEltSz > DstEltSz) {
1275 Ops = {RISCV::VFNCVT_X_F_W};
1276 } else if (SrcEltSz < DstEltSz) {
1277 Ops = {RISCV::VFWCVT_X_F_V};
1278 } else {
1279 Ops = {RISCV::VFCVT_X_F_V};
1280 }
1281
1282 // We need to use the source LMUL in the case of a narrowing op, and the
1283 // destination LMUL otherwise.
1284 if (SrcEltSz > DstEltSz)
1285 return SrcLT.first *
1286 getRISCVInstructionCost(Ops, SrcLT.second, CostKind);
1287 return LT.first * getRISCVInstructionCost(Ops, LT.second, CostKind);
1288 }
1289 break;
1290 }
1291 case Intrinsic::ceil:
1292 case Intrinsic::floor:
1293 case Intrinsic::trunc:
1294 case Intrinsic::rint:
1295 case Intrinsic::round:
1296 case Intrinsic::roundeven: {
1297 // These all use the same code.
1298 auto LT = getTypeLegalizationCost(RetTy);
1299 if (!LT.second.isVector() && TLI->isOperationCustom(ISD::FCEIL, LT.second))
1300 return LT.first * 8;
1301 break;
1302 }
1303 case Intrinsic::umin:
1304 case Intrinsic::umax:
1305 case Intrinsic::smin:
1306 case Intrinsic::smax: {
1307 auto LT = getTypeLegalizationCost(RetTy);
1308 if (LT.second.isScalarInteger() && ST->hasStdExtZbb())
1309 return LT.first;
1310
1311 if (ST->hasVInstructions() && LT.second.isVector()) {
1312 unsigned Op;
1313 switch (ICA.getID()) {
1314 case Intrinsic::umin:
1315 Op = RISCV::VMINU_VV;
1316 break;
1317 case Intrinsic::umax:
1318 Op = RISCV::VMAXU_VV;
1319 break;
1320 case Intrinsic::smin:
1321 Op = RISCV::VMIN_VV;
1322 break;
1323 case Intrinsic::smax:
1324 Op = RISCV::VMAX_VV;
1325 break;
1326 }
1327 return LT.first * getRISCVInstructionCost(Op, LT.second, CostKind);
1328 }
1329 break;
1330 }
1331 case Intrinsic::sadd_sat:
1332 case Intrinsic::ssub_sat:
1333 case Intrinsic::uadd_sat:
1334 case Intrinsic::usub_sat: {
1335 auto LT = getTypeLegalizationCost(RetTy);
1336 if (ST->hasVInstructions() && LT.second.isVector()) {
1337 unsigned Op;
1338 switch (ICA.getID()) {
1339 case Intrinsic::sadd_sat:
1340 Op = RISCV::VSADD_VV;
1341 break;
1342 case Intrinsic::ssub_sat:
1343 Op = RISCV::VSSUBU_VV;
1344 break;
1345 case Intrinsic::uadd_sat:
1346 Op = RISCV::VSADDU_VV;
1347 break;
1348 case Intrinsic::usub_sat:
1349 Op = RISCV::VSSUBU_VV;
1350 break;
1351 }
1352 return LT.first * getRISCVInstructionCost(Op, LT.second, CostKind);
1353 }
1354 break;
1355 }
1356 case Intrinsic::fma:
1357 case Intrinsic::fmuladd: {
1358 // TODO: handle promotion with f16/bf16 with zvfhmin/zvfbfmin
1359 auto LT = getTypeLegalizationCost(RetTy);
1360 if (ST->hasVInstructions() && LT.second.isVector())
1361 return LT.first *
1362 getRISCVInstructionCost(RISCV::VFMADD_VV, LT.second, CostKind);
1363 break;
1364 }
1365 case Intrinsic::fabs: {
1366 auto LT = getTypeLegalizationCost(RetTy);
1367 if (ST->hasVInstructions() && LT.second.isVector()) {
1368 // lui a0, 8
1369 // addi a0, a0, -1
1370 // vsetvli a1, zero, e16, m1, ta, ma
1371 // vand.vx v8, v8, a0
1372 // f16 with zvfhmin and bf16 with zvfhbmin
1373 if (LT.second.getVectorElementType() == MVT::bf16 ||
1374 (LT.second.getVectorElementType() == MVT::f16 &&
1375 !ST->hasVInstructionsF16()))
1376 return LT.first * getRISCVInstructionCost(RISCV::VAND_VX, LT.second,
1377 CostKind) +
1378 2;
1379 else
1380 return LT.first *
1381 getRISCVInstructionCost(RISCV::VFSGNJX_VV, LT.second, CostKind);
1382 }
1383 break;
1384 }
1385 case Intrinsic::sqrt: {
1386 auto LT = getTypeLegalizationCost(RetTy);
1387 if (ST->hasVInstructions() && LT.second.isVector()) {
1390 MVT ConvType = LT.second;
1391 MVT FsqrtType = LT.second;
1392 // f16 with zvfhmin and bf16 with zvfbfmin and the type of nxv32[b]f16
1393 // will be spilt.
1394 if (LT.second.getVectorElementType() == MVT::bf16) {
1395 if (LT.second == MVT::nxv32bf16) {
1396 ConvOp = {RISCV::VFWCVTBF16_F_F_V, RISCV::VFWCVTBF16_F_F_V,
1397 RISCV::VFNCVTBF16_F_F_W, RISCV::VFNCVTBF16_F_F_W};
1398 FsqrtOp = {RISCV::VFSQRT_V, RISCV::VFSQRT_V};
1399 ConvType = MVT::nxv16f16;
1400 FsqrtType = MVT::nxv16f32;
1401 } else {
1402 ConvOp = {RISCV::VFWCVTBF16_F_F_V, RISCV::VFNCVTBF16_F_F_W};
1403 FsqrtOp = {RISCV::VFSQRT_V};
1404 FsqrtType = TLI->getTypeToPromoteTo(ISD::FSQRT, FsqrtType);
1405 }
1406 } else if (LT.second.getVectorElementType() == MVT::f16 &&
1407 !ST->hasVInstructionsF16()) {
1408 if (LT.second == MVT::nxv32f16) {
1409 ConvOp = {RISCV::VFWCVT_F_F_V, RISCV::VFWCVT_F_F_V,
1410 RISCV::VFNCVT_F_F_W, RISCV::VFNCVT_F_F_W};
1411 FsqrtOp = {RISCV::VFSQRT_V, RISCV::VFSQRT_V};
1412 ConvType = MVT::nxv16f16;
1413 FsqrtType = MVT::nxv16f32;
1414 } else {
1415 ConvOp = {RISCV::VFWCVT_F_F_V, RISCV::VFNCVT_F_F_W};
1416 FsqrtOp = {RISCV::VFSQRT_V};
1417 FsqrtType = TLI->getTypeToPromoteTo(ISD::FSQRT, FsqrtType);
1418 }
1419 } else {
1420 FsqrtOp = {RISCV::VFSQRT_V};
1421 }
1422
1423 return LT.first * (getRISCVInstructionCost(FsqrtOp, FsqrtType, CostKind) +
1424 getRISCVInstructionCost(ConvOp, ConvType, CostKind));
1425 }
1426 break;
1427 }
1428 case Intrinsic::cttz:
1429 case Intrinsic::ctlz:
1430 case Intrinsic::ctpop: {
1431 auto LT = getTypeLegalizationCost(RetTy);
1432 if (ST->hasStdExtZvbb() && LT.second.isVector()) {
1433 unsigned Op;
1434 switch (ICA.getID()) {
1435 case Intrinsic::cttz:
1436 Op = RISCV::VCTZ_V;
1437 break;
1438 case Intrinsic::ctlz:
1439 Op = RISCV::VCLZ_V;
1440 break;
1441 case Intrinsic::ctpop:
1442 Op = RISCV::VCPOP_V;
1443 break;
1444 }
1445 return LT.first * getRISCVInstructionCost(Op, LT.second, CostKind);
1446 }
1447 break;
1448 }
1449 case Intrinsic::abs: {
1450 auto LT = getTypeLegalizationCost(RetTy);
1451 if (ST->hasVInstructions() && LT.second.isVector()) {
1452 // vrsub.vi v10, v8, 0
1453 // vmax.vv v8, v8, v10
1454 return LT.first *
1455 getRISCVInstructionCost({RISCV::VRSUB_VI, RISCV::VMAX_VV},
1456 LT.second, CostKind);
1457 }
1458 break;
1459 }
1460 case Intrinsic::get_active_lane_mask: {
1461 if (ST->hasVInstructions()) {
1462 Type *ExpRetTy = VectorType::get(
1463 ICA.getArgTypes()[0], cast<VectorType>(RetTy)->getElementCount());
1464 auto LT = getTypeLegalizationCost(ExpRetTy);
1465
1466 // vid.v v8 // considered hoisted
1467 // vsaddu.vx v8, v8, a0
1468 // vmsltu.vx v0, v8, a1
1469 return LT.first *
1470 getRISCVInstructionCost({RISCV::VSADDU_VX, RISCV::VMSLTU_VX},
1471 LT.second, CostKind);
1472 }
1473 break;
1474 }
1475 // TODO: add more intrinsic
1476 case Intrinsic::stepvector: {
1477 auto LT = getTypeLegalizationCost(RetTy);
1478 // Legalisation of illegal types involves an `index' instruction plus
1479 // (LT.first - 1) vector adds.
1480 if (ST->hasVInstructions())
1481 return getRISCVInstructionCost(RISCV::VID_V, LT.second, CostKind) +
1482 (LT.first - 1) *
1483 getRISCVInstructionCost(RISCV::VADD_VX, LT.second, CostKind);
1484 return 1 + (LT.first - 1);
1485 }
1486 case Intrinsic::experimental_cttz_elts: {
1487 Type *ArgTy = ICA.getArgTypes()[0];
1488 EVT ArgType = TLI->getValueType(DL, ArgTy, true);
1489 if (getTLI()->shouldExpandCttzElements(ArgType))
1490 break;
1491 InstructionCost Cost = getRISCVInstructionCost(
1492 RISCV::VFIRST_M, getTypeLegalizationCost(ArgTy).second, CostKind);
1493
1494 // If zero_is_poison is false, then we will generate additional
1495 // cmp + select instructions to convert -1 to EVL.
1496 Type *BoolTy = Type::getInt1Ty(RetTy->getContext());
1497 if (ICA.getArgs().size() > 1 &&
1498 cast<ConstantInt>(ICA.getArgs()[1])->isZero())
1499 Cost += getCmpSelInstrCost(Instruction::ICmp, BoolTy, RetTy,
1501 getCmpSelInstrCost(Instruction::Select, RetTy, BoolTy,
1503
1504 return Cost;
1505 }
1506 case Intrinsic::experimental_vp_splat: {
1507 auto LT = getTypeLegalizationCost(RetTy);
1508 // TODO: Lower i1 experimental_vp_splat
1509 if (!ST->hasVInstructions() || LT.second.getScalarType() == MVT::i1)
1511 return LT.first * getRISCVInstructionCost(LT.second.isFloatingPoint()
1512 ? RISCV::VFMV_V_F
1513 : RISCV::VMV_V_X,
1514 LT.second, CostKind);
1515 }
1516 case Intrinsic::experimental_vp_splice: {
1517 // To support type-based query from vectorizer, set the index to 0.
1518 // Note that index only change the cost from vslide.vx to vslide.vi and in
1519 // current implementations they have same costs.
1521 cast<VectorType>(ICA.getArgTypes()[0]), {}, CostKind,
1523 }
1524 case Intrinsic::fptoui_sat:
1525 case Intrinsic::fptosi_sat: {
1527 bool IsSigned = ICA.getID() == Intrinsic::fptosi_sat;
1528 Type *SrcTy = ICA.getArgTypes()[0];
1529
1530 auto SrcLT = getTypeLegalizationCost(SrcTy);
1531 auto DstLT = getTypeLegalizationCost(RetTy);
1532 if (!SrcTy->isVectorTy())
1533 break;
1534
1535 if (!SrcLT.first.isValid() || !DstLT.first.isValid())
1537
1538 Cost +=
1539 getCastInstrCost(IsSigned ? Instruction::FPToSI : Instruction::FPToUI,
1540 RetTy, SrcTy, TTI::CastContextHint::None, CostKind);
1541
1542 // Handle NaN.
1543 // vmfne v0, v8, v8 # If v8[i] is NaN set v0[i] to 1.
1544 // vmerge.vim v8, v8, 0, v0 # Convert NaN to 0.
1545 Type *CondTy = RetTy->getWithNewBitWidth(1);
1546 Cost += getCmpSelInstrCost(BinaryOperator::FCmp, SrcTy, CondTy,
1548 Cost += getCmpSelInstrCost(BinaryOperator::Select, RetTy, CondTy,
1550 return Cost;
1551 }
1552 }
1553
1554 if (ST->hasVInstructions() && RetTy->isVectorTy()) {
1555 if (auto LT = getTypeLegalizationCost(RetTy);
1556 LT.second.isVector()) {
1557 MVT EltTy = LT.second.getVectorElementType();
1558 if (const auto *Entry = CostTableLookup(VectorIntrinsicCostTable,
1559 ICA.getID(), EltTy))
1560 return LT.first * Entry->Cost;
1561 }
1562 }
1563
1565}
1566
1569 const SCEV *Ptr,
1571 // Address computations for vector indexed load/store likely require an offset
1572 // and/or scaling.
1573 if (ST->hasVInstructions() && PtrTy->isVectorTy())
1574 return getArithmeticInstrCost(Instruction::Add, PtrTy, CostKind);
1575
1576 return BaseT::getAddressComputationCost(PtrTy, SE, Ptr, CostKind);
1577}
1578
1580 Type *Src,
1583 const Instruction *I) const {
1584 bool IsVectorType = isa<VectorType>(Dst) && isa<VectorType>(Src);
1585 if (!IsVectorType)
1586 return BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I);
1587
1588 // FIXME: Need to compute legalizing cost for illegal types. The current
1589 // code handles only legal types and those which can be trivially
1590 // promoted to legal.
1591 if (!ST->hasVInstructions() || Src->getScalarSizeInBits() > ST->getELen() ||
1592 Dst->getScalarSizeInBits() > ST->getELen())
1593 return BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I);
1594
1595 int ISD = TLI->InstructionOpcodeToISD(Opcode);
1596 assert(ISD && "Invalid opcode");
1597 std::pair<InstructionCost, MVT> SrcLT = getTypeLegalizationCost(Src);
1598 std::pair<InstructionCost, MVT> DstLT = getTypeLegalizationCost(Dst);
1599
1600 // Handle i1 source and dest cases *before* calling logic in BasicTTI.
1601 // The shared implementation doesn't model vector widening during legalization
1602 // and instead assumes scalarization. In order to scalarize an <N x i1>
1603 // vector, we need to extend/trunc to/from i8. If we don't special case
1604 // this, we can get an infinite recursion cycle.
1605 switch (ISD) {
1606 default:
1607 break;
1608 case ISD::SIGN_EXTEND:
1609 case ISD::ZERO_EXTEND:
1610 if (Src->getScalarSizeInBits() == 1) {
1611 // We do not use vsext/vzext to extend from mask vector.
1612 // Instead we use the following instructions to extend from mask vector:
1613 // vmv.v.i v8, 0
1614 // vmerge.vim v8, v8, -1, v0 (repeated per split)
1615 return getRISCVInstructionCost(RISCV::VMV_V_I, DstLT.second, CostKind) +
1616 DstLT.first * getRISCVInstructionCost(RISCV::VMERGE_VIM,
1617 DstLT.second, CostKind) +
1618 DstLT.first - 1;
1619 }
1620 break;
1621 case ISD::TRUNCATE:
1622 if (Dst->getScalarSizeInBits() == 1) {
1623 // We do not use several vncvt to truncate to mask vector. So we could
1624 // not use PowDiff to calculate it.
1625 // Instead we use the following instructions to truncate to mask vector:
1626 // vand.vi v8, v8, 1
1627 // vmsne.vi v0, v8, 0
1628 return SrcLT.first *
1629 getRISCVInstructionCost({RISCV::VAND_VI, RISCV::VMSNE_VI},
1630 SrcLT.second, CostKind) +
1631 SrcLT.first - 1;
1632 }
1633 break;
1634 };
1635
1636 // Our actual lowering for the case where a wider legal type is available
1637 // uses promotion to the wider type. This is reflected in the result of
1638 // getTypeLegalizationCost, but BasicTTI assumes the widened cases are
1639 // scalarized if the legalized Src and Dst are not equal sized.
1640 const DataLayout &DL = this->getDataLayout();
1641 if (!SrcLT.second.isVector() || !DstLT.second.isVector() ||
1642 !SrcLT.first.isValid() || !DstLT.first.isValid() ||
1643 !TypeSize::isKnownLE(DL.getTypeSizeInBits(Src),
1644 SrcLT.second.getSizeInBits()) ||
1645 !TypeSize::isKnownLE(DL.getTypeSizeInBits(Dst),
1646 DstLT.second.getSizeInBits()))
1647 return BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I);
1648
1649 // The split cost is handled by the base getCastInstrCost
1650 assert((SrcLT.first == 1) && (DstLT.first == 1) && "Illegal type");
1651
1652 int PowDiff = (int)Log2_32(DstLT.second.getScalarSizeInBits()) -
1653 (int)Log2_32(SrcLT.second.getScalarSizeInBits());
1654 switch (ISD) {
1655 case ISD::SIGN_EXTEND:
1656 case ISD::ZERO_EXTEND: {
1657 if ((PowDiff < 1) || (PowDiff > 3))
1658 return BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I);
1659 unsigned SExtOp[] = {RISCV::VSEXT_VF2, RISCV::VSEXT_VF4, RISCV::VSEXT_VF8};
1660 unsigned ZExtOp[] = {RISCV::VZEXT_VF2, RISCV::VZEXT_VF4, RISCV::VZEXT_VF8};
1661 unsigned Op =
1662 (ISD == ISD::SIGN_EXTEND) ? SExtOp[PowDiff - 1] : ZExtOp[PowDiff - 1];
1663 return getRISCVInstructionCost(Op, DstLT.second, CostKind);
1664 }
1665 case ISD::TRUNCATE:
1666 case ISD::FP_EXTEND:
1667 case ISD::FP_ROUND: {
1668 // Counts of narrow/widen instructions.
1669 unsigned SrcEltSize = SrcLT.second.getScalarSizeInBits();
1670 unsigned DstEltSize = DstLT.second.getScalarSizeInBits();
1671
1672 unsigned Op = (ISD == ISD::TRUNCATE) ? RISCV::VNSRL_WI
1673 : (ISD == ISD::FP_EXTEND) ? RISCV::VFWCVT_F_F_V
1674 : RISCV::VFNCVT_F_F_W;
1676 for (; SrcEltSize != DstEltSize;) {
1677 MVT ElementMVT = (ISD == ISD::TRUNCATE)
1678 ? MVT::getIntegerVT(DstEltSize)
1679 : MVT::getFloatingPointVT(DstEltSize);
1680 MVT DstMVT = DstLT.second.changeVectorElementType(ElementMVT);
1681 DstEltSize =
1682 (DstEltSize > SrcEltSize) ? DstEltSize >> 1 : DstEltSize << 1;
1683 Cost += getRISCVInstructionCost(Op, DstMVT, CostKind);
1684 }
1685 return Cost;
1686 }
1687 case ISD::FP_TO_SINT:
1688 case ISD::FP_TO_UINT: {
1689 unsigned IsSigned = ISD == ISD::FP_TO_SINT;
1690 unsigned FCVT = IsSigned ? RISCV::VFCVT_RTZ_X_F_V : RISCV::VFCVT_RTZ_XU_F_V;
1691 unsigned FWCVT =
1692 IsSigned ? RISCV::VFWCVT_RTZ_X_F_V : RISCV::VFWCVT_RTZ_XU_F_V;
1693 unsigned FNCVT =
1694 IsSigned ? RISCV::VFNCVT_RTZ_X_F_W : RISCV::VFNCVT_RTZ_XU_F_W;
1695 unsigned SrcEltSize = Src->getScalarSizeInBits();
1696 unsigned DstEltSize = Dst->getScalarSizeInBits();
1698 if ((SrcEltSize == 16) &&
1699 (!ST->hasVInstructionsF16() || ((DstEltSize / 2) > SrcEltSize))) {
1700 // If the target only supports zvfhmin or it is fp16-to-i64 conversion
1701 // pre-widening to f32 and then convert f32 to integer
1702 VectorType *VecF32Ty =
1703 VectorType::get(Type::getFloatTy(Dst->getContext()),
1704 cast<VectorType>(Dst)->getElementCount());
1705 std::pair<InstructionCost, MVT> VecF32LT =
1706 getTypeLegalizationCost(VecF32Ty);
1707 Cost +=
1708 VecF32LT.first * getRISCVInstructionCost(RISCV::VFWCVT_F_F_V,
1709 VecF32LT.second, CostKind);
1710 Cost += getCastInstrCost(Opcode, Dst, VecF32Ty, CCH, CostKind, I);
1711 return Cost;
1712 }
1713 if (DstEltSize == SrcEltSize)
1714 Cost += getRISCVInstructionCost(FCVT, DstLT.second, CostKind);
1715 else if (DstEltSize > SrcEltSize)
1716 Cost += getRISCVInstructionCost(FWCVT, DstLT.second, CostKind);
1717 else { // (SrcEltSize > DstEltSize)
1718 // First do a narrowing conversion to an integer half the size, then
1719 // truncate if needed.
1720 MVT ElementVT = MVT::getIntegerVT(SrcEltSize / 2);
1721 MVT VecVT = DstLT.second.changeVectorElementType(ElementVT);
1722 Cost += getRISCVInstructionCost(FNCVT, VecVT, CostKind);
1723 if ((SrcEltSize / 2) > DstEltSize) {
1724 Type *VecTy = EVT(VecVT).getTypeForEVT(Dst->getContext());
1725 Cost +=
1726 getCastInstrCost(Instruction::Trunc, Dst, VecTy, CCH, CostKind, I);
1727 }
1728 }
1729 return Cost;
1730 }
1731 case ISD::SINT_TO_FP:
1732 case ISD::UINT_TO_FP: {
1733 unsigned IsSigned = ISD == ISD::SINT_TO_FP;
1734 unsigned FCVT = IsSigned ? RISCV::VFCVT_F_X_V : RISCV::VFCVT_F_XU_V;
1735 unsigned FWCVT = IsSigned ? RISCV::VFWCVT_F_X_V : RISCV::VFWCVT_F_XU_V;
1736 unsigned FNCVT = IsSigned ? RISCV::VFNCVT_F_X_W : RISCV::VFNCVT_F_XU_W;
1737 unsigned SrcEltSize = Src->getScalarSizeInBits();
1738 unsigned DstEltSize = Dst->getScalarSizeInBits();
1739
1741 if ((DstEltSize == 16) &&
1742 (!ST->hasVInstructionsF16() || ((SrcEltSize / 2) > DstEltSize))) {
1743 // If the target only supports zvfhmin or it is i64-to-fp16 conversion
1744 // it is converted to f32 and then converted to f16
1745 VectorType *VecF32Ty =
1746 VectorType::get(Type::getFloatTy(Dst->getContext()),
1747 cast<VectorType>(Dst)->getElementCount());
1748 std::pair<InstructionCost, MVT> VecF32LT =
1749 getTypeLegalizationCost(VecF32Ty);
1750 Cost += getCastInstrCost(Opcode, VecF32Ty, Src, CCH, CostKind, I);
1751 Cost += VecF32LT.first * getRISCVInstructionCost(RISCV::VFNCVT_F_F_W,
1752 DstLT.second, CostKind);
1753 return Cost;
1754 }
1755
1756 if (DstEltSize == SrcEltSize)
1757 Cost += getRISCVInstructionCost(FCVT, DstLT.second, CostKind);
1758 else if (DstEltSize > SrcEltSize) {
1759 if ((DstEltSize / 2) > SrcEltSize) {
1760 VectorType *VecTy =
1761 VectorType::get(IntegerType::get(Dst->getContext(), DstEltSize / 2),
1762 cast<VectorType>(Dst)->getElementCount());
1763 unsigned Op = IsSigned ? Instruction::SExt : Instruction::ZExt;
1764 Cost += getCastInstrCost(Op, VecTy, Src, CCH, CostKind, I);
1765 }
1766 Cost += getRISCVInstructionCost(FWCVT, DstLT.second, CostKind);
1767 } else
1768 Cost += getRISCVInstructionCost(FNCVT, DstLT.second, CostKind);
1769 return Cost;
1770 }
1771 }
1772 return BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I);
1773}
1774
1775unsigned RISCVTTIImpl::getEstimatedVLFor(VectorType *Ty) const {
1776 if (isa<ScalableVectorType>(Ty)) {
1777 const unsigned EltSize = DL.getTypeSizeInBits(Ty->getElementType());
1778 const unsigned MinSize = DL.getTypeSizeInBits(Ty).getKnownMinValue();
1779 const unsigned VectorBits = *getVScaleForTuning() * RISCV::RVVBitsPerBlock;
1780 return RISCVTargetLowering::computeVLMAX(VectorBits, EltSize, MinSize);
1781 }
1782 return cast<FixedVectorType>(Ty)->getNumElements();
1783}
1784
1787 FastMathFlags FMF,
1789 if (isa<FixedVectorType>(Ty) && !ST->useRVVForFixedLengthVectors())
1790 return BaseT::getMinMaxReductionCost(IID, Ty, FMF, CostKind);
1791
1792 // Skip if scalar size of Ty is bigger than ELEN.
1793 if (Ty->getScalarSizeInBits() > ST->getELen())
1794 return BaseT::getMinMaxReductionCost(IID, Ty, FMF, CostKind);
1795
1796 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
1797 if (Ty->getElementType()->isIntegerTy(1)) {
1798 // SelectionDAGBuilder does following transforms:
1799 // vector_reduce_{smin,umax}(<n x i1>) --> vector_reduce_or(<n x i1>)
1800 // vector_reduce_{smax,umin}(<n x i1>) --> vector_reduce_and(<n x i1>)
1801 if (IID == Intrinsic::umax || IID == Intrinsic::smin)
1802 return getArithmeticReductionCost(Instruction::Or, Ty, FMF, CostKind);
1803 else
1804 return getArithmeticReductionCost(Instruction::And, Ty, FMF, CostKind);
1805 }
1806
1807 if (IID == Intrinsic::maximum || IID == Intrinsic::minimum) {
1809 InstructionCost ExtraCost = 0;
1810 switch (IID) {
1811 case Intrinsic::maximum:
1812 if (FMF.noNaNs()) {
1813 Opcodes = {RISCV::VFREDMAX_VS, RISCV::VFMV_F_S};
1814 } else {
1815 Opcodes = {RISCV::VMFNE_VV, RISCV::VCPOP_M, RISCV::VFREDMAX_VS,
1816 RISCV::VFMV_F_S};
1817 // Cost of Canonical Nan + branch
1818 // lui a0, 523264
1819 // fmv.w.x fa0, a0
1820 Type *DstTy = Ty->getScalarType();
1821 const unsigned EltTyBits = DstTy->getScalarSizeInBits();
1822 Type *SrcTy = IntegerType::getIntNTy(DstTy->getContext(), EltTyBits);
1823 ExtraCost = 1 +
1824 getCastInstrCost(Instruction::UIToFP, DstTy, SrcTy,
1826 getCFInstrCost(Instruction::Br, CostKind);
1827 }
1828 break;
1829
1830 case Intrinsic::minimum:
1831 if (FMF.noNaNs()) {
1832 Opcodes = {RISCV::VFREDMIN_VS, RISCV::VFMV_F_S};
1833 } else {
1834 Opcodes = {RISCV::VMFNE_VV, RISCV::VCPOP_M, RISCV::VFREDMIN_VS,
1835 RISCV::VFMV_F_S};
1836 // Cost of Canonical Nan + branch
1837 // lui a0, 523264
1838 // fmv.w.x fa0, a0
1839 Type *DstTy = Ty->getScalarType();
1840 const unsigned EltTyBits = DL.getTypeSizeInBits(DstTy);
1841 Type *SrcTy = IntegerType::getIntNTy(DstTy->getContext(), EltTyBits);
1842 ExtraCost = 1 +
1843 getCastInstrCost(Instruction::UIToFP, DstTy, SrcTy,
1845 getCFInstrCost(Instruction::Br, CostKind);
1846 }
1847 break;
1848 }
1849 return ExtraCost + getRISCVInstructionCost(Opcodes, LT.second, CostKind);
1850 }
1851
1852 // IR Reduction is composed by one rvv reduction instruction and vmv
1853 unsigned SplitOp;
1855 switch (IID) {
1856 default:
1857 llvm_unreachable("Unsupported intrinsic");
1858 case Intrinsic::smax:
1859 SplitOp = RISCV::VMAX_VV;
1860 Opcodes = {RISCV::VREDMAX_VS, RISCV::VMV_X_S};
1861 break;
1862 case Intrinsic::smin:
1863 SplitOp = RISCV::VMIN_VV;
1864 Opcodes = {RISCV::VREDMIN_VS, RISCV::VMV_X_S};
1865 break;
1866 case Intrinsic::umax:
1867 SplitOp = RISCV::VMAXU_VV;
1868 Opcodes = {RISCV::VREDMAXU_VS, RISCV::VMV_X_S};
1869 break;
1870 case Intrinsic::umin:
1871 SplitOp = RISCV::VMINU_VV;
1872 Opcodes = {RISCV::VREDMINU_VS, RISCV::VMV_X_S};
1873 break;
1874 case Intrinsic::maxnum:
1875 SplitOp = RISCV::VFMAX_VV;
1876 Opcodes = {RISCV::VFREDMAX_VS, RISCV::VFMV_F_S};
1877 break;
1878 case Intrinsic::minnum:
1879 SplitOp = RISCV::VFMIN_VV;
1880 Opcodes = {RISCV::VFREDMIN_VS, RISCV::VFMV_F_S};
1881 break;
1882 }
1883 // Add a cost for data larger than LMUL8
1884 InstructionCost SplitCost =
1885 (LT.first > 1) ? (LT.first - 1) *
1886 getRISCVInstructionCost(SplitOp, LT.second, CostKind)
1887 : 0;
1888 return SplitCost + getRISCVInstructionCost(Opcodes, LT.second, CostKind);
1889}
1890
1893 std::optional<FastMathFlags> FMF,
1895 if (isa<FixedVectorType>(Ty) && !ST->useRVVForFixedLengthVectors())
1896 return BaseT::getArithmeticReductionCost(Opcode, Ty, FMF, CostKind);
1897
1898 // Skip if scalar size of Ty is bigger than ELEN.
1899 if (Ty->getScalarSizeInBits() > ST->getELen())
1900 return BaseT::getArithmeticReductionCost(Opcode, Ty, FMF, CostKind);
1901
1902 int ISD = TLI->InstructionOpcodeToISD(Opcode);
1903 assert(ISD && "Invalid opcode");
1904
1905 if (ISD != ISD::ADD && ISD != ISD::OR && ISD != ISD::XOR && ISD != ISD::AND &&
1906 ISD != ISD::FADD)
1907 return BaseT::getArithmeticReductionCost(Opcode, Ty, FMF, CostKind);
1908
1909 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
1910 Type *ElementTy = Ty->getElementType();
1911 if (ElementTy->isIntegerTy(1)) {
1912 // Example sequences:
1913 // vfirst.m a0, v0
1914 // seqz a0, a0
1915 if (LT.second == MVT::v1i1)
1916 return getRISCVInstructionCost(RISCV::VFIRST_M, LT.second, CostKind) +
1917 getCmpSelInstrCost(Instruction::ICmp, ElementTy, ElementTy,
1919
1920 if (ISD == ISD::AND) {
1921 // Example sequences:
1922 // vmand.mm v8, v9, v8 ; needed every time type is split
1923 // vmnot.m v8, v0 ; alias for vmnand
1924 // vcpop.m a0, v8
1925 // seqz a0, a0
1926
1927 // See the discussion: https://github.com/llvm/llvm-project/pull/119160
1928 // For LMUL <= 8, there is no splitting,
1929 // the sequences are vmnot, vcpop and seqz.
1930 // When LMUL > 8 and split = 1,
1931 // the sequences are vmnand, vcpop and seqz.
1932 // When LMUL > 8 and split > 1,
1933 // the sequences are (LT.first-2) * vmand, vmnand, vcpop and seqz.
1934 return ((LT.first > 2) ? (LT.first - 2) : 0) *
1935 getRISCVInstructionCost(RISCV::VMAND_MM, LT.second, CostKind) +
1936 getRISCVInstructionCost(RISCV::VMNAND_MM, LT.second, CostKind) +
1937 getRISCVInstructionCost(RISCV::VCPOP_M, LT.second, CostKind) +
1938 getCmpSelInstrCost(Instruction::ICmp, ElementTy, ElementTy,
1940 } else if (ISD == ISD::XOR || ISD == ISD::ADD) {
1941 // Example sequences:
1942 // vsetvli a0, zero, e8, mf8, ta, ma
1943 // vmxor.mm v8, v0, v8 ; needed every time type is split
1944 // vcpop.m a0, v8
1945 // andi a0, a0, 1
1946 return (LT.first - 1) *
1947 getRISCVInstructionCost(RISCV::VMXOR_MM, LT.second, CostKind) +
1948 getRISCVInstructionCost(RISCV::VCPOP_M, LT.second, CostKind) + 1;
1949 } else {
1950 assert(ISD == ISD::OR);
1951 // Example sequences:
1952 // vsetvli a0, zero, e8, mf8, ta, ma
1953 // vmor.mm v8, v9, v8 ; needed every time type is split
1954 // vcpop.m a0, v0
1955 // snez a0, a0
1956 return (LT.first - 1) *
1957 getRISCVInstructionCost(RISCV::VMOR_MM, LT.second, CostKind) +
1958 getRISCVInstructionCost(RISCV::VCPOP_M, LT.second, CostKind) +
1959 getCmpSelInstrCost(Instruction::ICmp, ElementTy, ElementTy,
1961 }
1962 }
1963
1964 // IR Reduction of or/and is composed by one vmv and one rvv reduction
1965 // instruction, and others is composed by two vmv and one rvv reduction
1966 // instruction
1967 unsigned SplitOp;
1969 switch (ISD) {
1970 case ISD::ADD:
1971 SplitOp = RISCV::VADD_VV;
1972 Opcodes = {RISCV::VMV_S_X, RISCV::VREDSUM_VS, RISCV::VMV_X_S};
1973 break;
1974 case ISD::OR:
1975 SplitOp = RISCV::VOR_VV;
1976 Opcodes = {RISCV::VREDOR_VS, RISCV::VMV_X_S};
1977 break;
1978 case ISD::XOR:
1979 SplitOp = RISCV::VXOR_VV;
1980 Opcodes = {RISCV::VMV_S_X, RISCV::VREDXOR_VS, RISCV::VMV_X_S};
1981 break;
1982 case ISD::AND:
1983 SplitOp = RISCV::VAND_VV;
1984 Opcodes = {RISCV::VREDAND_VS, RISCV::VMV_X_S};
1985 break;
1986 case ISD::FADD:
1987 // We can't promote f16/bf16 fadd reductions.
1988 if ((LT.second.getScalarType() == MVT::f16 && !ST->hasVInstructionsF16()) ||
1989 LT.second.getScalarType() == MVT::bf16)
1990 return BaseT::getArithmeticReductionCost(Opcode, Ty, FMF, CostKind);
1992 Opcodes.push_back(RISCV::VFMV_S_F);
1993 for (unsigned i = 0; i < LT.first.getValue(); i++)
1994 Opcodes.push_back(RISCV::VFREDOSUM_VS);
1995 Opcodes.push_back(RISCV::VFMV_F_S);
1996 return getRISCVInstructionCost(Opcodes, LT.second, CostKind);
1997 }
1998 SplitOp = RISCV::VFADD_VV;
1999 Opcodes = {RISCV::VFMV_S_F, RISCV::VFREDUSUM_VS, RISCV::VFMV_F_S};
2000 break;
2001 }
2002 // Add a cost for data larger than LMUL8
2003 InstructionCost SplitCost =
2004 (LT.first > 1) ? (LT.first - 1) *
2005 getRISCVInstructionCost(SplitOp, LT.second, CostKind)
2006 : 0;
2007 return SplitCost + getRISCVInstructionCost(Opcodes, LT.second, CostKind);
2008}
2009
2011 unsigned Opcode, bool IsUnsigned, Type *ResTy, VectorType *ValTy,
2012 std::optional<FastMathFlags> FMF, TTI::TargetCostKind CostKind) const {
2013 if (isa<FixedVectorType>(ValTy) && !ST->useRVVForFixedLengthVectors())
2014 return BaseT::getExtendedReductionCost(Opcode, IsUnsigned, ResTy, ValTy,
2015 FMF, CostKind);
2016
2017 // Skip if scalar size of ResTy is bigger than ELEN.
2018 if (ResTy->getScalarSizeInBits() > ST->getELen())
2019 return BaseT::getExtendedReductionCost(Opcode, IsUnsigned, ResTy, ValTy,
2020 FMF, CostKind);
2021
2022 if (Opcode != Instruction::Add && Opcode != Instruction::FAdd)
2023 return BaseT::getExtendedReductionCost(Opcode, IsUnsigned, ResTy, ValTy,
2024 FMF, CostKind);
2025
2026 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy);
2027
2028 if (IsUnsigned && Opcode == Instruction::Add &&
2029 LT.second.isFixedLengthVector() && LT.second.getScalarType() == MVT::i1) {
2030 // Represent vector_reduce_add(ZExt(<n x i1>)) as
2031 // ZExtOrTrunc(ctpop(bitcast <n x i1> to in)).
2032 return LT.first *
2033 getRISCVInstructionCost(RISCV::VCPOP_M, LT.second, CostKind);
2034 }
2035
2036 if (ResTy->getScalarSizeInBits() != 2 * LT.second.getScalarSizeInBits())
2037 return BaseT::getExtendedReductionCost(Opcode, IsUnsigned, ResTy, ValTy,
2038 FMF, CostKind);
2039
2040 return (LT.first - 1) +
2041 getArithmeticReductionCost(Opcode, ValTy, FMF, CostKind);
2042}
2043
2047 assert(OpInfo.isConstant() && "non constant operand?");
2048 if (!isa<VectorType>(Ty))
2049 // FIXME: We need to account for immediate materialization here, but doing
2050 // a decent job requires more knowledge about the immediate than we
2051 // currently have here.
2052 return 0;
2053
2054 if (OpInfo.isUniform())
2055 // vmv.v.i, vmv.v.x, or vfmv.v.f
2056 // We ignore the cost of the scalar constant materialization to be consistent
2057 // with how we treat scalar constants themselves just above.
2058 return 1;
2059
2060 return getConstantPoolLoadCost(Ty, CostKind);
2061}
2062
2064 Align Alignment,
2065 unsigned AddressSpace,
2067 TTI::OperandValueInfo OpInfo,
2068 const Instruction *I) const {
2069 EVT VT = TLI->getValueType(DL, Src, true);
2070 // Type legalization can't handle structs
2071 if (VT == MVT::Other)
2072 return BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
2073 CostKind, OpInfo, I);
2074
2076 if (Opcode == Instruction::Store && OpInfo.isConstant())
2077 Cost += getStoreImmCost(Src, OpInfo, CostKind);
2078
2079 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Src);
2080
2081 InstructionCost BaseCost = [&]() {
2082 InstructionCost Cost = LT.first;
2084 return Cost;
2085
2086 // Our actual lowering for the case where a wider legal type is available
2087 // uses the a VL predicated load on the wider type. This is reflected in
2088 // the result of getTypeLegalizationCost, but BasicTTI assumes the
2089 // widened cases are scalarized.
2090 const DataLayout &DL = this->getDataLayout();
2091 if (Src->isVectorTy() && LT.second.isVector() &&
2092 TypeSize::isKnownLT(DL.getTypeStoreSizeInBits(Src),
2093 LT.second.getSizeInBits()))
2094 return Cost;
2095
2096 return BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
2097 CostKind, OpInfo, I);
2098 }();
2099
2100 // Assume memory ops cost scale with the number of vector registers
2101 // possible accessed by the instruction. Note that BasicTTI already
2102 // handles the LT.first term for us.
2103 if (LT.second.isVector() && CostKind != TTI::TCK_CodeSize)
2104 BaseCost *= TLI->getLMULCost(LT.second);
2105 return Cost + BaseCost;
2106}
2107
2109 unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred,
2111 TTI::OperandValueInfo Op2Info, const Instruction *I) const {
2113 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind,
2114 Op1Info, Op2Info, I);
2115
2116 if (isa<FixedVectorType>(ValTy) && !ST->useRVVForFixedLengthVectors())
2117 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind,
2118 Op1Info, Op2Info, I);
2119
2120 // Skip if scalar size of ValTy is bigger than ELEN.
2121 if (ValTy->isVectorTy() && ValTy->getScalarSizeInBits() > ST->getELen())
2122 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind,
2123 Op1Info, Op2Info, I);
2124
2125 auto GetConstantMatCost =
2126 [&](TTI::OperandValueInfo OpInfo) -> InstructionCost {
2127 if (OpInfo.isUniform())
2128 // We return 0 we currently ignore the cost of materializing scalar
2129 // constants in GPRs.
2130 return 0;
2131
2132 return getConstantPoolLoadCost(ValTy, CostKind);
2133 };
2134
2135 InstructionCost ConstantMatCost;
2136 if (Op1Info.isConstant())
2137 ConstantMatCost += GetConstantMatCost(Op1Info);
2138 if (Op2Info.isConstant())
2139 ConstantMatCost += GetConstantMatCost(Op2Info);
2140
2141 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy);
2142 if (Opcode == Instruction::Select && ValTy->isVectorTy()) {
2143 if (CondTy->isVectorTy()) {
2144 if (ValTy->getScalarSizeInBits() == 1) {
2145 // vmandn.mm v8, v8, v9
2146 // vmand.mm v9, v0, v9
2147 // vmor.mm v0, v9, v8
2148 return ConstantMatCost +
2149 LT.first *
2150 getRISCVInstructionCost(
2151 {RISCV::VMANDN_MM, RISCV::VMAND_MM, RISCV::VMOR_MM},
2152 LT.second, CostKind);
2153 }
2154 // vselect and max/min are supported natively.
2155 return ConstantMatCost +
2156 LT.first * getRISCVInstructionCost(RISCV::VMERGE_VVM, LT.second,
2157 CostKind);
2158 }
2159
2160 if (ValTy->getScalarSizeInBits() == 1) {
2161 // vmv.v.x v9, a0
2162 // vmsne.vi v9, v9, 0
2163 // vmandn.mm v8, v8, v9
2164 // vmand.mm v9, v0, v9
2165 // vmor.mm v0, v9, v8
2166 MVT InterimVT = LT.second.changeVectorElementType(MVT::i8);
2167 return ConstantMatCost +
2168 LT.first *
2169 getRISCVInstructionCost({RISCV::VMV_V_X, RISCV::VMSNE_VI},
2170 InterimVT, CostKind) +
2171 LT.first * getRISCVInstructionCost(
2172 {RISCV::VMANDN_MM, RISCV::VMAND_MM, RISCV::VMOR_MM},
2173 LT.second, CostKind);
2174 }
2175
2176 // vmv.v.x v10, a0
2177 // vmsne.vi v0, v10, 0
2178 // vmerge.vvm v8, v9, v8, v0
2179 return ConstantMatCost +
2180 LT.first * getRISCVInstructionCost(
2181 {RISCV::VMV_V_X, RISCV::VMSNE_VI, RISCV::VMERGE_VVM},
2182 LT.second, CostKind);
2183 }
2184
2185 if ((Opcode == Instruction::ICmp) && ValTy->isVectorTy() &&
2186 CmpInst::isIntPredicate(VecPred)) {
2187 // Use VMSLT_VV to represent VMSEQ, VMSNE, VMSLTU, VMSLEU, VMSLT, VMSLE
2188 // provided they incur the same cost across all implementations
2189 return ConstantMatCost + LT.first * getRISCVInstructionCost(RISCV::VMSLT_VV,
2190 LT.second,
2191 CostKind);
2192 }
2193
2194 if ((Opcode == Instruction::FCmp) && ValTy->isVectorTy() &&
2195 CmpInst::isFPPredicate(VecPred)) {
2196
2197 // Use VMXOR_MM and VMXNOR_MM to generate all true/false mask
2198 if ((VecPred == CmpInst::FCMP_FALSE) || (VecPred == CmpInst::FCMP_TRUE))
2199 return ConstantMatCost +
2200 getRISCVInstructionCost(RISCV::VMXOR_MM, LT.second, CostKind);
2201
2202 // If we do not support the input floating point vector type, use the base
2203 // one which will calculate as:
2204 // ScalarizeCost + Num * Cost for fixed vector,
2205 // InvalidCost for scalable vector.
2206 if ((ValTy->getScalarSizeInBits() == 16 && !ST->hasVInstructionsF16()) ||
2207 (ValTy->getScalarSizeInBits() == 32 && !ST->hasVInstructionsF32()) ||
2208 (ValTy->getScalarSizeInBits() == 64 && !ST->hasVInstructionsF64()))
2209 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind,
2210 Op1Info, Op2Info, I);
2211
2212 // Assuming vector fp compare and mask instructions are all the same cost
2213 // until a need arises to differentiate them.
2214 switch (VecPred) {
2215 case CmpInst::FCMP_ONE: // vmflt.vv + vmflt.vv + vmor.mm
2216 case CmpInst::FCMP_ORD: // vmfeq.vv + vmfeq.vv + vmand.mm
2217 case CmpInst::FCMP_UNO: // vmfne.vv + vmfne.vv + vmor.mm
2218 case CmpInst::FCMP_UEQ: // vmflt.vv + vmflt.vv + vmnor.mm
2219 return ConstantMatCost +
2220 LT.first * getRISCVInstructionCost(
2221 {RISCV::VMFLT_VV, RISCV::VMFLT_VV, RISCV::VMOR_MM},
2222 LT.second, CostKind);
2223
2224 case CmpInst::FCMP_UGT: // vmfle.vv + vmnot.m
2225 case CmpInst::FCMP_UGE: // vmflt.vv + vmnot.m
2226 case CmpInst::FCMP_ULT: // vmfle.vv + vmnot.m
2227 case CmpInst::FCMP_ULE: // vmflt.vv + vmnot.m
2228 return ConstantMatCost +
2229 LT.first *
2230 getRISCVInstructionCost({RISCV::VMFLT_VV, RISCV::VMNAND_MM},
2231 LT.second, CostKind);
2232
2233 case CmpInst::FCMP_OEQ: // vmfeq.vv
2234 case CmpInst::FCMP_OGT: // vmflt.vv
2235 case CmpInst::FCMP_OGE: // vmfle.vv
2236 case CmpInst::FCMP_OLT: // vmflt.vv
2237 case CmpInst::FCMP_OLE: // vmfle.vv
2238 case CmpInst::FCMP_UNE: // vmfne.vv
2239 return ConstantMatCost +
2240 LT.first *
2241 getRISCVInstructionCost(RISCV::VMFLT_VV, LT.second, CostKind);
2242 default:
2243 break;
2244 }
2245 }
2246
2247 // With ShortForwardBranchOpt or ConditionalMoveFusion, scalar icmp + select
2248 // instructions will lower to SELECT_CC and lower to PseudoCCMOVGPR which will
2249 // generate a conditional branch + mv. The cost of scalar (icmp + select) will
2250 // be (0 + select instr cost).
2251 if (ST->hasConditionalMoveFusion() && I && isa<ICmpInst>(I) &&
2252 ValTy->isIntegerTy() && !I->user_empty()) {
2253 if (all_of(I->users(), [&](const User *U) {
2254 return match(U, m_Select(m_Specific(I), m_Value(), m_Value())) &&
2255 U->getType()->isIntegerTy() &&
2256 !isa<ConstantData>(U->getOperand(1)) &&
2257 !isa<ConstantData>(U->getOperand(2));
2258 }))
2259 return 0;
2260 }
2261
2262 // TODO: Add cost for scalar type.
2263
2264 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind,
2265 Op1Info, Op2Info, I);
2266}
2267
2270 const Instruction *I) const {
2272 return Opcode == Instruction::PHI ? 0 : 1;
2273 // Branches are assumed to be predicted.
2274 return 0;
2275}
2276
2279 unsigned Index,
2280 const Value *Op0,
2281 const Value *Op1) const {
2282 assert(Val->isVectorTy() && "This must be a vector type");
2283
2284 if (Opcode != Instruction::ExtractElement &&
2285 Opcode != Instruction::InsertElement)
2286 return BaseT::getVectorInstrCost(Opcode, Val, CostKind, Index, Op0, Op1);
2287
2288 // Legalize the type.
2289 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Val);
2290
2291 // This type is legalized to a scalar type.
2292 if (!LT.second.isVector()) {
2293 auto *FixedVecTy = cast<FixedVectorType>(Val);
2294 // If Index is a known constant, cost is zero.
2295 if (Index != -1U)
2296 return 0;
2297 // Extract/InsertElement with non-constant index is very costly when
2298 // scalarized; estimate cost of loads/stores sequence via the stack:
2299 // ExtractElement cost: store vector to stack, load scalar;
2300 // InsertElement cost: store vector to stack, store scalar, load vector.
2301 Type *ElemTy = FixedVecTy->getElementType();
2302 auto NumElems = FixedVecTy->getNumElements();
2303 auto Align = DL.getPrefTypeAlign(ElemTy);
2304 InstructionCost LoadCost =
2305 getMemoryOpCost(Instruction::Load, ElemTy, Align, 0, CostKind);
2306 InstructionCost StoreCost =
2307 getMemoryOpCost(Instruction::Store, ElemTy, Align, 0, CostKind);
2308 return Opcode == Instruction::ExtractElement
2309 ? StoreCost * NumElems + LoadCost
2310 : (StoreCost + LoadCost) * NumElems + StoreCost;
2311 }
2312
2313 // For unsupported scalable vector.
2314 if (LT.second.isScalableVector() && !LT.first.isValid())
2315 return LT.first;
2316
2317 // Mask vector extract/insert is expanded via e8.
2318 if (Val->getScalarSizeInBits() == 1) {
2319 VectorType *WideTy =
2321 cast<VectorType>(Val)->getElementCount());
2322 if (Opcode == Instruction::ExtractElement) {
2323 InstructionCost ExtendCost
2324 = getCastInstrCost(Instruction::ZExt, WideTy, Val,
2326 InstructionCost ExtractCost
2327 = getVectorInstrCost(Opcode, WideTy, CostKind, Index, nullptr, nullptr);
2328 return ExtendCost + ExtractCost;
2329 }
2330 InstructionCost ExtendCost
2331 = getCastInstrCost(Instruction::ZExt, WideTy, Val,
2333 InstructionCost InsertCost
2334 = getVectorInstrCost(Opcode, WideTy, CostKind, Index, nullptr, nullptr);
2335 InstructionCost TruncCost
2336 = getCastInstrCost(Instruction::Trunc, Val, WideTy,
2338 return ExtendCost + InsertCost + TruncCost;
2339 }
2340
2341
2342 // In RVV, we could use vslidedown + vmv.x.s to extract element from vector
2343 // and vslideup + vmv.s.x to insert element to vector.
2344 unsigned BaseCost = 1;
2345 // When insertelement we should add the index with 1 as the input of vslideup.
2346 unsigned SlideCost = Opcode == Instruction::InsertElement ? 2 : 1;
2347
2348 if (Index != -1U) {
2349 // The type may be split. For fixed-width vectors we can normalize the
2350 // index to the new type.
2351 if (LT.second.isFixedLengthVector()) {
2352 unsigned Width = LT.second.getVectorNumElements();
2353 Index = Index % Width;
2354 }
2355
2356 // If exact VLEN is known, we will insert/extract into the appropriate
2357 // subvector with no additional subvector insert/extract cost.
2358 if (auto VLEN = ST->getRealVLen()) {
2359 unsigned EltSize = LT.second.getScalarSizeInBits();
2360 unsigned M1Max = *VLEN / EltSize;
2361 Index = Index % M1Max;
2362 }
2363
2364 if (Index == 0)
2365 // We can extract/insert the first element without vslidedown/vslideup.
2366 SlideCost = 0;
2367 else if (ST->hasVendorXRivosVisni() && isUInt<5>(Index) &&
2368 Val->getScalarType()->isIntegerTy())
2369 SlideCost = 0; // With ri.vinsert/ri.vextract there is no slide needed
2370 else if (Opcode == Instruction::InsertElement)
2371 SlideCost = 1; // With a constant index, we do not need to use addi.
2372 }
2373
2374 // When the vector needs to split into multiple register groups and the index
2375 // exceeds single vector register group, we need to insert/extract the element
2376 // via stack.
2377 if (LT.first > 1 &&
2378 ((Index == -1U) || (Index >= LT.second.getVectorMinNumElements() &&
2379 LT.second.isScalableVector()))) {
2380 Type *ScalarType = Val->getScalarType();
2381 Align VecAlign = DL.getPrefTypeAlign(Val);
2382 Align SclAlign = DL.getPrefTypeAlign(ScalarType);
2383 // Extra addi for unknown index.
2384 InstructionCost IdxCost = Index == -1U ? 1 : 0;
2385
2386 // Store all split vectors into stack and load the target element.
2387 if (Opcode == Instruction::ExtractElement)
2388 return getMemoryOpCost(Instruction::Store, Val, VecAlign, 0, CostKind) +
2389 getMemoryOpCost(Instruction::Load, ScalarType, SclAlign, 0,
2390 CostKind) +
2391 IdxCost;
2392
2393 // Store all split vectors into stack and store the target element and load
2394 // vectors back.
2395 return getMemoryOpCost(Instruction::Store, Val, VecAlign, 0, CostKind) +
2396 getMemoryOpCost(Instruction::Load, Val, VecAlign, 0, CostKind) +
2397 getMemoryOpCost(Instruction::Store, ScalarType, SclAlign, 0,
2398 CostKind) +
2399 IdxCost;
2400 }
2401
2402 // Extract i64 in the target that has XLEN=32 need more instruction.
2403 if (Val->getScalarType()->isIntegerTy() &&
2404 ST->getXLen() < Val->getScalarSizeInBits()) {
2405 // For extractelement, we need the following instructions:
2406 // vsetivli zero, 1, e64, m1, ta, mu (not count)
2407 // vslidedown.vx v8, v8, a0
2408 // vmv.x.s a0, v8
2409 // li a1, 32
2410 // vsrl.vx v8, v8, a1
2411 // vmv.x.s a1, v8
2412
2413 // For insertelement, we need the following instructions:
2414 // vsetivli zero, 2, e32, m4, ta, mu (not count)
2415 // vmv.v.i v12, 0
2416 // vslide1up.vx v16, v12, a1
2417 // vslide1up.vx v12, v16, a0
2418 // addi a0, a2, 1
2419 // vsetvli zero, a0, e64, m4, tu, mu (not count)
2420 // vslideup.vx v8, v12, a2
2421
2422 // TODO: should we count these special vsetvlis?
2423 BaseCost = Opcode == Instruction::InsertElement ? 3 : 4;
2424 }
2425 return BaseCost + SlideCost;
2426}
2427
2431 unsigned Index) const {
2432 if (isa<FixedVectorType>(Val))
2434 Index);
2435
2436 // TODO: This code replicates what LoopVectorize.cpp used to do when asking
2437 // for the cost of extracting the last lane of a scalable vector. It probably
2438 // needs a more accurate cost.
2439 ElementCount EC = cast<VectorType>(Val)->getElementCount();
2440 assert(Index < EC.getKnownMinValue() && "Unexpected reverse index");
2441 return getVectorInstrCost(Opcode, Val, CostKind,
2442 EC.getKnownMinValue() - 1 - Index, nullptr,
2443 nullptr);
2444}
2445
2447 unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
2449 ArrayRef<const Value *> Args, const Instruction *CxtI) const {
2450
2451 // TODO: Handle more cost kinds.
2453 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, Op2Info,
2454 Args, CxtI);
2455
2456 if (isa<FixedVectorType>(Ty) && !ST->useRVVForFixedLengthVectors())
2457 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, Op2Info,
2458 Args, CxtI);
2459
2460 // Skip if scalar size of Ty is bigger than ELEN.
2461 if (isa<VectorType>(Ty) && Ty->getScalarSizeInBits() > ST->getELen())
2462 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, Op2Info,
2463 Args, CxtI);
2464
2465 // Legalize the type.
2466 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
2467
2468 // TODO: Handle scalar type.
2469 if (!LT.second.isVector())
2470 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, Op2Info,
2471 Args, CxtI);
2472
2473 // f16 with zvfhmin and bf16 will be promoted to f32.
2474 // FIXME: nxv32[b]f16 will be custom lowered and split.
2475 unsigned ISDOpcode = TLI->InstructionOpcodeToISD(Opcode);
2476 InstructionCost CastCost = 0;
2477 if ((LT.second.getVectorElementType() == MVT::f16 ||
2478 LT.second.getVectorElementType() == MVT::bf16) &&
2479 TLI->getOperationAction(ISDOpcode, LT.second) ==
2481 MVT PromotedVT = TLI->getTypeToPromoteTo(ISDOpcode, LT.second);
2482 Type *PromotedTy = EVT(PromotedVT).getTypeForEVT(Ty->getContext());
2483 Type *LegalTy = EVT(LT.second).getTypeForEVT(Ty->getContext());
2484 // Add cost of extending arguments
2485 CastCost += LT.first * Args.size() *
2486 getCastInstrCost(Instruction::FPExt, PromotedTy, LegalTy,
2488 // Add cost of truncating result
2489 CastCost +=
2490 LT.first * getCastInstrCost(Instruction::FPTrunc, LegalTy, PromotedTy,
2492 // Compute cost of op in promoted type
2493 LT.second = PromotedVT;
2494 }
2495
2496 auto getConstantMatCost =
2497 [&](unsigned Operand, TTI::OperandValueInfo OpInfo) -> InstructionCost {
2498 if (OpInfo.isUniform() && canSplatOperand(Opcode, Operand))
2499 // Two sub-cases:
2500 // * Has a 5 bit immediate operand which can be splatted.
2501 // * Has a larger immediate which must be materialized in scalar register
2502 // We return 0 for both as we currently ignore the cost of materializing
2503 // scalar constants in GPRs.
2504 return 0;
2505
2506 return getConstantPoolLoadCost(Ty, CostKind);
2507 };
2508
2509 // Add the cost of materializing any constant vectors required.
2510 InstructionCost ConstantMatCost = 0;
2511 if (Op1Info.isConstant())
2512 ConstantMatCost += getConstantMatCost(0, Op1Info);
2513 if (Op2Info.isConstant())
2514 ConstantMatCost += getConstantMatCost(1, Op2Info);
2515
2516 unsigned Op;
2517 switch (ISDOpcode) {
2518 case ISD::ADD:
2519 case ISD::SUB:
2520 Op = RISCV::VADD_VV;
2521 break;
2522 case ISD::SHL:
2523 case ISD::SRL:
2524 case ISD::SRA:
2525 Op = RISCV::VSLL_VV;
2526 break;
2527 case ISD::AND:
2528 case ISD::OR:
2529 case ISD::XOR:
2530 Op = (Ty->getScalarSizeInBits() == 1) ? RISCV::VMAND_MM : RISCV::VAND_VV;
2531 break;
2532 case ISD::MUL:
2533 case ISD::MULHS:
2534 case ISD::MULHU:
2535 Op = RISCV::VMUL_VV;
2536 break;
2537 case ISD::SDIV:
2538 case ISD::UDIV:
2539 Op = RISCV::VDIV_VV;
2540 break;
2541 case ISD::SREM:
2542 case ISD::UREM:
2543 Op = RISCV::VREM_VV;
2544 break;
2545 case ISD::FADD:
2546 case ISD::FSUB:
2547 Op = RISCV::VFADD_VV;
2548 break;
2549 case ISD::FMUL:
2550 Op = RISCV::VFMUL_VV;
2551 break;
2552 case ISD::FDIV:
2553 Op = RISCV::VFDIV_VV;
2554 break;
2555 case ISD::FNEG:
2556 Op = RISCV::VFSGNJN_VV;
2557 break;
2558 default:
2559 // Assuming all other instructions have the same cost until a need arises to
2560 // differentiate them.
2561 return CastCost + ConstantMatCost +
2562 BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, Op2Info,
2563 Args, CxtI);
2564 }
2565
2566 InstructionCost InstrCost = getRISCVInstructionCost(Op, LT.second, CostKind);
2567 // We use BasicTTIImpl to calculate scalar costs, which assumes floating point
2568 // ops are twice as expensive as integer ops. Do the same for vectors so
2569 // scalar floating point ops aren't cheaper than their vector equivalents.
2570 if (Ty->isFPOrFPVectorTy())
2571 InstrCost *= 2;
2572 return CastCost + ConstantMatCost + LT.first * InstrCost;
2573}
2574
2575// TODO: Deduplicate from TargetTransformInfoImplCRTPBase.
2577 ArrayRef<const Value *> Ptrs, const Value *Base,
2578 const TTI::PointersChainInfo &Info, Type *AccessTy,
2581 // In the basic model we take into account GEP instructions only
2582 // (although here can come alloca instruction, a value, constants and/or
2583 // constant expressions, PHIs, bitcasts ... whatever allowed to be used as a
2584 // pointer). Typically, if Base is a not a GEP-instruction and all the
2585 // pointers are relative to the same base address, all the rest are
2586 // either GEP instructions, PHIs, bitcasts or constants. When we have same
2587 // base, we just calculate cost of each non-Base GEP as an ADD operation if
2588 // any their index is a non-const.
2589 // If no known dependencies between the pointers cost is calculated as a sum
2590 // of costs of GEP instructions.
2591 for (auto [I, V] : enumerate(Ptrs)) {
2592 const auto *GEP = dyn_cast<GetElementPtrInst>(V);
2593 if (!GEP)
2594 continue;
2595 if (Info.isSameBase() && V != Base) {
2596 if (GEP->hasAllConstantIndices())
2597 continue;
2598 // If the chain is unit-stride and BaseReg + stride*i is a legal
2599 // addressing mode, then presume the base GEP is sitting around in a
2600 // register somewhere and check if we can fold the offset relative to
2601 // it.
2602 unsigned Stride = DL.getTypeStoreSize(AccessTy);
2603 if (Info.isUnitStride() &&
2604 isLegalAddressingMode(AccessTy,
2605 /* BaseGV */ nullptr,
2606 /* BaseOffset */ Stride * I,
2607 /* HasBaseReg */ true,
2608 /* Scale */ 0,
2609 GEP->getType()->getPointerAddressSpace()))
2610 continue;
2611 Cost += getArithmeticInstrCost(Instruction::Add, GEP->getType(), CostKind,
2612 {TTI::OK_AnyValue, TTI::OP_None},
2613 {TTI::OK_AnyValue, TTI::OP_None}, {});
2614 } else {
2615 SmallVector<const Value *> Indices(GEP->indices());
2616 Cost += getGEPCost(GEP->getSourceElementType(), GEP->getPointerOperand(),
2617 Indices, AccessTy, CostKind);
2618 }
2619 }
2620 return Cost;
2621}
2622
2625 OptimizationRemarkEmitter *ORE) const {
2626 // TODO: More tuning on benchmarks and metrics with changes as needed
2627 // would apply to all settings below to enable performance.
2628
2629
2630 if (ST->enableDefaultUnroll())
2631 return BasicTTIImplBase::getUnrollingPreferences(L, SE, UP, ORE);
2632
2633 // Enable Upper bound unrolling universally, not dependent upon the conditions
2634 // below.
2635 UP.UpperBound = true;
2636
2637 // Disable loop unrolling for Oz and Os.
2638 UP.OptSizeThreshold = 0;
2640 if (L->getHeader()->getParent()->hasOptSize())
2641 return;
2642
2643 SmallVector<BasicBlock *, 4> ExitingBlocks;
2644 L->getExitingBlocks(ExitingBlocks);
2645 LLVM_DEBUG(dbgs() << "Loop has:\n"
2646 << "Blocks: " << L->getNumBlocks() << "\n"
2647 << "Exit blocks: " << ExitingBlocks.size() << "\n");
2648
2649 // Only allow another exit other than the latch. This acts as an early exit
2650 // as it mirrors the profitability calculation of the runtime unroller.
2651 if (ExitingBlocks.size() > 2)
2652 return;
2653
2654 // Limit the CFG of the loop body for targets with a branch predictor.
2655 // Allowing 4 blocks permits if-then-else diamonds in the body.
2656 if (L->getNumBlocks() > 4)
2657 return;
2658
2659 // Scan the loop: don't unroll loops with calls as this could prevent
2660 // inlining. Don't unroll auto-vectorized loops either, though do allow
2661 // unrolling of the scalar remainder.
2662 bool IsVectorized = getBooleanLoopAttribute(L, "llvm.loop.isvectorized");
2664 for (auto *BB : L->getBlocks()) {
2665 for (auto &I : *BB) {
2666 // Both auto-vectorized loops and the scalar remainder have the
2667 // isvectorized attribute, so differentiate between them by the presence
2668 // of vector instructions.
2669 if (IsVectorized && I.getType()->isVectorTy())
2670 return;
2671
2672 if (isa<CallInst>(I) || isa<InvokeInst>(I)) {
2673 if (const Function *F = cast<CallBase>(I).getCalledFunction()) {
2674 if (!isLoweredToCall(F))
2675 continue;
2676 }
2677 return;
2678 }
2679
2680 SmallVector<const Value *> Operands(I.operand_values());
2683 }
2684 }
2685
2686 LLVM_DEBUG(dbgs() << "Cost of loop: " << Cost << "\n");
2687
2688 UP.Partial = true;
2689 UP.Runtime = true;
2690 UP.UnrollRemainder = true;
2691 UP.UnrollAndJam = true;
2692
2693 // Force unrolling small loops can be very useful because of the branch
2694 // taken cost of the backedge.
2695 if (Cost < 12)
2696 UP.Force = true;
2697}
2698
2703
2705 if (Ty->isVectorTy()) {
2706 // f16 with only zvfhmin and bf16 will be promoted to f32
2707 Type *EltTy = cast<VectorType>(Ty)->getElementType();
2708 if ((EltTy->isHalfTy() && !ST->hasVInstructionsF16()) ||
2709 EltTy->isBFloatTy())
2710 Ty = VectorType::get(Type::getFloatTy(Ty->getContext()),
2711 cast<VectorType>(Ty));
2712
2713 TypeSize Size = DL.getTypeSizeInBits(Ty);
2714 if (Size.isScalable() && ST->hasVInstructions())
2715 return divideCeil(Size.getKnownMinValue(), RISCV::RVVBitsPerBlock);
2716
2717 if (ST->useRVVForFixedLengthVectors())
2718 return divideCeil(Size, ST->getRealMinVLen());
2719 }
2720
2721 return BaseT::getRegUsageForType(Ty);
2722}
2723
2724unsigned RISCVTTIImpl::getMaximumVF(unsigned ElemWidth, unsigned Opcode) const {
2725 if (SLPMaxVF.getNumOccurrences())
2726 return SLPMaxVF;
2727
2728 // Return how many elements can fit in getRegisterBitwidth. This is the
2729 // same routine as used in LoopVectorizer. We should probably be
2730 // accounting for whether we actually have instructions with the right
2731 // lane type, but we don't have enough information to do that without
2732 // some additional plumbing which hasn't been justified yet.
2733 TypeSize RegWidth =
2735 // If no vector registers, or absurd element widths, disable
2736 // vectorization by returning 1.
2737 return std::max<unsigned>(1U, RegWidth.getFixedValue() / ElemWidth);
2738}
2739
2743
2745 return ST->enableUnalignedVectorMem();
2746}
2747
2750 ScalarEvolution *SE) const {
2751 if (ST->hasVendorXCVmem() && !ST->is64Bit())
2752 return TTI::AMK_PostIndexed;
2753
2755}
2756
2758 const TargetTransformInfo::LSRCost &C2) const {
2759 // RISC-V specific here are "instruction number 1st priority".
2760 // If we need to emit adds inside the loop to add up base registers, then
2761 // we need at least one extra temporary register.
2762 unsigned C1NumRegs = C1.NumRegs + (C1.NumBaseAdds != 0);
2763 unsigned C2NumRegs = C2.NumRegs + (C2.NumBaseAdds != 0);
2764 return std::tie(C1.Insns, C1NumRegs, C1.AddRecCost,
2765 C1.NumIVMuls, C1.NumBaseAdds,
2766 C1.ScaleCost, C1.ImmCost, C1.SetupCost) <
2767 std::tie(C2.Insns, C2NumRegs, C2.AddRecCost,
2768 C2.NumIVMuls, C2.NumBaseAdds,
2769 C2.ScaleCost, C2.ImmCost, C2.SetupCost);
2770}
2771
2773 Align Alignment) const {
2774 auto *VTy = dyn_cast<VectorType>(DataTy);
2775 if (!VTy || VTy->isScalableTy())
2776 return false;
2777
2778 if (!isLegalMaskedLoadStore(DataTy, Alignment))
2779 return false;
2780
2781 // FIXME: If it is an i8 vector and the element count exceeds 256, we should
2782 // scalarize these types with LMUL >= maximum fixed-length LMUL.
2783 if (VTy->getElementType()->isIntegerTy(8))
2784 if (VTy->getElementCount().getFixedValue() > 256)
2785 return VTy->getPrimitiveSizeInBits() / ST->getRealMinVLen() <
2786 ST->getMaxLMULForFixedLengthVectors();
2787 return true;
2788}
2789
2791 Align Alignment) const {
2792 auto *VTy = dyn_cast<VectorType>(DataTy);
2793 if (!VTy || VTy->isScalableTy())
2794 return false;
2795
2796 if (!isLegalMaskedLoadStore(DataTy, Alignment))
2797 return false;
2798 return true;
2799}
2800
2801/// See if \p I should be considered for address type promotion. We check if \p
2802/// I is a sext with right type and used in memory accesses. If it used in a
2803/// "complex" getelementptr, we allow it to be promoted without finding other
2804/// sext instructions that sign extended the same initial value. A getelementptr
2805/// is considered as "complex" if it has more than 2 operands.
2807 const Instruction &I, bool &AllowPromotionWithoutCommonHeader) const {
2808 bool Considerable = false;
2809 AllowPromotionWithoutCommonHeader = false;
2810 if (!isa<SExtInst>(&I))
2811 return false;
2812 Type *ConsideredSExtType =
2813 Type::getInt64Ty(I.getParent()->getParent()->getContext());
2814 if (I.getType() != ConsideredSExtType)
2815 return false;
2816 // See if the sext is the one with the right type and used in at least one
2817 // GetElementPtrInst.
2818 for (const User *U : I.users()) {
2819 if (const GetElementPtrInst *GEPInst = dyn_cast<GetElementPtrInst>(U)) {
2820 Considerable = true;
2821 // A getelementptr is considered as "complex" if it has more than 2
2822 // operands. We will promote a SExt used in such complex GEP as we
2823 // expect some computation to be merged if they are done on 64 bits.
2824 if (GEPInst->getNumOperands() > 2) {
2825 AllowPromotionWithoutCommonHeader = true;
2826 break;
2827 }
2828 }
2829 }
2830 return Considerable;
2831}
2832
2833bool RISCVTTIImpl::canSplatOperand(unsigned Opcode, int Operand) const {
2834 switch (Opcode) {
2835 case Instruction::Add:
2836 case Instruction::Sub:
2837 case Instruction::Mul:
2838 case Instruction::And:
2839 case Instruction::Or:
2840 case Instruction::Xor:
2841 case Instruction::FAdd:
2842 case Instruction::FSub:
2843 case Instruction::FMul:
2844 case Instruction::FDiv:
2845 case Instruction::ICmp:
2846 case Instruction::FCmp:
2847 return true;
2848 case Instruction::Shl:
2849 case Instruction::LShr:
2850 case Instruction::AShr:
2851 case Instruction::UDiv:
2852 case Instruction::SDiv:
2853 case Instruction::URem:
2854 case Instruction::SRem:
2855 case Instruction::Select:
2856 return Operand == 1;
2857 default:
2858 return false;
2859 }
2860}
2861
2863 if (!I->getType()->isVectorTy() || !ST->hasVInstructions())
2864 return false;
2865
2866 if (canSplatOperand(I->getOpcode(), Operand))
2867 return true;
2868
2869 auto *II = dyn_cast<IntrinsicInst>(I);
2870 if (!II)
2871 return false;
2872
2873 switch (II->getIntrinsicID()) {
2874 case Intrinsic::fma:
2875 case Intrinsic::vp_fma:
2876 case Intrinsic::fmuladd:
2877 case Intrinsic::vp_fmuladd:
2878 return Operand == 0 || Operand == 1;
2879 case Intrinsic::vp_shl:
2880 case Intrinsic::vp_lshr:
2881 case Intrinsic::vp_ashr:
2882 case Intrinsic::vp_udiv:
2883 case Intrinsic::vp_sdiv:
2884 case Intrinsic::vp_urem:
2885 case Intrinsic::vp_srem:
2886 case Intrinsic::ssub_sat:
2887 case Intrinsic::vp_ssub_sat:
2888 case Intrinsic::usub_sat:
2889 case Intrinsic::vp_usub_sat:
2890 case Intrinsic::vp_select:
2891 return Operand == 1;
2892 // These intrinsics are commutative.
2893 case Intrinsic::vp_add:
2894 case Intrinsic::vp_mul:
2895 case Intrinsic::vp_and:
2896 case Intrinsic::vp_or:
2897 case Intrinsic::vp_xor:
2898 case Intrinsic::vp_fadd:
2899 case Intrinsic::vp_fmul:
2900 case Intrinsic::vp_icmp:
2901 case Intrinsic::vp_fcmp:
2902 case Intrinsic::smin:
2903 case Intrinsic::vp_smin:
2904 case Intrinsic::umin:
2905 case Intrinsic::vp_umin:
2906 case Intrinsic::smax:
2907 case Intrinsic::vp_smax:
2908 case Intrinsic::umax:
2909 case Intrinsic::vp_umax:
2910 case Intrinsic::sadd_sat:
2911 case Intrinsic::vp_sadd_sat:
2912 case Intrinsic::uadd_sat:
2913 case Intrinsic::vp_uadd_sat:
2914 // These intrinsics have 'vr' versions.
2915 case Intrinsic::vp_sub:
2916 case Intrinsic::vp_fsub:
2917 case Intrinsic::vp_fdiv:
2918 return Operand == 0 || Operand == 1;
2919 default:
2920 return false;
2921 }
2922}
2923
2924/// Check if sinking \p I's operands to I's basic block is profitable, because
2925/// the operands can be folded into a target instruction, e.g.
2926/// splats of scalars can fold into vector instructions.
2929 using namespace llvm::PatternMatch;
2930
2931 if (I->isBitwiseLogicOp()) {
2932 if (!I->getType()->isVectorTy()) {
2933 if (ST->hasStdExtZbb() || ST->hasStdExtZbkb()) {
2934 for (auto &Op : I->operands()) {
2935 // (and/or/xor X, (not Y)) -> (andn/orn/xnor X, Y)
2936 if (match(Op.get(), m_Not(m_Value()))) {
2937 Ops.push_back(&Op);
2938 return true;
2939 }
2940 }
2941 }
2942 } else if (I->getOpcode() == Instruction::And && ST->hasStdExtZvkb()) {
2943 for (auto &Op : I->operands()) {
2944 // (and X, (not Y)) -> (vandn.vv X, Y)
2945 if (match(Op.get(), m_Not(m_Value()))) {
2946 Ops.push_back(&Op);
2947 return true;
2948 }
2949 // (and X, (splat (not Y))) -> (vandn.vx X, Y)
2951 m_ZeroInt()),
2952 m_Value(), m_ZeroMask()))) {
2953 Use &InsertElt = cast<Instruction>(Op)->getOperandUse(0);
2954 Use &Not = cast<Instruction>(InsertElt)->getOperandUse(1);
2955 Ops.push_back(&Not);
2956 Ops.push_back(&InsertElt);
2957 Ops.push_back(&Op);
2958 return true;
2959 }
2960 }
2961 }
2962 }
2963
2964 if (!I->getType()->isVectorTy() || !ST->hasVInstructions())
2965 return false;
2966
2967 // Don't sink splat operands if the target prefers it. Some targets requires
2968 // S2V transfer buffers and we can run out of them copying the same value
2969 // repeatedly.
2970 // FIXME: It could still be worth doing if it would improve vector register
2971 // pressure and prevent a vector spill.
2972 if (!ST->sinkSplatOperands())
2973 return false;
2974
2975 for (auto OpIdx : enumerate(I->operands())) {
2976 if (!canSplatOperand(I, OpIdx.index()))
2977 continue;
2978
2979 Instruction *Op = dyn_cast<Instruction>(OpIdx.value().get());
2980 // Make sure we are not already sinking this operand
2981 if (!Op || any_of(Ops, [&](Use *U) { return U->get() == Op; }))
2982 continue;
2983
2984 // We are looking for a splat/vp.splat that can be sunk.
2986 m_Value(), m_Value(), m_Value()));
2987 if (!IsVPSplat &&
2989 m_Undef(), m_ZeroMask())))
2990 continue;
2991
2992 // Don't sink i1 splats.
2993 if (cast<VectorType>(Op->getType())->getElementType()->isIntegerTy(1))
2994 continue;
2995
2996 // All uses of the shuffle should be sunk to avoid duplicating it across gpr
2997 // and vector registers
2998 for (Use &U : Op->uses()) {
2999 Instruction *Insn = cast<Instruction>(U.getUser());
3000 if (!canSplatOperand(Insn, U.getOperandNo()))
3001 return false;
3002 }
3003
3004 // Sink any fpexts since they might be used in a widening fp pattern.
3005 if (IsVPSplat) {
3006 if (isa<FPExtInst>(Op->getOperand(0)))
3007 Ops.push_back(&Op->getOperandUse(0));
3008 } else {
3009 Use *InsertEltUse = &Op->getOperandUse(0);
3010 auto *InsertElt = cast<InsertElementInst>(InsertEltUse);
3011 if (isa<FPExtInst>(InsertElt->getOperand(1)))
3012 Ops.push_back(&InsertElt->getOperandUse(1));
3013 Ops.push_back(InsertEltUse);
3014 }
3015 Ops.push_back(&OpIdx.value());
3016 }
3017 return true;
3018}
3019
3021RISCVTTIImpl::enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const {
3023 // TODO: Enable expansion when unaligned access is not supported after we fix
3024 // issues in ExpandMemcmp.
3025 if (!ST->enableUnalignedScalarMem())
3026 return Options;
3027
3028 if (!ST->hasStdExtZbb() && !ST->hasStdExtZbkb() && !IsZeroCmp)
3029 return Options;
3030
3031 Options.AllowOverlappingLoads = true;
3032 Options.MaxNumLoads = TLI->getMaxExpandSizeMemcmp(OptSize);
3033 Options.NumLoadsPerBlock = Options.MaxNumLoads;
3034 if (ST->is64Bit()) {
3035 Options.LoadSizes = {8, 4, 2, 1};
3036 Options.AllowedTailExpansions = {3, 5, 6};
3037 } else {
3038 Options.LoadSizes = {4, 2, 1};
3039 Options.AllowedTailExpansions = {3};
3040 }
3041
3042 if (IsZeroCmp && ST->hasVInstructions()) {
3043 unsigned VLenB = ST->getRealMinVLen() / 8;
3044 // The minimum size should be `XLen / 8 + 1`, and the maxinum size should be
3045 // `VLenB * MaxLMUL` so that it fits in a single register group.
3046 unsigned MinSize = ST->getXLen() / 8 + 1;
3047 unsigned MaxSize = VLenB * ST->getMaxLMULForFixedLengthVectors();
3048 for (unsigned Size = MinSize; Size <= MaxSize; Size++)
3049 Options.LoadSizes.insert(Options.LoadSizes.begin(), Size);
3050 }
3051 return Options;
3052}
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
This file provides a helper that implements much of the TTI interface in terms of the target-independ...
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
static bool shouldSplit(Instruction *InsertPoint, DenseSet< Value * > &PrevConditionValues, DenseSet< Value * > &ConditionValues, DominatorTree &DT, DenseSet< Instruction * > &Unhoistables)
static cl::opt< OutputCostKind > CostKind("cost-kind", cl::desc("Target cost kind"), cl::init(OutputCostKind::RecipThroughput), cl::values(clEnumValN(OutputCostKind::RecipThroughput, "throughput", "Reciprocal throughput"), clEnumValN(OutputCostKind::Latency, "latency", "Instruction latency"), clEnumValN(OutputCostKind::CodeSize, "code-size", "Code size"), clEnumValN(OutputCostKind::SizeAndLatency, "size-latency", "Code size and latency"), clEnumValN(OutputCostKind::All, "all", "Print all cost kinds")))
Cost tables and simple lookup functions.
Hexagon Common GEP
static cl::opt< int > InstrCost("inline-instr-cost", cl::Hidden, cl::init(5), cl::desc("Cost of a single instruction when inlining"))
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
static LVOptions Options
Definition LVOptions.cpp:25
#define F(x, y, z)
Definition MD5.cpp:55
#define I(x, y, z)
Definition MD5.cpp:58
mir Rename Register Operands
static const Function * getCalledFunction(const Value *V)
MachineInstr unsigned OpIdx
uint64_t IntrinsicInst * II
static InstructionCost costShuffleViaVRegSplitting(const RISCVTTIImpl &TTI, MVT LegalVT, std::optional< unsigned > VLen, VectorType *Tp, ArrayRef< int > Mask, TTI::TargetCostKind CostKind)
Try to perform better estimation of the permutation.
static InstructionCost costShuffleViaSplitting(const RISCVTTIImpl &TTI, MVT LegalVT, VectorType *Tp, ArrayRef< int > Mask, TTI::TargetCostKind CostKind)
Attempt to approximate the cost of a shuffle which will require splitting during legalization.
static bool isRepeatedConcatMask(ArrayRef< int > Mask, int &SubVectorSize)
static unsigned isM1OrSmaller(MVT VT)
static cl::opt< unsigned > SLPMaxVF("riscv-v-slp-max-vf", cl::desc("Overrides result used for getMaximumVF query which is used " "exclusively by SLP vectorizer."), cl::Hidden)
static cl::opt< unsigned > RVVRegisterWidthLMUL("riscv-v-register-bit-width-lmul", cl::desc("The LMUL to use for getRegisterBitWidth queries. Affects LMUL used " "by autovectorized code. Fractional LMULs are not supported."), cl::init(2), cl::Hidden)
static cl::opt< unsigned > RVVMinTripCount("riscv-v-min-trip-count", cl::desc("Set the lower bound of a trip count to decide on " "vectorization while tail-folding."), cl::init(5), cl::Hidden)
static InstructionCost getIntImmCostImpl(const DataLayout &DL, const RISCVSubtarget *ST, const APInt &Imm, Type *Ty, TTI::TargetCostKind CostKind, bool FreeZeroes)
static VectorType * getVRGatherIndexType(MVT DataVT, const RISCVSubtarget &ST, LLVMContext &C)
static const CostTblEntry VectorIntrinsicCostTable[]
static bool canUseShiftPair(Instruction *Inst, const APInt &Imm)
This file defines a TargetTransformInfoImplBase conforming object specific to the RISC-V target machi...
static Type * getValueType(Value *V)
Returns the type of the given value/instruction V.
This file contains some templates that are useful if you are working with the STL at all.
#define LLVM_DEBUG(...)
Definition Debug.h:114
This file describes how to lower LLVM code to machine code.
This pass exposes codegen information to IR-level passes.
Class for arbitrary precision integers.
Definition APInt.h:78
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:41
size_t size() const
size - Get the array size.
Definition ArrayRef.h:147
InstructionCost getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef< unsigned > Indices, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, bool UseMaskForCond=false, bool UseMaskForGaps=false) const override
InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index, const Value *Op0, const Value *Op1) const override
InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Opd1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Opd2Info={TTI::OK_AnyValue, TTI::OP_None}, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr) const override
InstructionCost getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty, FastMathFlags FMF, TTI::TargetCostKind CostKind) const override
InstructionCost getGEPCost(Type *PointeeType, const Value *Ptr, ArrayRef< const Value * > Operands, Type *AccessType, TTI::TargetCostKind CostKind) const override
InstructionCost getScalarizationOverhead(VectorType *InTy, const APInt &DemandedElts, bool Insert, bool Extract, TTI::TargetCostKind CostKind, bool ForPoisonSrc=true, ArrayRef< Value * > VL={}) const override
InstructionCost getStridedMemoryOpCost(unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask, Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I) const override
TTI::ShuffleKind improveShuffleKindFromMask(TTI::ShuffleKind Kind, ArrayRef< int > Mask, VectorType *SrcTy, int &Index, VectorType *&SubTy) const
bool isLegalAddressingMode(Type *Ty, GlobalValue *BaseGV, int64_t BaseOffset, bool HasBaseReg, int64_t Scale, unsigned AddrSpace, Instruction *I=nullptr, int64_t ScalableOffset=0) const override
InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *DstTy, VectorType *SrcTy, ArrayRef< int > Mask, TTI::TargetCostKind CostKind, int Index, VectorType *SubTp, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr) const override
InstructionCost getExpandCompressMemoryOpCost(unsigned Opcode, Type *DataTy, bool VariableMask, Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I=nullptr) const override
InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind) const override
InstructionCost getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Op1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Op2Info={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr) const override
std::optional< unsigned > getMaxVScale() const override
void getUnrollingPreferences(Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP, OptimizationRemarkEmitter *ORE) const override
void getPeelingPreferences(Loop *L, ScalarEvolution &SE, TTI::PeelingPreferences &PP) const override
InstructionCost getIndexedVectorInstrCostFromEnd(unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index) const override
InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH, TTI::TargetCostKind CostKind, const Instruction *I=nullptr) const override
std::pair< InstructionCost, MVT > getTypeLegalizationCost(Type *Ty) const
bool isLegalAddImmediate(int64_t imm) const override
std::optional< unsigned > getVScaleForTuning() const override
InstructionCost getExtendedReductionCost(unsigned Opcode, bool IsUnsigned, Type *ResTy, VectorType *Ty, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind) const override
InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind) const override
InstructionCost getAddressComputationCost(Type *PtrTy, ScalarEvolution *, const SCEV *, TTI::TargetCostKind) const override
unsigned getRegUsageForType(Type *Ty) const override
InstructionCost getGatherScatterOpCost(unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask, Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I=nullptr) const override
InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, TTI::OperandValueInfo OpInfo={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr) const override
InstructionCost getMaskedMemoryOpCost(unsigned Opcode, Type *DataTy, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind) const override
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition InstrTypes.h:678
@ FCMP_OEQ
0 0 0 1 True if ordered and equal
Definition InstrTypes.h:681
@ FCMP_TRUE
1 1 1 1 Always true (always folded)
Definition InstrTypes.h:695
@ ICMP_SLT
signed less than
Definition InstrTypes.h:707
@ FCMP_OLT
0 1 0 0 True if ordered and less than
Definition InstrTypes.h:684
@ FCMP_ULE
1 1 0 1 True if unordered, less than, or equal
Definition InstrTypes.h:693
@ FCMP_OGT
0 0 1 0 True if ordered and greater than
Definition InstrTypes.h:682
@ FCMP_OGE
0 0 1 1 True if ordered and greater than or equal
Definition InstrTypes.h:683
@ FCMP_ULT
1 1 0 0 True if unordered or less than
Definition InstrTypes.h:692
@ FCMP_ONE
0 1 1 0 True if ordered and operands are unequal
Definition InstrTypes.h:686
@ FCMP_UEQ
1 0 0 1 True if unordered or equal
Definition InstrTypes.h:689
@ FCMP_UGT
1 0 1 0 True if unordered or greater than
Definition InstrTypes.h:690
@ FCMP_OLE
0 1 0 1 True if ordered and less than or equal
Definition InstrTypes.h:685
@ FCMP_ORD
0 1 1 1 True if ordered (no nans)
Definition InstrTypes.h:687
@ ICMP_NE
not equal
Definition InstrTypes.h:700
@ FCMP_UNE
1 1 1 0 True if unordered or not equal
Definition InstrTypes.h:694
@ FCMP_UGE
1 0 1 1 True if unordered, greater than, or equal
Definition InstrTypes.h:691
@ FCMP_FALSE
0 0 0 0 Always false (always folded)
Definition InstrTypes.h:680
@ FCMP_UNO
1 0 0 0 True if unordered: isnan(X) | isnan(Y)
Definition InstrTypes.h:688
static bool isFPPredicate(Predicate P)
Definition InstrTypes.h:772
static bool isIntPredicate(Predicate P)
Definition InstrTypes.h:778
A parsed version of the target data layout string in and methods for querying it.
Definition DataLayout.h:63
Convenience struct for specifying and reasoning about fast-math flags.
Definition FMF.h:22
bool noNaNs() const
Definition FMF.h:65
Class to represent fixed width SIMD vectors.
unsigned getNumElements() const
static FixedVectorType * getDoubleElementsVectorType(FixedVectorType *VTy)
static LLVM_ABI FixedVectorType * get(Type *ElementType, unsigned NumElts)
Definition Type.cpp:803
an instruction for type-safe pointer arithmetic to access elements of arrays and structs
static InstructionCost getInvalid(CostType Val=0)
CostType getValue() const
This function is intended to be used as sparingly as possible, since the class provides the full rang...
LLVM_ABI bool isCommutative() const LLVM_READONLY
Return true if the instruction is commutative:
static LLVM_ABI IntegerType * get(LLVMContext &C, unsigned NumBits)
This static method is the primary way of constructing an IntegerType.
Definition Type.cpp:319
const SmallVectorImpl< Type * > & getArgTypes() const
const SmallVectorImpl< const Value * > & getArgs() const
A wrapper class for inspecting calls to intrinsic functions.
This is an important class for using LLVM in a threaded context.
Definition LLVMContext.h:68
Represents a single loop in the control flow graph.
Definition LoopInfo.h:40
Machine Value Type.
static MVT getFloatingPointVT(unsigned BitWidth)
unsigned getVectorMinNumElements() const
Given a vector type, return the minimum number of elements it contains.
uint64_t getScalarSizeInBits() const
MVT changeVectorElementType(MVT EltVT) const
Return a VT for a vector type whose attributes match ourselves with the exception of the element type...
bool bitsLE(MVT VT) const
Return true if this has no more bits than VT.
unsigned getVectorNumElements() const
bool isVector() const
Return true if this is a vector value type.
MVT changeTypeToInteger()
Return the type converted to an equivalently sized integer or vector with integer element type.
TypeSize getSizeInBits() const
Returns the size of the specified MVT in bits.
uint64_t getFixedSizeInBits() const
Return the size of the specified fixed width value type in bits.
bool bitsGT(MVT VT) const
Return true if this has more bits than VT.
bool isFixedLengthVector() const
TypeSize getStoreSize() const
Return the number of bytes overwritten by a store of the specified value type.
MVT getVectorElementType() const
static MVT getIntegerVT(unsigned BitWidth)
MVT getScalarType() const
If this is a vector, return the element type, otherwise return this.
unsigned getOpcode() const
Return the opcode for this Instruction or ConstantExpr.
Definition Operator.h:43
The optimization diagnostic interface.
InstructionCost getExtendedReductionCost(unsigned Opcode, bool IsUnsigned, Type *ResTy, VectorType *ValTy, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind) const override
InstructionCost getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind, const Instruction *I=nullptr) const override
InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Op1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Op2Info={TTI::OK_AnyValue, TTI::OP_None}, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr) const override
bool isLegalMaskedExpandLoad(Type *DataType, Align Alignment) const override
bool isLegalMaskedLoadStore(Type *DataType, Align Alignment) const
InstructionCost getIntImmCostIntrin(Intrinsic::ID IID, unsigned Idx, const APInt &Imm, Type *Ty, TTI::TargetCostKind CostKind) const override
unsigned getMinTripCountTailFoldingThreshold() const override
TTI::AddressingModeKind getPreferredAddressingMode(const Loop *L, ScalarEvolution *SE) const override
InstructionCost getAddressComputationCost(Type *PTy, ScalarEvolution *SE, const SCEV *Ptr, TTI::TargetCostKind CostKind) const override
InstructionCost getStoreImmCost(Type *VecTy, TTI::OperandValueInfo OpInfo, TTI::TargetCostKind CostKind) const
Return the cost of materializing an immediate for a value operand of a store instruction.
InstructionCost getCostOfKeepingLiveOverCall(ArrayRef< Type * > Tys) const override
bool hasActiveVectorLength() const override
InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH, TTI::TargetCostKind CostKind, const Instruction *I=nullptr) const override
InstructionCost getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Op1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Op2Info={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr) const override
InstructionCost getIndexedVectorInstrCostFromEnd(unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index) const override
InstructionCost getExpandCompressMemoryOpCost(unsigned Opcode, Type *Src, bool VariableMask, Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I=nullptr) const override
void getUnrollingPreferences(Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP, OptimizationRemarkEmitter *ORE) const override
InstructionCost getMaskedMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind) const override
InstructionCost getGatherScatterOpCost(unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask, Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I) const override
InstructionCost getIntImmCostInst(unsigned Opcode, unsigned Idx, const APInt &Imm, Type *Ty, TTI::TargetCostKind CostKind, Instruction *Inst=nullptr) const override
InstructionCost getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty, FastMathFlags FMF, TTI::TargetCostKind CostKind) const override
Try to calculate op costs for min/max reduction operations.
bool canSplatOperand(Instruction *I, int Operand) const
Return true if the (vector) instruction I will be lowered to an instruction with a scalar splat opera...
bool isLSRCostLess(const TargetTransformInfo::LSRCost &C1, const TargetTransformInfo::LSRCost &C2) const override
bool isLegalStridedLoadStore(Type *DataType, Align Alignment) const override
unsigned getRegUsageForType(Type *Ty) const override
InstructionCost getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef< unsigned > Indices, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, bool UseMaskForCond=false, bool UseMaskForGaps=false) const override
InstructionCost getScalarizationOverhead(VectorType *Ty, const APInt &DemandedElts, bool Insert, bool Extract, TTI::TargetCostKind CostKind, bool ForPoisonSrc=true, ArrayRef< Value * > VL={}) const override
Estimate the overhead of scalarizing an instruction.
bool isLegalMaskedScatter(Type *DataType, Align Alignment) const override
bool isLegalMaskedCompressStore(Type *DataTy, Align Alignment) const override
bool preferAlternateOpcodeVectorization() const override
bool isProfitableToSinkOperands(Instruction *I, SmallVectorImpl< Use * > &Ops) const override
Check if sinking I's operands to I's basic block is profitable, because the operands can be folded in...
InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index, const Value *Op0, const Value *Op1) const override
std::optional< unsigned > getMaxVScale() const override
bool shouldExpandReduction(const IntrinsicInst *II) const override
std::optional< unsigned > getVScaleForTuning() const override
bool isLegalMaskedGather(Type *DataType, Align Alignment) const override
InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *DstTy, VectorType *SrcTy, ArrayRef< int > Mask, TTI::TargetCostKind CostKind, int Index, VectorType *SubTp, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr) const override
unsigned getMaximumVF(unsigned ElemWidth, unsigned Opcode) const override
InstructionCost getPointersChainCost(ArrayRef< const Value * > Ptrs, const Value *Base, const TTI::PointersChainInfo &Info, Type *AccessTy, TTI::TargetCostKind CostKind) const override
TTI::MemCmpExpansionOptions enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const override
InstructionCost getPartialReductionCost(unsigned Opcode, Type *InputTypeA, Type *InputTypeB, Type *AccumType, ElementCount VF, TTI::PartialReductionExtendKind OpAExtend, TTI::PartialReductionExtendKind OpBExtend, std::optional< unsigned > BinOp, TTI::TargetCostKind CostKind) const override
InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, TTI::OperandValueInfo OpdInfo={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr) const override
InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind) const override
Get intrinsic cost based on arguments.
InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind) const override
TypeSize getRegisterBitWidth(TargetTransformInfo::RegisterKind K) const override
void getPeelingPreferences(Loop *L, ScalarEvolution &SE, TTI::PeelingPreferences &PP) const override
bool shouldConsiderAddressTypePromotion(const Instruction &I, bool &AllowPromotionWithoutCommonHeader) const override
See if I should be considered for address type promotion.
InstructionCost getIntImmCost(const APInt &Imm, Type *Ty, TTI::TargetCostKind CostKind) const override
InstructionCost getStridedMemoryOpCost(unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask, Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I) const override
TargetTransformInfo::PopcntSupportKind getPopcntSupport(unsigned TyWidth) const override
static MVT getM1VT(MVT VT)
Given a vector (either fixed or scalable), return the scalable vector corresponding to a vector regis...
InstructionCost getVRGatherVVCost(MVT VT) const
Return the cost of a vrgather.vv instruction for the type VT.
InstructionCost getVRGatherVICost(MVT VT) const
Return the cost of a vrgather.vi (or vx) instruction for the type VT.
static unsigned computeVLMAX(unsigned VectorBits, unsigned EltSize, unsigned MinSize)
InstructionCost getLMULCost(MVT VT) const
Return the cost of LMUL for linear operations.
InstructionCost getVSlideVICost(MVT VT) const
Return the cost of a vslidedown.vi or vslideup.vi instruction for the type VT.
InstructionCost getVSlideVXCost(MVT VT) const
Return the cost of a vslidedown.vx or vslideup.vx instruction for the type VT.
static RISCVVType::VLMUL getLMUL(MVT VT)
This class represents an analyzed expression in the program.
The main scalar evolution driver.
static LLVM_ABI bool isIdentityMask(ArrayRef< int > Mask, int NumSrcElts)
Return true if this shuffle mask chooses elements from exactly one source vector without lane crossin...
static LLVM_ABI bool isInterleaveMask(ArrayRef< int > Mask, unsigned Factor, unsigned NumInputElts, SmallVectorImpl< unsigned > &StartIndexes)
Return true if the mask interleaves one or more input vectors together.
Implements a dense probed hash-table based set with some number of buckets stored inline.
Definition DenseSet.h:281
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void append(ItTy in_start, ItTy in_end)
Add the specified range to the end of the SmallVector.
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
An instruction for storing to memory.
virtual const DataLayout & getDataLayout() const
virtual TTI::AddressingModeKind getPreferredAddressingMode(const Loop *L, ScalarEvolution *SE) const
virtual bool isLoweredToCall(const Function *F) const
InstructionCost getInstructionCost(const User *U, ArrayRef< const Value * > Operands, TTI::TargetCostKind CostKind) const override
TargetCostKind
The kind of cost model.
@ TCK_RecipThroughput
Reciprocal throughput.
@ TCK_CodeSize
Instruction code size.
@ TCK_SizeAndLatency
The weighted sum of size and latency.
@ TCK_Latency
The latency of instruction.
static bool requiresOrderedReduction(std::optional< FastMathFlags > FMF)
A helper function to determine the type of reduction algorithm used for a given Opcode and set of Fas...
PopcntSupportKind
Flags indicating the kind of support for population count.
@ TCC_Free
Expected to fold away in lowering.
@ TCC_Basic
The cost of a typical 'add' instruction.
ShuffleKind
The various kinds of shuffle patterns for vector queries.
@ SK_InsertSubvector
InsertSubvector. Index indicates start offset.
@ SK_Select
Selects elements from the corresponding lane of either source operand.
@ SK_PermuteSingleSrc
Shuffle elements of single source vector with any shuffle mask.
@ SK_Transpose
Transpose two vectors.
@ SK_Splice
Concatenates elements from the first input vector with elements of the second input vector.
@ SK_Broadcast
Broadcast element 0 to all other elements.
@ SK_PermuteTwoSrc
Merge elements from two source vectors into one with any shuffle mask.
@ SK_Reverse
Reverse the order of the vector.
@ SK_ExtractSubvector
ExtractSubvector Index indicates start offset.
CastContextHint
Represents a hint about the context in which a cast is used.
@ None
The cast is not used with a load/store of any kind.
static constexpr TypeSize getFixed(ScalarTy ExactSize)
Definition TypeSize.h:343
static constexpr TypeSize getScalable(ScalarTy MinimumSize)
Definition TypeSize.h:346
The instances of the Type class are immutable: once they are created, they are never changed.
Definition Type.h:45
static LLVM_ABI IntegerType * getInt64Ty(LLVMContext &C)
Definition Type.cpp:298
bool isVectorTy() const
True if this is an instance of VectorType.
Definition Type.h:273
LLVM_ABI bool isScalableTy(SmallPtrSetImpl< const Type * > &Visited) const
Return true if this is a type whose size is a known multiple of vscale.
Definition Type.cpp:62
bool isBFloatTy() const
Return true if this is 'bfloat', a 16-bit bfloat type.
Definition Type.h:145
LLVM_ABI unsigned getPointerAddressSpace() const
Get the address space of this pointer or pointer vector type.
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
Definition Type.h:352
LLVM_ABI Type * getWithNewBitWidth(unsigned NewBitWidth) const
Given an integer or vector type, change the lane bitwidth to NewBitwidth, whilst keeping the old numb...
bool isHalfTy() const
Return true if this is 'half', a 16-bit IEEE fp type.
Definition Type.h:142
LLVMContext & getContext() const
Return the LLVMContext in which this type was uniqued.
Definition Type.h:128
LLVM_ABI unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
Definition Type.cpp:231
static LLVM_ABI IntegerType * getInt1Ty(LLVMContext &C)
Definition Type.cpp:294
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition Type.h:240
static LLVM_ABI IntegerType * getIntNTy(LLVMContext &C, unsigned N)
Definition Type.cpp:301
static LLVM_ABI Type * getFloatTy(LLVMContext &C)
Definition Type.cpp:285
A Use represents the edge between a Value definition and its users.
Definition Use.h:35
Value * getOperand(unsigned i) const
Definition User.h:232
LLVM Value Representation.
Definition Value.h:75
Base class of all SIMD vector types.
ElementCount getElementCount() const
Return an ElementCount instance to represent the (possibly scalable) number of elements in the vector...
static LLVM_ABI VectorType * get(Type *ElementType, ElementCount EC)
This static method is the primary way to construct an VectorType.
std::pair< iterator, bool > insert(const ValueT &V)
Definition DenseSet.h:194
constexpr bool isKnownMultipleOf(ScalarTy RHS) const
This function tells the caller whether the element count is known at compile time to be a multiple of...
Definition TypeSize.h:181
constexpr ScalarTy getFixedValue() const
Definition TypeSize.h:200
static constexpr bool isKnownLE(const FixedOrScalableQuantity &LHS, const FixedOrScalableQuantity &RHS)
Definition TypeSize.h:230
static constexpr bool isKnownLT(const FixedOrScalableQuantity &LHS, const FixedOrScalableQuantity &RHS)
Definition TypeSize.h:216
constexpr ScalarTy getKnownMinValue() const
Returns the minimum value this quantity can represent.
Definition TypeSize.h:166
constexpr LeafTy divideCoefficientBy(ScalarTy RHS) const
We do not provide the '/' operator here because division for polynomial types does not work in the sa...
Definition TypeSize.h:252
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
ISD namespace - This namespace contains an enum which represents all of the SelectionDAG node types a...
Definition ISDOpcodes.h:24
@ ADD
Simple integer binary arithmetic operators.
Definition ISDOpcodes.h:259
@ SINT_TO_FP
[SU]INT_TO_FP - These operators convert integers (whose interpreted sign depends on the first letter)...
Definition ISDOpcodes.h:862
@ FADD
Simple binary floating point operators.
Definition ISDOpcodes.h:410
@ SIGN_EXTEND
Conversion operators.
Definition ISDOpcodes.h:826
@ MULHU
MULHU/MULHS - Multiply high - Multiply two integers of type iN, producing an unsigned/signed value of...
Definition ISDOpcodes.h:695
@ SHL
Shift and rotation operations.
Definition ISDOpcodes.h:756
@ ZERO_EXTEND
ZERO_EXTEND - Used for integer types, zeroing the new bits.
Definition ISDOpcodes.h:832
@ FP_TO_SINT
FP_TO_[US]INT - Convert a floating point value to a signed or unsigned integer.
Definition ISDOpcodes.h:908
@ AND
Bitwise operators - logical and, logical or, logical xor.
Definition ISDOpcodes.h:730
@ FP_ROUND
X = FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision of the ...
Definition ISDOpcodes.h:941
@ TRUNCATE
TRUNCATE - Completely drop the high bits.
Definition ISDOpcodes.h:838
SpecificConstantMatch m_ZeroInt()
Convenience matchers for specific integer values.
BinaryOp_match< SrcTy, SpecificConstantMatch, TargetOpcode::G_XOR, true > m_Not(const SrcTy &&Src)
Matches a register not-ed by a G_XOR.
bool match(Val *V, const Pattern &P)
IntrinsicID_match m_Intrinsic()
Match intrinsic calls like this: m_Intrinsic<Intrinsic::fabs>(m_Value(X))
TwoOps_match< V1_t, V2_t, Instruction::ShuffleVector > m_Shuffle(const V1_t &v1, const V2_t &v2)
Matches ShuffleVectorInst independently of mask value.
class_match< Value > m_Value()
Match an arbitrary value and ignore it.
auto m_Undef()
Match an arbitrary undef constant.
ThreeOps_match< Val_t, Elt_t, Idx_t, Instruction::InsertElement > m_InsertElt(const Val_t &Val, const Elt_t &Elt, const Idx_t &Idx)
Matches InsertElementInst.
int getIntMatCost(const APInt &Val, unsigned Size, const MCSubtargetInfo &STI, bool CompressionCost, bool FreeZeroes)
static constexpr unsigned RVVBitsPerBlock
initializer< Ty > init(const Ty &Val)
This is an optimization pass for GlobalISel generic memory operations.
unsigned Log2_32_Ceil(uint32_t Value)
Return the ceil log base 2 of the specified value, 32 if the value is zero.
Definition MathExtras.h:355
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1707
const CostTblEntryT< CostType > * CostTableLookup(ArrayRef< CostTblEntryT< CostType > > Tbl, int ISD, MVT Ty)
Find in cost table.
Definition CostTable.h:35
LLVM_ABI bool getBooleanLoopAttribute(const Loop *TheLoop, StringRef Name)
Returns true if Name is applied to TheLoop and enabled.
InstructionCost Cost
constexpr bool isInt(int64_t x)
Checks if an integer fits into the given bit width.
Definition MathExtras.h:174
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are tuples (A, B,...
Definition STLExtras.h:2454
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:649
int countr_zero(T Val)
Count number of 0's from the least significant bit to the most stopping at the first 1.
Definition bit.h:157
constexpr bool isShiftedMask_64(uint64_t Value)
Return true if the argument contains a non-empty sequence of ones with the remainder zero (64 bit ver...
Definition MathExtras.h:282
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1714
unsigned Log2_32(uint32_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition MathExtras.h:342
LLVM_ABI llvm::SmallVector< int, 16 > createStrideMask(unsigned Start, unsigned Stride, unsigned VF)
Create a stride shuffle mask.
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
Definition MathExtras.h:288
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:207
constexpr bool isUInt(uint64_t x)
Checks if an unsigned integer fits into the given bit width.
Definition MathExtras.h:198
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:548
constexpr int PoisonMaskElem
constexpr T divideCeil(U Numerator, V Denominator)
Returns the integer ceil(Numerator / Denominator).
Definition MathExtras.h:405
TargetTransformInfo TTI
LLVM_ABI bool isMaskedSlidePair(ArrayRef< int > Mask, int NumElts, std::array< std::pair< int, int >, 2 > &SrcInfo)
Does this shuffle mask represent either one slide shuffle or a pair of two slide shuffles,...
LLVM_ABI llvm::SmallVector< int, 16 > createInterleaveMask(unsigned VF, unsigned NumVecs)
Create an interleave shuffle mask.
DWARFExpression::Operation Op
CostTblEntryT< unsigned > CostTblEntry
Definition CostTable.h:30
OutputIt copy(R &&Range, OutputIt Out)
Definition STLExtras.h:1817
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:565
LLVM_ABI void processShuffleMasks(ArrayRef< int > Mask, unsigned NumOfSrcRegs, unsigned NumOfDestRegs, unsigned NumOfUsedRegs, function_ref< void()> NoInputAction, function_ref< void(ArrayRef< int >, unsigned, unsigned)> SingleInputAction, function_ref< void(ArrayRef< int >, unsigned, unsigned, bool)> ManyInputsAction)
Splits and processes shuffle mask depending on the number of input and output registers.
bool equal(L &&LRange, R &&RRange)
Wrapper function around std::equal to detect if pair-wise elements between two ranges are the same.
Definition STLExtras.h:2070
T bit_floor(T Value)
Returns the largest integral power of two no greater than Value if Value is nonzero.
Definition bit.h:280
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition BitVector.h:853
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition Alignment.h:39
Extended Value Type.
Definition ValueTypes.h:35
LLVM_ABI Type * getTypeForEVT(LLVMContext &Context) const
This method returns an LLVM type corresponding to the specified EVT.
unsigned Insns
TODO: Some of these could be merged.
Returns options for expansion of memcmp. IsZeroCmp is.
Describe known properties for a set of pointers.
Parameters that control the generic loop unrolling transformation.
bool UpperBound
Allow using trip count upper bound to unroll loops.
bool Force
Apply loop unroll on any kind of loop (mainly to loops that fail runtime unrolling).
unsigned PartialOptSizeThreshold
The cost threshold for the unrolled loop when optimizing for size, like OptSizeThreshold,...
bool UnrollAndJam
Allow unroll and jam. Used to enable unroll and jam for the target.
bool UnrollRemainder
Allow unrolling of all the iterations of the runtime loop remainder.
bool Runtime
Allow runtime unrolling (unrolling of loops to expand the size of the loop body even when the number ...
bool Partial
Allow partial unrolling (unrolling of loops to expand the size of the loop body, not only to eliminat...
unsigned OptSizeThreshold
The cost threshold for the unrolled loop when optimizing for size (set to UINT_MAX to disable).