LLVM 22.0.0git
VectorCombine.cpp
Go to the documentation of this file.
1//===------- VectorCombine.cpp - Optimize partial vector operations -------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This pass optimizes scalar/vector interactions using target cost models. The
10// transforms implemented here may not fit in traditional loop-based or SLP
11// vectorization passes.
12//
13//===----------------------------------------------------------------------===//
14
16#include "llvm/ADT/DenseMap.h"
17#include "llvm/ADT/STLExtras.h"
18#include "llvm/ADT/ScopeExit.h"
20#include "llvm/ADT/Statistic.h"
25#include "llvm/Analysis/Loads.h"
30#include "llvm/IR/Dominators.h"
31#include "llvm/IR/Function.h"
32#include "llvm/IR/IRBuilder.h"
38#include <numeric>
39#include <optional>
40#include <queue>
41#include <set>
42
43#define DEBUG_TYPE "vector-combine"
45
46using namespace llvm;
47using namespace llvm::PatternMatch;
48
49STATISTIC(NumVecLoad, "Number of vector loads formed");
50STATISTIC(NumVecCmp, "Number of vector compares formed");
51STATISTIC(NumVecBO, "Number of vector binops formed");
52STATISTIC(NumVecCmpBO, "Number of vector compare + binop formed");
53STATISTIC(NumShufOfBitcast, "Number of shuffles moved after bitcast");
54STATISTIC(NumScalarOps, "Number of scalar unary + binary ops formed");
55STATISTIC(NumScalarCmp, "Number of scalar compares formed");
56STATISTIC(NumScalarIntrinsic, "Number of scalar intrinsic calls formed");
57
59 "disable-vector-combine", cl::init(false), cl::Hidden,
60 cl::desc("Disable all vector combine transforms"));
61
63 "disable-binop-extract-shuffle", cl::init(false), cl::Hidden,
64 cl::desc("Disable binop extract to shuffle transforms"));
65
67 "vector-combine-max-scan-instrs", cl::init(30), cl::Hidden,
68 cl::desc("Max number of instructions to scan for vector combining."));
69
70static const unsigned InvalidIndex = std::numeric_limits<unsigned>::max();
71
72namespace {
73class VectorCombine {
74public:
75 VectorCombine(Function &F, const TargetTransformInfo &TTI,
78 bool TryEarlyFoldsOnly)
79 : F(F), Builder(F.getContext(), InstSimplifyFolder(*DL)), TTI(TTI),
80 DT(DT), AA(AA), AC(AC), DL(DL), CostKind(CostKind), SQ(*DL),
81 TryEarlyFoldsOnly(TryEarlyFoldsOnly) {}
82
83 bool run();
84
85private:
86 Function &F;
88 const TargetTransformInfo &TTI;
89 const DominatorTree &DT;
90 AAResults &AA;
91 AssumptionCache &AC;
92 const DataLayout *DL;
93 TTI::TargetCostKind CostKind;
94 const SimplifyQuery SQ;
95
96 /// If true, only perform beneficial early IR transforms. Do not introduce new
97 /// vector operations.
98 bool TryEarlyFoldsOnly;
99
100 InstructionWorklist Worklist;
101
102 /// Next instruction to iterate. It will be updated when it is erased by
103 /// RecursivelyDeleteTriviallyDeadInstructions.
104 Instruction *NextInst;
105
106 // TODO: Direct calls from the top-level "run" loop use a plain "Instruction"
107 // parameter. That should be updated to specific sub-classes because the
108 // run loop was changed to dispatch on opcode.
109 bool vectorizeLoadInsert(Instruction &I);
110 bool widenSubvectorLoad(Instruction &I);
111 ExtractElementInst *getShuffleExtract(ExtractElementInst *Ext0,
112 ExtractElementInst *Ext1,
113 unsigned PreferredExtractIndex) const;
114 bool isExtractExtractCheap(ExtractElementInst *Ext0, ExtractElementInst *Ext1,
115 const Instruction &I,
116 ExtractElementInst *&ConvertToShuffle,
117 unsigned PreferredExtractIndex);
118 Value *foldExtExtCmp(Value *V0, Value *V1, Value *ExtIndex, Instruction &I);
119 Value *foldExtExtBinop(Value *V0, Value *V1, Value *ExtIndex, Instruction &I);
120 bool foldExtractExtract(Instruction &I);
121 bool foldInsExtFNeg(Instruction &I);
122 bool foldInsExtBinop(Instruction &I);
123 bool foldInsExtVectorToShuffle(Instruction &I);
124 bool foldBitOpOfCastops(Instruction &I);
125 bool foldBitOpOfCastConstant(Instruction &I);
126 bool foldBitcastShuffle(Instruction &I);
127 bool scalarizeOpOrCmp(Instruction &I);
128 bool scalarizeVPIntrinsic(Instruction &I);
129 bool foldExtractedCmps(Instruction &I);
130 bool foldBinopOfReductions(Instruction &I);
131 bool foldSingleElementStore(Instruction &I);
132 bool scalarizeLoadExtract(Instruction &I);
133 bool scalarizeExtExtract(Instruction &I);
134 bool foldConcatOfBoolMasks(Instruction &I);
135 bool foldPermuteOfBinops(Instruction &I);
136 bool foldShuffleOfBinops(Instruction &I);
137 bool foldShuffleOfSelects(Instruction &I);
138 bool foldShuffleOfCastops(Instruction &I);
139 bool foldShuffleOfShuffles(Instruction &I);
140 bool foldShuffleOfIntrinsics(Instruction &I);
141 bool foldShuffleToIdentity(Instruction &I);
142 bool foldShuffleFromReductions(Instruction &I);
143 bool foldShuffleChainsToReduce(Instruction &I);
144 bool foldCastFromReductions(Instruction &I);
145 bool foldSelectShuffle(Instruction &I, bool FromReduction = false);
146 bool foldInterleaveIntrinsics(Instruction &I);
147 bool shrinkType(Instruction &I);
148 bool shrinkLoadForShuffles(Instruction &I);
149 bool shrinkPhiOfShuffles(Instruction &I);
150
151 void replaceValue(Instruction &Old, Value &New, bool Erase = true) {
152 LLVM_DEBUG(dbgs() << "VC: Replacing: " << Old << '\n');
153 LLVM_DEBUG(dbgs() << " With: " << New << '\n');
154 Old.replaceAllUsesWith(&New);
155 if (auto *NewI = dyn_cast<Instruction>(&New)) {
156 New.takeName(&Old);
157 Worklist.pushUsersToWorkList(*NewI);
158 Worklist.pushValue(NewI);
159 }
160 if (Erase && isInstructionTriviallyDead(&Old)) {
161 eraseInstruction(Old);
162 } else {
163 Worklist.push(&Old);
164 }
165 }
166
167 void eraseInstruction(Instruction &I) {
168 LLVM_DEBUG(dbgs() << "VC: Erasing: " << I << '\n');
169 SmallVector<Value *> Ops(I.operands());
170 Worklist.remove(&I);
171 I.eraseFromParent();
172
173 // Push remaining users of the operands and then the operand itself - allows
174 // further folds that were hindered by OneUse limits.
175 SmallPtrSet<Value *, 4> Visited;
176 for (Value *Op : Ops) {
177 if (!Visited.contains(Op)) {
178 if (auto *OpI = dyn_cast<Instruction>(Op)) {
180 OpI, nullptr, nullptr, [&](Value *V) {
181 if (auto *I = dyn_cast<Instruction>(V)) {
182 LLVM_DEBUG(dbgs() << "VC: Erased: " << *I << '\n');
183 Worklist.remove(I);
184 if (I == NextInst)
185 NextInst = NextInst->getNextNode();
186 Visited.insert(I);
187 }
188 }))
189 continue;
190 Worklist.pushUsersToWorkList(*OpI);
191 Worklist.pushValue(OpI);
192 }
193 }
194 }
195 }
196};
197} // namespace
198
199/// Return the source operand of a potentially bitcasted value. If there is no
200/// bitcast, return the input value itself.
202 while (auto *BitCast = dyn_cast<BitCastInst>(V))
203 V = BitCast->getOperand(0);
204 return V;
205}
206
207static bool canWidenLoad(LoadInst *Load, const TargetTransformInfo &TTI) {
208 // Do not widen load if atomic/volatile or under asan/hwasan/memtag/tsan.
209 // The widened load may load data from dirty regions or create data races
210 // non-existent in the source.
211 if (!Load || !Load->isSimple() || !Load->hasOneUse() ||
212 Load->getFunction()->hasFnAttribute(Attribute::SanitizeMemTag) ||
214 return false;
215
216 // We are potentially transforming byte-sized (8-bit) memory accesses, so make
217 // sure we have all of our type-based constraints in place for this target.
218 Type *ScalarTy = Load->getType()->getScalarType();
219 uint64_t ScalarSize = ScalarTy->getPrimitiveSizeInBits();
220 unsigned MinVectorSize = TTI.getMinVectorRegisterBitWidth();
221 if (!ScalarSize || !MinVectorSize || MinVectorSize % ScalarSize != 0 ||
222 ScalarSize % 8 != 0)
223 return false;
224
225 return true;
226}
227
228bool VectorCombine::vectorizeLoadInsert(Instruction &I) {
229 // Match insert into fixed vector of scalar value.
230 // TODO: Handle non-zero insert index.
231 Value *Scalar;
232 if (!match(&I,
234 return false;
235
236 // Optionally match an extract from another vector.
237 Value *X;
238 bool HasExtract = match(Scalar, m_ExtractElt(m_Value(X), m_ZeroInt()));
239 if (!HasExtract)
240 X = Scalar;
241
242 auto *Load = dyn_cast<LoadInst>(X);
243 if (!canWidenLoad(Load, TTI))
244 return false;
245
246 Type *ScalarTy = Scalar->getType();
247 uint64_t ScalarSize = ScalarTy->getPrimitiveSizeInBits();
248 unsigned MinVectorSize = TTI.getMinVectorRegisterBitWidth();
249
250 // Check safety of replacing the scalar load with a larger vector load.
251 // We use minimal alignment (maximum flexibility) because we only care about
252 // the dereferenceable region. When calculating cost and creating a new op,
253 // we may use a larger value based on alignment attributes.
254 Value *SrcPtr = Load->getPointerOperand()->stripPointerCasts();
255 assert(isa<PointerType>(SrcPtr->getType()) && "Expected a pointer type");
256
257 unsigned MinVecNumElts = MinVectorSize / ScalarSize;
258 auto *MinVecTy = VectorType::get(ScalarTy, MinVecNumElts, false);
259 unsigned OffsetEltIndex = 0;
260 Align Alignment = Load->getAlign();
261 if (!isSafeToLoadUnconditionally(SrcPtr, MinVecTy, Align(1), *DL, Load, &AC,
262 &DT)) {
263 // It is not safe to load directly from the pointer, but we can still peek
264 // through gep offsets and check if it safe to load from a base address with
265 // updated alignment. If it is, we can shuffle the element(s) into place
266 // after loading.
267 unsigned OffsetBitWidth = DL->getIndexTypeSizeInBits(SrcPtr->getType());
268 APInt Offset(OffsetBitWidth, 0);
270
271 // We want to shuffle the result down from a high element of a vector, so
272 // the offset must be positive.
273 if (Offset.isNegative())
274 return false;
275
276 // The offset must be a multiple of the scalar element to shuffle cleanly
277 // in the element's size.
278 uint64_t ScalarSizeInBytes = ScalarSize / 8;
279 if (Offset.urem(ScalarSizeInBytes) != 0)
280 return false;
281
282 // If we load MinVecNumElts, will our target element still be loaded?
283 OffsetEltIndex = Offset.udiv(ScalarSizeInBytes).getZExtValue();
284 if (OffsetEltIndex >= MinVecNumElts)
285 return false;
286
287 if (!isSafeToLoadUnconditionally(SrcPtr, MinVecTy, Align(1), *DL, Load, &AC,
288 &DT))
289 return false;
290
291 // Update alignment with offset value. Note that the offset could be negated
292 // to more accurately represent "(new) SrcPtr - Offset = (old) SrcPtr", but
293 // negation does not change the result of the alignment calculation.
294 Alignment = commonAlignment(Alignment, Offset.getZExtValue());
295 }
296
297 // Original pattern: insertelt undef, load [free casts of] PtrOp, 0
298 // Use the greater of the alignment on the load or its source pointer.
299 Alignment = std::max(SrcPtr->getPointerAlignment(*DL), Alignment);
300 Type *LoadTy = Load->getType();
301 unsigned AS = Load->getPointerAddressSpace();
302 InstructionCost OldCost =
303 TTI.getMemoryOpCost(Instruction::Load, LoadTy, Alignment, AS, CostKind);
304 APInt DemandedElts = APInt::getOneBitSet(MinVecNumElts, 0);
305 OldCost +=
306 TTI.getScalarizationOverhead(MinVecTy, DemandedElts,
307 /* Insert */ true, HasExtract, CostKind);
308
309 // New pattern: load VecPtr
310 InstructionCost NewCost =
311 TTI.getMemoryOpCost(Instruction::Load, MinVecTy, Alignment, AS, CostKind);
312 // Optionally, we are shuffling the loaded vector element(s) into place.
313 // For the mask set everything but element 0 to undef to prevent poison from
314 // propagating from the extra loaded memory. This will also optionally
315 // shrink/grow the vector from the loaded size to the output size.
316 // We assume this operation has no cost in codegen if there was no offset.
317 // Note that we could use freeze to avoid poison problems, but then we might
318 // still need a shuffle to change the vector size.
319 auto *Ty = cast<FixedVectorType>(I.getType());
320 unsigned OutputNumElts = Ty->getNumElements();
321 SmallVector<int, 16> Mask(OutputNumElts, PoisonMaskElem);
322 assert(OffsetEltIndex < MinVecNumElts && "Address offset too big");
323 Mask[0] = OffsetEltIndex;
324 if (OffsetEltIndex)
325 NewCost += TTI.getShuffleCost(TTI::SK_PermuteSingleSrc, Ty, MinVecTy, Mask,
326 CostKind);
327
328 // We can aggressively convert to the vector form because the backend can
329 // invert this transform if it does not result in a performance win.
330 if (OldCost < NewCost || !NewCost.isValid())
331 return false;
332
333 // It is safe and potentially profitable to load a vector directly:
334 // inselt undef, load Scalar, 0 --> load VecPtr
335 IRBuilder<> Builder(Load);
336 Value *CastedPtr =
337 Builder.CreatePointerBitCastOrAddrSpaceCast(SrcPtr, Builder.getPtrTy(AS));
338 Value *VecLd = Builder.CreateAlignedLoad(MinVecTy, CastedPtr, Alignment);
339 VecLd = Builder.CreateShuffleVector(VecLd, Mask);
340
341 replaceValue(I, *VecLd);
342 ++NumVecLoad;
343 return true;
344}
345
346/// If we are loading a vector and then inserting it into a larger vector with
347/// undefined elements, try to load the larger vector and eliminate the insert.
348/// This removes a shuffle in IR and may allow combining of other loaded values.
349bool VectorCombine::widenSubvectorLoad(Instruction &I) {
350 // Match subvector insert of fixed vector.
351 auto *Shuf = cast<ShuffleVectorInst>(&I);
352 if (!Shuf->isIdentityWithPadding())
353 return false;
354
355 // Allow a non-canonical shuffle mask that is choosing elements from op1.
356 unsigned NumOpElts =
357 cast<FixedVectorType>(Shuf->getOperand(0)->getType())->getNumElements();
358 unsigned OpIndex = any_of(Shuf->getShuffleMask(), [&NumOpElts](int M) {
359 return M >= (int)(NumOpElts);
360 });
361
362 auto *Load = dyn_cast<LoadInst>(Shuf->getOperand(OpIndex));
363 if (!canWidenLoad(Load, TTI))
364 return false;
365
366 // We use minimal alignment (maximum flexibility) because we only care about
367 // the dereferenceable region. When calculating cost and creating a new op,
368 // we may use a larger value based on alignment attributes.
369 auto *Ty = cast<FixedVectorType>(I.getType());
370 Value *SrcPtr = Load->getPointerOperand()->stripPointerCasts();
371 assert(isa<PointerType>(SrcPtr->getType()) && "Expected a pointer type");
372 Align Alignment = Load->getAlign();
373 if (!isSafeToLoadUnconditionally(SrcPtr, Ty, Align(1), *DL, Load, &AC, &DT))
374 return false;
375
376 Alignment = std::max(SrcPtr->getPointerAlignment(*DL), Alignment);
377 Type *LoadTy = Load->getType();
378 unsigned AS = Load->getPointerAddressSpace();
379
380 // Original pattern: insert_subvector (load PtrOp)
381 // This conservatively assumes that the cost of a subvector insert into an
382 // undef value is 0. We could add that cost if the cost model accurately
383 // reflects the real cost of that operation.
384 InstructionCost OldCost =
385 TTI.getMemoryOpCost(Instruction::Load, LoadTy, Alignment, AS, CostKind);
386
387 // New pattern: load PtrOp
388 InstructionCost NewCost =
389 TTI.getMemoryOpCost(Instruction::Load, Ty, Alignment, AS, CostKind);
390
391 // We can aggressively convert to the vector form because the backend can
392 // invert this transform if it does not result in a performance win.
393 if (OldCost < NewCost || !NewCost.isValid())
394 return false;
395
396 IRBuilder<> Builder(Load);
397 Value *CastedPtr =
398 Builder.CreatePointerBitCastOrAddrSpaceCast(SrcPtr, Builder.getPtrTy(AS));
399 Value *VecLd = Builder.CreateAlignedLoad(Ty, CastedPtr, Alignment);
400 replaceValue(I, *VecLd);
401 ++NumVecLoad;
402 return true;
403}
404
405/// Determine which, if any, of the inputs should be replaced by a shuffle
406/// followed by extract from a different index.
407ExtractElementInst *VectorCombine::getShuffleExtract(
408 ExtractElementInst *Ext0, ExtractElementInst *Ext1,
409 unsigned PreferredExtractIndex = InvalidIndex) const {
410 auto *Index0C = dyn_cast<ConstantInt>(Ext0->getIndexOperand());
411 auto *Index1C = dyn_cast<ConstantInt>(Ext1->getIndexOperand());
412 assert(Index0C && Index1C && "Expected constant extract indexes");
413
414 unsigned Index0 = Index0C->getZExtValue();
415 unsigned Index1 = Index1C->getZExtValue();
416
417 // If the extract indexes are identical, no shuffle is needed.
418 if (Index0 == Index1)
419 return nullptr;
420
421 Type *VecTy = Ext0->getVectorOperand()->getType();
422 assert(VecTy == Ext1->getVectorOperand()->getType() && "Need matching types");
423 InstructionCost Cost0 =
424 TTI.getVectorInstrCost(*Ext0, VecTy, CostKind, Index0);
425 InstructionCost Cost1 =
426 TTI.getVectorInstrCost(*Ext1, VecTy, CostKind, Index1);
427
428 // If both costs are invalid no shuffle is needed
429 if (!Cost0.isValid() && !Cost1.isValid())
430 return nullptr;
431
432 // We are extracting from 2 different indexes, so one operand must be shuffled
433 // before performing a vector operation and/or extract. The more expensive
434 // extract will be replaced by a shuffle.
435 if (Cost0 > Cost1)
436 return Ext0;
437 if (Cost1 > Cost0)
438 return Ext1;
439
440 // If the costs are equal and there is a preferred extract index, shuffle the
441 // opposite operand.
442 if (PreferredExtractIndex == Index0)
443 return Ext1;
444 if (PreferredExtractIndex == Index1)
445 return Ext0;
446
447 // Otherwise, replace the extract with the higher index.
448 return Index0 > Index1 ? Ext0 : Ext1;
449}
450
451/// Compare the relative costs of 2 extracts followed by scalar operation vs.
452/// vector operation(s) followed by extract. Return true if the existing
453/// instructions are cheaper than a vector alternative. Otherwise, return false
454/// and if one of the extracts should be transformed to a shufflevector, set
455/// \p ConvertToShuffle to that extract instruction.
456bool VectorCombine::isExtractExtractCheap(ExtractElementInst *Ext0,
457 ExtractElementInst *Ext1,
458 const Instruction &I,
459 ExtractElementInst *&ConvertToShuffle,
460 unsigned PreferredExtractIndex) {
461 auto *Ext0IndexC = dyn_cast<ConstantInt>(Ext0->getIndexOperand());
462 auto *Ext1IndexC = dyn_cast<ConstantInt>(Ext1->getIndexOperand());
463 assert(Ext0IndexC && Ext1IndexC && "Expected constant extract indexes");
464
465 unsigned Opcode = I.getOpcode();
466 Value *Ext0Src = Ext0->getVectorOperand();
467 Value *Ext1Src = Ext1->getVectorOperand();
468 Type *ScalarTy = Ext0->getType();
469 auto *VecTy = cast<VectorType>(Ext0Src->getType());
470 InstructionCost ScalarOpCost, VectorOpCost;
471
472 // Get cost estimates for scalar and vector versions of the operation.
473 bool IsBinOp = Instruction::isBinaryOp(Opcode);
474 if (IsBinOp) {
475 ScalarOpCost = TTI.getArithmeticInstrCost(Opcode, ScalarTy, CostKind);
476 VectorOpCost = TTI.getArithmeticInstrCost(Opcode, VecTy, CostKind);
477 } else {
478 assert((Opcode == Instruction::ICmp || Opcode == Instruction::FCmp) &&
479 "Expected a compare");
480 CmpInst::Predicate Pred = cast<CmpInst>(I).getPredicate();
481 ScalarOpCost = TTI.getCmpSelInstrCost(
482 Opcode, ScalarTy, CmpInst::makeCmpResultType(ScalarTy), Pred, CostKind);
483 VectorOpCost = TTI.getCmpSelInstrCost(
484 Opcode, VecTy, CmpInst::makeCmpResultType(VecTy), Pred, CostKind);
485 }
486
487 // Get cost estimates for the extract elements. These costs will factor into
488 // both sequences.
489 unsigned Ext0Index = Ext0IndexC->getZExtValue();
490 unsigned Ext1Index = Ext1IndexC->getZExtValue();
491
492 InstructionCost Extract0Cost =
493 TTI.getVectorInstrCost(*Ext0, VecTy, CostKind, Ext0Index);
494 InstructionCost Extract1Cost =
495 TTI.getVectorInstrCost(*Ext1, VecTy, CostKind, Ext1Index);
496
497 // A more expensive extract will always be replaced by a splat shuffle.
498 // For example, if Ext0 is more expensive:
499 // opcode (extelt V0, Ext0), (ext V1, Ext1) -->
500 // extelt (opcode (splat V0, Ext0), V1), Ext1
501 // TODO: Evaluate whether that always results in lowest cost. Alternatively,
502 // check the cost of creating a broadcast shuffle and shuffling both
503 // operands to element 0.
504 unsigned BestExtIndex = Extract0Cost > Extract1Cost ? Ext0Index : Ext1Index;
505 unsigned BestInsIndex = Extract0Cost > Extract1Cost ? Ext1Index : Ext0Index;
506 InstructionCost CheapExtractCost = std::min(Extract0Cost, Extract1Cost);
507
508 // Extra uses of the extracts mean that we include those costs in the
509 // vector total because those instructions will not be eliminated.
510 InstructionCost OldCost, NewCost;
511 if (Ext0Src == Ext1Src && Ext0Index == Ext1Index) {
512 // Handle a special case. If the 2 extracts are identical, adjust the
513 // formulas to account for that. The extra use charge allows for either the
514 // CSE'd pattern or an unoptimized form with identical values:
515 // opcode (extelt V, C), (extelt V, C) --> extelt (opcode V, V), C
516 bool HasUseTax = Ext0 == Ext1 ? !Ext0->hasNUses(2)
517 : !Ext0->hasOneUse() || !Ext1->hasOneUse();
518 OldCost = CheapExtractCost + ScalarOpCost;
519 NewCost = VectorOpCost + CheapExtractCost + HasUseTax * CheapExtractCost;
520 } else {
521 // Handle the general case. Each extract is actually a different value:
522 // opcode (extelt V0, C0), (extelt V1, C1) --> extelt (opcode V0, V1), C
523 OldCost = Extract0Cost + Extract1Cost + ScalarOpCost;
524 NewCost = VectorOpCost + CheapExtractCost +
525 !Ext0->hasOneUse() * Extract0Cost +
526 !Ext1->hasOneUse() * Extract1Cost;
527 }
528
529 ConvertToShuffle = getShuffleExtract(Ext0, Ext1, PreferredExtractIndex);
530 if (ConvertToShuffle) {
531 if (IsBinOp && DisableBinopExtractShuffle)
532 return true;
533
534 // If we are extracting from 2 different indexes, then one operand must be
535 // shuffled before performing the vector operation. The shuffle mask is
536 // poison except for 1 lane that is being translated to the remaining
537 // extraction lane. Therefore, it is a splat shuffle. Ex:
538 // ShufMask = { poison, poison, 0, poison }
539 // TODO: The cost model has an option for a "broadcast" shuffle
540 // (splat-from-element-0), but no option for a more general splat.
541 if (auto *FixedVecTy = dyn_cast<FixedVectorType>(VecTy)) {
542 SmallVector<int> ShuffleMask(FixedVecTy->getNumElements(),
544 ShuffleMask[BestInsIndex] = BestExtIndex;
546 VecTy, VecTy, ShuffleMask, CostKind, 0,
547 nullptr, {ConvertToShuffle});
548 } else {
550 VecTy, VecTy, {}, CostKind, 0, nullptr,
551 {ConvertToShuffle});
552 }
553 }
554
555 // Aggressively form a vector op if the cost is equal because the transform
556 // may enable further optimization.
557 // Codegen can reverse this transform (scalarize) if it was not profitable.
558 return OldCost < NewCost;
559}
560
561/// Create a shuffle that translates (shifts) 1 element from the input vector
562/// to a new element location.
563static Value *createShiftShuffle(Value *Vec, unsigned OldIndex,
564 unsigned NewIndex, IRBuilderBase &Builder) {
565 // The shuffle mask is poison except for 1 lane that is being translated
566 // to the new element index. Example for OldIndex == 2 and NewIndex == 0:
567 // ShufMask = { 2, poison, poison, poison }
568 auto *VecTy = cast<FixedVectorType>(Vec->getType());
569 SmallVector<int, 32> ShufMask(VecTy->getNumElements(), PoisonMaskElem);
570 ShufMask[NewIndex] = OldIndex;
571 return Builder.CreateShuffleVector(Vec, ShufMask, "shift");
572}
573
574/// Given an extract element instruction with constant index operand, shuffle
575/// the source vector (shift the scalar element) to a NewIndex for extraction.
576/// Return null if the input can be constant folded, so that we are not creating
577/// unnecessary instructions.
578static Value *translateExtract(ExtractElementInst *ExtElt, unsigned NewIndex,
579 IRBuilderBase &Builder) {
580 // Shufflevectors can only be created for fixed-width vectors.
581 Value *X = ExtElt->getVectorOperand();
582 if (!isa<FixedVectorType>(X->getType()))
583 return nullptr;
584
585 // If the extract can be constant-folded, this code is unsimplified. Defer
586 // to other passes to handle that.
587 Value *C = ExtElt->getIndexOperand();
588 assert(isa<ConstantInt>(C) && "Expected a constant index operand");
589 if (isa<Constant>(X))
590 return nullptr;
591
592 Value *Shuf = createShiftShuffle(X, cast<ConstantInt>(C)->getZExtValue(),
593 NewIndex, Builder);
594 return Shuf;
595}
596
597/// Try to reduce extract element costs by converting scalar compares to vector
598/// compares followed by extract.
599/// cmp (ext0 V0, ExtIndex), (ext1 V1, ExtIndex)
600Value *VectorCombine::foldExtExtCmp(Value *V0, Value *V1, Value *ExtIndex,
601 Instruction &I) {
602 assert(isa<CmpInst>(&I) && "Expected a compare");
603
604 // cmp Pred (extelt V0, ExtIndex), (extelt V1, ExtIndex)
605 // --> extelt (cmp Pred V0, V1), ExtIndex
606 ++NumVecCmp;
607 CmpInst::Predicate Pred = cast<CmpInst>(&I)->getPredicate();
608 Value *VecCmp = Builder.CreateCmp(Pred, V0, V1);
609 return Builder.CreateExtractElement(VecCmp, ExtIndex, "foldExtExtCmp");
610}
611
612/// Try to reduce extract element costs by converting scalar binops to vector
613/// binops followed by extract.
614/// bo (ext0 V0, ExtIndex), (ext1 V1, ExtIndex)
615Value *VectorCombine::foldExtExtBinop(Value *V0, Value *V1, Value *ExtIndex,
616 Instruction &I) {
617 assert(isa<BinaryOperator>(&I) && "Expected a binary operator");
618
619 // bo (extelt V0, ExtIndex), (extelt V1, ExtIndex)
620 // --> extelt (bo V0, V1), ExtIndex
621 ++NumVecBO;
622 Value *VecBO = Builder.CreateBinOp(cast<BinaryOperator>(&I)->getOpcode(), V0,
623 V1, "foldExtExtBinop");
624
625 // All IR flags are safe to back-propagate because any potential poison
626 // created in unused vector elements is discarded by the extract.
627 if (auto *VecBOInst = dyn_cast<Instruction>(VecBO))
628 VecBOInst->copyIRFlags(&I);
629
630 return Builder.CreateExtractElement(VecBO, ExtIndex, "foldExtExtBinop");
631}
632
633/// Match an instruction with extracted vector operands.
634bool VectorCombine::foldExtractExtract(Instruction &I) {
635 // It is not safe to transform things like div, urem, etc. because we may
636 // create undefined behavior when executing those on unknown vector elements.
638 return false;
639
640 Instruction *I0, *I1;
641 CmpPredicate Pred = CmpInst::BAD_ICMP_PREDICATE;
642 if (!match(&I, m_Cmp(Pred, m_Instruction(I0), m_Instruction(I1))) &&
644 return false;
645
646 Value *V0, *V1;
647 uint64_t C0, C1;
648 if (!match(I0, m_ExtractElt(m_Value(V0), m_ConstantInt(C0))) ||
649 !match(I1, m_ExtractElt(m_Value(V1), m_ConstantInt(C1))) ||
650 V0->getType() != V1->getType())
651 return false;
652
653 // If the scalar value 'I' is going to be re-inserted into a vector, then try
654 // to create an extract to that same element. The extract/insert can be
655 // reduced to a "select shuffle".
656 // TODO: If we add a larger pattern match that starts from an insert, this
657 // probably becomes unnecessary.
658 auto *Ext0 = cast<ExtractElementInst>(I0);
659 auto *Ext1 = cast<ExtractElementInst>(I1);
660 uint64_t InsertIndex = InvalidIndex;
661 if (I.hasOneUse())
662 match(I.user_back(),
663 m_InsertElt(m_Value(), m_Value(), m_ConstantInt(InsertIndex)));
664
665 ExtractElementInst *ExtractToChange;
666 if (isExtractExtractCheap(Ext0, Ext1, I, ExtractToChange, InsertIndex))
667 return false;
668
669 Value *ExtOp0 = Ext0->getVectorOperand();
670 Value *ExtOp1 = Ext1->getVectorOperand();
671
672 if (ExtractToChange) {
673 unsigned CheapExtractIdx = ExtractToChange == Ext0 ? C1 : C0;
674 Value *NewExtOp =
675 translateExtract(ExtractToChange, CheapExtractIdx, Builder);
676 if (!NewExtOp)
677 return false;
678 if (ExtractToChange == Ext0)
679 ExtOp0 = NewExtOp;
680 else
681 ExtOp1 = NewExtOp;
682 }
683
684 Value *ExtIndex = ExtractToChange == Ext0 ? Ext1->getIndexOperand()
685 : Ext0->getIndexOperand();
686 Value *NewExt = Pred != CmpInst::BAD_ICMP_PREDICATE
687 ? foldExtExtCmp(ExtOp0, ExtOp1, ExtIndex, I)
688 : foldExtExtBinop(ExtOp0, ExtOp1, ExtIndex, I);
689 Worklist.push(Ext0);
690 Worklist.push(Ext1);
691 replaceValue(I, *NewExt);
692 return true;
693}
694
695/// Try to replace an extract + scalar fneg + insert with a vector fneg +
696/// shuffle.
697bool VectorCombine::foldInsExtFNeg(Instruction &I) {
698 // Match an insert (op (extract)) pattern.
699 Value *DestVec;
700 uint64_t Index;
701 Instruction *FNeg;
702 if (!match(&I, m_InsertElt(m_Value(DestVec), m_OneUse(m_Instruction(FNeg)),
703 m_ConstantInt(Index))))
704 return false;
705
706 // Note: This handles the canonical fneg instruction and "fsub -0.0, X".
707 Value *SrcVec;
708 Instruction *Extract;
709 if (!match(FNeg, m_FNeg(m_CombineAnd(
710 m_Instruction(Extract),
711 m_ExtractElt(m_Value(SrcVec), m_SpecificInt(Index))))))
712 return false;
713
714 auto *VecTy = cast<FixedVectorType>(I.getType());
715 auto *ScalarTy = VecTy->getScalarType();
716 auto *SrcVecTy = dyn_cast<FixedVectorType>(SrcVec->getType());
717 if (!SrcVecTy || ScalarTy != SrcVecTy->getScalarType())
718 return false;
719
720 // Ignore bogus insert/extract index.
721 unsigned NumElts = VecTy->getNumElements();
722 if (Index >= NumElts)
723 return false;
724
725 // We are inserting the negated element into the same lane that we extracted
726 // from. This is equivalent to a select-shuffle that chooses all but the
727 // negated element from the destination vector.
728 SmallVector<int> Mask(NumElts);
729 std::iota(Mask.begin(), Mask.end(), 0);
730 Mask[Index] = Index + NumElts;
731 InstructionCost OldCost =
732 TTI.getArithmeticInstrCost(Instruction::FNeg, ScalarTy, CostKind) +
733 TTI.getVectorInstrCost(I, VecTy, CostKind, Index);
734
735 // If the extract has one use, it will be eliminated, so count it in the
736 // original cost. If it has more than one use, ignore the cost because it will
737 // be the same before/after.
738 if (Extract->hasOneUse())
739 OldCost += TTI.getVectorInstrCost(*Extract, VecTy, CostKind, Index);
740
741 InstructionCost NewCost =
742 TTI.getArithmeticInstrCost(Instruction::FNeg, VecTy, CostKind) +
744 Mask, CostKind);
745
746 bool NeedLenChg = SrcVecTy->getNumElements() != NumElts;
747 // If the lengths of the two vectors are not equal,
748 // we need to add a length-change vector. Add this cost.
749 SmallVector<int> SrcMask;
750 if (NeedLenChg) {
751 SrcMask.assign(NumElts, PoisonMaskElem);
752 SrcMask[Index] = Index;
754 VecTy, SrcVecTy, SrcMask, CostKind);
755 }
756
757 if (NewCost > OldCost)
758 return false;
759
760 Value *NewShuf;
761 // insertelt DestVec, (fneg (extractelt SrcVec, Index)), Index
762 Value *VecFNeg = Builder.CreateFNegFMF(SrcVec, FNeg);
763 if (NeedLenChg) {
764 // shuffle DestVec, (shuffle (fneg SrcVec), poison, SrcMask), Mask
765 Value *LenChgShuf = Builder.CreateShuffleVector(VecFNeg, SrcMask);
766 NewShuf = Builder.CreateShuffleVector(DestVec, LenChgShuf, Mask);
767 } else {
768 // shuffle DestVec, (fneg SrcVec), Mask
769 NewShuf = Builder.CreateShuffleVector(DestVec, VecFNeg, Mask);
770 }
771
772 replaceValue(I, *NewShuf);
773 return true;
774}
775
776/// Try to fold insert(binop(x,y),binop(a,b),idx)
777/// --> binop(insert(x,a,idx),insert(y,b,idx))
778bool VectorCombine::foldInsExtBinop(Instruction &I) {
779 BinaryOperator *VecBinOp, *SclBinOp;
780 uint64_t Index;
781 if (!match(&I,
782 m_InsertElt(m_OneUse(m_BinOp(VecBinOp)),
783 m_OneUse(m_BinOp(SclBinOp)), m_ConstantInt(Index))))
784 return false;
785
786 // TODO: Add support for addlike etc.
787 Instruction::BinaryOps BinOpcode = VecBinOp->getOpcode();
788 if (BinOpcode != SclBinOp->getOpcode())
789 return false;
790
791 auto *ResultTy = dyn_cast<FixedVectorType>(I.getType());
792 if (!ResultTy)
793 return false;
794
795 // TODO: Attempt to detect m_ExtractElt for scalar operands and convert to
796 // shuffle?
797
799 TTI.getInstructionCost(VecBinOp, CostKind) +
801 InstructionCost NewCost =
802 TTI.getArithmeticInstrCost(BinOpcode, ResultTy, CostKind) +
803 TTI.getVectorInstrCost(Instruction::InsertElement, ResultTy, CostKind,
804 Index, VecBinOp->getOperand(0),
805 SclBinOp->getOperand(0)) +
806 TTI.getVectorInstrCost(Instruction::InsertElement, ResultTy, CostKind,
807 Index, VecBinOp->getOperand(1),
808 SclBinOp->getOperand(1));
809
810 LLVM_DEBUG(dbgs() << "Found an insertion of two binops: " << I
811 << "\n OldCost: " << OldCost << " vs NewCost: " << NewCost
812 << "\n");
813 if (NewCost > OldCost)
814 return false;
815
816 Value *NewIns0 = Builder.CreateInsertElement(VecBinOp->getOperand(0),
817 SclBinOp->getOperand(0), Index);
818 Value *NewIns1 = Builder.CreateInsertElement(VecBinOp->getOperand(1),
819 SclBinOp->getOperand(1), Index);
820 Value *NewBO = Builder.CreateBinOp(BinOpcode, NewIns0, NewIns1);
821
822 // Intersect flags from the old binops.
823 if (auto *NewInst = dyn_cast<Instruction>(NewBO)) {
824 NewInst->copyIRFlags(VecBinOp);
825 NewInst->andIRFlags(SclBinOp);
826 }
827
828 Worklist.pushValue(NewIns0);
829 Worklist.pushValue(NewIns1);
830 replaceValue(I, *NewBO);
831 return true;
832}
833
834/// Match: bitop(castop(x), castop(y)) -> castop(bitop(x, y))
835/// Supports: bitcast, trunc, sext, zext
836bool VectorCombine::foldBitOpOfCastops(Instruction &I) {
837 // Check if this is a bitwise logic operation
838 auto *BinOp = dyn_cast<BinaryOperator>(&I);
839 if (!BinOp || !BinOp->isBitwiseLogicOp())
840 return false;
841
842 // Get the cast instructions
843 auto *LHSCast = dyn_cast<CastInst>(BinOp->getOperand(0));
844 auto *RHSCast = dyn_cast<CastInst>(BinOp->getOperand(1));
845 if (!LHSCast || !RHSCast) {
846 LLVM_DEBUG(dbgs() << " One or both operands are not cast instructions\n");
847 return false;
848 }
849
850 // Both casts must be the same type
851 Instruction::CastOps CastOpcode = LHSCast->getOpcode();
852 if (CastOpcode != RHSCast->getOpcode())
853 return false;
854
855 // Only handle supported cast operations
856 switch (CastOpcode) {
857 case Instruction::BitCast:
858 case Instruction::Trunc:
859 case Instruction::SExt:
860 case Instruction::ZExt:
861 break;
862 default:
863 return false;
864 }
865
866 Value *LHSSrc = LHSCast->getOperand(0);
867 Value *RHSSrc = RHSCast->getOperand(0);
868
869 // Source types must match
870 if (LHSSrc->getType() != RHSSrc->getType())
871 return false;
872
873 auto *SrcTy = LHSSrc->getType();
874 auto *DstTy = I.getType();
875 // Bitcasts can handle scalar/vector mixes, such as i16 -> <16 x i1>.
876 // Other casts only handle vector types with integer elements.
877 if (CastOpcode != Instruction::BitCast &&
878 (!isa<FixedVectorType>(SrcTy) || !isa<FixedVectorType>(DstTy)))
879 return false;
880
881 // Only integer scalar/vector values are legal for bitwise logic operations.
882 if (!SrcTy->getScalarType()->isIntegerTy() ||
883 !DstTy->getScalarType()->isIntegerTy())
884 return false;
885
886 // Cost Check :
887 // OldCost = bitlogic + 2*casts
888 // NewCost = bitlogic + cast
889
890 // Calculate specific costs for each cast with instruction context
892 CastOpcode, DstTy, SrcTy, TTI::CastContextHint::None, CostKind, LHSCast);
894 CastOpcode, DstTy, SrcTy, TTI::CastContextHint::None, CostKind, RHSCast);
895
896 InstructionCost OldCost =
897 TTI.getArithmeticInstrCost(BinOp->getOpcode(), DstTy, CostKind) +
898 LHSCastCost + RHSCastCost;
899
900 // For new cost, we can't provide an instruction (it doesn't exist yet)
901 InstructionCost GenericCastCost = TTI.getCastInstrCost(
902 CastOpcode, DstTy, SrcTy, TTI::CastContextHint::None, CostKind);
903
904 InstructionCost NewCost =
905 TTI.getArithmeticInstrCost(BinOp->getOpcode(), SrcTy, CostKind) +
906 GenericCastCost;
907
908 // Account for multi-use casts using specific costs
909 if (!LHSCast->hasOneUse())
910 NewCost += LHSCastCost;
911 if (!RHSCast->hasOneUse())
912 NewCost += RHSCastCost;
913
914 LLVM_DEBUG(dbgs() << "foldBitOpOfCastops: OldCost=" << OldCost
915 << " NewCost=" << NewCost << "\n");
916
917 if (NewCost > OldCost)
918 return false;
919
920 // Create the operation on the source type
921 Value *NewOp = Builder.CreateBinOp(BinOp->getOpcode(), LHSSrc, RHSSrc,
922 BinOp->getName() + ".inner");
923 if (auto *NewBinOp = dyn_cast<BinaryOperator>(NewOp))
924 NewBinOp->copyIRFlags(BinOp);
925
926 Worklist.pushValue(NewOp);
927
928 // Create the cast operation directly to ensure we get a new instruction
929 Instruction *NewCast = CastInst::Create(CastOpcode, NewOp, I.getType());
930
931 // Preserve cast instruction flags
932 NewCast->copyIRFlags(LHSCast);
933 NewCast->andIRFlags(RHSCast);
934
935 // Insert the new instruction
936 Value *Result = Builder.Insert(NewCast);
937
938 replaceValue(I, *Result);
939 return true;
940}
941
942/// Match:
943// bitop(castop(x), C) ->
944// bitop(castop(x), castop(InvC)) ->
945// castop(bitop(x, InvC))
946// Supports: bitcast
947bool VectorCombine::foldBitOpOfCastConstant(Instruction &I) {
949 Constant *C;
950
951 // Check if this is a bitwise logic operation
953 return false;
954
955 // Get the cast instructions
956 auto *LHSCast = dyn_cast<CastInst>(LHS);
957 if (!LHSCast)
958 return false;
959
960 Instruction::CastOps CastOpcode = LHSCast->getOpcode();
961
962 // Only handle supported cast operations
963 switch (CastOpcode) {
964 case Instruction::BitCast:
965 case Instruction::ZExt:
966 case Instruction::SExt:
967 case Instruction::Trunc:
968 break;
969 default:
970 return false;
971 }
972
973 Value *LHSSrc = LHSCast->getOperand(0);
974
975 auto *SrcTy = LHSSrc->getType();
976 auto *DstTy = I.getType();
977 // Bitcasts can handle scalar/vector mixes, such as i16 -> <16 x i1>.
978 // Other casts only handle vector types with integer elements.
979 if (CastOpcode != Instruction::BitCast &&
980 (!isa<FixedVectorType>(SrcTy) || !isa<FixedVectorType>(DstTy)))
981 return false;
982
983 // Only integer scalar/vector values are legal for bitwise logic operations.
984 if (!SrcTy->getScalarType()->isIntegerTy() ||
985 !DstTy->getScalarType()->isIntegerTy())
986 return false;
987
988 // Find the constant InvC, such that castop(InvC) equals to C.
989 PreservedCastFlags RHSFlags;
990 Constant *InvC = getLosslessInvCast(C, SrcTy, CastOpcode, *DL, &RHSFlags);
991 if (!InvC)
992 return false;
993
994 // Cost Check :
995 // OldCost = bitlogic + cast
996 // NewCost = bitlogic + cast
997
998 // Calculate specific costs for each cast with instruction context
1000 CastOpcode, DstTy, SrcTy, TTI::CastContextHint::None, CostKind, LHSCast);
1001
1002 InstructionCost OldCost =
1003 TTI.getArithmeticInstrCost(I.getOpcode(), DstTy, CostKind) + LHSCastCost;
1004
1005 // For new cost, we can't provide an instruction (it doesn't exist yet)
1006 InstructionCost GenericCastCost = TTI.getCastInstrCost(
1007 CastOpcode, DstTy, SrcTy, TTI::CastContextHint::None, CostKind);
1008
1009 InstructionCost NewCost =
1010 TTI.getArithmeticInstrCost(I.getOpcode(), SrcTy, CostKind) +
1011 GenericCastCost;
1012
1013 // Account for multi-use casts using specific costs
1014 if (!LHSCast->hasOneUse())
1015 NewCost += LHSCastCost;
1016
1017 LLVM_DEBUG(dbgs() << "foldBitOpOfCastConstant: OldCost=" << OldCost
1018 << " NewCost=" << NewCost << "\n");
1019
1020 if (NewCost > OldCost)
1021 return false;
1022
1023 // Create the operation on the source type
1024 Value *NewOp = Builder.CreateBinOp((Instruction::BinaryOps)I.getOpcode(),
1025 LHSSrc, InvC, I.getName() + ".inner");
1026 if (auto *NewBinOp = dyn_cast<BinaryOperator>(NewOp))
1027 NewBinOp->copyIRFlags(&I);
1028
1029 Worklist.pushValue(NewOp);
1030
1031 // Create the cast operation directly to ensure we get a new instruction
1032 Instruction *NewCast = CastInst::Create(CastOpcode, NewOp, I.getType());
1033
1034 // Insert the new instruction
1035 Value *Result = Builder.Insert(NewCast);
1036
1037 replaceValue(I, *Result);
1038 return true;
1039}
1040
1041/// If this is a bitcast of a shuffle, try to bitcast the source vector to the
1042/// destination type followed by shuffle. This can enable further transforms by
1043/// moving bitcasts or shuffles together.
1044bool VectorCombine::foldBitcastShuffle(Instruction &I) {
1045 Value *V0, *V1;
1046 ArrayRef<int> Mask;
1047 if (!match(&I, m_BitCast(m_OneUse(
1048 m_Shuffle(m_Value(V0), m_Value(V1), m_Mask(Mask))))))
1049 return false;
1050
1051 // 1) Do not fold bitcast shuffle for scalable type. First, shuffle cost for
1052 // scalable type is unknown; Second, we cannot reason if the narrowed shuffle
1053 // mask for scalable type is a splat or not.
1054 // 2) Disallow non-vector casts.
1055 // TODO: We could allow any shuffle.
1056 auto *DestTy = dyn_cast<FixedVectorType>(I.getType());
1057 auto *SrcTy = dyn_cast<FixedVectorType>(V0->getType());
1058 if (!DestTy || !SrcTy)
1059 return false;
1060
1061 unsigned DestEltSize = DestTy->getScalarSizeInBits();
1062 unsigned SrcEltSize = SrcTy->getScalarSizeInBits();
1063 if (SrcTy->getPrimitiveSizeInBits() % DestEltSize != 0)
1064 return false;
1065
1066 bool IsUnary = isa<UndefValue>(V1);
1067
1068 // For binary shuffles, only fold bitcast(shuffle(X,Y))
1069 // if it won't increase the number of bitcasts.
1070 if (!IsUnary) {
1073 if (!(BCTy0 && BCTy0->getElementType() == DestTy->getElementType()) &&
1074 !(BCTy1 && BCTy1->getElementType() == DestTy->getElementType()))
1075 return false;
1076 }
1077
1078 SmallVector<int, 16> NewMask;
1079 if (DestEltSize <= SrcEltSize) {
1080 // The bitcast is from wide to narrow/equal elements. The shuffle mask can
1081 // always be expanded to the equivalent form choosing narrower elements.
1082 assert(SrcEltSize % DestEltSize == 0 && "Unexpected shuffle mask");
1083 unsigned ScaleFactor = SrcEltSize / DestEltSize;
1084 narrowShuffleMaskElts(ScaleFactor, Mask, NewMask);
1085 } else {
1086 // The bitcast is from narrow elements to wide elements. The shuffle mask
1087 // must choose consecutive elements to allow casting first.
1088 assert(DestEltSize % SrcEltSize == 0 && "Unexpected shuffle mask");
1089 unsigned ScaleFactor = DestEltSize / SrcEltSize;
1090 if (!widenShuffleMaskElts(ScaleFactor, Mask, NewMask))
1091 return false;
1092 }
1093
1094 // Bitcast the shuffle src - keep its original width but using the destination
1095 // scalar type.
1096 unsigned NumSrcElts = SrcTy->getPrimitiveSizeInBits() / DestEltSize;
1097 auto *NewShuffleTy =
1098 FixedVectorType::get(DestTy->getScalarType(), NumSrcElts);
1099 auto *OldShuffleTy =
1100 FixedVectorType::get(SrcTy->getScalarType(), Mask.size());
1101 unsigned NumOps = IsUnary ? 1 : 2;
1102
1103 // The new shuffle must not cost more than the old shuffle.
1107
1108 InstructionCost NewCost =
1109 TTI.getShuffleCost(SK, DestTy, NewShuffleTy, NewMask, CostKind) +
1110 (NumOps * TTI.getCastInstrCost(Instruction::BitCast, NewShuffleTy, SrcTy,
1111 TargetTransformInfo::CastContextHint::None,
1112 CostKind));
1113 InstructionCost OldCost =
1114 TTI.getShuffleCost(SK, OldShuffleTy, SrcTy, Mask, CostKind) +
1115 TTI.getCastInstrCost(Instruction::BitCast, DestTy, OldShuffleTy,
1116 TargetTransformInfo::CastContextHint::None,
1117 CostKind);
1118
1119 LLVM_DEBUG(dbgs() << "Found a bitcasted shuffle: " << I << "\n OldCost: "
1120 << OldCost << " vs NewCost: " << NewCost << "\n");
1121
1122 if (NewCost > OldCost || !NewCost.isValid())
1123 return false;
1124
1125 // bitcast (shuf V0, V1, MaskC) --> shuf (bitcast V0), (bitcast V1), MaskC'
1126 ++NumShufOfBitcast;
1127 Value *CastV0 = Builder.CreateBitCast(peekThroughBitcasts(V0), NewShuffleTy);
1128 Value *CastV1 = Builder.CreateBitCast(peekThroughBitcasts(V1), NewShuffleTy);
1129 Value *Shuf = Builder.CreateShuffleVector(CastV0, CastV1, NewMask);
1130 replaceValue(I, *Shuf);
1131 return true;
1132}
1133
1134/// VP Intrinsics whose vector operands are both splat values may be simplified
1135/// into the scalar version of the operation and the result splatted. This
1136/// can lead to scalarization down the line.
1137bool VectorCombine::scalarizeVPIntrinsic(Instruction &I) {
1138 if (!isa<VPIntrinsic>(I))
1139 return false;
1140 VPIntrinsic &VPI = cast<VPIntrinsic>(I);
1141 Value *Op0 = VPI.getArgOperand(0);
1142 Value *Op1 = VPI.getArgOperand(1);
1143
1144 if (!isSplatValue(Op0) || !isSplatValue(Op1))
1145 return false;
1146
1147 // Check getSplatValue early in this function, to avoid doing unnecessary
1148 // work.
1149 Value *ScalarOp0 = getSplatValue(Op0);
1150 Value *ScalarOp1 = getSplatValue(Op1);
1151 if (!ScalarOp0 || !ScalarOp1)
1152 return false;
1153
1154 // For the binary VP intrinsics supported here, the result on disabled lanes
1155 // is a poison value. For now, only do this simplification if all lanes
1156 // are active.
1157 // TODO: Relax the condition that all lanes are active by using insertelement
1158 // on inactive lanes.
1159 auto IsAllTrueMask = [](Value *MaskVal) {
1160 if (Value *SplattedVal = getSplatValue(MaskVal))
1161 if (auto *ConstValue = dyn_cast<Constant>(SplattedVal))
1162 return ConstValue->isAllOnesValue();
1163 return false;
1164 };
1165 if (!IsAllTrueMask(VPI.getArgOperand(2)))
1166 return false;
1167
1168 // Check to make sure we support scalarization of the intrinsic
1169 Intrinsic::ID IntrID = VPI.getIntrinsicID();
1170 if (!VPBinOpIntrinsic::isVPBinOp(IntrID))
1171 return false;
1172
1173 // Calculate cost of splatting both operands into vectors and the vector
1174 // intrinsic
1175 VectorType *VecTy = cast<VectorType>(VPI.getType());
1176 SmallVector<int> Mask;
1177 if (auto *FVTy = dyn_cast<FixedVectorType>(VecTy))
1178 Mask.resize(FVTy->getNumElements(), 0);
1179 InstructionCost SplatCost =
1180 TTI.getVectorInstrCost(Instruction::InsertElement, VecTy, CostKind, 0) +
1182 CostKind);
1183
1184 // Calculate the cost of the VP Intrinsic
1186 for (Value *V : VPI.args())
1187 Args.push_back(V->getType());
1188 IntrinsicCostAttributes Attrs(IntrID, VecTy, Args);
1189 InstructionCost VectorOpCost = TTI.getIntrinsicInstrCost(Attrs, CostKind);
1190 InstructionCost OldCost = 2 * SplatCost + VectorOpCost;
1191
1192 // Determine scalar opcode
1193 std::optional<unsigned> FunctionalOpcode =
1194 VPI.getFunctionalOpcode();
1195 std::optional<Intrinsic::ID> ScalarIntrID = std::nullopt;
1196 if (!FunctionalOpcode) {
1197 ScalarIntrID = VPI.getFunctionalIntrinsicID();
1198 if (!ScalarIntrID)
1199 return false;
1200 }
1201
1202 // Calculate cost of scalarizing
1203 InstructionCost ScalarOpCost = 0;
1204 if (ScalarIntrID) {
1205 IntrinsicCostAttributes Attrs(*ScalarIntrID, VecTy->getScalarType(), Args);
1206 ScalarOpCost = TTI.getIntrinsicInstrCost(Attrs, CostKind);
1207 } else {
1208 ScalarOpCost = TTI.getArithmeticInstrCost(*FunctionalOpcode,
1209 VecTy->getScalarType(), CostKind);
1210 }
1211
1212 // The existing splats may be kept around if other instructions use them.
1213 InstructionCost CostToKeepSplats =
1214 (SplatCost * !Op0->hasOneUse()) + (SplatCost * !Op1->hasOneUse());
1215 InstructionCost NewCost = ScalarOpCost + SplatCost + CostToKeepSplats;
1216
1217 LLVM_DEBUG(dbgs() << "Found a VP Intrinsic to scalarize: " << VPI
1218 << "\n");
1219 LLVM_DEBUG(dbgs() << "Cost of Intrinsic: " << OldCost
1220 << ", Cost of scalarizing:" << NewCost << "\n");
1221
1222 // We want to scalarize unless the vector variant actually has lower cost.
1223 if (OldCost < NewCost || !NewCost.isValid())
1224 return false;
1225
1226 // Scalarize the intrinsic
1227 ElementCount EC = cast<VectorType>(Op0->getType())->getElementCount();
1228 Value *EVL = VPI.getArgOperand(3);
1229
1230 // If the VP op might introduce UB or poison, we can scalarize it provided
1231 // that we know the EVL > 0: If the EVL is zero, then the original VP op
1232 // becomes a no-op and thus won't be UB, so make sure we don't introduce UB by
1233 // scalarizing it.
1234 bool SafeToSpeculate;
1235 if (ScalarIntrID)
1236 SafeToSpeculate = Intrinsic::getFnAttributes(I.getContext(), *ScalarIntrID)
1237 .hasAttribute(Attribute::AttrKind::Speculatable);
1238 else
1240 *FunctionalOpcode, &VPI, nullptr, &AC, &DT);
1241 if (!SafeToSpeculate &&
1242 !isKnownNonZero(EVL, SimplifyQuery(*DL, &DT, &AC, &VPI)))
1243 return false;
1244
1245 Value *ScalarVal =
1246 ScalarIntrID
1247 ? Builder.CreateIntrinsic(VecTy->getScalarType(), *ScalarIntrID,
1248 {ScalarOp0, ScalarOp1})
1249 : Builder.CreateBinOp((Instruction::BinaryOps)(*FunctionalOpcode),
1250 ScalarOp0, ScalarOp1);
1251
1252 replaceValue(VPI, *Builder.CreateVectorSplat(EC, ScalarVal));
1253 return true;
1254}
1255
1256/// Match a vector op/compare/intrinsic with at least one
1257/// inserted scalar operand and convert to scalar op/cmp/intrinsic followed
1258/// by insertelement.
1259bool VectorCombine::scalarizeOpOrCmp(Instruction &I) {
1260 auto *UO = dyn_cast<UnaryOperator>(&I);
1261 auto *BO = dyn_cast<BinaryOperator>(&I);
1262 auto *CI = dyn_cast<CmpInst>(&I);
1263 auto *II = dyn_cast<IntrinsicInst>(&I);
1264 if (!UO && !BO && !CI && !II)
1265 return false;
1266
1267 // TODO: Allow intrinsics with different argument types
1268 if (II) {
1269 if (!isTriviallyVectorizable(II->getIntrinsicID()))
1270 return false;
1271 for (auto [Idx, Arg] : enumerate(II->args()))
1272 if (Arg->getType() != II->getType() &&
1273 !isVectorIntrinsicWithScalarOpAtArg(II->getIntrinsicID(), Idx, &TTI))
1274 return false;
1275 }
1276
1277 // Do not convert the vector condition of a vector select into a scalar
1278 // condition. That may cause problems for codegen because of differences in
1279 // boolean formats and register-file transfers.
1280 // TODO: Can we account for that in the cost model?
1281 if (CI)
1282 for (User *U : I.users())
1283 if (match(U, m_Select(m_Specific(&I), m_Value(), m_Value())))
1284 return false;
1285
1286 // Match constant vectors or scalars being inserted into constant vectors:
1287 // vec_op [VecC0 | (inselt VecC0, V0, Index)], ...
1288 SmallVector<Value *> VecCs, ScalarOps;
1289 std::optional<uint64_t> Index;
1290
1291 auto Ops = II ? II->args() : I.operands();
1292 for (auto [OpNum, Op] : enumerate(Ops)) {
1293 Constant *VecC;
1294 Value *V;
1295 uint64_t InsIdx = 0;
1296 if (match(Op.get(), m_InsertElt(m_Constant(VecC), m_Value(V),
1297 m_ConstantInt(InsIdx)))) {
1298 // Bail if any inserts are out of bounds.
1299 VectorType *OpTy = cast<VectorType>(Op->getType());
1300 if (OpTy->getElementCount().getKnownMinValue() <= InsIdx)
1301 return false;
1302 // All inserts must have the same index.
1303 // TODO: Deal with mismatched index constants and variable indexes?
1304 if (!Index)
1305 Index = InsIdx;
1306 else if (InsIdx != *Index)
1307 return false;
1308 VecCs.push_back(VecC);
1309 ScalarOps.push_back(V);
1310 } else if (II && isVectorIntrinsicWithScalarOpAtArg(II->getIntrinsicID(),
1311 OpNum, &TTI)) {
1312 VecCs.push_back(Op.get());
1313 ScalarOps.push_back(Op.get());
1314 } else if (match(Op.get(), m_Constant(VecC))) {
1315 VecCs.push_back(VecC);
1316 ScalarOps.push_back(nullptr);
1317 } else {
1318 return false;
1319 }
1320 }
1321
1322 // Bail if all operands are constant.
1323 if (!Index.has_value())
1324 return false;
1325
1326 VectorType *VecTy = cast<VectorType>(I.getType());
1327 Type *ScalarTy = VecTy->getScalarType();
1328 assert(VecTy->isVectorTy() &&
1329 (ScalarTy->isIntegerTy() || ScalarTy->isFloatingPointTy() ||
1330 ScalarTy->isPointerTy()) &&
1331 "Unexpected types for insert element into binop or cmp");
1332
1333 unsigned Opcode = I.getOpcode();
1334 InstructionCost ScalarOpCost, VectorOpCost;
1335 if (CI) {
1336 CmpInst::Predicate Pred = CI->getPredicate();
1337 ScalarOpCost = TTI.getCmpSelInstrCost(
1338 Opcode, ScalarTy, CmpInst::makeCmpResultType(ScalarTy), Pred, CostKind);
1339 VectorOpCost = TTI.getCmpSelInstrCost(
1340 Opcode, VecTy, CmpInst::makeCmpResultType(VecTy), Pred, CostKind);
1341 } else if (UO || BO) {
1342 ScalarOpCost = TTI.getArithmeticInstrCost(Opcode, ScalarTy, CostKind);
1343 VectorOpCost = TTI.getArithmeticInstrCost(Opcode, VecTy, CostKind);
1344 } else {
1345 IntrinsicCostAttributes ScalarICA(
1346 II->getIntrinsicID(), ScalarTy,
1347 SmallVector<Type *>(II->arg_size(), ScalarTy));
1348 ScalarOpCost = TTI.getIntrinsicInstrCost(ScalarICA, CostKind);
1349 IntrinsicCostAttributes VectorICA(
1350 II->getIntrinsicID(), VecTy,
1351 SmallVector<Type *>(II->arg_size(), VecTy));
1352 VectorOpCost = TTI.getIntrinsicInstrCost(VectorICA, CostKind);
1353 }
1354
1355 // Fold the vector constants in the original vectors into a new base vector to
1356 // get more accurate cost modelling.
1357 Value *NewVecC = nullptr;
1358 if (CI)
1359 NewVecC = simplifyCmpInst(CI->getPredicate(), VecCs[0], VecCs[1], SQ);
1360 else if (UO)
1361 NewVecC =
1362 simplifyUnOp(UO->getOpcode(), VecCs[0], UO->getFastMathFlags(), SQ);
1363 else if (BO)
1364 NewVecC = simplifyBinOp(BO->getOpcode(), VecCs[0], VecCs[1], SQ);
1365 else if (II)
1366 NewVecC = simplifyCall(II, II->getCalledOperand(), VecCs, SQ);
1367
1368 if (!NewVecC)
1369 return false;
1370
1371 // Get cost estimate for the insert element. This cost will factor into
1372 // both sequences.
1373 InstructionCost OldCost = VectorOpCost;
1374 InstructionCost NewCost =
1375 ScalarOpCost + TTI.getVectorInstrCost(Instruction::InsertElement, VecTy,
1376 CostKind, *Index, NewVecC);
1377
1378 for (auto [Idx, Op, VecC, Scalar] : enumerate(Ops, VecCs, ScalarOps)) {
1379 if (!Scalar || (II && isVectorIntrinsicWithScalarOpAtArg(
1380 II->getIntrinsicID(), Idx, &TTI)))
1381 continue;
1383 Instruction::InsertElement, VecTy, CostKind, *Index, VecC, Scalar);
1384 OldCost += InsertCost;
1385 NewCost += !Op->hasOneUse() * InsertCost;
1386 }
1387
1388 // We want to scalarize unless the vector variant actually has lower cost.
1389 if (OldCost < NewCost || !NewCost.isValid())
1390 return false;
1391
1392 // vec_op (inselt VecC0, V0, Index), (inselt VecC1, V1, Index) -->
1393 // inselt NewVecC, (scalar_op V0, V1), Index
1394 if (CI)
1395 ++NumScalarCmp;
1396 else if (UO || BO)
1397 ++NumScalarOps;
1398 else
1399 ++NumScalarIntrinsic;
1400
1401 // For constant cases, extract the scalar element, this should constant fold.
1402 for (auto [OpIdx, Scalar, VecC] : enumerate(ScalarOps, VecCs))
1403 if (!Scalar)
1405 cast<Constant>(VecC), Builder.getInt64(*Index));
1406
1407 Value *Scalar;
1408 if (CI)
1409 Scalar = Builder.CreateCmp(CI->getPredicate(), ScalarOps[0], ScalarOps[1]);
1410 else if (UO || BO)
1411 Scalar = Builder.CreateNAryOp(Opcode, ScalarOps);
1412 else
1413 Scalar = Builder.CreateIntrinsic(ScalarTy, II->getIntrinsicID(), ScalarOps);
1414
1415 Scalar->setName(I.getName() + ".scalar");
1416
1417 // All IR flags are safe to back-propagate. There is no potential for extra
1418 // poison to be created by the scalar instruction.
1419 if (auto *ScalarInst = dyn_cast<Instruction>(Scalar))
1420 ScalarInst->copyIRFlags(&I);
1421
1422 Value *Insert = Builder.CreateInsertElement(NewVecC, Scalar, *Index);
1423 replaceValue(I, *Insert);
1424 return true;
1425}
1426
1427/// Try to combine a scalar binop + 2 scalar compares of extracted elements of
1428/// a vector into vector operations followed by extract. Note: The SLP pass
1429/// may miss this pattern because of implementation problems.
1430bool VectorCombine::foldExtractedCmps(Instruction &I) {
1431 auto *BI = dyn_cast<BinaryOperator>(&I);
1432
1433 // We are looking for a scalar binop of booleans.
1434 // binop i1 (cmp Pred I0, C0), (cmp Pred I1, C1)
1435 if (!BI || !I.getType()->isIntegerTy(1))
1436 return false;
1437
1438 // The compare predicates should match, and each compare should have a
1439 // constant operand.
1440 Value *B0 = I.getOperand(0), *B1 = I.getOperand(1);
1441 Instruction *I0, *I1;
1442 Constant *C0, *C1;
1443 CmpPredicate P0, P1;
1444 if (!match(B0, m_Cmp(P0, m_Instruction(I0), m_Constant(C0))) ||
1445 !match(B1, m_Cmp(P1, m_Instruction(I1), m_Constant(C1))))
1446 return false;
1447
1448 auto MatchingPred = CmpPredicate::getMatching(P0, P1);
1449 if (!MatchingPred)
1450 return false;
1451
1452 // The compare operands must be extracts of the same vector with constant
1453 // extract indexes.
1454 Value *X;
1455 uint64_t Index0, Index1;
1456 if (!match(I0, m_ExtractElt(m_Value(X), m_ConstantInt(Index0))) ||
1457 !match(I1, m_ExtractElt(m_Specific(X), m_ConstantInt(Index1))))
1458 return false;
1459
1460 auto *Ext0 = cast<ExtractElementInst>(I0);
1461 auto *Ext1 = cast<ExtractElementInst>(I1);
1462 ExtractElementInst *ConvertToShuf = getShuffleExtract(Ext0, Ext1, CostKind);
1463 if (!ConvertToShuf)
1464 return false;
1465 assert((ConvertToShuf == Ext0 || ConvertToShuf == Ext1) &&
1466 "Unknown ExtractElementInst");
1467
1468 // The original scalar pattern is:
1469 // binop i1 (cmp Pred (ext X, Index0), C0), (cmp Pred (ext X, Index1), C1)
1470 CmpInst::Predicate Pred = *MatchingPred;
1471 unsigned CmpOpcode =
1472 CmpInst::isFPPredicate(Pred) ? Instruction::FCmp : Instruction::ICmp;
1473 auto *VecTy = dyn_cast<FixedVectorType>(X->getType());
1474 if (!VecTy)
1475 return false;
1476
1477 InstructionCost Ext0Cost =
1478 TTI.getVectorInstrCost(*Ext0, VecTy, CostKind, Index0);
1479 InstructionCost Ext1Cost =
1480 TTI.getVectorInstrCost(*Ext1, VecTy, CostKind, Index1);
1482 CmpOpcode, I0->getType(), CmpInst::makeCmpResultType(I0->getType()), Pred,
1483 CostKind);
1484
1485 InstructionCost OldCost =
1486 Ext0Cost + Ext1Cost + CmpCost * 2 +
1487 TTI.getArithmeticInstrCost(I.getOpcode(), I.getType(), CostKind);
1488
1489 // The proposed vector pattern is:
1490 // vcmp = cmp Pred X, VecC
1491 // ext (binop vNi1 vcmp, (shuffle vcmp, Index1)), Index0
1492 int CheapIndex = ConvertToShuf == Ext0 ? Index1 : Index0;
1493 int ExpensiveIndex = ConvertToShuf == Ext0 ? Index0 : Index1;
1496 CmpOpcode, VecTy, CmpInst::makeCmpResultType(VecTy), Pred, CostKind);
1497 SmallVector<int, 32> ShufMask(VecTy->getNumElements(), PoisonMaskElem);
1498 ShufMask[CheapIndex] = ExpensiveIndex;
1500 CmpTy, ShufMask, CostKind);
1501 NewCost += TTI.getArithmeticInstrCost(I.getOpcode(), CmpTy, CostKind);
1502 NewCost += TTI.getVectorInstrCost(*Ext0, CmpTy, CostKind, CheapIndex);
1503 NewCost += Ext0->hasOneUse() ? 0 : Ext0Cost;
1504 NewCost += Ext1->hasOneUse() ? 0 : Ext1Cost;
1505
1506 // Aggressively form vector ops if the cost is equal because the transform
1507 // may enable further optimization.
1508 // Codegen can reverse this transform (scalarize) if it was not profitable.
1509 if (OldCost < NewCost || !NewCost.isValid())
1510 return false;
1511
1512 // Create a vector constant from the 2 scalar constants.
1513 SmallVector<Constant *, 32> CmpC(VecTy->getNumElements(),
1514 PoisonValue::get(VecTy->getElementType()));
1515 CmpC[Index0] = C0;
1516 CmpC[Index1] = C1;
1517 Value *VCmp = Builder.CreateCmp(Pred, X, ConstantVector::get(CmpC));
1518 Value *Shuf = createShiftShuffle(VCmp, ExpensiveIndex, CheapIndex, Builder);
1519 Value *LHS = ConvertToShuf == Ext0 ? Shuf : VCmp;
1520 Value *RHS = ConvertToShuf == Ext0 ? VCmp : Shuf;
1521 Value *VecLogic = Builder.CreateBinOp(BI->getOpcode(), LHS, RHS);
1522 Value *NewExt = Builder.CreateExtractElement(VecLogic, CheapIndex);
1523 replaceValue(I, *NewExt);
1524 ++NumVecCmpBO;
1525 return true;
1526}
1527
1530 const TargetTransformInfo &TTI,
1531 InstructionCost &CostBeforeReduction,
1532 InstructionCost &CostAfterReduction) {
1533 Instruction *Op0, *Op1;
1534 auto *RedOp = dyn_cast<Instruction>(II.getOperand(0));
1535 auto *VecRedTy = cast<VectorType>(II.getOperand(0)->getType());
1536 unsigned ReductionOpc =
1537 getArithmeticReductionInstruction(II.getIntrinsicID());
1538 if (RedOp && match(RedOp, m_ZExtOrSExt(m_Value()))) {
1539 bool IsUnsigned = isa<ZExtInst>(RedOp);
1540 auto *ExtType = cast<VectorType>(RedOp->getOperand(0)->getType());
1541
1542 CostBeforeReduction =
1543 TTI.getCastInstrCost(RedOp->getOpcode(), VecRedTy, ExtType,
1545 CostAfterReduction =
1546 TTI.getExtendedReductionCost(ReductionOpc, IsUnsigned, II.getType(),
1547 ExtType, FastMathFlags(), CostKind);
1548 return;
1549 }
1550 if (RedOp && II.getIntrinsicID() == Intrinsic::vector_reduce_add &&
1551 match(RedOp,
1553 match(Op0, m_ZExtOrSExt(m_Value())) &&
1554 Op0->getOpcode() == Op1->getOpcode() &&
1555 Op0->getOperand(0)->getType() == Op1->getOperand(0)->getType() &&
1556 (Op0->getOpcode() == RedOp->getOpcode() || Op0 == Op1)) {
1557 // Matched reduce.add(ext(mul(ext(A), ext(B)))
1558 bool IsUnsigned = isa<ZExtInst>(Op0);
1559 auto *ExtType = cast<VectorType>(Op0->getOperand(0)->getType());
1560 VectorType *MulType = VectorType::get(Op0->getType(), VecRedTy);
1561
1562 InstructionCost ExtCost =
1563 TTI.getCastInstrCost(Op0->getOpcode(), MulType, ExtType,
1565 InstructionCost MulCost =
1566 TTI.getArithmeticInstrCost(Instruction::Mul, MulType, CostKind);
1567 InstructionCost Ext2Cost =
1568 TTI.getCastInstrCost(RedOp->getOpcode(), VecRedTy, MulType,
1570
1571 CostBeforeReduction = ExtCost * 2 + MulCost + Ext2Cost;
1572 CostAfterReduction = TTI.getMulAccReductionCost(
1573 IsUnsigned, ReductionOpc, II.getType(), ExtType, CostKind);
1574 return;
1575 }
1576 CostAfterReduction = TTI.getArithmeticReductionCost(ReductionOpc, VecRedTy,
1577 std::nullopt, CostKind);
1578}
1579
1580bool VectorCombine::foldBinopOfReductions(Instruction &I) {
1581 Instruction::BinaryOps BinOpOpc = cast<BinaryOperator>(&I)->getOpcode();
1582 Intrinsic::ID ReductionIID = getReductionForBinop(BinOpOpc);
1583 if (BinOpOpc == Instruction::Sub)
1584 ReductionIID = Intrinsic::vector_reduce_add;
1585 if (ReductionIID == Intrinsic::not_intrinsic)
1586 return false;
1587
1588 auto checkIntrinsicAndGetItsArgument = [](Value *V,
1589 Intrinsic::ID IID) -> Value * {
1590 auto *II = dyn_cast<IntrinsicInst>(V);
1591 if (!II)
1592 return nullptr;
1593 if (II->getIntrinsicID() == IID && II->hasOneUse())
1594 return II->getArgOperand(0);
1595 return nullptr;
1596 };
1597
1598 Value *V0 = checkIntrinsicAndGetItsArgument(I.getOperand(0), ReductionIID);
1599 if (!V0)
1600 return false;
1601 Value *V1 = checkIntrinsicAndGetItsArgument(I.getOperand(1), ReductionIID);
1602 if (!V1)
1603 return false;
1604
1605 auto *VTy = cast<VectorType>(V0->getType());
1606 if (V1->getType() != VTy)
1607 return false;
1608 const auto &II0 = *cast<IntrinsicInst>(I.getOperand(0));
1609 const auto &II1 = *cast<IntrinsicInst>(I.getOperand(1));
1610 unsigned ReductionOpc =
1611 getArithmeticReductionInstruction(II0.getIntrinsicID());
1612
1613 InstructionCost OldCost = 0;
1614 InstructionCost NewCost = 0;
1615 InstructionCost CostOfRedOperand0 = 0;
1616 InstructionCost CostOfRed0 = 0;
1617 InstructionCost CostOfRedOperand1 = 0;
1618 InstructionCost CostOfRed1 = 0;
1619 analyzeCostOfVecReduction(II0, CostKind, TTI, CostOfRedOperand0, CostOfRed0);
1620 analyzeCostOfVecReduction(II1, CostKind, TTI, CostOfRedOperand1, CostOfRed1);
1621 OldCost = CostOfRed0 + CostOfRed1 + TTI.getInstructionCost(&I, CostKind);
1622 NewCost =
1623 CostOfRedOperand0 + CostOfRedOperand1 +
1624 TTI.getArithmeticInstrCost(BinOpOpc, VTy, CostKind) +
1625 TTI.getArithmeticReductionCost(ReductionOpc, VTy, std::nullopt, CostKind);
1626 if (NewCost >= OldCost || !NewCost.isValid())
1627 return false;
1628
1629 LLVM_DEBUG(dbgs() << "Found two mergeable reductions: " << I
1630 << "\n OldCost: " << OldCost << " vs NewCost: " << NewCost
1631 << "\n");
1632 Value *VectorBO;
1633 if (BinOpOpc == Instruction::Or)
1634 VectorBO = Builder.CreateOr(V0, V1, "",
1635 cast<PossiblyDisjointInst>(I).isDisjoint());
1636 else
1637 VectorBO = Builder.CreateBinOp(BinOpOpc, V0, V1);
1638
1639 Instruction *Rdx = Builder.CreateIntrinsic(ReductionIID, {VTy}, {VectorBO});
1640 replaceValue(I, *Rdx);
1641 return true;
1642}
1643
1644// Check if memory loc modified between two instrs in the same BB
1647 const MemoryLocation &Loc, AAResults &AA) {
1648 unsigned NumScanned = 0;
1649 return std::any_of(Begin, End, [&](const Instruction &Instr) {
1650 return isModSet(AA.getModRefInfo(&Instr, Loc)) ||
1651 ++NumScanned > MaxInstrsToScan;
1652 });
1653}
1654
1655namespace {
1656/// Helper class to indicate whether a vector index can be safely scalarized and
1657/// if a freeze needs to be inserted.
1658class ScalarizationResult {
1659 enum class StatusTy { Unsafe, Safe, SafeWithFreeze };
1660
1661 StatusTy Status;
1662 Value *ToFreeze;
1663
1664 ScalarizationResult(StatusTy Status, Value *ToFreeze = nullptr)
1665 : Status(Status), ToFreeze(ToFreeze) {}
1666
1667public:
1668 ScalarizationResult(const ScalarizationResult &Other) = default;
1669 ~ScalarizationResult() {
1670 assert(!ToFreeze && "freeze() not called with ToFreeze being set");
1671 }
1672
1673 static ScalarizationResult unsafe() { return {StatusTy::Unsafe}; }
1674 static ScalarizationResult safe() { return {StatusTy::Safe}; }
1675 static ScalarizationResult safeWithFreeze(Value *ToFreeze) {
1676 return {StatusTy::SafeWithFreeze, ToFreeze};
1677 }
1678
1679 /// Returns true if the index can be scalarize without requiring a freeze.
1680 bool isSafe() const { return Status == StatusTy::Safe; }
1681 /// Returns true if the index cannot be scalarized.
1682 bool isUnsafe() const { return Status == StatusTy::Unsafe; }
1683 /// Returns true if the index can be scalarize, but requires inserting a
1684 /// freeze.
1685 bool isSafeWithFreeze() const { return Status == StatusTy::SafeWithFreeze; }
1686
1687 /// Reset the state of Unsafe and clear ToFreze if set.
1688 void discard() {
1689 ToFreeze = nullptr;
1690 Status = StatusTy::Unsafe;
1691 }
1692
1693 /// Freeze the ToFreeze and update the use in \p User to use it.
1694 void freeze(IRBuilderBase &Builder, Instruction &UserI) {
1695 assert(isSafeWithFreeze() &&
1696 "should only be used when freezing is required");
1697 assert(is_contained(ToFreeze->users(), &UserI) &&
1698 "UserI must be a user of ToFreeze");
1699 IRBuilder<>::InsertPointGuard Guard(Builder);
1700 Builder.SetInsertPoint(cast<Instruction>(&UserI));
1701 Value *Frozen =
1702 Builder.CreateFreeze(ToFreeze, ToFreeze->getName() + ".frozen");
1703 for (Use &U : make_early_inc_range((UserI.operands())))
1704 if (U.get() == ToFreeze)
1705 U.set(Frozen);
1706
1707 ToFreeze = nullptr;
1708 }
1709};
1710} // namespace
1711
1712/// Check if it is legal to scalarize a memory access to \p VecTy at index \p
1713/// Idx. \p Idx must access a valid vector element.
1714static ScalarizationResult canScalarizeAccess(VectorType *VecTy, Value *Idx,
1715 Instruction *CtxI,
1716 AssumptionCache &AC,
1717 const DominatorTree &DT) {
1718 // We do checks for both fixed vector types and scalable vector types.
1719 // This is the number of elements of fixed vector types,
1720 // or the minimum number of elements of scalable vector types.
1721 uint64_t NumElements = VecTy->getElementCount().getKnownMinValue();
1722 unsigned IntWidth = Idx->getType()->getScalarSizeInBits();
1723
1724 if (auto *C = dyn_cast<ConstantInt>(Idx)) {
1725 if (C->getValue().ult(NumElements))
1726 return ScalarizationResult::safe();
1727 return ScalarizationResult::unsafe();
1728 }
1729
1730 // Always unsafe if the index type can't handle all inbound values.
1731 if (!llvm::isUIntN(IntWidth, NumElements))
1732 return ScalarizationResult::unsafe();
1733
1734 APInt Zero(IntWidth, 0);
1735 APInt MaxElts(IntWidth, NumElements);
1736 ConstantRange ValidIndices(Zero, MaxElts);
1737 ConstantRange IdxRange(IntWidth, true);
1738
1739 if (isGuaranteedNotToBePoison(Idx, &AC)) {
1740 if (ValidIndices.contains(computeConstantRange(Idx, /* ForSigned */ false,
1741 true, &AC, CtxI, &DT)))
1742 return ScalarizationResult::safe();
1743 return ScalarizationResult::unsafe();
1744 }
1745
1746 // If the index may be poison, check if we can insert a freeze before the
1747 // range of the index is restricted.
1748 Value *IdxBase;
1749 ConstantInt *CI;
1750 if (match(Idx, m_And(m_Value(IdxBase), m_ConstantInt(CI)))) {
1751 IdxRange = IdxRange.binaryAnd(CI->getValue());
1752 } else if (match(Idx, m_URem(m_Value(IdxBase), m_ConstantInt(CI)))) {
1753 IdxRange = IdxRange.urem(CI->getValue());
1754 }
1755
1756 if (ValidIndices.contains(IdxRange))
1757 return ScalarizationResult::safeWithFreeze(IdxBase);
1758 return ScalarizationResult::unsafe();
1759}
1760
1761/// The memory operation on a vector of \p ScalarType had alignment of
1762/// \p VectorAlignment. Compute the maximal, but conservatively correct,
1763/// alignment that will be valid for the memory operation on a single scalar
1764/// element of the same type with index \p Idx.
1766 Type *ScalarType, Value *Idx,
1767 const DataLayout &DL) {
1768 if (auto *C = dyn_cast<ConstantInt>(Idx))
1769 return commonAlignment(VectorAlignment,
1770 C->getZExtValue() * DL.getTypeStoreSize(ScalarType));
1771 return commonAlignment(VectorAlignment, DL.getTypeStoreSize(ScalarType));
1772}
1773
1774// Combine patterns like:
1775// %0 = load <4 x i32>, <4 x i32>* %a
1776// %1 = insertelement <4 x i32> %0, i32 %b, i32 1
1777// store <4 x i32> %1, <4 x i32>* %a
1778// to:
1779// %0 = bitcast <4 x i32>* %a to i32*
1780// %1 = getelementptr inbounds i32, i32* %0, i64 0, i64 1
1781// store i32 %b, i32* %1
1782bool VectorCombine::foldSingleElementStore(Instruction &I) {
1784 return false;
1785 auto *SI = cast<StoreInst>(&I);
1786 if (!SI->isSimple() || !isa<VectorType>(SI->getValueOperand()->getType()))
1787 return false;
1788
1789 // TODO: Combine more complicated patterns (multiple insert) by referencing
1790 // TargetTransformInfo.
1792 Value *NewElement;
1793 Value *Idx;
1794 if (!match(SI->getValueOperand(),
1795 m_InsertElt(m_Instruction(Source), m_Value(NewElement),
1796 m_Value(Idx))))
1797 return false;
1798
1799 if (auto *Load = dyn_cast<LoadInst>(Source)) {
1800 auto VecTy = cast<VectorType>(SI->getValueOperand()->getType());
1801 Value *SrcAddr = Load->getPointerOperand()->stripPointerCasts();
1802 // Don't optimize for atomic/volatile load or store. Ensure memory is not
1803 // modified between, vector type matches store size, and index is inbounds.
1804 if (!Load->isSimple() || Load->getParent() != SI->getParent() ||
1805 !DL->typeSizeEqualsStoreSize(Load->getType()->getScalarType()) ||
1806 SrcAddr != SI->getPointerOperand()->stripPointerCasts())
1807 return false;
1808
1809 auto ScalarizableIdx = canScalarizeAccess(VecTy, Idx, Load, AC, DT);
1810 if (ScalarizableIdx.isUnsafe() ||
1811 isMemModifiedBetween(Load->getIterator(), SI->getIterator(),
1812 MemoryLocation::get(SI), AA))
1813 return false;
1814
1815 // Ensure we add the load back to the worklist BEFORE its users so they can
1816 // erased in the correct order.
1817 Worklist.push(Load);
1818
1819 if (ScalarizableIdx.isSafeWithFreeze())
1820 ScalarizableIdx.freeze(Builder, *cast<Instruction>(Idx));
1821 Value *GEP = Builder.CreateInBoundsGEP(
1822 SI->getValueOperand()->getType(), SI->getPointerOperand(),
1823 {ConstantInt::get(Idx->getType(), 0), Idx});
1824 StoreInst *NSI = Builder.CreateStore(NewElement, GEP);
1825 NSI->copyMetadata(*SI);
1826 Align ScalarOpAlignment = computeAlignmentAfterScalarization(
1827 std::max(SI->getAlign(), Load->getAlign()), NewElement->getType(), Idx,
1828 *DL);
1829 NSI->setAlignment(ScalarOpAlignment);
1830 replaceValue(I, *NSI);
1832 return true;
1833 }
1834
1835 return false;
1836}
1837
1838/// Try to scalarize vector loads feeding extractelement instructions.
1839bool VectorCombine::scalarizeLoadExtract(Instruction &I) {
1841 return false;
1842
1843 Value *Ptr;
1844 if (!match(&I, m_Load(m_Value(Ptr))))
1845 return false;
1846
1847 auto *LI = cast<LoadInst>(&I);
1848 auto *VecTy = cast<VectorType>(LI->getType());
1849 if (LI->isVolatile() || !DL->typeSizeEqualsStoreSize(VecTy->getScalarType()))
1850 return false;
1851
1852 InstructionCost OriginalCost =
1853 TTI.getMemoryOpCost(Instruction::Load, VecTy, LI->getAlign(),
1854 LI->getPointerAddressSpace(), CostKind);
1855 InstructionCost ScalarizedCost = 0;
1856
1857 Instruction *LastCheckedInst = LI;
1858 unsigned NumInstChecked = 0;
1859 DenseMap<ExtractElementInst *, ScalarizationResult> NeedFreeze;
1860 auto FailureGuard = make_scope_exit([&]() {
1861 // If the transform is aborted, discard the ScalarizationResults.
1862 for (auto &Pair : NeedFreeze)
1863 Pair.second.discard();
1864 });
1865
1866 // Check if all users of the load are extracts with no memory modifications
1867 // between the load and the extract. Compute the cost of both the original
1868 // code and the scalarized version.
1869 for (User *U : LI->users()) {
1870 auto *UI = dyn_cast<ExtractElementInst>(U);
1871 if (!UI || UI->getParent() != LI->getParent())
1872 return false;
1873
1874 // If any extract is waiting to be erased, then bail out as this will
1875 // distort the cost calculation and possibly lead to infinite loops.
1876 if (UI->use_empty())
1877 return false;
1878
1879 // Check if any instruction between the load and the extract may modify
1880 // memory.
1881 if (LastCheckedInst->comesBefore(UI)) {
1882 for (Instruction &I :
1883 make_range(std::next(LI->getIterator()), UI->getIterator())) {
1884 // Bail out if we reached the check limit or the instruction may write
1885 // to memory.
1886 if (NumInstChecked == MaxInstrsToScan || I.mayWriteToMemory())
1887 return false;
1888 NumInstChecked++;
1889 }
1890 LastCheckedInst = UI;
1891 }
1892
1893 auto ScalarIdx =
1894 canScalarizeAccess(VecTy, UI->getIndexOperand(), LI, AC, DT);
1895 if (ScalarIdx.isUnsafe())
1896 return false;
1897 if (ScalarIdx.isSafeWithFreeze()) {
1898 NeedFreeze.try_emplace(UI, ScalarIdx);
1899 ScalarIdx.discard();
1900 }
1901
1902 auto *Index = dyn_cast<ConstantInt>(UI->getIndexOperand());
1903 OriginalCost +=
1904 TTI.getVectorInstrCost(Instruction::ExtractElement, VecTy, CostKind,
1905 Index ? Index->getZExtValue() : -1);
1906 ScalarizedCost +=
1907 TTI.getMemoryOpCost(Instruction::Load, VecTy->getElementType(),
1908 Align(1), LI->getPointerAddressSpace(), CostKind);
1909 ScalarizedCost += TTI.getAddressComputationCost(LI->getPointerOperandType(),
1910 nullptr, nullptr, CostKind);
1911 }
1912
1913 LLVM_DEBUG(dbgs() << "Found all extractions of a vector load: " << I
1914 << "\n LoadExtractCost: " << OriginalCost
1915 << " vs ScalarizedCost: " << ScalarizedCost << "\n");
1916
1917 if (ScalarizedCost >= OriginalCost)
1918 return false;
1919
1920 // Ensure we add the load back to the worklist BEFORE its users so they can
1921 // erased in the correct order.
1922 Worklist.push(LI);
1923
1924 Type *ElemType = VecTy->getElementType();
1925
1926 // Replace extracts with narrow scalar loads.
1927 for (User *U : LI->users()) {
1928 auto *EI = cast<ExtractElementInst>(U);
1929 Value *Idx = EI->getIndexOperand();
1930
1931 // Insert 'freeze' for poison indexes.
1932 auto It = NeedFreeze.find(EI);
1933 if (It != NeedFreeze.end())
1934 It->second.freeze(Builder, *cast<Instruction>(Idx));
1935
1936 Builder.SetInsertPoint(EI);
1937 Value *GEP =
1938 Builder.CreateInBoundsGEP(VecTy, Ptr, {Builder.getInt32(0), Idx});
1939 auto *NewLoad = cast<LoadInst>(
1940 Builder.CreateLoad(ElemType, GEP, EI->getName() + ".scalar"));
1941
1942 Align ScalarOpAlignment =
1943 computeAlignmentAfterScalarization(LI->getAlign(), ElemType, Idx, *DL);
1944 NewLoad->setAlignment(ScalarOpAlignment);
1945
1946 if (auto *ConstIdx = dyn_cast<ConstantInt>(Idx)) {
1947 size_t Offset = ConstIdx->getZExtValue() * DL->getTypeStoreSize(ElemType);
1948 AAMDNodes OldAAMD = LI->getAAMetadata();
1949 NewLoad->setAAMetadata(OldAAMD.adjustForAccess(Offset, ElemType, *DL));
1950 }
1951
1952 replaceValue(*EI, *NewLoad, false);
1953 }
1954
1955 FailureGuard.release();
1956 return true;
1957}
1958
1959bool VectorCombine::scalarizeExtExtract(Instruction &I) {
1961 return false;
1962 auto *Ext = dyn_cast<ZExtInst>(&I);
1963 if (!Ext)
1964 return false;
1965
1966 // Try to convert a vector zext feeding only extracts to a set of scalar
1967 // (Src << ExtIdx *Size) & (Size -1)
1968 // if profitable .
1969 auto *SrcTy = dyn_cast<FixedVectorType>(Ext->getOperand(0)->getType());
1970 if (!SrcTy)
1971 return false;
1972 auto *DstTy = cast<FixedVectorType>(Ext->getType());
1973
1974 Type *ScalarDstTy = DstTy->getElementType();
1975 if (DL->getTypeSizeInBits(SrcTy) != DL->getTypeSizeInBits(ScalarDstTy))
1976 return false;
1977
1978 InstructionCost VectorCost =
1979 TTI.getCastInstrCost(Instruction::ZExt, DstTy, SrcTy,
1981 unsigned ExtCnt = 0;
1982 bool ExtLane0 = false;
1983 for (User *U : Ext->users()) {
1984 uint64_t Idx;
1985 if (!match(U, m_ExtractElt(m_Value(), m_ConstantInt(Idx))))
1986 return false;
1987 if (cast<Instruction>(U)->use_empty())
1988 continue;
1989 ExtCnt += 1;
1990 ExtLane0 |= !Idx;
1991 VectorCost += TTI.getVectorInstrCost(Instruction::ExtractElement, DstTy,
1992 CostKind, Idx, U);
1993 }
1994
1995 InstructionCost ScalarCost =
1996 ExtCnt * TTI.getArithmeticInstrCost(
1997 Instruction::And, ScalarDstTy, CostKind,
2000 (ExtCnt - ExtLane0) *
2002 Instruction::LShr, ScalarDstTy, CostKind,
2005 if (ScalarCost > VectorCost)
2006 return false;
2007
2008 Value *ScalarV = Ext->getOperand(0);
2009 if (!isGuaranteedNotToBePoison(ScalarV, &AC, dyn_cast<Instruction>(ScalarV),
2010 &DT))
2011 ScalarV = Builder.CreateFreeze(ScalarV);
2012 ScalarV = Builder.CreateBitCast(
2013 ScalarV,
2014 IntegerType::get(SrcTy->getContext(), DL->getTypeSizeInBits(SrcTy)));
2015 uint64_t SrcEltSizeInBits = DL->getTypeSizeInBits(SrcTy->getElementType());
2016 uint64_t EltBitMask = (1ull << SrcEltSizeInBits) - 1;
2017 for (User *U : Ext->users()) {
2018 auto *Extract = cast<ExtractElementInst>(U);
2019 uint64_t Idx =
2020 cast<ConstantInt>(Extract->getIndexOperand())->getZExtValue();
2021 Value *LShr = Builder.CreateLShr(ScalarV, Idx * SrcEltSizeInBits);
2022 Value *And = Builder.CreateAnd(LShr, EltBitMask);
2023 U->replaceAllUsesWith(And);
2024 }
2025 return true;
2026}
2027
2028/// Try to fold "(or (zext (bitcast X)), (shl (zext (bitcast Y)), C))"
2029/// to "(bitcast (concat X, Y))"
2030/// where X/Y are bitcasted from i1 mask vectors.
2031bool VectorCombine::foldConcatOfBoolMasks(Instruction &I) {
2032 Type *Ty = I.getType();
2033 if (!Ty->isIntegerTy())
2034 return false;
2035
2036 // TODO: Add big endian test coverage
2037 if (DL->isBigEndian())
2038 return false;
2039
2040 // Restrict to disjoint cases so the mask vectors aren't overlapping.
2041 Instruction *X, *Y;
2043 return false;
2044
2045 // Allow both sources to contain shl, to handle more generic pattern:
2046 // "(or (shl (zext (bitcast X)), C1), (shl (zext (bitcast Y)), C2))"
2047 Value *SrcX;
2048 uint64_t ShAmtX = 0;
2049 if (!match(X, m_OneUse(m_ZExt(m_OneUse(m_BitCast(m_Value(SrcX)))))) &&
2050 !match(X, m_OneUse(
2052 m_ConstantInt(ShAmtX)))))
2053 return false;
2054
2055 Value *SrcY;
2056 uint64_t ShAmtY = 0;
2057 if (!match(Y, m_OneUse(m_ZExt(m_OneUse(m_BitCast(m_Value(SrcY)))))) &&
2058 !match(Y, m_OneUse(
2060 m_ConstantInt(ShAmtY)))))
2061 return false;
2062
2063 // Canonicalize larger shift to the RHS.
2064 if (ShAmtX > ShAmtY) {
2065 std::swap(X, Y);
2066 std::swap(SrcX, SrcY);
2067 std::swap(ShAmtX, ShAmtY);
2068 }
2069
2070 // Ensure both sources are matching vXi1 bool mask types, and that the shift
2071 // difference is the mask width so they can be easily concatenated together.
2072 uint64_t ShAmtDiff = ShAmtY - ShAmtX;
2073 unsigned NumSHL = (ShAmtX > 0) + (ShAmtY > 0);
2074 unsigned BitWidth = Ty->getPrimitiveSizeInBits();
2075 auto *MaskTy = dyn_cast<FixedVectorType>(SrcX->getType());
2076 if (!MaskTy || SrcX->getType() != SrcY->getType() ||
2077 !MaskTy->getElementType()->isIntegerTy(1) ||
2078 MaskTy->getNumElements() != ShAmtDiff ||
2079 MaskTy->getNumElements() > (BitWidth / 2))
2080 return false;
2081
2082 auto *ConcatTy = FixedVectorType::getDoubleElementsVectorType(MaskTy);
2083 auto *ConcatIntTy =
2084 Type::getIntNTy(Ty->getContext(), ConcatTy->getNumElements());
2085 auto *MaskIntTy = Type::getIntNTy(Ty->getContext(), ShAmtDiff);
2086
2087 SmallVector<int, 32> ConcatMask(ConcatTy->getNumElements());
2088 std::iota(ConcatMask.begin(), ConcatMask.end(), 0);
2089
2090 // TODO: Is it worth supporting multi use cases?
2091 InstructionCost OldCost = 0;
2092 OldCost += TTI.getArithmeticInstrCost(Instruction::Or, Ty, CostKind);
2093 OldCost +=
2094 NumSHL * TTI.getArithmeticInstrCost(Instruction::Shl, Ty, CostKind);
2095 OldCost += 2 * TTI.getCastInstrCost(Instruction::ZExt, Ty, MaskIntTy,
2097 OldCost += 2 * TTI.getCastInstrCost(Instruction::BitCast, MaskIntTy, MaskTy,
2099
2100 InstructionCost NewCost = 0;
2102 MaskTy, ConcatMask, CostKind);
2103 NewCost += TTI.getCastInstrCost(Instruction::BitCast, ConcatIntTy, ConcatTy,
2105 if (Ty != ConcatIntTy)
2106 NewCost += TTI.getCastInstrCost(Instruction::ZExt, Ty, ConcatIntTy,
2108 if (ShAmtX > 0)
2109 NewCost += TTI.getArithmeticInstrCost(Instruction::Shl, Ty, CostKind);
2110
2111 LLVM_DEBUG(dbgs() << "Found a concatenation of bitcasted bool masks: " << I
2112 << "\n OldCost: " << OldCost << " vs NewCost: " << NewCost
2113 << "\n");
2114
2115 if (NewCost > OldCost)
2116 return false;
2117
2118 // Build bool mask concatenation, bitcast back to scalar integer, and perform
2119 // any residual zero-extension or shifting.
2120 Value *Concat = Builder.CreateShuffleVector(SrcX, SrcY, ConcatMask);
2121 Worklist.pushValue(Concat);
2122
2123 Value *Result = Builder.CreateBitCast(Concat, ConcatIntTy);
2124
2125 if (Ty != ConcatIntTy) {
2126 Worklist.pushValue(Result);
2127 Result = Builder.CreateZExt(Result, Ty);
2128 }
2129
2130 if (ShAmtX > 0) {
2131 Worklist.pushValue(Result);
2132 Result = Builder.CreateShl(Result, ShAmtX);
2133 }
2134
2135 replaceValue(I, *Result);
2136 return true;
2137}
2138
2139/// Try to convert "shuffle (binop (shuffle, shuffle)), undef"
2140/// --> "binop (shuffle), (shuffle)".
2141bool VectorCombine::foldPermuteOfBinops(Instruction &I) {
2142 BinaryOperator *BinOp;
2143 ArrayRef<int> OuterMask;
2144 if (!match(&I,
2145 m_Shuffle(m_OneUse(m_BinOp(BinOp)), m_Undef(), m_Mask(OuterMask))))
2146 return false;
2147
2148 // Don't introduce poison into div/rem.
2149 if (BinOp->isIntDivRem() && llvm::is_contained(OuterMask, PoisonMaskElem))
2150 return false;
2151
2152 Value *Op00, *Op01, *Op10, *Op11;
2153 ArrayRef<int> Mask0, Mask1;
2154 bool Match0 =
2155 match(BinOp->getOperand(0),
2156 m_OneUse(m_Shuffle(m_Value(Op00), m_Value(Op01), m_Mask(Mask0))));
2157 bool Match1 =
2158 match(BinOp->getOperand(1),
2159 m_OneUse(m_Shuffle(m_Value(Op10), m_Value(Op11), m_Mask(Mask1))));
2160 if (!Match0 && !Match1)
2161 return false;
2162
2163 Op00 = Match0 ? Op00 : BinOp->getOperand(0);
2164 Op01 = Match0 ? Op01 : BinOp->getOperand(0);
2165 Op10 = Match1 ? Op10 : BinOp->getOperand(1);
2166 Op11 = Match1 ? Op11 : BinOp->getOperand(1);
2167
2168 Instruction::BinaryOps Opcode = BinOp->getOpcode();
2169 auto *ShuffleDstTy = dyn_cast<FixedVectorType>(I.getType());
2170 auto *BinOpTy = dyn_cast<FixedVectorType>(BinOp->getType());
2171 auto *Op0Ty = dyn_cast<FixedVectorType>(Op00->getType());
2172 auto *Op1Ty = dyn_cast<FixedVectorType>(Op10->getType());
2173 if (!ShuffleDstTy || !BinOpTy || !Op0Ty || !Op1Ty)
2174 return false;
2175
2176 unsigned NumSrcElts = BinOpTy->getNumElements();
2177
2178 // Don't accept shuffles that reference the second operand in
2179 // div/rem or if its an undef arg.
2180 if ((BinOp->isIntDivRem() || !isa<PoisonValue>(I.getOperand(1))) &&
2181 any_of(OuterMask, [NumSrcElts](int M) { return M >= (int)NumSrcElts; }))
2182 return false;
2183
2184 // Merge outer / inner (or identity if no match) shuffles.
2185 SmallVector<int> NewMask0, NewMask1;
2186 for (int M : OuterMask) {
2187 if (M < 0 || M >= (int)NumSrcElts) {
2188 NewMask0.push_back(PoisonMaskElem);
2189 NewMask1.push_back(PoisonMaskElem);
2190 } else {
2191 NewMask0.push_back(Match0 ? Mask0[M] : M);
2192 NewMask1.push_back(Match1 ? Mask1[M] : M);
2193 }
2194 }
2195
2196 unsigned NumOpElts = Op0Ty->getNumElements();
2197 bool IsIdentity0 = ShuffleDstTy == Op0Ty &&
2198 all_of(NewMask0, [NumOpElts](int M) { return M < (int)NumOpElts; }) &&
2199 ShuffleVectorInst::isIdentityMask(NewMask0, NumOpElts);
2200 bool IsIdentity1 = ShuffleDstTy == Op1Ty &&
2201 all_of(NewMask1, [NumOpElts](int M) { return M < (int)NumOpElts; }) &&
2202 ShuffleVectorInst::isIdentityMask(NewMask1, NumOpElts);
2203
2204 // Try to merge shuffles across the binop if the new shuffles are not costly.
2205 InstructionCost OldCost =
2206 TTI.getArithmeticInstrCost(Opcode, BinOpTy, CostKind) +
2208 BinOpTy, OuterMask, CostKind, 0, nullptr, {BinOp}, &I);
2209 if (Match0)
2210 OldCost += TTI.getShuffleCost(
2211 TargetTransformInfo::SK_PermuteTwoSrc, BinOpTy, Op0Ty, Mask0, CostKind,
2212 0, nullptr, {Op00, Op01}, cast<Instruction>(BinOp->getOperand(0)));
2213 if (Match1)
2214 OldCost += TTI.getShuffleCost(
2215 TargetTransformInfo::SK_PermuteTwoSrc, BinOpTy, Op1Ty, Mask1, CostKind,
2216 0, nullptr, {Op10, Op11}, cast<Instruction>(BinOp->getOperand(1)));
2217
2218 InstructionCost NewCost =
2219 TTI.getArithmeticInstrCost(Opcode, ShuffleDstTy, CostKind);
2220
2221 if (!IsIdentity0)
2222 NewCost +=
2224 Op0Ty, NewMask0, CostKind, 0, nullptr, {Op00, Op01});
2225 if (!IsIdentity1)
2226 NewCost +=
2228 Op1Ty, NewMask1, CostKind, 0, nullptr, {Op10, Op11});
2229
2230 LLVM_DEBUG(dbgs() << "Found a shuffle feeding a shuffled binop: " << I
2231 << "\n OldCost: " << OldCost << " vs NewCost: " << NewCost
2232 << "\n");
2233
2234 // If costs are equal, still fold as we reduce instruction count.
2235 if (NewCost > OldCost)
2236 return false;
2237
2238 Value *LHS =
2239 IsIdentity0 ? Op00 : Builder.CreateShuffleVector(Op00, Op01, NewMask0);
2240 Value *RHS =
2241 IsIdentity1 ? Op10 : Builder.CreateShuffleVector(Op10, Op11, NewMask1);
2242 Value *NewBO = Builder.CreateBinOp(Opcode, LHS, RHS);
2243
2244 // Intersect flags from the old binops.
2245 if (auto *NewInst = dyn_cast<Instruction>(NewBO))
2246 NewInst->copyIRFlags(BinOp);
2247
2248 Worklist.pushValue(LHS);
2249 Worklist.pushValue(RHS);
2250 replaceValue(I, *NewBO);
2251 return true;
2252}
2253
2254/// Try to convert "shuffle (binop), (binop)" into "binop (shuffle), (shuffle)".
2255/// Try to convert "shuffle (cmpop), (cmpop)" into "cmpop (shuffle), (shuffle)".
2256bool VectorCombine::foldShuffleOfBinops(Instruction &I) {
2257 ArrayRef<int> OldMask;
2258 Instruction *LHS, *RHS;
2260 m_OneUse(m_Instruction(RHS)), m_Mask(OldMask))))
2261 return false;
2262
2263 // TODO: Add support for addlike etc.
2264 if (LHS->getOpcode() != RHS->getOpcode())
2265 return false;
2266
2267 Value *X, *Y, *Z, *W;
2268 bool IsCommutative = false;
2269 CmpPredicate PredLHS = CmpInst::BAD_ICMP_PREDICATE;
2270 CmpPredicate PredRHS = CmpInst::BAD_ICMP_PREDICATE;
2271 if (match(LHS, m_BinOp(m_Value(X), m_Value(Y))) &&
2272 match(RHS, m_BinOp(m_Value(Z), m_Value(W)))) {
2273 auto *BO = cast<BinaryOperator>(LHS);
2274 // Don't introduce poison into div/rem.
2275 if (llvm::is_contained(OldMask, PoisonMaskElem) && BO->isIntDivRem())
2276 return false;
2277 IsCommutative = BinaryOperator::isCommutative(BO->getOpcode());
2278 } else if (match(LHS, m_Cmp(PredLHS, m_Value(X), m_Value(Y))) &&
2279 match(RHS, m_Cmp(PredRHS, m_Value(Z), m_Value(W))) &&
2280 (CmpInst::Predicate)PredLHS == (CmpInst::Predicate)PredRHS) {
2281 IsCommutative = cast<CmpInst>(LHS)->isCommutative();
2282 } else
2283 return false;
2284
2285 auto *ShuffleDstTy = dyn_cast<FixedVectorType>(I.getType());
2286 auto *BinResTy = dyn_cast<FixedVectorType>(LHS->getType());
2287 auto *BinOpTy = dyn_cast<FixedVectorType>(X->getType());
2288 if (!ShuffleDstTy || !BinResTy || !BinOpTy || X->getType() != Z->getType())
2289 return false;
2290
2291 unsigned NumSrcElts = BinOpTy->getNumElements();
2292
2293 // If we have something like "add X, Y" and "add Z, X", swap ops to match.
2294 if (IsCommutative && X != Z && Y != W && (X == W || Y == Z))
2295 std::swap(X, Y);
2296
2297 auto ConvertToUnary = [NumSrcElts](int &M) {
2298 if (M >= (int)NumSrcElts)
2299 M -= NumSrcElts;
2300 };
2301
2302 SmallVector<int> NewMask0(OldMask);
2304 if (X == Z) {
2305 llvm::for_each(NewMask0, ConvertToUnary);
2307 Z = PoisonValue::get(BinOpTy);
2308 }
2309
2310 SmallVector<int> NewMask1(OldMask);
2312 if (Y == W) {
2313 llvm::for_each(NewMask1, ConvertToUnary);
2315 W = PoisonValue::get(BinOpTy);
2316 }
2317
2318 // Try to replace a binop with a shuffle if the shuffle is not costly.
2319 InstructionCost OldCost =
2323 BinResTy, OldMask, CostKind, 0, nullptr, {LHS, RHS},
2324 &I);
2325
2326 // Handle shuffle(binop(shuffle(x),y),binop(z,shuffle(w))) style patterns
2327 // where one use shuffles have gotten split across the binop/cmp. These
2328 // often allow a major reduction in total cost that wouldn't happen as
2329 // individual folds.
2330 auto MergeInner = [&](Value *&Op, int Offset, MutableArrayRef<int> Mask,
2331 TTI::TargetCostKind CostKind) -> bool {
2332 Value *InnerOp;
2333 ArrayRef<int> InnerMask;
2334 if (match(Op, m_OneUse(m_Shuffle(m_Value(InnerOp), m_Undef(),
2335 m_Mask(InnerMask)))) &&
2336 InnerOp->getType() == Op->getType() &&
2337 all_of(InnerMask,
2338 [NumSrcElts](int M) { return M < (int)NumSrcElts; })) {
2339 for (int &M : Mask)
2340 if (Offset <= M && M < (int)(Offset + NumSrcElts)) {
2341 M = InnerMask[M - Offset];
2342 M = 0 <= M ? M + Offset : M;
2343 }
2345 Op = InnerOp;
2346 return true;
2347 }
2348 return false;
2349 };
2350 bool ReducedInstCount = false;
2351 ReducedInstCount |= MergeInner(X, 0, NewMask0, CostKind);
2352 ReducedInstCount |= MergeInner(Y, 0, NewMask1, CostKind);
2353 ReducedInstCount |= MergeInner(Z, NumSrcElts, NewMask0, CostKind);
2354 ReducedInstCount |= MergeInner(W, NumSrcElts, NewMask1, CostKind);
2355
2356 auto *ShuffleCmpTy =
2357 FixedVectorType::get(BinOpTy->getElementType(), ShuffleDstTy);
2358 InstructionCost NewCost =
2359 TTI.getShuffleCost(SK0, ShuffleCmpTy, BinOpTy, NewMask0, CostKind, 0,
2360 nullptr, {X, Z}) +
2361 TTI.getShuffleCost(SK1, ShuffleCmpTy, BinOpTy, NewMask1, CostKind, 0,
2362 nullptr, {Y, W});
2363
2364 if (PredLHS == CmpInst::BAD_ICMP_PREDICATE) {
2365 NewCost +=
2366 TTI.getArithmeticInstrCost(LHS->getOpcode(), ShuffleDstTy, CostKind);
2367 } else {
2368 NewCost += TTI.getCmpSelInstrCost(LHS->getOpcode(), ShuffleCmpTy,
2369 ShuffleDstTy, PredLHS, CostKind);
2370 }
2371
2372 LLVM_DEBUG(dbgs() << "Found a shuffle feeding two binops: " << I
2373 << "\n OldCost: " << OldCost << " vs NewCost: " << NewCost
2374 << "\n");
2375
2376 // If either shuffle will constant fold away, then fold for the same cost as
2377 // we will reduce the instruction count.
2378 ReducedInstCount |= (isa<Constant>(X) && isa<Constant>(Z)) ||
2379 (isa<Constant>(Y) && isa<Constant>(W));
2380 if (ReducedInstCount ? (NewCost > OldCost) : (NewCost >= OldCost))
2381 return false;
2382
2383 Value *Shuf0 = Builder.CreateShuffleVector(X, Z, NewMask0);
2384 Value *Shuf1 = Builder.CreateShuffleVector(Y, W, NewMask1);
2385 Value *NewBO = PredLHS == CmpInst::BAD_ICMP_PREDICATE
2386 ? Builder.CreateBinOp(
2387 cast<BinaryOperator>(LHS)->getOpcode(), Shuf0, Shuf1)
2388 : Builder.CreateCmp(PredLHS, Shuf0, Shuf1);
2389
2390 // Intersect flags from the old binops.
2391 if (auto *NewInst = dyn_cast<Instruction>(NewBO)) {
2392 NewInst->copyIRFlags(LHS);
2393 NewInst->andIRFlags(RHS);
2394 }
2395
2396 Worklist.pushValue(Shuf0);
2397 Worklist.pushValue(Shuf1);
2398 replaceValue(I, *NewBO);
2399 return true;
2400}
2401
2402/// Try to convert,
2403/// (shuffle(select(c1,t1,f1)), (select(c2,t2,f2)), m) into
2404/// (select (shuffle c1,c2,m), (shuffle t1,t2,m), (shuffle f1,f2,m))
2405bool VectorCombine::foldShuffleOfSelects(Instruction &I) {
2406 ArrayRef<int> Mask;
2407 Value *C1, *T1, *F1, *C2, *T2, *F2;
2408 if (!match(&I, m_Shuffle(
2410 m_OneUse(m_Select(m_Value(C2), m_Value(T2), m_Value(F2))),
2411 m_Mask(Mask))))
2412 return false;
2413
2414 auto *C1VecTy = dyn_cast<FixedVectorType>(C1->getType());
2415 auto *C2VecTy = dyn_cast<FixedVectorType>(C2->getType());
2416 if (!C1VecTy || !C2VecTy || C1VecTy != C2VecTy)
2417 return false;
2418
2419 auto *SI0FOp = dyn_cast<FPMathOperator>(I.getOperand(0));
2420 auto *SI1FOp = dyn_cast<FPMathOperator>(I.getOperand(1));
2421 // SelectInsts must have the same FMF.
2422 if (((SI0FOp == nullptr) != (SI1FOp == nullptr)) ||
2423 ((SI0FOp != nullptr) &&
2424 (SI0FOp->getFastMathFlags() != SI1FOp->getFastMathFlags())))
2425 return false;
2426
2427 auto *SrcVecTy = cast<FixedVectorType>(T1->getType());
2428 auto *DstVecTy = cast<FixedVectorType>(I.getType());
2430 auto SelOp = Instruction::Select;
2432 SelOp, SrcVecTy, C1VecTy, CmpInst::BAD_ICMP_PREDICATE, CostKind);
2433 OldCost += TTI.getCmpSelInstrCost(SelOp, SrcVecTy, C2VecTy,
2435 OldCost +=
2436 TTI.getShuffleCost(SK, DstVecTy, SrcVecTy, Mask, CostKind, 0, nullptr,
2437 {I.getOperand(0), I.getOperand(1)}, &I);
2438
2440 SK, FixedVectorType::get(C1VecTy->getScalarType(), Mask.size()), C1VecTy,
2441 Mask, CostKind, 0, nullptr, {C1, C2});
2442 NewCost += TTI.getShuffleCost(SK, DstVecTy, SrcVecTy, Mask, CostKind, 0,
2443 nullptr, {T1, T2});
2444 NewCost += TTI.getShuffleCost(SK, DstVecTy, SrcVecTy, Mask, CostKind, 0,
2445 nullptr, {F1, F2});
2446 auto *C1C2ShuffledVecTy = cast<FixedVectorType>(
2447 toVectorTy(Type::getInt1Ty(I.getContext()), DstVecTy->getNumElements()));
2448 NewCost += TTI.getCmpSelInstrCost(SelOp, DstVecTy, C1C2ShuffledVecTy,
2450
2451 LLVM_DEBUG(dbgs() << "Found a shuffle feeding two selects: " << I
2452 << "\n OldCost: " << OldCost << " vs NewCost: " << NewCost
2453 << "\n");
2454 if (NewCost > OldCost)
2455 return false;
2456
2457 Value *ShuffleCmp = Builder.CreateShuffleVector(C1, C2, Mask);
2458 Value *ShuffleTrue = Builder.CreateShuffleVector(T1, T2, Mask);
2459 Value *ShuffleFalse = Builder.CreateShuffleVector(F1, F2, Mask);
2460 Value *NewSel;
2461 // We presuppose that the SelectInsts have the same FMF.
2462 if (SI0FOp)
2463 NewSel = Builder.CreateSelectFMF(ShuffleCmp, ShuffleTrue, ShuffleFalse,
2464 SI0FOp->getFastMathFlags());
2465 else
2466 NewSel = Builder.CreateSelect(ShuffleCmp, ShuffleTrue, ShuffleFalse);
2467
2468 Worklist.pushValue(ShuffleCmp);
2469 Worklist.pushValue(ShuffleTrue);
2470 Worklist.pushValue(ShuffleFalse);
2471 replaceValue(I, *NewSel);
2472 return true;
2473}
2474
2475/// Try to convert "shuffle (castop), (castop)" with a shared castop operand
2476/// into "castop (shuffle)".
2477bool VectorCombine::foldShuffleOfCastops(Instruction &I) {
2478 Value *V0, *V1;
2479 ArrayRef<int> OldMask;
2480 if (!match(&I, m_Shuffle(m_Value(V0), m_Value(V1), m_Mask(OldMask))))
2481 return false;
2482
2483 auto *C0 = dyn_cast<CastInst>(V0);
2484 auto *C1 = dyn_cast<CastInst>(V1);
2485 if (!C0 || !C1)
2486 return false;
2487
2488 Instruction::CastOps Opcode = C0->getOpcode();
2489 if (C0->getSrcTy() != C1->getSrcTy())
2490 return false;
2491
2492 // Handle shuffle(zext_nneg(x), sext(y)) -> sext(shuffle(x,y)) folds.
2493 if (Opcode != C1->getOpcode()) {
2494 if (match(C0, m_SExtLike(m_Value())) && match(C1, m_SExtLike(m_Value())))
2495 Opcode = Instruction::SExt;
2496 else
2497 return false;
2498 }
2499
2500 auto *ShuffleDstTy = dyn_cast<FixedVectorType>(I.getType());
2501 auto *CastDstTy = dyn_cast<FixedVectorType>(C0->getDestTy());
2502 auto *CastSrcTy = dyn_cast<FixedVectorType>(C0->getSrcTy());
2503 if (!ShuffleDstTy || !CastDstTy || !CastSrcTy)
2504 return false;
2505
2506 unsigned NumSrcElts = CastSrcTy->getNumElements();
2507 unsigned NumDstElts = CastDstTy->getNumElements();
2508 assert((NumDstElts == NumSrcElts || Opcode == Instruction::BitCast) &&
2509 "Only bitcasts expected to alter src/dst element counts");
2510
2511 // Check for bitcasting of unscalable vector types.
2512 // e.g. <32 x i40> -> <40 x i32>
2513 if (NumDstElts != NumSrcElts && (NumSrcElts % NumDstElts) != 0 &&
2514 (NumDstElts % NumSrcElts) != 0)
2515 return false;
2516
2517 SmallVector<int, 16> NewMask;
2518 if (NumSrcElts >= NumDstElts) {
2519 // The bitcast is from wide to narrow/equal elements. The shuffle mask can
2520 // always be expanded to the equivalent form choosing narrower elements.
2521 assert(NumSrcElts % NumDstElts == 0 && "Unexpected shuffle mask");
2522 unsigned ScaleFactor = NumSrcElts / NumDstElts;
2523 narrowShuffleMaskElts(ScaleFactor, OldMask, NewMask);
2524 } else {
2525 // The bitcast is from narrow elements to wide elements. The shuffle mask
2526 // must choose consecutive elements to allow casting first.
2527 assert(NumDstElts % NumSrcElts == 0 && "Unexpected shuffle mask");
2528 unsigned ScaleFactor = NumDstElts / NumSrcElts;
2529 if (!widenShuffleMaskElts(ScaleFactor, OldMask, NewMask))
2530 return false;
2531 }
2532
2533 auto *NewShuffleDstTy =
2534 FixedVectorType::get(CastSrcTy->getScalarType(), NewMask.size());
2535
2536 // Try to replace a castop with a shuffle if the shuffle is not costly.
2537 InstructionCost CostC0 =
2538 TTI.getCastInstrCost(C0->getOpcode(), CastDstTy, CastSrcTy,
2540 InstructionCost CostC1 =
2541 TTI.getCastInstrCost(C1->getOpcode(), CastDstTy, CastSrcTy,
2543 InstructionCost OldCost = CostC0 + CostC1;
2544 OldCost +=
2546 CastDstTy, OldMask, CostKind, 0, nullptr, {}, &I);
2547
2548 InstructionCost NewCost =
2550 CastSrcTy, NewMask, CostKind);
2551 NewCost += TTI.getCastInstrCost(Opcode, ShuffleDstTy, NewShuffleDstTy,
2553 if (!C0->hasOneUse())
2554 NewCost += CostC0;
2555 if (!C1->hasOneUse())
2556 NewCost += CostC1;
2557
2558 LLVM_DEBUG(dbgs() << "Found a shuffle feeding two casts: " << I
2559 << "\n OldCost: " << OldCost << " vs NewCost: " << NewCost
2560 << "\n");
2561 if (NewCost > OldCost)
2562 return false;
2563
2564 Value *Shuf = Builder.CreateShuffleVector(C0->getOperand(0),
2565 C1->getOperand(0), NewMask);
2566 Value *Cast = Builder.CreateCast(Opcode, Shuf, ShuffleDstTy);
2567
2568 // Intersect flags from the old casts.
2569 if (auto *NewInst = dyn_cast<Instruction>(Cast)) {
2570 NewInst->copyIRFlags(C0);
2571 NewInst->andIRFlags(C1);
2572 }
2573
2574 Worklist.pushValue(Shuf);
2575 replaceValue(I, *Cast);
2576 return true;
2577}
2578
2579/// Try to convert any of:
2580/// "shuffle (shuffle x, y), (shuffle y, x)"
2581/// "shuffle (shuffle x, undef), (shuffle y, undef)"
2582/// "shuffle (shuffle x, undef), y"
2583/// "shuffle x, (shuffle y, undef)"
2584/// into "shuffle x, y".
2585bool VectorCombine::foldShuffleOfShuffles(Instruction &I) {
2586 ArrayRef<int> OuterMask;
2587 Value *OuterV0, *OuterV1;
2588 if (!match(&I,
2589 m_Shuffle(m_Value(OuterV0), m_Value(OuterV1), m_Mask(OuterMask))))
2590 return false;
2591
2592 ArrayRef<int> InnerMask0, InnerMask1;
2593 Value *X0, *X1, *Y0, *Y1;
2594 bool Match0 =
2595 match(OuterV0, m_Shuffle(m_Value(X0), m_Value(Y0), m_Mask(InnerMask0)));
2596 bool Match1 =
2597 match(OuterV1, m_Shuffle(m_Value(X1), m_Value(Y1), m_Mask(InnerMask1)));
2598 if (!Match0 && !Match1)
2599 return false;
2600
2601 // If the outer shuffle is a permute, then create a fake inner all-poison
2602 // shuffle. This is easier than accounting for length-changing shuffles below.
2603 SmallVector<int, 16> PoisonMask1;
2604 if (!Match1 && isa<PoisonValue>(OuterV1)) {
2605 X1 = X0;
2606 Y1 = Y0;
2607 PoisonMask1.append(InnerMask0.size(), PoisonMaskElem);
2608 InnerMask1 = PoisonMask1;
2609 Match1 = true; // fake match
2610 }
2611
2612 X0 = Match0 ? X0 : OuterV0;
2613 Y0 = Match0 ? Y0 : OuterV0;
2614 X1 = Match1 ? X1 : OuterV1;
2615 Y1 = Match1 ? Y1 : OuterV1;
2616 auto *ShuffleDstTy = dyn_cast<FixedVectorType>(I.getType());
2617 auto *ShuffleSrcTy = dyn_cast<FixedVectorType>(X0->getType());
2618 auto *ShuffleImmTy = dyn_cast<FixedVectorType>(OuterV0->getType());
2619 if (!ShuffleDstTy || !ShuffleSrcTy || !ShuffleImmTy ||
2620 X0->getType() != X1->getType())
2621 return false;
2622
2623 unsigned NumSrcElts = ShuffleSrcTy->getNumElements();
2624 unsigned NumImmElts = ShuffleImmTy->getNumElements();
2625
2626 // Attempt to merge shuffles, matching upto 2 source operands.
2627 // Replace index to a poison arg with PoisonMaskElem.
2628 // Bail if either inner masks reference an undef arg.
2629 SmallVector<int, 16> NewMask(OuterMask);
2630 Value *NewX = nullptr, *NewY = nullptr;
2631 for (int &M : NewMask) {
2632 Value *Src = nullptr;
2633 if (0 <= M && M < (int)NumImmElts) {
2634 Src = OuterV0;
2635 if (Match0) {
2636 M = InnerMask0[M];
2637 Src = M >= (int)NumSrcElts ? Y0 : X0;
2638 M = M >= (int)NumSrcElts ? (M - NumSrcElts) : M;
2639 }
2640 } else if (M >= (int)NumImmElts) {
2641 Src = OuterV1;
2642 M -= NumImmElts;
2643 if (Match1) {
2644 M = InnerMask1[M];
2645 Src = M >= (int)NumSrcElts ? Y1 : X1;
2646 M = M >= (int)NumSrcElts ? (M - NumSrcElts) : M;
2647 }
2648 }
2649 if (Src && M != PoisonMaskElem) {
2650 assert(0 <= M && M < (int)NumSrcElts && "Unexpected shuffle mask index");
2651 if (isa<UndefValue>(Src)) {
2652 // We've referenced an undef element - if its poison, update the shuffle
2653 // mask, else bail.
2654 if (!isa<PoisonValue>(Src))
2655 return false;
2656 M = PoisonMaskElem;
2657 continue;
2658 }
2659 if (!NewX || NewX == Src) {
2660 NewX = Src;
2661 continue;
2662 }
2663 if (!NewY || NewY == Src) {
2664 M += NumSrcElts;
2665 NewY = Src;
2666 continue;
2667 }
2668 return false;
2669 }
2670 }
2671
2672 if (!NewX)
2673 return PoisonValue::get(ShuffleDstTy);
2674 if (!NewY)
2675 NewY = PoisonValue::get(ShuffleSrcTy);
2676
2677 // Have we folded to an Identity shuffle?
2678 if (ShuffleVectorInst::isIdentityMask(NewMask, NumSrcElts)) {
2679 replaceValue(I, *NewX);
2680 return true;
2681 }
2682
2683 // Try to merge the shuffles if the new shuffle is not costly.
2684 InstructionCost InnerCost0 = 0;
2685 if (Match0)
2686 InnerCost0 = TTI.getInstructionCost(cast<User>(OuterV0), CostKind);
2687
2688 InstructionCost InnerCost1 = 0;
2689 if (Match1)
2690 InnerCost1 = TTI.getInstructionCost(cast<User>(OuterV1), CostKind);
2691
2693
2694 InstructionCost OldCost = InnerCost0 + InnerCost1 + OuterCost;
2695
2696 bool IsUnary = all_of(NewMask, [&](int M) { return M < (int)NumSrcElts; });
2700 InstructionCost NewCost =
2701 TTI.getShuffleCost(SK, ShuffleDstTy, ShuffleSrcTy, NewMask, CostKind, 0,
2702 nullptr, {NewX, NewY});
2703 if (!OuterV0->hasOneUse())
2704 NewCost += InnerCost0;
2705 if (!OuterV1->hasOneUse())
2706 NewCost += InnerCost1;
2707
2708 LLVM_DEBUG(dbgs() << "Found a shuffle feeding two shuffles: " << I
2709 << "\n OldCost: " << OldCost << " vs NewCost: " << NewCost
2710 << "\n");
2711 if (NewCost > OldCost)
2712 return false;
2713
2714 Value *Shuf = Builder.CreateShuffleVector(NewX, NewY, NewMask);
2715 replaceValue(I, *Shuf);
2716 return true;
2717}
2718
2719/// Try to convert
2720/// "shuffle (intrinsic), (intrinsic)" into "intrinsic (shuffle), (shuffle)".
2721bool VectorCombine::foldShuffleOfIntrinsics(Instruction &I) {
2722 Value *V0, *V1;
2723 ArrayRef<int> OldMask;
2724 if (!match(&I, m_Shuffle(m_OneUse(m_Value(V0)), m_OneUse(m_Value(V1)),
2725 m_Mask(OldMask))))
2726 return false;
2727
2728 auto *II0 = dyn_cast<IntrinsicInst>(V0);
2729 auto *II1 = dyn_cast<IntrinsicInst>(V1);
2730 if (!II0 || !II1)
2731 return false;
2732
2733 Intrinsic::ID IID = II0->getIntrinsicID();
2734 if (IID != II1->getIntrinsicID())
2735 return false;
2736
2737 auto *ShuffleDstTy = dyn_cast<FixedVectorType>(I.getType());
2738 auto *II0Ty = dyn_cast<FixedVectorType>(II0->getType());
2739 if (!ShuffleDstTy || !II0Ty)
2740 return false;
2741
2742 if (!isTriviallyVectorizable(IID))
2743 return false;
2744
2745 for (unsigned I = 0, E = II0->arg_size(); I != E; ++I)
2747 II0->getArgOperand(I) != II1->getArgOperand(I))
2748 return false;
2749
2750 InstructionCost OldCost =
2751 TTI.getIntrinsicInstrCost(IntrinsicCostAttributes(IID, *II0), CostKind) +
2752 TTI.getIntrinsicInstrCost(IntrinsicCostAttributes(IID, *II1), CostKind) +
2754 II0Ty, OldMask, CostKind, 0, nullptr, {II0, II1}, &I);
2755
2756 SmallVector<Type *> NewArgsTy;
2757 InstructionCost NewCost = 0;
2758 for (unsigned I = 0, E = II0->arg_size(); I != E; ++I) {
2760 NewArgsTy.push_back(II0->getArgOperand(I)->getType());
2761 } else {
2762 auto *VecTy = cast<FixedVectorType>(II0->getArgOperand(I)->getType());
2763 auto *ArgTy = FixedVectorType::get(VecTy->getElementType(),
2764 ShuffleDstTy->getNumElements());
2765 NewArgsTy.push_back(ArgTy);
2767 ArgTy, VecTy, OldMask, CostKind);
2768 }
2769 }
2770 IntrinsicCostAttributes NewAttr(IID, ShuffleDstTy, NewArgsTy);
2771 NewCost += TTI.getIntrinsicInstrCost(NewAttr, CostKind);
2772
2773 LLVM_DEBUG(dbgs() << "Found a shuffle feeding two intrinsics: " << I
2774 << "\n OldCost: " << OldCost << " vs NewCost: " << NewCost
2775 << "\n");
2776
2777 if (NewCost > OldCost)
2778 return false;
2779
2780 SmallVector<Value *> NewArgs;
2781 for (unsigned I = 0, E = II0->arg_size(); I != E; ++I)
2783 NewArgs.push_back(II0->getArgOperand(I));
2784 } else {
2785 Value *Shuf = Builder.CreateShuffleVector(II0->getArgOperand(I),
2786 II1->getArgOperand(I), OldMask);
2787 NewArgs.push_back(Shuf);
2788 Worklist.pushValue(Shuf);
2789 }
2790 Value *NewIntrinsic = Builder.CreateIntrinsic(ShuffleDstTy, IID, NewArgs);
2791
2792 // Intersect flags from the old intrinsics.
2793 if (auto *NewInst = dyn_cast<Instruction>(NewIntrinsic)) {
2794 NewInst->copyIRFlags(II0);
2795 NewInst->andIRFlags(II1);
2796 }
2797
2798 replaceValue(I, *NewIntrinsic);
2799 return true;
2800}
2801
2802using InstLane = std::pair<Use *, int>;
2803
2804static InstLane lookThroughShuffles(Use *U, int Lane) {
2805 while (auto *SV = dyn_cast<ShuffleVectorInst>(U->get())) {
2806 unsigned NumElts =
2807 cast<FixedVectorType>(SV->getOperand(0)->getType())->getNumElements();
2808 int M = SV->getMaskValue(Lane);
2809 if (M < 0)
2810 return {nullptr, PoisonMaskElem};
2811 if (static_cast<unsigned>(M) < NumElts) {
2812 U = &SV->getOperandUse(0);
2813 Lane = M;
2814 } else {
2815 U = &SV->getOperandUse(1);
2816 Lane = M - NumElts;
2817 }
2818 }
2819 return InstLane{U, Lane};
2820}
2821
2825 for (InstLane IL : Item) {
2826 auto [U, Lane] = IL;
2827 InstLane OpLane =
2828 U ? lookThroughShuffles(&cast<Instruction>(U->get())->getOperandUse(Op),
2829 Lane)
2830 : InstLane{nullptr, PoisonMaskElem};
2831 NItem.emplace_back(OpLane);
2832 }
2833 return NItem;
2834}
2835
2836/// Detect concat of multiple values into a vector
2838 const TargetTransformInfo &TTI) {
2839 auto *Ty = cast<FixedVectorType>(Item.front().first->get()->getType());
2840 unsigned NumElts = Ty->getNumElements();
2841 if (Item.size() == NumElts || NumElts == 1 || Item.size() % NumElts != 0)
2842 return false;
2843
2844 // Check that the concat is free, usually meaning that the type will be split
2845 // during legalization.
2846 SmallVector<int, 16> ConcatMask(NumElts * 2);
2847 std::iota(ConcatMask.begin(), ConcatMask.end(), 0);
2848 if (TTI.getShuffleCost(TTI::SK_PermuteTwoSrc,
2849 FixedVectorType::get(Ty->getScalarType(), NumElts * 2),
2850 Ty, ConcatMask, CostKind) != 0)
2851 return false;
2852
2853 unsigned NumSlices = Item.size() / NumElts;
2854 // Currently we generate a tree of shuffles for the concats, which limits us
2855 // to a power2.
2856 if (!isPowerOf2_32(NumSlices))
2857 return false;
2858 for (unsigned Slice = 0; Slice < NumSlices; ++Slice) {
2859 Use *SliceV = Item[Slice * NumElts].first;
2860 if (!SliceV || SliceV->get()->getType() != Ty)
2861 return false;
2862 for (unsigned Elt = 0; Elt < NumElts; ++Elt) {
2863 auto [V, Lane] = Item[Slice * NumElts + Elt];
2864 if (Lane != static_cast<int>(Elt) || SliceV->get() != V->get())
2865 return false;
2866 }
2867 }
2868 return true;
2869}
2870
2872 const SmallPtrSet<Use *, 4> &IdentityLeafs,
2873 const SmallPtrSet<Use *, 4> &SplatLeafs,
2874 const SmallPtrSet<Use *, 4> &ConcatLeafs,
2875 IRBuilderBase &Builder,
2876 const TargetTransformInfo *TTI) {
2877 auto [FrontU, FrontLane] = Item.front();
2878
2879 if (IdentityLeafs.contains(FrontU)) {
2880 return FrontU->get();
2881 }
2882 if (SplatLeafs.contains(FrontU)) {
2883 SmallVector<int, 16> Mask(Ty->getNumElements(), FrontLane);
2884 return Builder.CreateShuffleVector(FrontU->get(), Mask);
2885 }
2886 if (ConcatLeafs.contains(FrontU)) {
2887 unsigned NumElts =
2888 cast<FixedVectorType>(FrontU->get()->getType())->getNumElements();
2889 SmallVector<Value *> Values(Item.size() / NumElts, nullptr);
2890 for (unsigned S = 0; S < Values.size(); ++S)
2891 Values[S] = Item[S * NumElts].first->get();
2892
2893 while (Values.size() > 1) {
2894 NumElts *= 2;
2895 SmallVector<int, 16> Mask(NumElts, 0);
2896 std::iota(Mask.begin(), Mask.end(), 0);
2897 SmallVector<Value *> NewValues(Values.size() / 2, nullptr);
2898 for (unsigned S = 0; S < NewValues.size(); ++S)
2899 NewValues[S] =
2900 Builder.CreateShuffleVector(Values[S * 2], Values[S * 2 + 1], Mask);
2901 Values = NewValues;
2902 }
2903 return Values[0];
2904 }
2905
2906 auto *I = cast<Instruction>(FrontU->get());
2907 auto *II = dyn_cast<IntrinsicInst>(I);
2908 unsigned NumOps = I->getNumOperands() - (II ? 1 : 0);
2910 for (unsigned Idx = 0; Idx < NumOps; Idx++) {
2911 if (II &&
2912 isVectorIntrinsicWithScalarOpAtArg(II->getIntrinsicID(), Idx, TTI)) {
2913 Ops[Idx] = II->getOperand(Idx);
2914 continue;
2915 }
2917 Ty, IdentityLeafs, SplatLeafs, ConcatLeafs,
2918 Builder, TTI);
2919 }
2920
2921 SmallVector<Value *, 8> ValueList;
2922 for (const auto &Lane : Item)
2923 if (Lane.first)
2924 ValueList.push_back(Lane.first->get());
2925
2926 Type *DstTy =
2927 FixedVectorType::get(I->getType()->getScalarType(), Ty->getNumElements());
2928 if (auto *BI = dyn_cast<BinaryOperator>(I)) {
2929 auto *Value = Builder.CreateBinOp((Instruction::BinaryOps)BI->getOpcode(),
2930 Ops[0], Ops[1]);
2931 propagateIRFlags(Value, ValueList);
2932 return Value;
2933 }
2934 if (auto *CI = dyn_cast<CmpInst>(I)) {
2935 auto *Value = Builder.CreateCmp(CI->getPredicate(), Ops[0], Ops[1]);
2936 propagateIRFlags(Value, ValueList);
2937 return Value;
2938 }
2939 if (auto *SI = dyn_cast<SelectInst>(I)) {
2940 auto *Value = Builder.CreateSelect(Ops[0], Ops[1], Ops[2], "", SI);
2941 propagateIRFlags(Value, ValueList);
2942 return Value;
2943 }
2944 if (auto *CI = dyn_cast<CastInst>(I)) {
2945 auto *Value = Builder.CreateCast(CI->getOpcode(), Ops[0], DstTy);
2946 propagateIRFlags(Value, ValueList);
2947 return Value;
2948 }
2949 if (II) {
2950 auto *Value = Builder.CreateIntrinsic(DstTy, II->getIntrinsicID(), Ops);
2951 propagateIRFlags(Value, ValueList);
2952 return Value;
2953 }
2954 assert(isa<UnaryInstruction>(I) && "Unexpected instruction type in Generate");
2955 auto *Value =
2956 Builder.CreateUnOp((Instruction::UnaryOps)I->getOpcode(), Ops[0]);
2957 propagateIRFlags(Value, ValueList);
2958 return Value;
2959}
2960
2961// Starting from a shuffle, look up through operands tracking the shuffled index
2962// of each lane. If we can simplify away the shuffles to identities then
2963// do so.
2964bool VectorCombine::foldShuffleToIdentity(Instruction &I) {
2965 auto *Ty = dyn_cast<FixedVectorType>(I.getType());
2966 if (!Ty || I.use_empty())
2967 return false;
2968
2969 SmallVector<InstLane> Start(Ty->getNumElements());
2970 for (unsigned M = 0, E = Ty->getNumElements(); M < E; ++M)
2971 Start[M] = lookThroughShuffles(&*I.use_begin(), M);
2972
2974 Worklist.push_back(Start);
2975 SmallPtrSet<Use *, 4> IdentityLeafs, SplatLeafs, ConcatLeafs;
2976 unsigned NumVisited = 0;
2977
2978 while (!Worklist.empty()) {
2979 if (++NumVisited > MaxInstrsToScan)
2980 return false;
2981
2982 SmallVector<InstLane> Item = Worklist.pop_back_val();
2983 auto [FrontU, FrontLane] = Item.front();
2984
2985 // If we found an undef first lane then bail out to keep things simple.
2986 if (!FrontU)
2987 return false;
2988
2989 // Helper to peek through bitcasts to the same value.
2990 auto IsEquiv = [&](Value *X, Value *Y) {
2991 return X->getType() == Y->getType() &&
2993 };
2994
2995 // Look for an identity value.
2996 if (FrontLane == 0 &&
2997 cast<FixedVectorType>(FrontU->get()->getType())->getNumElements() ==
2998 Ty->getNumElements() &&
2999 all_of(drop_begin(enumerate(Item)), [IsEquiv, Item](const auto &E) {
3000 Value *FrontV = Item.front().first->get();
3001 return !E.value().first || (IsEquiv(E.value().first->get(), FrontV) &&
3002 E.value().second == (int)E.index());
3003 })) {
3004 IdentityLeafs.insert(FrontU);
3005 continue;
3006 }
3007 // Look for constants, for the moment only supporting constant splats.
3008 if (auto *C = dyn_cast<Constant>(FrontU);
3009 C && C->getSplatValue() &&
3010 all_of(drop_begin(Item), [Item](InstLane &IL) {
3011 Value *FrontV = Item.front().first->get();
3012 Use *U = IL.first;
3013 return !U || (isa<Constant>(U->get()) &&
3014 cast<Constant>(U->get())->getSplatValue() ==
3015 cast<Constant>(FrontV)->getSplatValue());
3016 })) {
3017 SplatLeafs.insert(FrontU);
3018 continue;
3019 }
3020 // Look for a splat value.
3021 if (all_of(drop_begin(Item), [Item](InstLane &IL) {
3022 auto [FrontU, FrontLane] = Item.front();
3023 auto [U, Lane] = IL;
3024 return !U || (U->get() == FrontU->get() && Lane == FrontLane);
3025 })) {
3026 SplatLeafs.insert(FrontU);
3027 continue;
3028 }
3029
3030 // We need each element to be the same type of value, and check that each
3031 // element has a single use.
3032 auto CheckLaneIsEquivalentToFirst = [Item](InstLane IL) {
3033 Value *FrontV = Item.front().first->get();
3034 if (!IL.first)
3035 return true;
3036 Value *V = IL.first->get();
3037 if (auto *I = dyn_cast<Instruction>(V); I && !I->hasOneUser())
3038 return false;
3039 if (V->getValueID() != FrontV->getValueID())
3040 return false;
3041 if (auto *CI = dyn_cast<CmpInst>(V))
3042 if (CI->getPredicate() != cast<CmpInst>(FrontV)->getPredicate())
3043 return false;
3044 if (auto *CI = dyn_cast<CastInst>(V))
3045 if (CI->getSrcTy()->getScalarType() !=
3046 cast<CastInst>(FrontV)->getSrcTy()->getScalarType())
3047 return false;
3048 if (auto *SI = dyn_cast<SelectInst>(V))
3049 if (!isa<VectorType>(SI->getOperand(0)->getType()) ||
3050 SI->getOperand(0)->getType() !=
3051 cast<SelectInst>(FrontV)->getOperand(0)->getType())
3052 return false;
3053 if (isa<CallInst>(V) && !isa<IntrinsicInst>(V))
3054 return false;
3055 auto *II = dyn_cast<IntrinsicInst>(V);
3056 return !II || (isa<IntrinsicInst>(FrontV) &&
3057 II->getIntrinsicID() ==
3058 cast<IntrinsicInst>(FrontV)->getIntrinsicID() &&
3059 !II->hasOperandBundles());
3060 };
3061 if (all_of(drop_begin(Item), CheckLaneIsEquivalentToFirst)) {
3062 // Check the operator is one that we support.
3063 if (isa<BinaryOperator, CmpInst>(FrontU)) {
3064 // We exclude div/rem in case they hit UB from poison lanes.
3065 if (auto *BO = dyn_cast<BinaryOperator>(FrontU);
3066 BO && BO->isIntDivRem())
3067 return false;
3070 continue;
3071 } else if (isa<UnaryOperator, TruncInst, ZExtInst, SExtInst, FPToSIInst,
3072 FPToUIInst, SIToFPInst, UIToFPInst>(FrontU)) {
3074 continue;
3075 } else if (auto *BitCast = dyn_cast<BitCastInst>(FrontU)) {
3076 // TODO: Handle vector widening/narrowing bitcasts.
3077 auto *DstTy = dyn_cast<FixedVectorType>(BitCast->getDestTy());
3078 auto *SrcTy = dyn_cast<FixedVectorType>(BitCast->getSrcTy());
3079 if (DstTy && SrcTy &&
3080 SrcTy->getNumElements() == DstTy->getNumElements()) {
3082 continue;
3083 }
3084 } else if (isa<SelectInst>(FrontU)) {
3088 continue;
3089 } else if (auto *II = dyn_cast<IntrinsicInst>(FrontU);
3090 II && isTriviallyVectorizable(II->getIntrinsicID()) &&
3091 !II->hasOperandBundles()) {
3092 for (unsigned Op = 0, E = II->getNumOperands() - 1; Op < E; Op++) {
3093 if (isVectorIntrinsicWithScalarOpAtArg(II->getIntrinsicID(), Op,
3094 &TTI)) {
3095 if (!all_of(drop_begin(Item), [Item, Op](InstLane &IL) {
3096 Value *FrontV = Item.front().first->get();
3097 Use *U = IL.first;
3098 return !U || (cast<Instruction>(U->get())->getOperand(Op) ==
3099 cast<Instruction>(FrontV)->getOperand(Op));
3100 }))
3101 return false;
3102 continue;
3103 }
3105 }
3106 continue;
3107 }
3108 }
3109
3110 if (isFreeConcat(Item, CostKind, TTI)) {
3111 ConcatLeafs.insert(FrontU);
3112 continue;
3113 }
3114
3115 return false;
3116 }
3117
3118 if (NumVisited <= 1)
3119 return false;
3120
3121 LLVM_DEBUG(dbgs() << "Found a superfluous identity shuffle: " << I << "\n");
3122
3123 // If we got this far, we know the shuffles are superfluous and can be
3124 // removed. Scan through again and generate the new tree of instructions.
3125 Builder.SetInsertPoint(&I);
3126 Value *V = generateNewInstTree(Start, Ty, IdentityLeafs, SplatLeafs,
3127 ConcatLeafs, Builder, &TTI);
3128 replaceValue(I, *V);
3129 return true;
3130}
3131
3132/// Given a commutative reduction, the order of the input lanes does not alter
3133/// the results. We can use this to remove certain shuffles feeding the
3134/// reduction, removing the need to shuffle at all.
3135bool VectorCombine::foldShuffleFromReductions(Instruction &I) {
3136 auto *II = dyn_cast<IntrinsicInst>(&I);
3137 if (!II)
3138 return false;
3139 switch (II->getIntrinsicID()) {
3140 case Intrinsic::vector_reduce_add:
3141 case Intrinsic::vector_reduce_mul:
3142 case Intrinsic::vector_reduce_and:
3143 case Intrinsic::vector_reduce_or:
3144 case Intrinsic::vector_reduce_xor:
3145 case Intrinsic::vector_reduce_smin:
3146 case Intrinsic::vector_reduce_smax:
3147 case Intrinsic::vector_reduce_umin:
3148 case Intrinsic::vector_reduce_umax:
3149 break;
3150 default:
3151 return false;
3152 }
3153
3154 // Find all the inputs when looking through operations that do not alter the
3155 // lane order (binops, for example). Currently we look for a single shuffle,
3156 // and can ignore splat values.
3157 std::queue<Value *> Worklist;
3158 SmallPtrSet<Value *, 4> Visited;
3159 ShuffleVectorInst *Shuffle = nullptr;
3160 if (auto *Op = dyn_cast<Instruction>(I.getOperand(0)))
3161 Worklist.push(Op);
3162
3163 while (!Worklist.empty()) {
3164 Value *CV = Worklist.front();
3165 Worklist.pop();
3166 if (Visited.contains(CV))
3167 continue;
3168
3169 // Splats don't change the order, so can be safely ignored.
3170 if (isSplatValue(CV))
3171 continue;
3172
3173 Visited.insert(CV);
3174
3175 if (auto *CI = dyn_cast<Instruction>(CV)) {
3176 if (CI->isBinaryOp()) {
3177 for (auto *Op : CI->operand_values())
3178 Worklist.push(Op);
3179 continue;
3180 } else if (auto *SV = dyn_cast<ShuffleVectorInst>(CI)) {
3181 if (Shuffle && Shuffle != SV)
3182 return false;
3183 Shuffle = SV;
3184 continue;
3185 }
3186 }
3187
3188 // Anything else is currently an unknown node.
3189 return false;
3190 }
3191
3192 if (!Shuffle)
3193 return false;
3194
3195 // Check all uses of the binary ops and shuffles are also included in the
3196 // lane-invariant operations (Visited should be the list of lanewise
3197 // instructions, including the shuffle that we found).
3198 for (auto *V : Visited)
3199 for (auto *U : V->users())
3200 if (!Visited.contains(U) && U != &I)
3201 return false;
3202
3203 FixedVectorType *VecType =
3204 dyn_cast<FixedVectorType>(II->getOperand(0)->getType());
3205 if (!VecType)
3206 return false;
3207 FixedVectorType *ShuffleInputType =
3209 if (!ShuffleInputType)
3210 return false;
3211 unsigned NumInputElts = ShuffleInputType->getNumElements();
3212
3213 // Find the mask from sorting the lanes into order. This is most likely to
3214 // become a identity or concat mask. Undef elements are pushed to the end.
3215 SmallVector<int> ConcatMask;
3216 Shuffle->getShuffleMask(ConcatMask);
3217 sort(ConcatMask, [](int X, int Y) { return (unsigned)X < (unsigned)Y; });
3218 bool UsesSecondVec =
3219 any_of(ConcatMask, [&](int M) { return M >= (int)NumInputElts; });
3220
3222 UsesSecondVec ? TTI::SK_PermuteTwoSrc : TTI::SK_PermuteSingleSrc, VecType,
3223 ShuffleInputType, Shuffle->getShuffleMask(), CostKind);
3225 UsesSecondVec ? TTI::SK_PermuteTwoSrc : TTI::SK_PermuteSingleSrc, VecType,
3226 ShuffleInputType, ConcatMask, CostKind);
3227
3228 LLVM_DEBUG(dbgs() << "Found a reduction feeding from a shuffle: " << *Shuffle
3229 << "\n");
3230 LLVM_DEBUG(dbgs() << " OldCost: " << OldCost << " vs NewCost: " << NewCost
3231 << "\n");
3232 bool MadeChanges = false;
3233 if (NewCost < OldCost) {
3234 Builder.SetInsertPoint(Shuffle);
3235 Value *NewShuffle = Builder.CreateShuffleVector(
3236 Shuffle->getOperand(0), Shuffle->getOperand(1), ConcatMask);
3237 LLVM_DEBUG(dbgs() << "Created new shuffle: " << *NewShuffle << "\n");
3238 replaceValue(*Shuffle, *NewShuffle);
3239 return true;
3240 }
3241
3242 // See if we can re-use foldSelectShuffle, getting it to reduce the size of
3243 // the shuffle into a nicer order, as it can ignore the order of the shuffles.
3244 MadeChanges |= foldSelectShuffle(*Shuffle, true);
3245 return MadeChanges;
3246}
3247
3248/// For a given chain of patterns of the following form:
3249///
3250/// ```
3251/// %1 = shufflevector <n x ty1> %0, <n x ty1> poison <n x ty2> mask
3252///
3253/// %2 = tail call <n x ty1> llvm.<umin/umax/smin/smax>(<n x ty1> %0, <n x
3254/// ty1> %1)
3255/// OR
3256/// %2 = add/mul/or/and/xor <n x ty1> %0, %1
3257///
3258/// %3 = shufflevector <n x ty1> %2, <n x ty1> poison <n x ty2> mask
3259/// ...
3260/// ...
3261/// %(i - 1) = tail call <n x ty1> llvm.<umin/umax/smin/smax>(<n x ty1> %(i -
3262/// 3), <n x ty1> %(i - 2)
3263/// OR
3264/// %(i - 1) = add/mul/or/and/xor <n x ty1> %(i - 3), %(i - 2)
3265///
3266/// %(i) = extractelement <n x ty1> %(i - 1), 0
3267/// ```
3268///
3269/// Where:
3270/// `mask` follows a partition pattern:
3271///
3272/// Ex:
3273/// [n = 8, p = poison]
3274///
3275/// 4 5 6 7 | p p p p
3276/// 2 3 | p p p p p p
3277/// 1 | p p p p p p p
3278///
3279/// For powers of 2, there's a consistent pattern, but for other cases
3280/// the parity of the current half value at each step decides the
3281/// next partition half (see `ExpectedParityMask` for more logical details
3282/// in generalising this).
3283///
3284/// Ex:
3285/// [n = 6]
3286///
3287/// 3 4 5 | p p p
3288/// 1 2 | p p p p
3289/// 1 | p p p p p
3290bool VectorCombine::foldShuffleChainsToReduce(Instruction &I) {
3291 // Going bottom-up for the pattern.
3292 std::queue<Value *> InstWorklist;
3293 InstructionCost OrigCost = 0;
3294
3295 // Common instruction operation after each shuffle op.
3296 std::optional<unsigned int> CommonCallOp = std::nullopt;
3297 std::optional<Instruction::BinaryOps> CommonBinOp = std::nullopt;
3298
3299 bool IsFirstCallOrBinInst = true;
3300 bool ShouldBeCallOrBinInst = true;
3301
3302 // This stores the last used instructions for shuffle/common op.
3303 //
3304 // PrevVecV[0] / PrevVecV[1] store the last two simultaneous
3305 // instructions from either shuffle/common op.
3306 SmallVector<Value *, 2> PrevVecV(2, nullptr);
3307
3308 Value *VecOpEE;
3309 if (!match(&I, m_ExtractElt(m_Value(VecOpEE), m_Zero())))
3310 return false;
3311
3312 auto *FVT = dyn_cast<FixedVectorType>(VecOpEE->getType());
3313 if (!FVT)
3314 return false;
3315
3316 int64_t VecSize = FVT->getNumElements();
3317 if (VecSize < 2)
3318 return false;
3319
3320 // Number of levels would be ~log2(n), considering we always partition
3321 // by half for this fold pattern.
3322 unsigned int NumLevels = Log2_64_Ceil(VecSize), VisitedCnt = 0;
3323 int64_t ShuffleMaskHalf = 1, ExpectedParityMask = 0;
3324
3325 // This is how we generalise for all element sizes.
3326 // At each step, if vector size is odd, we need non-poison
3327 // values to cover the dominant half so we don't miss out on any element.
3328 //
3329 // This mask will help us retrieve this as we go from bottom to top:
3330 //
3331 // Mask Set -> N = N * 2 - 1
3332 // Mask Unset -> N = N * 2
3333 for (int Cur = VecSize, Mask = NumLevels - 1; Cur > 1;
3334 Cur = (Cur + 1) / 2, --Mask) {
3335 if (Cur & 1)
3336 ExpectedParityMask |= (1ll << Mask);
3337 }
3338
3339 InstWorklist.push(VecOpEE);
3340
3341 while (!InstWorklist.empty()) {
3342 Value *CI = InstWorklist.front();
3343 InstWorklist.pop();
3344
3345 if (auto *II = dyn_cast<IntrinsicInst>(CI)) {
3346 if (!ShouldBeCallOrBinInst)
3347 return false;
3348
3349 if (!IsFirstCallOrBinInst &&
3350 any_of(PrevVecV, [](Value *VecV) { return VecV == nullptr; }))
3351 return false;
3352
3353 // For the first found call/bin op, the vector has to come from the
3354 // extract element op.
3355 if (II != (IsFirstCallOrBinInst ? VecOpEE : PrevVecV[0]))
3356 return false;
3357 IsFirstCallOrBinInst = false;
3358
3359 if (!CommonCallOp)
3360 CommonCallOp = II->getIntrinsicID();
3361 if (II->getIntrinsicID() != *CommonCallOp)
3362 return false;
3363
3364 switch (II->getIntrinsicID()) {
3365 case Intrinsic::umin:
3366 case Intrinsic::umax:
3367 case Intrinsic::smin:
3368 case Intrinsic::smax: {
3369 auto *Op0 = II->getOperand(0);
3370 auto *Op1 = II->getOperand(1);
3371 PrevVecV[0] = Op0;
3372 PrevVecV[1] = Op1;
3373 break;
3374 }
3375 default:
3376 return false;
3377 }
3378 ShouldBeCallOrBinInst ^= 1;
3379
3380 IntrinsicCostAttributes ICA(
3381 *CommonCallOp, II->getType(),
3382 {PrevVecV[0]->getType(), PrevVecV[1]->getType()});
3383 OrigCost += TTI.getIntrinsicInstrCost(ICA, CostKind);
3384
3385 // We may need a swap here since it can be (a, b) or (b, a)
3386 // and accordingly change as we go up.
3387 if (!isa<ShuffleVectorInst>(PrevVecV[1]))
3388 std::swap(PrevVecV[0], PrevVecV[1]);
3389 InstWorklist.push(PrevVecV[1]);
3390 InstWorklist.push(PrevVecV[0]);
3391 } else if (auto *BinOp = dyn_cast<BinaryOperator>(CI)) {
3392 // Similar logic for bin ops.
3393
3394 if (!ShouldBeCallOrBinInst)
3395 return false;
3396
3397 if (!IsFirstCallOrBinInst &&
3398 any_of(PrevVecV, [](Value *VecV) { return VecV == nullptr; }))
3399 return false;
3400
3401 if (BinOp != (IsFirstCallOrBinInst ? VecOpEE : PrevVecV[0]))
3402 return false;
3403 IsFirstCallOrBinInst = false;
3404
3405 if (!CommonBinOp)
3406 CommonBinOp = BinOp->getOpcode();
3407
3408 if (BinOp->getOpcode() != *CommonBinOp)
3409 return false;
3410
3411 switch (*CommonBinOp) {
3412 case BinaryOperator::Add:
3413 case BinaryOperator::Mul:
3414 case BinaryOperator::Or:
3415 case BinaryOperator::And:
3416 case BinaryOperator::Xor: {
3417 auto *Op0 = BinOp->getOperand(0);
3418 auto *Op1 = BinOp->getOperand(1);
3419 PrevVecV[0] = Op0;
3420 PrevVecV[1] = Op1;
3421 break;
3422 }
3423 default:
3424 return false;
3425 }
3426 ShouldBeCallOrBinInst ^= 1;
3427
3428 OrigCost +=
3429 TTI.getArithmeticInstrCost(*CommonBinOp, BinOp->getType(), CostKind);
3430
3431 if (!isa<ShuffleVectorInst>(PrevVecV[1]))
3432 std::swap(PrevVecV[0], PrevVecV[1]);
3433 InstWorklist.push(PrevVecV[1]);
3434 InstWorklist.push(PrevVecV[0]);
3435 } else if (auto *SVInst = dyn_cast<ShuffleVectorInst>(CI)) {
3436 // We shouldn't have any null values in the previous vectors,
3437 // is so, there was a mismatch in pattern.
3438 if (ShouldBeCallOrBinInst ||
3439 any_of(PrevVecV, [](Value *VecV) { return VecV == nullptr; }))
3440 return false;
3441
3442 if (SVInst != PrevVecV[1])
3443 return false;
3444
3445 ArrayRef<int> CurMask;
3446 if (!match(SVInst, m_Shuffle(m_Specific(PrevVecV[0]), m_Poison(),
3447 m_Mask(CurMask))))
3448 return false;
3449
3450 // Subtract the parity mask when checking the condition.
3451 for (int Mask = 0, MaskSize = CurMask.size(); Mask != MaskSize; ++Mask) {
3452 if (Mask < ShuffleMaskHalf &&
3453 CurMask[Mask] != ShuffleMaskHalf + Mask - (ExpectedParityMask & 1))
3454 return false;
3455 if (Mask >= ShuffleMaskHalf && CurMask[Mask] != -1)
3456 return false;
3457 }
3458
3459 // Update mask values.
3460 ShuffleMaskHalf *= 2;
3461 ShuffleMaskHalf -= (ExpectedParityMask & 1);
3462 ExpectedParityMask >>= 1;
3463
3465 SVInst->getType(), SVInst->getType(),
3466 CurMask, CostKind);
3467
3468 VisitedCnt += 1;
3469 if (!ExpectedParityMask && VisitedCnt == NumLevels)
3470 break;
3471
3472 ShouldBeCallOrBinInst ^= 1;
3473 } else {
3474 return false;
3475 }
3476 }
3477
3478 // Pattern should end with a shuffle op.
3479 if (ShouldBeCallOrBinInst)
3480 return false;
3481
3482 assert(VecSize != -1 && "Expected Match for Vector Size");
3483
3484 Value *FinalVecV = PrevVecV[0];
3485 if (!FinalVecV)
3486 return false;
3487
3488 auto *FinalVecVTy = cast<FixedVectorType>(FinalVecV->getType());
3489
3490 Intrinsic::ID ReducedOp =
3491 (CommonCallOp ? getMinMaxReductionIntrinsicID(*CommonCallOp)
3492 : getReductionForBinop(*CommonBinOp));
3493 if (!ReducedOp)
3494 return false;
3495
3496 IntrinsicCostAttributes ICA(ReducedOp, FinalVecVTy, {FinalVecV});
3498
3499 if (NewCost >= OrigCost)
3500 return false;
3501
3502 auto *ReducedResult =
3503 Builder.CreateIntrinsic(ReducedOp, {FinalVecV->getType()}, {FinalVecV});
3504 replaceValue(I, *ReducedResult);
3505
3506 return true;
3507}
3508
3509/// Determine if its more efficient to fold:
3510/// reduce(trunc(x)) -> trunc(reduce(x)).
3511/// reduce(sext(x)) -> sext(reduce(x)).
3512/// reduce(zext(x)) -> zext(reduce(x)).
3513bool VectorCombine::foldCastFromReductions(Instruction &I) {
3514 auto *II = dyn_cast<IntrinsicInst>(&I);
3515 if (!II)
3516 return false;
3517
3518 bool TruncOnly = false;
3519 Intrinsic::ID IID = II->getIntrinsicID();
3520 switch (IID) {
3521 case Intrinsic::vector_reduce_add:
3522 case Intrinsic::vector_reduce_mul:
3523 TruncOnly = true;
3524 break;
3525 case Intrinsic::vector_reduce_and:
3526 case Intrinsic::vector_reduce_or:
3527 case Intrinsic::vector_reduce_xor:
3528 break;
3529 default:
3530 return false;
3531 }
3532
3533 unsigned ReductionOpc = getArithmeticReductionInstruction(IID);
3534 Value *ReductionSrc = I.getOperand(0);
3535
3536 Value *Src;
3537 if (!match(ReductionSrc, m_OneUse(m_Trunc(m_Value(Src)))) &&
3538 (TruncOnly || !match(ReductionSrc, m_OneUse(m_ZExtOrSExt(m_Value(Src))))))
3539 return false;
3540
3541 auto CastOpc =
3542 (Instruction::CastOps)cast<Instruction>(ReductionSrc)->getOpcode();
3543
3544 auto *SrcTy = cast<VectorType>(Src->getType());
3545 auto *ReductionSrcTy = cast<VectorType>(ReductionSrc->getType());
3546 Type *ResultTy = I.getType();
3547
3549 ReductionOpc, ReductionSrcTy, std::nullopt, CostKind);
3550 OldCost += TTI.getCastInstrCost(CastOpc, ReductionSrcTy, SrcTy,
3552 cast<CastInst>(ReductionSrc));
3553 InstructionCost NewCost =
3554 TTI.getArithmeticReductionCost(ReductionOpc, SrcTy, std::nullopt,
3555 CostKind) +
3556 TTI.getCastInstrCost(CastOpc, ResultTy, ReductionSrcTy->getScalarType(),
3558
3559 if (OldCost <= NewCost || !NewCost.isValid())
3560 return false;
3561
3562 Value *NewReduction = Builder.CreateIntrinsic(SrcTy->getScalarType(),
3563 II->getIntrinsicID(), {Src});
3564 Value *NewCast = Builder.CreateCast(CastOpc, NewReduction, ResultTy);
3565 replaceValue(I, *NewCast);
3566 return true;
3567}
3568
3569/// Returns true if this ShuffleVectorInst eventually feeds into a
3570/// vector reduction intrinsic (e.g., vector_reduce_add) by only following
3571/// chains of shuffles and binary operators (in any combination/order).
3572/// The search does not go deeper than the given Depth.
3574 constexpr unsigned MaxVisited = 32;
3577 bool FoundReduction = false;
3578
3579 WorkList.push_back(SVI);
3580 while (!WorkList.empty()) {
3581 Instruction *I = WorkList.pop_back_val();
3582 for (User *U : I->users()) {
3583 auto *UI = cast<Instruction>(U);
3584 if (!UI || !Visited.insert(UI).second)
3585 continue;
3586 if (Visited.size() > MaxVisited)
3587 return false;
3588 if (auto *II = dyn_cast<IntrinsicInst>(UI)) {
3589 // More than one reduction reached
3590 if (FoundReduction)
3591 return false;
3592 switch (II->getIntrinsicID()) {
3593 case Intrinsic::vector_reduce_add:
3594 case Intrinsic::vector_reduce_mul:
3595 case Intrinsic::vector_reduce_and:
3596 case Intrinsic::vector_reduce_or:
3597 case Intrinsic::vector_reduce_xor:
3598 case Intrinsic::vector_reduce_smin:
3599 case Intrinsic::vector_reduce_smax:
3600 case Intrinsic::vector_reduce_umin:
3601 case Intrinsic::vector_reduce_umax:
3602 FoundReduction = true;
3603 continue;
3604 default:
3605 return false;
3606 }
3607 }
3608
3610 return false;
3611
3612 WorkList.emplace_back(UI);
3613 }
3614 }
3615 return FoundReduction;
3616}
3617
3618/// This method looks for groups of shuffles acting on binops, of the form:
3619/// %x = shuffle ...
3620/// %y = shuffle ...
3621/// %a = binop %x, %y
3622/// %b = binop %x, %y
3623/// shuffle %a, %b, selectmask
3624/// We may, especially if the shuffle is wider than legal, be able to convert
3625/// the shuffle to a form where only parts of a and b need to be computed. On
3626/// architectures with no obvious "select" shuffle, this can reduce the total
3627/// number of operations if the target reports them as cheaper.
3628bool VectorCombine::foldSelectShuffle(Instruction &I, bool FromReduction) {
3629 auto *SVI = cast<ShuffleVectorInst>(&I);
3630 auto *VT = cast<FixedVectorType>(I.getType());
3631 auto *Op0 = dyn_cast<Instruction>(SVI->getOperand(0));
3632 auto *Op1 = dyn_cast<Instruction>(SVI->getOperand(1));
3633 if (!Op0 || !Op1 || Op0 == Op1 || !Op0->isBinaryOp() || !Op1->isBinaryOp() ||
3634 VT != Op0->getType())
3635 return false;
3636
3637 auto *SVI0A = dyn_cast<Instruction>(Op0->getOperand(0));
3638 auto *SVI0B = dyn_cast<Instruction>(Op0->getOperand(1));
3639 auto *SVI1A = dyn_cast<Instruction>(Op1->getOperand(0));
3640 auto *SVI1B = dyn_cast<Instruction>(Op1->getOperand(1));
3641 SmallPtrSet<Instruction *, 4> InputShuffles({SVI0A, SVI0B, SVI1A, SVI1B});
3642 auto checkSVNonOpUses = [&](Instruction *I) {
3643 if (!I || I->getOperand(0)->getType() != VT)
3644 return true;
3645 return any_of(I->users(), [&](User *U) {
3646 return U != Op0 && U != Op1 &&
3647 !(isa<ShuffleVectorInst>(U) &&
3648 (InputShuffles.contains(cast<Instruction>(U)) ||
3649 isInstructionTriviallyDead(cast<Instruction>(U))));
3650 });
3651 };
3652 if (checkSVNonOpUses(SVI0A) || checkSVNonOpUses(SVI0B) ||
3653 checkSVNonOpUses(SVI1A) || checkSVNonOpUses(SVI1B))
3654 return false;
3655
3656 // Collect all the uses that are shuffles that we can transform together. We
3657 // may not have a single shuffle, but a group that can all be transformed
3658 // together profitably.
3660 auto collectShuffles = [&](Instruction *I) {
3661 for (auto *U : I->users()) {
3662 auto *SV = dyn_cast<ShuffleVectorInst>(U);
3663 if (!SV || SV->getType() != VT)
3664 return false;
3665 if ((SV->getOperand(0) != Op0 && SV->getOperand(0) != Op1) ||
3666 (SV->getOperand(1) != Op0 && SV->getOperand(1) != Op1))
3667 return false;
3668 if (!llvm::is_contained(Shuffles, SV))
3669 Shuffles.push_back(SV);
3670 }
3671 return true;
3672 };
3673 if (!collectShuffles(Op0) || !collectShuffles(Op1))
3674 return false;
3675 // From a reduction, we need to be processing a single shuffle, otherwise the
3676 // other uses will not be lane-invariant.
3677 if (FromReduction && Shuffles.size() > 1)
3678 return false;
3679
3680 // Add any shuffle uses for the shuffles we have found, to include them in our
3681 // cost calculations.
3682 if (!FromReduction) {
3683 for (ShuffleVectorInst *SV : Shuffles) {
3684 for (auto *U : SV->users()) {
3685 ShuffleVectorInst *SSV = dyn_cast<ShuffleVectorInst>(U);
3686 if (SSV && isa<UndefValue>(SSV->getOperand(1)) && SSV->getType() == VT)
3687 Shuffles.push_back(SSV);
3688 }
3689 }
3690 }
3691
3692 // For each of the output shuffles, we try to sort all the first vector
3693 // elements to the beginning, followed by the second array elements at the
3694 // end. If the binops are legalized to smaller vectors, this may reduce total
3695 // number of binops. We compute the ReconstructMask mask needed to convert
3696 // back to the original lane order.
3698 SmallVector<SmallVector<int>> OrigReconstructMasks;
3699 int MaxV1Elt = 0, MaxV2Elt = 0;
3700 unsigned NumElts = VT->getNumElements();
3701 for (ShuffleVectorInst *SVN : Shuffles) {
3702 SmallVector<int> Mask;
3703 SVN->getShuffleMask(Mask);
3704
3705 // Check the operands are the same as the original, or reversed (in which
3706 // case we need to commute the mask).
3707 Value *SVOp0 = SVN->getOperand(0);
3708 Value *SVOp1 = SVN->getOperand(1);
3709 if (isa<UndefValue>(SVOp1)) {
3710 auto *SSV = cast<ShuffleVectorInst>(SVOp0);
3711 SVOp0 = SSV->getOperand(0);
3712 SVOp1 = SSV->getOperand(1);
3713 for (int &Elem : Mask) {
3714 if (Elem >= static_cast<int>(SSV->getShuffleMask().size()))
3715 return false;
3716 Elem = Elem < 0 ? Elem : SSV->getMaskValue(Elem);
3717 }
3718 }
3719 if (SVOp0 == Op1 && SVOp1 == Op0) {
3720 std::swap(SVOp0, SVOp1);
3722 }
3723 if (SVOp0 != Op0 || SVOp1 != Op1)
3724 return false;
3725
3726 // Calculate the reconstruction mask for this shuffle, as the mask needed to
3727 // take the packed values from Op0/Op1 and reconstructing to the original
3728 // order.
3729 SmallVector<int> ReconstructMask;
3730 for (unsigned I = 0; I < Mask.size(); I++) {
3731 if (Mask[I] < 0) {
3732 ReconstructMask.push_back(-1);
3733 } else if (Mask[I] < static_cast<int>(NumElts)) {
3734 MaxV1Elt = std::max(MaxV1Elt, Mask[I]);
3735 auto It = find_if(V1, [&](const std::pair<int, int> &A) {
3736 return Mask[I] == A.first;
3737 });
3738 if (It != V1.end())
3739 ReconstructMask.push_back(It - V1.begin());
3740 else {
3741 ReconstructMask.push_back(V1.size());
3742 V1.emplace_back(Mask[I], V1.size());
3743 }
3744 } else {
3745 MaxV2Elt = std::max<int>(MaxV2Elt, Mask[I] - NumElts);
3746 auto It = find_if(V2, [&](const std::pair<int, int> &A) {
3747 return Mask[I] - static_cast<int>(NumElts) == A.first;
3748 });
3749 if (It != V2.end())
3750 ReconstructMask.push_back(NumElts + It - V2.begin());
3751 else {
3752 ReconstructMask.push_back(NumElts + V2.size());
3753 V2.emplace_back(Mask[I] - NumElts, NumElts + V2.size());
3754 }
3755 }
3756 }
3757
3758 // For reductions, we know that the lane ordering out doesn't alter the
3759 // result. In-order can help simplify the shuffle away.
3760 if (FromReduction)
3761 sort(ReconstructMask);
3762 OrigReconstructMasks.push_back(std::move(ReconstructMask));
3763 }
3764
3765 // If the Maximum element used from V1 and V2 are not larger than the new
3766 // vectors, the vectors are already packes and performing the optimization
3767 // again will likely not help any further. This also prevents us from getting
3768 // stuck in a cycle in case the costs do not also rule it out.
3769 if (V1.empty() || V2.empty() ||
3770 (MaxV1Elt == static_cast<int>(V1.size()) - 1 &&
3771 MaxV2Elt == static_cast<int>(V2.size()) - 1))
3772 return false;
3773
3774 // GetBaseMaskValue takes one of the inputs, which may either be a shuffle, a
3775 // shuffle of another shuffle, or not a shuffle (that is treated like a
3776 // identity shuffle).
3777 auto GetBaseMaskValue = [&](Instruction *I, int M) {
3778 auto *SV = dyn_cast<ShuffleVectorInst>(I);
3779 if (!SV)
3780 return M;
3781 if (isa<UndefValue>(SV->getOperand(1)))
3782 if (auto *SSV = dyn_cast<ShuffleVectorInst>(SV->getOperand(0)))
3783 if (InputShuffles.contains(SSV))
3784 return SSV->getMaskValue(SV->getMaskValue(M));
3785 return SV->getMaskValue(M);
3786 };
3787
3788 // Attempt to sort the inputs my ascending mask values to make simpler input
3789 // shuffles and push complex shuffles down to the uses. We sort on the first
3790 // of the two input shuffle orders, to try and get at least one input into a
3791 // nice order.
3792 auto SortBase = [&](Instruction *A, std::pair<int, int> X,
3793 std::pair<int, int> Y) {
3794 int MXA = GetBaseMaskValue(A, X.first);
3795 int MYA = GetBaseMaskValue(A, Y.first);
3796 return MXA < MYA;
3797 };
3798 stable_sort(V1, [&](std::pair<int, int> A, std::pair<int, int> B) {
3799 return SortBase(SVI0A, A, B);
3800 });
3801 stable_sort(V2, [&](std::pair<int, int> A, std::pair<int, int> B) {
3802 return SortBase(SVI1A, A, B);
3803 });
3804 // Calculate our ReconstructMasks from the OrigReconstructMasks and the
3805 // modified order of the input shuffles.
3806 SmallVector<SmallVector<int>> ReconstructMasks;
3807 for (const auto &Mask : OrigReconstructMasks) {
3808 SmallVector<int> ReconstructMask;
3809 for (int M : Mask) {
3810 auto FindIndex = [](const SmallVector<std::pair<int, int>> &V, int M) {
3811 auto It = find_if(V, [M](auto A) { return A.second == M; });
3812 assert(It != V.end() && "Expected all entries in Mask");
3813 return std::distance(V.begin(), It);
3814 };
3815 if (M < 0)
3816 ReconstructMask.push_back(-1);
3817 else if (M < static_cast<int>(NumElts)) {
3818 ReconstructMask.push_back(FindIndex(V1, M));
3819 } else {
3820 ReconstructMask.push_back(NumElts + FindIndex(V2, M));
3821 }
3822 }
3823 ReconstructMasks.push_back(std::move(ReconstructMask));
3824 }
3825
3826 // Calculate the masks needed for the new input shuffles, which get padded
3827 // with undef
3828 SmallVector<int> V1A, V1B, V2A, V2B;
3829 for (unsigned I = 0; I < V1.size(); I++) {
3830 V1A.push_back(GetBaseMaskValue(SVI0A, V1[I].first));
3831 V1B.push_back(GetBaseMaskValue(SVI0B, V1[I].first));
3832 }
3833 for (unsigned I = 0; I < V2.size(); I++) {
3834 V2A.push_back(GetBaseMaskValue(SVI1A, V2[I].first));
3835 V2B.push_back(GetBaseMaskValue(SVI1B, V2[I].first));
3836 }
3837 while (V1A.size() < NumElts) {
3840 }
3841 while (V2A.size() < NumElts) {
3844 }
3845
3846 auto AddShuffleCost = [&](InstructionCost C, Instruction *I) {
3847 auto *SV = dyn_cast<ShuffleVectorInst>(I);
3848 if (!SV)
3849 return C;
3850 return C + TTI.getShuffleCost(isa<UndefValue>(SV->getOperand(1))
3853 VT, VT, SV->getShuffleMask(), CostKind);
3854 };
3855 auto AddShuffleMaskCost = [&](InstructionCost C, ArrayRef<int> Mask) {
3856 return C +
3858 };
3859
3860 unsigned ElementSize = VT->getElementType()->getPrimitiveSizeInBits();
3861 unsigned MaxVectorSize =
3863 unsigned MaxElementsInVector = MaxVectorSize / ElementSize;
3864 if (MaxElementsInVector == 0)
3865 return false;
3866 // When there are multiple shufflevector operations on the same input,
3867 // especially when the vector length is larger than the register size,
3868 // identical shuffle patterns may occur across different groups of elements.
3869 // To avoid overestimating the cost by counting these repeated shuffles more
3870 // than once, we only account for unique shuffle patterns. This adjustment
3871 // prevents inflated costs in the cost model for wide vectors split into
3872 // several register-sized groups.
3873 std::set<SmallVector<int, 4>> UniqueShuffles;
3874 auto AddShuffleMaskAdjustedCost = [&](InstructionCost C, ArrayRef<int> Mask) {
3875 // Compute the cost for performing the shuffle over the full vector.
3876 auto ShuffleCost =
3878 unsigned NumFullVectors = Mask.size() / MaxElementsInVector;
3879 if (NumFullVectors < 2)
3880 return C + ShuffleCost;
3881 SmallVector<int, 4> SubShuffle(MaxElementsInVector);
3882 unsigned NumUniqueGroups = 0;
3883 unsigned NumGroups = Mask.size() / MaxElementsInVector;
3884 // For each group of MaxElementsInVector contiguous elements,
3885 // collect their shuffle pattern and insert into the set of unique patterns.
3886 for (unsigned I = 0; I < NumFullVectors; ++I) {
3887 for (unsigned J = 0; J < MaxElementsInVector; ++J)
3888 SubShuffle[J] = Mask[MaxElementsInVector * I + J];
3889 if (UniqueShuffles.insert(SubShuffle).second)
3890 NumUniqueGroups += 1;
3891 }
3892 return C + ShuffleCost * NumUniqueGroups / NumGroups;
3893 };
3894 auto AddShuffleAdjustedCost = [&](InstructionCost C, Instruction *I) {
3895 auto *SV = dyn_cast<ShuffleVectorInst>(I);
3896 if (!SV)
3897 return C;
3898 SmallVector<int, 16> Mask;
3899 SV->getShuffleMask(Mask);
3900 return AddShuffleMaskAdjustedCost(C, Mask);
3901 };
3902 // Check that input consists of ShuffleVectors applied to the same input
3903 auto AllShufflesHaveSameOperands =
3904 [](SmallPtrSetImpl<Instruction *> &InputShuffles) {
3905 if (InputShuffles.size() < 2)
3906 return false;
3907 ShuffleVectorInst *FirstSV =
3908 dyn_cast<ShuffleVectorInst>(*InputShuffles.begin());
3909 if (!FirstSV)
3910 return false;
3911
3912 Value *In0 = FirstSV->getOperand(0), *In1 = FirstSV->getOperand(1);
3913 return std::all_of(
3914 std::next(InputShuffles.begin()), InputShuffles.end(),
3915 [&](Instruction *I) {
3916 ShuffleVectorInst *SV = dyn_cast<ShuffleVectorInst>(I);
3917 return SV && SV->getOperand(0) == In0 && SV->getOperand(1) == In1;
3918 });
3919 };
3920
3921 // Get the costs of the shuffles + binops before and after with the new
3922 // shuffle masks.
3923 InstructionCost CostBefore =
3924 TTI.getArithmeticInstrCost(Op0->getOpcode(), VT, CostKind) +
3925 TTI.getArithmeticInstrCost(Op1->getOpcode(), VT, CostKind);
3926 CostBefore += std::accumulate(Shuffles.begin(), Shuffles.end(),
3927 InstructionCost(0), AddShuffleCost);
3928 if (AllShufflesHaveSameOperands(InputShuffles)) {
3929 UniqueShuffles.clear();
3930 CostBefore += std::accumulate(InputShuffles.begin(), InputShuffles.end(),
3931 InstructionCost(0), AddShuffleAdjustedCost);
3932 } else {
3933 CostBefore += std::accumulate(InputShuffles.begin(), InputShuffles.end(),
3934 InstructionCost(0), AddShuffleCost);
3935 }
3936
3937 // The new binops will be unused for lanes past the used shuffle lengths.
3938 // These types attempt to get the correct cost for that from the target.
3939 FixedVectorType *Op0SmallVT =
3940 FixedVectorType::get(VT->getScalarType(), V1.size());
3941 FixedVectorType *Op1SmallVT =
3942 FixedVectorType::get(VT->getScalarType(), V2.size());
3943 InstructionCost CostAfter =
3944 TTI.getArithmeticInstrCost(Op0->getOpcode(), Op0SmallVT, CostKind) +
3945 TTI.getArithmeticInstrCost(Op1->getOpcode(), Op1SmallVT, CostKind);
3946 UniqueShuffles.clear();
3947 CostAfter += std::accumulate(ReconstructMasks.begin(), ReconstructMasks.end(),
3948 InstructionCost(0), AddShuffleMaskAdjustedCost);
3949 std::set<SmallVector<int>> OutputShuffleMasks({V1A, V1B, V2A, V2B});
3950 CostAfter +=
3951 std::accumulate(OutputShuffleMasks.begin(), OutputShuffleMasks.end(),
3952 InstructionCost(0), AddShuffleMaskCost);
3953
3954 LLVM_DEBUG(dbgs() << "Found a binop select shuffle pattern: " << I << "\n");
3955 LLVM_DEBUG(dbgs() << " CostBefore: " << CostBefore
3956 << " vs CostAfter: " << CostAfter << "\n");
3957 if (CostBefore < CostAfter ||
3958 (CostBefore == CostAfter && !feedsIntoVectorReduction(SVI)))
3959 return false;
3960
3961 // The cost model has passed, create the new instructions.
3962 auto GetShuffleOperand = [&](Instruction *I, unsigned Op) -> Value * {
3963 auto *SV = dyn_cast<ShuffleVectorInst>(I);
3964 if (!SV)
3965 return I;
3966 if (isa<UndefValue>(SV->getOperand(1)))
3967 if (auto *SSV = dyn_cast<ShuffleVectorInst>(SV->getOperand(0)))
3968 if (InputShuffles.contains(SSV))
3969 return SSV->getOperand(Op);
3970 return SV->getOperand(Op);
3971 };
3972 Builder.SetInsertPoint(*SVI0A->getInsertionPointAfterDef());
3973 Value *NSV0A = Builder.CreateShuffleVector(GetShuffleOperand(SVI0A, 0),
3974 GetShuffleOperand(SVI0A, 1), V1A);
3975 Builder.SetInsertPoint(*SVI0B->getInsertionPointAfterDef());
3976 Value *NSV0B = Builder.CreateShuffleVector(GetShuffleOperand(SVI0B, 0),
3977 GetShuffleOperand(SVI0B, 1), V1B);
3978 Builder.SetInsertPoint(*SVI1A->getInsertionPointAfterDef());
3979 Value *NSV1A = Builder.CreateShuffleVector(GetShuffleOperand(SVI1A, 0),
3980 GetShuffleOperand(SVI1A, 1), V2A);
3981 Builder.SetInsertPoint(*SVI1B->getInsertionPointAfterDef());
3982 Value *NSV1B = Builder.CreateShuffleVector(GetShuffleOperand(SVI1B, 0),
3983 GetShuffleOperand(SVI1B, 1), V2B);
3984 Builder.SetInsertPoint(Op0);
3985 Value *NOp0 = Builder.CreateBinOp((Instruction::BinaryOps)Op0->getOpcode(),
3986 NSV0A, NSV0B);
3987 if (auto *I = dyn_cast<Instruction>(NOp0))
3988 I->copyIRFlags(Op0, true);
3989 Builder.SetInsertPoint(Op1);
3990 Value *NOp1 = Builder.CreateBinOp((Instruction::BinaryOps)Op1->getOpcode(),
3991 NSV1A, NSV1B);
3992 if (auto *I = dyn_cast<Instruction>(NOp1))
3993 I->copyIRFlags(Op1, true);
3994
3995 for (int S = 0, E = ReconstructMasks.size(); S != E; S++) {
3996 Builder.SetInsertPoint(Shuffles[S]);
3997 Value *NSV = Builder.CreateShuffleVector(NOp0, NOp1, ReconstructMasks[S]);
3998 replaceValue(*Shuffles[S], *NSV, false);
3999 }
4000
4001 Worklist.pushValue(NSV0A);
4002 Worklist.pushValue(NSV0B);
4003 Worklist.pushValue(NSV1A);
4004 Worklist.pushValue(NSV1B);
4005 return true;
4006}
4007
4008/// Check if instruction depends on ZExt and this ZExt can be moved after the
4009/// instruction. Move ZExt if it is profitable. For example:
4010/// logic(zext(x),y) -> zext(logic(x,trunc(y)))
4011/// lshr((zext(x),y) -> zext(lshr(x,trunc(y)))
4012/// Cost model calculations takes into account if zext(x) has other users and
4013/// whether it can be propagated through them too.
4014bool VectorCombine::shrinkType(Instruction &I) {
4015 Value *ZExted, *OtherOperand;
4016 if (!match(&I, m_c_BitwiseLogic(m_ZExt(m_Value(ZExted)),
4017 m_Value(OtherOperand))) &&
4018 !match(&I, m_LShr(m_ZExt(m_Value(ZExted)), m_Value(OtherOperand))))
4019 return false;
4020
4021 Value *ZExtOperand = I.getOperand(I.getOperand(0) == OtherOperand ? 1 : 0);
4022
4023 auto *BigTy = cast<FixedVectorType>(I.getType());
4024 auto *SmallTy = cast<FixedVectorType>(ZExted->getType());
4025 unsigned BW = SmallTy->getElementType()->getPrimitiveSizeInBits();
4026
4027 if (I.getOpcode() == Instruction::LShr) {
4028 // Check that the shift amount is less than the number of bits in the
4029 // smaller type. Otherwise, the smaller lshr will return a poison value.
4030 KnownBits ShAmtKB = computeKnownBits(I.getOperand(1), *DL);
4031 if (ShAmtKB.getMaxValue().uge(BW))
4032 return false;
4033 } else {
4034 // Check that the expression overall uses at most the same number of bits as
4035 // ZExted
4036 KnownBits KB = computeKnownBits(&I, *DL);
4037 if (KB.countMaxActiveBits() > BW)
4038 return false;
4039 }
4040
4041 // Calculate costs of leaving current IR as it is and moving ZExt operation
4042 // later, along with adding truncates if needed
4044 Instruction::ZExt, BigTy, SmallTy,
4045 TargetTransformInfo::CastContextHint::None, CostKind);
4046 InstructionCost CurrentCost = ZExtCost;
4047 InstructionCost ShrinkCost = 0;
4048
4049 // Calculate total cost and check that we can propagate through all ZExt users
4050 for (User *U : ZExtOperand->users()) {
4051 auto *UI = cast<Instruction>(U);
4052 if (UI == &I) {
4053 CurrentCost +=
4054 TTI.getArithmeticInstrCost(UI->getOpcode(), BigTy, CostKind);
4055 ShrinkCost +=
4056 TTI.getArithmeticInstrCost(UI->getOpcode(), SmallTy, CostKind);
4057 ShrinkCost += ZExtCost;
4058 continue;
4059 }
4060
4061 if (!Instruction::isBinaryOp(UI->getOpcode()))
4062 return false;
4063
4064 // Check if we can propagate ZExt through its other users
4065 KnownBits KB = computeKnownBits(UI, *DL);
4066 if (KB.countMaxActiveBits() > BW)
4067 return false;
4068
4069 CurrentCost += TTI.getArithmeticInstrCost(UI->getOpcode(), BigTy, CostKind);
4070 ShrinkCost +=
4071 TTI.getArithmeticInstrCost(UI->getOpcode(), SmallTy, CostKind);
4072 ShrinkCost += ZExtCost;
4073 }
4074
4075 // If the other instruction operand is not a constant, we'll need to
4076 // generate a truncate instruction. So we have to adjust cost
4077 if (!isa<Constant>(OtherOperand))
4078 ShrinkCost += TTI.getCastInstrCost(
4079 Instruction::Trunc, SmallTy, BigTy,
4080 TargetTransformInfo::CastContextHint::None, CostKind);
4081
4082 // If the cost of shrinking types and leaving the IR is the same, we'll lean
4083 // towards modifying the IR because shrinking opens opportunities for other
4084 // shrinking optimisations.
4085 if (ShrinkCost > CurrentCost)
4086 return false;
4087
4088 Builder.SetInsertPoint(&I);
4089 Value *Op0 = ZExted;
4090 Value *Op1 = Builder.CreateTrunc(OtherOperand, SmallTy);
4091 // Keep the order of operands the same
4092 if (I.getOperand(0) == OtherOperand)
4093 std::swap(Op0, Op1);
4094 Value *NewBinOp =
4095 Builder.CreateBinOp((Instruction::BinaryOps)I.getOpcode(), Op0, Op1);
4096 cast<Instruction>(NewBinOp)->copyIRFlags(&I);
4097 cast<Instruction>(NewBinOp)->copyMetadata(I);
4098 Value *NewZExtr = Builder.CreateZExt(NewBinOp, BigTy);
4099 replaceValue(I, *NewZExtr);
4100 return true;
4101}
4102
4103/// insert (DstVec, (extract SrcVec, ExtIdx), InsIdx) -->
4104/// shuffle (DstVec, SrcVec, Mask)
4105bool VectorCombine::foldInsExtVectorToShuffle(Instruction &I) {
4106 Value *DstVec, *SrcVec;
4107 uint64_t ExtIdx, InsIdx;
4108 if (!match(&I,
4109 m_InsertElt(m_Value(DstVec),
4110 m_ExtractElt(m_Value(SrcVec), m_ConstantInt(ExtIdx)),
4111 m_ConstantInt(InsIdx))))
4112 return false;
4113
4114 auto *DstVecTy = dyn_cast<FixedVectorType>(I.getType());
4115 auto *SrcVecTy = dyn_cast<FixedVectorType>(SrcVec->getType());
4116 // We can try combining vectors with different element sizes.
4117 if (!DstVecTy || !SrcVecTy ||
4118 SrcVecTy->getElementType() != DstVecTy->getElementType())
4119 return false;
4120
4121 unsigned NumDstElts = DstVecTy->getNumElements();
4122 unsigned NumSrcElts = SrcVecTy->getNumElements();
4123 if (InsIdx >= NumDstElts || ExtIdx >= NumSrcElts || NumDstElts == 1)
4124 return false;
4125
4126 // Insertion into poison is a cheaper single operand shuffle.
4128 SmallVector<int> Mask(NumDstElts, PoisonMaskElem);
4129
4130 bool NeedExpOrNarrow = NumSrcElts != NumDstElts;
4131 bool IsExtIdxInBounds = ExtIdx < NumDstElts;
4132 bool NeedDstSrcSwap = isa<PoisonValue>(DstVec) && !isa<UndefValue>(SrcVec);
4133 if (NeedDstSrcSwap) {
4135 if (!IsExtIdxInBounds && NeedExpOrNarrow)
4136 Mask[InsIdx] = 0;
4137 else
4138 Mask[InsIdx] = ExtIdx;
4139 std::swap(DstVec, SrcVec);
4140 } else {
4142 std::iota(Mask.begin(), Mask.end(), 0);
4143 if (!IsExtIdxInBounds && NeedExpOrNarrow)
4144 Mask[InsIdx] = NumDstElts;
4145 else
4146 Mask[InsIdx] = ExtIdx + NumDstElts;
4147 }
4148
4149 // Cost
4150 auto *Ins = cast<InsertElementInst>(&I);
4151 auto *Ext = cast<ExtractElementInst>(I.getOperand(1));
4152 InstructionCost InsCost =
4153 TTI.getVectorInstrCost(*Ins, DstVecTy, CostKind, InsIdx);
4154 InstructionCost ExtCost =
4155 TTI.getVectorInstrCost(*Ext, DstVecTy, CostKind, ExtIdx);
4156 InstructionCost OldCost = ExtCost + InsCost;
4157
4158 InstructionCost NewCost = 0;
4159 SmallVector<int> ExtToVecMask;
4160 if (!NeedExpOrNarrow) {
4161 // Ignore 'free' identity insertion shuffle.
4162 // TODO: getShuffleCost should return TCC_Free for Identity shuffles.
4163 if (!ShuffleVectorInst::isIdentityMask(Mask, NumSrcElts))
4164 NewCost += TTI.getShuffleCost(SK, DstVecTy, DstVecTy, Mask, CostKind, 0,
4165 nullptr, {DstVec, SrcVec});
4166 } else {
4167 // When creating length-changing-vector, always create with a Mask whose
4168 // first element has an ExtIdx, so that the first element of the vector
4169 // being created is always the target to be extracted.
4170 ExtToVecMask.assign(NumDstElts, PoisonMaskElem);
4171 if (IsExtIdxInBounds)
4172 ExtToVecMask[ExtIdx] = ExtIdx;
4173 else
4174 ExtToVecMask[0] = ExtIdx;
4175 // Add cost for expanding or narrowing
4177 DstVecTy, SrcVecTy, ExtToVecMask, CostKind);
4178 NewCost += TTI.getShuffleCost(SK, DstVecTy, DstVecTy, Mask, CostKind);
4179 }
4180
4181 if (!Ext->hasOneUse())
4182 NewCost += ExtCost;
4183
4184 LLVM_DEBUG(dbgs() << "Found a insert/extract shuffle-like pair: " << I
4185 << "\n OldCost: " << OldCost << " vs NewCost: " << NewCost
4186 << "\n");
4187
4188 if (OldCost < NewCost)
4189 return false;
4190
4191 if (NeedExpOrNarrow) {
4192 if (!NeedDstSrcSwap)
4193 SrcVec = Builder.CreateShuffleVector(SrcVec, ExtToVecMask);
4194 else
4195 DstVec = Builder.CreateShuffleVector(DstVec, ExtToVecMask);
4196 }
4197
4198 // Canonicalize undef param to RHS to help further folds.
4199 if (isa<UndefValue>(DstVec) && !isa<UndefValue>(SrcVec)) {
4200 ShuffleVectorInst::commuteShuffleMask(Mask, NumDstElts);
4201 std::swap(DstVec, SrcVec);
4202 }
4203
4204 Value *Shuf = Builder.CreateShuffleVector(DstVec, SrcVec, Mask);
4205 replaceValue(I, *Shuf);
4206
4207 return true;
4208}
4209
4210/// If we're interleaving 2 constant splats, for instance `<vscale x 8 x i32>
4211/// <splat of 666>` and `<vscale x 8 x i32> <splat of 777>`, we can create a
4212/// larger splat `<vscale x 8 x i64> <splat of ((777 << 32) | 666)>` first
4213/// before casting it back into `<vscale x 16 x i32>`.
4214bool VectorCombine::foldInterleaveIntrinsics(Instruction &I) {
4215 const APInt *SplatVal0, *SplatVal1;
4217 m_APInt(SplatVal0), m_APInt(SplatVal1))))
4218 return false;
4219
4220 LLVM_DEBUG(dbgs() << "VC: Folding interleave2 with two splats: " << I
4221 << "\n");
4222
4223 auto *VTy =
4224 cast<VectorType>(cast<IntrinsicInst>(I).getArgOperand(0)->getType());
4225 auto *ExtVTy = VectorType::getExtendedElementVectorType(VTy);
4226 unsigned Width = VTy->getElementType()->getIntegerBitWidth();
4227
4228 // Just in case the cost of interleave2 intrinsic and bitcast are both
4229 // invalid, in which case we want to bail out, we use <= rather
4230 // than < here. Even they both have valid and equal costs, it's probably
4231 // not a good idea to emit a high-cost constant splat.
4233 TTI.getCastInstrCost(Instruction::BitCast, I.getType(), ExtVTy,
4235 LLVM_DEBUG(dbgs() << "VC: The cost to cast from " << *ExtVTy << " to "
4236 << *I.getType() << " is too high.\n");
4237 return false;
4238 }
4239
4240 APInt NewSplatVal = SplatVal1->zext(Width * 2);
4241 NewSplatVal <<= Width;
4242 NewSplatVal |= SplatVal0->zext(Width * 2);
4243 auto *NewSplat = ConstantVector::getSplat(
4244 ExtVTy->getElementCount(), ConstantInt::get(F.getContext(), NewSplatVal));
4245
4246 IRBuilder<> Builder(&I);
4247 replaceValue(I, *Builder.CreateBitCast(NewSplat, I.getType()));
4248 return true;
4249}
4250
4251// Attempt to shrink loads that are only used by shufflevector instructions.
4252bool VectorCombine::shrinkLoadForShuffles(Instruction &I) {
4253 auto *OldLoad = dyn_cast<LoadInst>(&I);
4254 if (!OldLoad || !OldLoad->isSimple())
4255 return false;
4256
4257 auto *OldLoadTy = dyn_cast<FixedVectorType>(OldLoad->getType());
4258 if (!OldLoadTy)
4259 return false;
4260
4261 unsigned const OldNumElements = OldLoadTy->getNumElements();
4262
4263 // Search all uses of load. If all uses are shufflevector instructions, and
4264 // the second operands are all poison values, find the minimum and maximum
4265 // indices of the vector elements referenced by all shuffle masks.
4266 // Otherwise return `std::nullopt`.
4267 using IndexRange = std::pair<int, int>;
4268 auto GetIndexRangeInShuffles = [&]() -> std::optional<IndexRange> {
4269 IndexRange OutputRange = IndexRange(OldNumElements, -1);
4270 for (llvm::Use &Use : I.uses()) {
4271 // Ensure all uses match the required pattern.
4272 User *Shuffle = Use.getUser();
4273 ArrayRef<int> Mask;
4274
4275 if (!match(Shuffle,
4276 m_Shuffle(m_Specific(OldLoad), m_Undef(), m_Mask(Mask))))
4277 return std::nullopt;
4278
4279 // Ignore shufflevector instructions that have no uses.
4280 if (Shuffle->use_empty())
4281 continue;
4282
4283 // Find the min and max indices used by the shufflevector instruction.
4284 for (int Index : Mask) {
4285 if (Index >= 0 && Index < static_cast<int>(OldNumElements)) {
4286 OutputRange.first = std::min(Index, OutputRange.first);
4287 OutputRange.second = std::max(Index, OutputRange.second);
4288 }
4289 }
4290 }
4291
4292 if (OutputRange.second < OutputRange.first)
4293 return std::nullopt;
4294
4295 return OutputRange;
4296 };
4297
4298 // Get the range of vector elements used by shufflevector instructions.
4299 if (std::optional<IndexRange> Indices = GetIndexRangeInShuffles()) {
4300 unsigned const NewNumElements = Indices->second + 1u;
4301
4302 // If the range of vector elements is smaller than the full load, attempt
4303 // to create a smaller load.
4304 if (NewNumElements < OldNumElements) {
4305 IRBuilder Builder(&I);
4306 Builder.SetCurrentDebugLocation(I.getDebugLoc());
4307
4308 // Calculate costs of old and new ops.
4309 Type *ElemTy = OldLoadTy->getElementType();
4310 FixedVectorType *NewLoadTy = FixedVectorType::get(ElemTy, NewNumElements);
4311 Value *PtrOp = OldLoad->getPointerOperand();
4312
4314 Instruction::Load, OldLoad->getType(), OldLoad->getAlign(),
4315 OldLoad->getPointerAddressSpace(), CostKind);
4316 InstructionCost NewCost =
4317 TTI.getMemoryOpCost(Instruction::Load, NewLoadTy, OldLoad->getAlign(),
4318 OldLoad->getPointerAddressSpace(), CostKind);
4319
4320 using UseEntry = std::pair<ShuffleVectorInst *, std::vector<int>>;
4322 unsigned const MaxIndex = NewNumElements * 2u;
4323
4324 for (llvm::Use &Use : I.uses()) {
4325 auto *Shuffle = cast<ShuffleVectorInst>(Use.getUser());
4326 ArrayRef<int> OldMask = Shuffle->getShuffleMask();
4327
4328 // Create entry for new use.
4329 NewUses.push_back({Shuffle, OldMask});
4330
4331 // Validate mask indices.
4332 for (int Index : OldMask) {
4333 if (Index >= static_cast<int>(MaxIndex))
4334 return false;
4335 }
4336
4337 // Update costs.
4338 OldCost +=
4340 OldLoadTy, OldMask, CostKind);
4341 NewCost +=
4343 NewLoadTy, OldMask, CostKind);
4344 }
4345
4346 LLVM_DEBUG(
4347 dbgs() << "Found a load used only by shufflevector instructions: "
4348 << I << "\n OldCost: " << OldCost
4349 << " vs NewCost: " << NewCost << "\n");
4350
4351 if (OldCost < NewCost || !NewCost.isValid())
4352 return false;
4353
4354 // Create new load of smaller vector.
4355 auto *NewLoad = cast<LoadInst>(
4356 Builder.CreateAlignedLoad(NewLoadTy, PtrOp, OldLoad->getAlign()));
4357 NewLoad->copyMetadata(I);
4358
4359 // Replace all uses.
4360 for (UseEntry &Use : NewUses) {
4361 ShuffleVectorInst *Shuffle = Use.first;
4362 std::vector<int> &NewMask = Use.second;
4363
4364 Builder.SetInsertPoint(Shuffle);
4365 Builder.SetCurrentDebugLocation(Shuffle->getDebugLoc());
4366 Value *NewShuffle = Builder.CreateShuffleVector(
4367 NewLoad, PoisonValue::get(NewLoadTy), NewMask);
4368
4369 replaceValue(*Shuffle, *NewShuffle, false);
4370 }
4371
4372 return true;
4373 }
4374 }
4375 return false;
4376}
4377
4378// Attempt to narrow a phi of shufflevector instructions where the two incoming
4379// values have the same operands but different masks. If the two shuffle masks
4380// are offsets of one another we can use one branch to rotate the incoming
4381// vector and perform one larger shuffle after the phi.
4382bool VectorCombine::shrinkPhiOfShuffles(Instruction &I) {
4383 auto *Phi = dyn_cast<PHINode>(&I);
4384 if (!Phi || Phi->getNumIncomingValues() != 2u)
4385 return false;
4386
4387 Value *Op = nullptr;
4388 ArrayRef<int> Mask0;
4389 ArrayRef<int> Mask1;
4390
4391 if (!match(Phi->getOperand(0u),
4392 m_OneUse(m_Shuffle(m_Value(Op), m_Poison(), m_Mask(Mask0)))) ||
4393 !match(Phi->getOperand(1u),
4394 m_OneUse(m_Shuffle(m_Specific(Op), m_Poison(), m_Mask(Mask1)))))
4395 return false;
4396
4397 auto *Shuf = cast<ShuffleVectorInst>(Phi->getOperand(0u));
4398
4399 // Ensure result vectors are wider than the argument vector.
4400 auto *InputVT = cast<FixedVectorType>(Op->getType());
4401 auto *ResultVT = cast<FixedVectorType>(Shuf->getType());
4402 auto const InputNumElements = InputVT->getNumElements();
4403
4404 if (InputNumElements >= ResultVT->getNumElements())
4405 return false;
4406
4407 // Take the difference of the two shuffle masks at each index. Ignore poison
4408 // values at the same index in both masks.
4409 SmallVector<int, 16> NewMask;
4410 NewMask.reserve(Mask0.size());
4411
4412 for (auto [M0, M1] : zip(Mask0, Mask1)) {
4413 if (M0 >= 0 && M1 >= 0)
4414 NewMask.push_back(M0 - M1);
4415 else if (M0 == -1 && M1 == -1)
4416 continue;
4417 else
4418 return false;
4419 }
4420
4421 // Ensure all elements of the new mask are equal. If the difference between
4422 // the incoming mask elements is the same, the two must be constant offsets
4423 // of one another.
4424 if (NewMask.empty() || !all_equal(NewMask))
4425 return false;
4426
4427 // Create new mask using difference of the two incoming masks.
4428 int MaskOffset = NewMask[0u];
4429 unsigned Index = (InputNumElements - MaskOffset) % InputNumElements;
4430 NewMask.clear();
4431
4432 for (unsigned I = 0u; I < InputNumElements; ++I) {
4433 NewMask.push_back(Index);
4434 Index = (Index + 1u) % InputNumElements;
4435 }
4436
4437 // Calculate costs for worst cases and compare.
4438 auto const Kind = TTI::SK_PermuteSingleSrc;
4439 auto OldCost =
4440 std::max(TTI.getShuffleCost(Kind, ResultVT, InputVT, Mask0, CostKind),
4441 TTI.getShuffleCost(Kind, ResultVT, InputVT, Mask1, CostKind));
4442 auto NewCost = TTI.getShuffleCost(Kind, InputVT, InputVT, NewMask, CostKind) +
4443 TTI.getShuffleCost(Kind, ResultVT, InputVT, Mask1, CostKind);
4444
4445 LLVM_DEBUG(dbgs() << "Found a phi of mergeable shuffles: " << I
4446 << "\n OldCost: " << OldCost << " vs NewCost: " << NewCost
4447 << "\n");
4448
4449 if (NewCost > OldCost)
4450 return false;
4451
4452 // Create new shuffles and narrowed phi.
4453 auto Builder = IRBuilder(Shuf);
4454 Builder.SetCurrentDebugLocation(Shuf->getDebugLoc());
4455 auto *PoisonVal = PoisonValue::get(InputVT);
4456 auto *NewShuf0 = Builder.CreateShuffleVector(Op, PoisonVal, NewMask);
4457 Worklist.push(cast<Instruction>(NewShuf0));
4458
4459 Builder.SetInsertPoint(Phi);
4460 Builder.SetCurrentDebugLocation(Phi->getDebugLoc());
4461 auto *NewPhi = Builder.CreatePHI(NewShuf0->getType(), 2u);
4462 NewPhi->addIncoming(NewShuf0, Phi->getIncomingBlock(0u));
4463 NewPhi->addIncoming(Op, Phi->getIncomingBlock(1u));
4464
4465 Builder.SetInsertPoint(*NewPhi->getInsertionPointAfterDef());
4466 PoisonVal = PoisonValue::get(NewPhi->getType());
4467 auto *NewShuf1 = Builder.CreateShuffleVector(NewPhi, PoisonVal, Mask1);
4468
4469 replaceValue(*Phi, *NewShuf1);
4470 return true;
4471}
4472
4473/// This is the entry point for all transforms. Pass manager differences are
4474/// handled in the callers of this function.
4475bool VectorCombine::run() {
4477 return false;
4478
4479 // Don't attempt vectorization if the target does not support vectors.
4480 if (!TTI.getNumberOfRegisters(TTI.getRegisterClassForType(/*Vector*/ true)))
4481 return false;
4482
4483 LLVM_DEBUG(dbgs() << "\n\nVECTORCOMBINE on " << F.getName() << "\n");
4484
4485 auto FoldInst = [this](Instruction &I) {
4486 Builder.SetInsertPoint(&I);
4487 bool IsVectorType = isa<VectorType>(I.getType());
4488 bool IsFixedVectorType = isa<FixedVectorType>(I.getType());
4489 auto Opcode = I.getOpcode();
4490
4491 LLVM_DEBUG(dbgs() << "VC: Visiting: " << I << '\n');
4492
4493 // These folds should be beneficial regardless of when this pass is run
4494 // in the optimization pipeline.
4495 // The type checking is for run-time efficiency. We can avoid wasting time
4496 // dispatching to folding functions if there's no chance of matching.
4497 if (IsFixedVectorType) {
4498 switch (Opcode) {
4499 case Instruction::InsertElement:
4500 if (vectorizeLoadInsert(I))
4501 return true;
4502 break;
4503 case Instruction::ShuffleVector:
4504 if (widenSubvectorLoad(I))
4505 return true;
4506 break;
4507 default:
4508 break;
4509 }
4510 }
4511
4512 // This transform works with scalable and fixed vectors
4513 // TODO: Identify and allow other scalable transforms
4514 if (IsVectorType) {
4515 if (scalarizeOpOrCmp(I))
4516 return true;
4517 if (scalarizeLoadExtract(I))
4518 return true;
4519 if (scalarizeExtExtract(I))
4520 return true;
4521 if (scalarizeVPIntrinsic(I))
4522 return true;
4523 if (foldInterleaveIntrinsics(I))
4524 return true;
4525 }
4526
4527 if (Opcode == Instruction::Store)
4528 if (foldSingleElementStore(I))
4529 return true;
4530
4531 // If this is an early pipeline invocation of this pass, we are done.
4532 if (TryEarlyFoldsOnly)
4533 return false;
4534
4535 // Otherwise, try folds that improve codegen but may interfere with
4536 // early IR canonicalizations.
4537 // The type checking is for run-time efficiency. We can avoid wasting time
4538 // dispatching to folding functions if there's no chance of matching.
4539 if (IsFixedVectorType) {
4540 switch (Opcode) {
4541 case Instruction::InsertElement:
4542 if (foldInsExtFNeg(I))
4543 return true;
4544 if (foldInsExtBinop(I))
4545 return true;
4546 if (foldInsExtVectorToShuffle(I))
4547 return true;
4548 break;
4549 case Instruction::ShuffleVector:
4550 if (foldPermuteOfBinops(I))
4551 return true;
4552 if (foldShuffleOfBinops(I))
4553 return true;
4554 if (foldShuffleOfSelects(I))
4555 return true;
4556 if (foldShuffleOfCastops(I))
4557 return true;
4558 if (foldShuffleOfShuffles(I))
4559 return true;
4560 if (foldShuffleOfIntrinsics(I))
4561 return true;
4562 if (foldSelectShuffle(I))
4563 return true;
4564 if (foldShuffleToIdentity(I))
4565 return true;
4566 break;
4567 case Instruction::Load:
4568 if (shrinkLoadForShuffles(I))
4569 return true;
4570 break;
4571 case Instruction::BitCast:
4572 if (foldBitcastShuffle(I))
4573 return true;
4574 break;
4575 case Instruction::And:
4576 case Instruction::Or:
4577 case Instruction::Xor:
4578 if (foldBitOpOfCastops(I))
4579 return true;
4580 if (foldBitOpOfCastConstant(I))
4581 return true;
4582 break;
4583 case Instruction::PHI:
4584 if (shrinkPhiOfShuffles(I))
4585 return true;
4586 break;
4587 default:
4588 if (shrinkType(I))
4589 return true;
4590 break;
4591 }
4592 } else {
4593 switch (Opcode) {
4594 case Instruction::Call:
4595 if (foldShuffleFromReductions(I))
4596 return true;
4597 if (foldCastFromReductions(I))
4598 return true;
4599 break;
4600 case Instruction::ExtractElement:
4601 if (foldShuffleChainsToReduce(I))
4602 return true;
4603 break;
4604 case Instruction::ICmp:
4605 case Instruction::FCmp:
4606 if (foldExtractExtract(I))
4607 return true;
4608 break;
4609 case Instruction::Or:
4610 if (foldConcatOfBoolMasks(I))
4611 return true;
4612 [[fallthrough]];
4613 default:
4614 if (Instruction::isBinaryOp(Opcode)) {
4615 if (foldExtractExtract(I))
4616 return true;
4617 if (foldExtractedCmps(I))
4618 return true;
4619 if (foldBinopOfReductions(I))
4620 return true;
4621 }
4622 break;
4623 }
4624 }
4625 return false;
4626 };
4627
4628 bool MadeChange = false;
4629 for (BasicBlock &BB : F) {
4630 // Ignore unreachable basic blocks.
4631 if (!DT.isReachableFromEntry(&BB))
4632 continue;
4633 // Use early increment range so that we can erase instructions in loop.
4634 // make_early_inc_range is not applicable here, as the next iterator may
4635 // be invalidated by RecursivelyDeleteTriviallyDeadInstructions.
4636 // We manually maintain the next instruction and update it when it is about
4637 // to be deleted.
4638 Instruction *I = &BB.front();
4639 while (I) {
4640 NextInst = I->getNextNode();
4641 if (!I->isDebugOrPseudoInst())
4642 MadeChange |= FoldInst(*I);
4643 I = NextInst;
4644 }
4645 }
4646
4647 NextInst = nullptr;
4648
4649 while (!Worklist.isEmpty()) {
4650 Instruction *I = Worklist.removeOne();
4651 if (!I)
4652 continue;
4653
4656 continue;
4657 }
4658
4659 MadeChange |= FoldInst(*I);
4660 }
4661
4662 return MadeChange;
4663}
4664
4667 auto &AC = FAM.getResult<AssumptionAnalysis>(F);
4669 DominatorTree &DT = FAM.getResult<DominatorTreeAnalysis>(F);
4670 AAResults &AA = FAM.getResult<AAManager>(F);
4671 const DataLayout *DL = &F.getDataLayout();
4672 VectorCombine Combiner(F, TTI, DT, AA, AC, DL, TTI::TCK_RecipThroughput,
4673 TryEarlyFoldsOnly);
4674 if (!Combiner.run())
4675 return PreservedAnalyses::all();
4678 return PA;
4679}
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static cl::opt< unsigned > MaxInstrsToScan("aggressive-instcombine-max-scan-instrs", cl::init(64), cl::Hidden, cl::desc("Max number of instructions to scan for aggressive instcombine."))
This is the interface for LLVM's primary stateless and local alias analysis.
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static cl::opt< OutputCostKind > CostKind("cost-kind", cl::desc("Target cost kind"), cl::init(OutputCostKind::RecipThroughput), cl::values(clEnumValN(OutputCostKind::RecipThroughput, "throughput", "Reciprocal throughput"), clEnumValN(OutputCostKind::Latency, "latency", "Instruction latency"), clEnumValN(OutputCostKind::CodeSize, "code-size", "Code size"), clEnumValN(OutputCostKind::SizeAndLatency, "size-latency", "Code size and latency"), clEnumValN(OutputCostKind::All, "all", "Print all cost kinds")))
This file defines the DenseMap class.
This is the interface for a simple mod/ref and alias analysis over globals.
Hexagon Common GEP
const size_t AbstractManglingParser< Derived, Alloc >::NumOps
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
static void eraseInstruction(Instruction &I, ICFLoopSafetyInfo &SafetyInfo, MemorySSAUpdater &MSSAU)
Definition LICM.cpp:1451
#define F(x, y, z)
Definition MD5.cpp:55
#define I(x, y, z)
Definition MD5.cpp:58
#define T1
MachineInstr unsigned OpIdx
uint64_t IntrinsicInst * II
FunctionAnalysisManager FAM
unsigned OpIndex
This file contains some templates that are useful if you are working with the STL at all.
This file defines the make_scope_exit function, which executes user-defined cleanup logic at scope ex...
This file defines the SmallVector class.
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
Definition Statistic.h:167
#define LLVM_DEBUG(...)
Definition Debug.h:114
static TableGen::Emitter::Opt Y("gen-skeleton-entry", EmitSkeleton, "Generate example skeleton entry")
static TableGen::Emitter::OptClass< SkeletonEmitter > X("gen-skeleton-class", "Generate example skeleton class")
static SymbolRef::Type getType(const Symbol *Sym)
Definition TapiFile.cpp:39
This pass exposes codegen information to IR-level passes.
static std::optional< unsigned > getOpcode(ArrayRef< VPValue * > Values)
Returns the opcode of Values or ~0 if they do not all agree.
Definition VPlanSLP.cpp:247
static Value * generateNewInstTree(ArrayRef< InstLane > Item, FixedVectorType *Ty, const SmallPtrSet< Use *, 4 > &IdentityLeafs, const SmallPtrSet< Use *, 4 > &SplatLeafs, const SmallPtrSet< Use *, 4 > &ConcatLeafs, IRBuilderBase &Builder, const TargetTransformInfo *TTI)
static bool isFreeConcat(ArrayRef< InstLane > Item, TTI::TargetCostKind CostKind, const TargetTransformInfo &TTI)
Detect concat of multiple values into a vector.
static void analyzeCostOfVecReduction(const IntrinsicInst &II, TTI::TargetCostKind CostKind, const TargetTransformInfo &TTI, InstructionCost &CostBeforeReduction, InstructionCost &CostAfterReduction)
static SmallVector< InstLane > generateInstLaneVectorFromOperand(ArrayRef< InstLane > Item, int Op)
static Value * createShiftShuffle(Value *Vec, unsigned OldIndex, unsigned NewIndex, IRBuilderBase &Builder)
Create a shuffle that translates (shifts) 1 element from the input vector to a new element location.
static Align computeAlignmentAfterScalarization(Align VectorAlignment, Type *ScalarType, Value *Idx, const DataLayout &DL)
The memory operation on a vector of ScalarType had alignment of VectorAlignment.
static bool feedsIntoVectorReduction(ShuffleVectorInst *SVI)
Returns true if this ShuffleVectorInst eventually feeds into a vector reduction intrinsic (e....
static ScalarizationResult canScalarizeAccess(VectorType *VecTy, Value *Idx, Instruction *CtxI, AssumptionCache &AC, const DominatorTree &DT)
Check if it is legal to scalarize a memory access to VecTy at index Idx.
static cl::opt< bool > DisableVectorCombine("disable-vector-combine", cl::init(false), cl::Hidden, cl::desc("Disable all vector combine transforms"))
static InstLane lookThroughShuffles(Use *U, int Lane)
static bool canWidenLoad(LoadInst *Load, const TargetTransformInfo &TTI)
static const unsigned InvalidIndex
std::pair< Use *, int > InstLane
static Value * translateExtract(ExtractElementInst *ExtElt, unsigned NewIndex, IRBuilderBase &Builder)
Given an extract element instruction with constant index operand, shuffle the source vector (shift th...
static cl::opt< unsigned > MaxInstrsToScan("vector-combine-max-scan-instrs", cl::init(30), cl::Hidden, cl::desc("Max number of instructions to scan for vector combining."))
static cl::opt< bool > DisableBinopExtractShuffle("disable-binop-extract-shuffle", cl::init(false), cl::Hidden, cl::desc("Disable binop extract to shuffle transforms"))
static bool isMemModifiedBetween(BasicBlock::iterator Begin, BasicBlock::iterator End, const MemoryLocation &Loc, AAResults &AA)
static constexpr int Concat[]
Value * RHS
Value * LHS
A manager for alias analyses.
Class for arbitrary precision integers.
Definition APInt.h:78
LLVM_ABI APInt zext(unsigned width) const
Zero extend to a new width.
Definition APInt.cpp:1012
static APInt getOneBitSet(unsigned numBits, unsigned BitNo)
Return an APInt with exactly one bit set in the result.
Definition APInt.h:239
bool uge(const APInt &RHS) const
Unsigned greater or equal comparison.
Definition APInt.h:1221
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:41
const T & front() const
front - Get the first element.
Definition ArrayRef.h:150
size_t size() const
size - Get the array size.
Definition ArrayRef.h:147
A function analysis which provides an AssumptionCache.
A cache of @llvm.assume calls within a function.
LLVM_ABI bool hasAttribute(Attribute::AttrKind Kind) const
Return true if the attribute exists in this set.
InstListType::iterator iterator
Instruction iterators...
Definition BasicBlock.h:170
BinaryOps getOpcode() const
Definition InstrTypes.h:374
Represents analyses that only rely on functions' control flow.
Definition Analysis.h:73
Value * getArgOperand(unsigned i) const
iterator_range< User::op_iterator > args()
Iteration adapter for range-for loops.
static LLVM_ABI CastInst * Create(Instruction::CastOps, Value *S, Type *Ty, const Twine &Name="", InsertPosition InsertBefore=nullptr)
Provides a way to construct any of the CastInst subclasses using an opcode instead of the subclass's ...
static Type * makeCmpResultType(Type *opnd_type)
Create a result type for fcmp/icmp.
Definition InstrTypes.h:984
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition InstrTypes.h:678
bool isFPPredicate() const
Definition InstrTypes.h:784
static LLVM_ABI std::optional< CmpPredicate > getMatching(CmpPredicate A, CmpPredicate B)
Compares two CmpPredicates taking samesign into account and returns the canonicalized CmpPredicate if...
Combiner implementation.
Definition Combiner.h:34
static LLVM_ABI Constant * getExtractElement(Constant *Vec, Constant *Idx, Type *OnlyIfReducedTy=nullptr)
This is the shared class of boolean and integer constants.
Definition Constants.h:87
const APInt & getValue() const
Return the constant as an APInt value reference.
Definition Constants.h:154
This class represents a range of values.
LLVM_ABI ConstantRange urem(const ConstantRange &Other) const
Return a new range representing the possible values resulting from an unsigned remainder operation of...
LLVM_ABI ConstantRange binaryAnd(const ConstantRange &Other) const
Return a new range representing the possible values resulting from a binary-and of a value in this ra...
LLVM_ABI bool contains(const APInt &Val) const
Return true if the specified value is in the set.
static LLVM_ABI Constant * getSplat(ElementCount EC, Constant *Elt)
Return a ConstantVector with the specified constant in each element.
static LLVM_ABI Constant * get(ArrayRef< Constant * > V)
A parsed version of the target data layout string in and methods for querying it.
Definition DataLayout.h:63
Analysis pass which computes a DominatorTree.
Definition Dominators.h:284
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree.
Definition Dominators.h:165
LLVM_ABI bool isReachableFromEntry(const Use &U) const
Provide an overload for a Use.
This instruction extracts a single (scalar) element from a VectorType value.
Convenience struct for specifying and reasoning about fast-math flags.
Definition FMF.h:22
Class to represent fixed width SIMD vectors.
unsigned getNumElements() const
static FixedVectorType * getDoubleElementsVectorType(FixedVectorType *VTy)
static LLVM_ABI FixedVectorType * get(Type *ElementType, unsigned NumElts)
Definition Type.cpp:803
Common base class shared among various IRBuilders.
Definition IRBuilder.h:114
Value * CreateInsertElement(Type *VecTy, Value *NewElt, Value *Idx, const Twine &Name="")
Definition IRBuilder.h:2571
Value * CreateExtractElement(Value *Vec, Value *Idx, const Twine &Name="")
Definition IRBuilder.h:2559
LoadInst * CreateAlignedLoad(Type *Ty, Value *Ptr, MaybeAlign Align, const char *Name)
Definition IRBuilder.h:1864
LLVM_ABI Value * CreateSelectFMF(Value *C, Value *True, Value *False, FMFSource FMFSource, const Twine &Name="", Instruction *MDFrom=nullptr)
LLVM_ABI Value * CreateVectorSplat(unsigned NumElts, Value *V, const Twine &Name="")
Return a vector value that contains.
LLVM_ABI Value * CreateSelect(Value *C, Value *True, Value *False, const Twine &Name="", Instruction *MDFrom=nullptr)
Value * CreateFreeze(Value *V, const Twine &Name="")
Definition IRBuilder.h:2637
Value * CreateLShr(Value *LHS, Value *RHS, const Twine &Name="", bool isExact=false)
Definition IRBuilder.h:1513
Value * CreateCast(Instruction::CastOps Op, Value *V, Type *DestTy, const Twine &Name="", MDNode *FPMathTag=nullptr, FMFSource FMFSource={})
Definition IRBuilder.h:2238
void SetCurrentDebugLocation(DebugLoc L)
Set location information used by debugging information.
Definition IRBuilder.h:247
Value * CreateInBoundsGEP(Type *Ty, Value *Ptr, ArrayRef< Value * > IdxList, const Twine &Name="")
Definition IRBuilder.h:1931
Value * CreatePointerBitCastOrAddrSpaceCast(Value *V, Type *DestTy, const Twine &Name="")
Definition IRBuilder.h:2263
ConstantInt * getInt64(uint64_t C)
Get a constant 64-bit value.
Definition IRBuilder.h:527
LLVM_ABI CallInst * CreateIntrinsic(Intrinsic::ID ID, ArrayRef< Type * > Types, ArrayRef< Value * > Args, FMFSource FMFSource={}, const Twine &Name="")
Create a call to intrinsic ID with Args, mangled using Types.
ConstantInt * getInt32(uint32_t C)
Get a constant 32-bit value.
Definition IRBuilder.h:522
Value * CreateCmp(CmpInst::Predicate Pred, Value *LHS, Value *RHS, const Twine &Name="", MDNode *FPMathTag=nullptr)
Definition IRBuilder.h:2463
PHINode * CreatePHI(Type *Ty, unsigned NumReservedValues, const Twine &Name="")
Definition IRBuilder.h:2494
InstTy * Insert(InstTy *I, const Twine &Name="") const
Insert and return the specified instruction.
Definition IRBuilder.h:172
Value * CreateBitCast(Value *V, Type *DestTy, const Twine &Name="")
Definition IRBuilder.h:2204
LoadInst * CreateLoad(Type *Ty, Value *Ptr, const char *Name)
Provided to resolve 'CreateLoad(Ty, Ptr, "...")' correctly, instead of converting the string to 'bool...
Definition IRBuilder.h:1847
Value * CreateShl(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
Definition IRBuilder.h:1492
LLVM_ABI Value * CreateNAryOp(unsigned Opc, ArrayRef< Value * > Ops, const Twine &Name="", MDNode *FPMathTag=nullptr)
Create either a UnaryOperator or BinaryOperator depending on Opc.
Value * CreateZExt(Value *V, Type *DestTy, const Twine &Name="", bool IsNonNeg=false)
Definition IRBuilder.h:2082
Value * CreateShuffleVector(Value *V1, Value *V2, Value *Mask, const Twine &Name="")
Definition IRBuilder.h:2593
Value * CreateAnd(Value *LHS, Value *RHS, const Twine &Name="")
Definition IRBuilder.h:1551
StoreInst * CreateStore(Value *Val, Value *Ptr, bool isVolatile=false)
Definition IRBuilder.h:1860
Value * CreateTrunc(Value *V, Type *DestTy, const Twine &Name="", bool IsNUW=false, bool IsNSW=false)
Definition IRBuilder.h:2068
PointerType * getPtrTy(unsigned AddrSpace=0)
Fetch the type representing a pointer.
Definition IRBuilder.h:605
Value * CreateBinOp(Instruction::BinaryOps Opc, Value *LHS, Value *RHS, const Twine &Name="", MDNode *FPMathTag=nullptr)
Definition IRBuilder.h:1708
void SetInsertPoint(BasicBlock *TheBB)
This specifies that created instructions should be appended to the end of the specified block.
Definition IRBuilder.h:207
Value * CreateFNegFMF(Value *V, FMFSource FMFSource, const Twine &Name="", MDNode *FPMathTag=nullptr)
Definition IRBuilder.h:1795
Value * CreateOr(Value *LHS, Value *RHS, const Twine &Name="", bool IsDisjoint=false)
Definition IRBuilder.h:1573
InstSimplifyFolder - Use InstructionSimplify to fold operations to existing values.
void push(Instruction *I)
Push the instruction onto the worklist stack.
LLVM_ABI void copyIRFlags(const Value *V, bool IncludeWrapFlags=true)
Convenience method to copy supported exact, fast-math, and (optionally) wrapping flags from V to this...
const DebugLoc & getDebugLoc() const
Return the debug location for this node as a DebugLoc.
LLVM_ABI void andIRFlags(const Value *V)
Logical 'and' of any supported wrapping, exact, and fast-math flags of V and this instruction.
bool isBinaryOp() const
LLVM_ABI bool comesBefore(const Instruction *Other) const
Given an instruction Other in the same basic block as this instruction, return true if this instructi...
unsigned getOpcode() const
Returns a member of one of the enums like Instruction::Add.
LLVM_ABI void copyMetadata(const Instruction &SrcInst, ArrayRef< unsigned > WL=ArrayRef< unsigned >())
Copy metadata from SrcInst to this instruction.
bool isIntDivRem() const
static LLVM_ABI IntegerType * get(LLVMContext &C, unsigned NumBits)
This static method is the primary way of constructing an IntegerType.
Definition Type.cpp:319
A wrapper class for inspecting calls to intrinsic functions.
Intrinsic::ID getIntrinsicID() const
Return the intrinsic ID of this intrinsic.
An instruction for reading from memory.
Representation for a specific memory location.
static LLVM_ABI MemoryLocation get(const LoadInst *LI)
Return a location with information about the memory reference by the given instruction.
void addIncoming(Value *V, BasicBlock *BB)
Add an incoming value to the end of the PHI list.
static LLVM_ABI PoisonValue * get(Type *T)
Static factory methods - Return an 'poison' object of the specified type.
A set of analyses that are preserved following a run of a transformation pass.
Definition Analysis.h:112
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
Definition Analysis.h:118
PreservedAnalyses & preserveSet()
Mark an analysis set as preserved.
Definition Analysis.h:151
const SDValue & getOperand(unsigned Num) const
This instruction constructs a fixed permutation of two input vectors.
int getMaskValue(unsigned Elt) const
Return the shuffle mask value of this instruction for the given element index.
VectorType * getType() const
Overload to return most specific vector type.
static LLVM_ABI void getShuffleMask(const Constant *Mask, SmallVectorImpl< int > &Result)
Convert the input shuffle mask operand to a vector of integers.
static LLVM_ABI bool isIdentityMask(ArrayRef< int > Mask, int NumSrcElts)
Return true if this shuffle mask chooses elements from exactly one source vector without lane crossin...
static void commuteShuffleMask(MutableArrayRef< int > Mask, unsigned InVecNumElts)
Change values in a shuffle permute mask assuming the two vector operands of length InVecNumElts have ...
size_type size() const
Definition SmallPtrSet.h:99
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
bool contains(ConstPtrType Ptr) const
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
void assign(size_type NumElts, ValueParamT Elt)
reference emplace_back(ArgTypes &&... Args)
void reserve(size_type N)
void append(ItTy in_start, ItTy in_end)
Add the specified range to the end of the SmallVector.
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
void setAlignment(Align Align)
Analysis pass providing the TargetTransformInfo.
This pass provides access to the codegen interfaces that are needed for IR-level transformations.
LLVM_ABI InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index=-1, const Value *Op0=nullptr, const Value *Op1=nullptr) const
LLVM_ABI InstructionCost getScalarizationOverhead(VectorType *Ty, const APInt &DemandedElts, bool Insert, bool Extract, TTI::TargetCostKind CostKind, bool ForPoisonSrc=true, ArrayRef< Value * > VL={}) const
Estimate the overhead of scalarizing an instruction.
LLVM_ABI InstructionCost getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, OperandValueInfo Op1Info={OK_AnyValue, OP_None}, OperandValueInfo Op2Info={OK_AnyValue, OP_None}, const Instruction *I=nullptr) const
LLVM_ABI TypeSize getRegisterBitWidth(RegisterKind K) const
LLVM_ABI InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, OperandValueInfo OpdInfo={OK_AnyValue, OP_None}, const Instruction *I=nullptr) const
LLVM_ABI bool allowVectorElementIndexingUsingGEP() const
Returns true if GEP should not be used to index into vectors for this target.
LLVM_ABI InstructionCost getShuffleCost(ShuffleKind Kind, VectorType *DstTy, VectorType *SrcTy, ArrayRef< int > Mask={}, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, int Index=0, VectorType *SubTp=nullptr, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr) const
LLVM_ABI InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind) const
LLVM_ABI InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput) const
Calculate the cost of vector reduction intrinsics.
LLVM_ABI InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH, TTI::TargetCostKind CostKind=TTI::TCK_SizeAndLatency, const Instruction *I=nullptr) const
LLVM_ABI unsigned getRegisterClassForType(bool Vector, Type *Ty=nullptr) const
TargetCostKind
The kind of cost model.
@ TCK_RecipThroughput
Reciprocal throughput.
LLVM_ABI InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, TTI::OperandValueInfo Opd1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Opd2Info={TTI::OK_AnyValue, TTI::OP_None}, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr, const TargetLibraryInfo *TLibInfo=nullptr) const
This is an approximation of reciprocal throughput of a math/logic op.
LLVM_ABI unsigned getMinVectorRegisterBitWidth() const
LLVM_ABI InstructionCost getAddressComputationCost(Type *PtrTy, ScalarEvolution *SE, const SCEV *Ptr, TTI::TargetCostKind CostKind) const
LLVM_ABI unsigned getNumberOfRegisters(unsigned ClassID) const
LLVM_ABI InstructionCost getInstructionCost(const User *U, ArrayRef< const Value * > Operands, TargetCostKind CostKind) const
Estimate the cost of a given IR user when lowered.
ShuffleKind
The various kinds of shuffle patterns for vector queries.
@ SK_PermuteSingleSrc
Shuffle elements of single source vector with any shuffle mask.
@ SK_Broadcast
Broadcast element 0 to all other elements.
@ SK_PermuteTwoSrc
Merge elements from two source vectors into one with any shuffle mask.
@ None
The cast is not used with a load/store of any kind.
The instances of the Type class are immutable: once they are created, they are never changed.
Definition Type.h:45
bool isPointerTy() const
True if this is an instance of PointerType.
Definition Type.h:267
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
Definition Type.h:352
LLVM_ABI TypeSize getPrimitiveSizeInBits() const LLVM_READONLY
Return the basic size of this type if it is a primitive type.
Definition Type.cpp:198
LLVMContext & getContext() const
Return the LLVMContext in which this type was uniqued.
Definition Type.h:128
LLVM_ABI unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
Definition Type.cpp:231
bool isFloatingPointTy() const
Return true if this is one of the floating-point types.
Definition Type.h:184
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition Type.h:240
A Use represents the edge between a Value definition and its users.
Definition Use.h:35
op_range operands()
Definition User.h:292
Value * getOperand(unsigned i) const
Definition User.h:232
static LLVM_ABI bool isVPBinOp(Intrinsic::ID ID)
std::optional< unsigned > getFunctionalIntrinsicID() const
std::optional< unsigned > getFunctionalOpcode() const
LLVM Value Representation.
Definition Value.h:75
Type * getType() const
All values are typed, get the type of this value.
Definition Value.h:256
const Value * stripAndAccumulateInBoundsConstantOffsets(const DataLayout &DL, APInt &Offset) const
This is a wrapper around stripAndAccumulateConstantOffsets with the in-bounds requirement set to fals...
Definition Value.h:759
bool hasOneUse() const
Return true if there is exactly one use of this value.
Definition Value.h:439
LLVM_ABI void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
Definition Value.cpp:546
iterator_range< user_iterator > users()
Definition Value.h:426
LLVM_ABI Align getPointerAlignment(const DataLayout &DL) const
Returns an alignment of the pointer value.
Definition Value.cpp:956
unsigned getValueID() const
Return an ID for the concrete type of this object.
Definition Value.h:543
LLVM_ABI bool hasNUses(unsigned N) const
Return true if this Value has exactly N uses.
Definition Value.cpp:150
bool use_empty() const
Definition Value.h:346
LLVM_ABI StringRef getName() const
Return a constant reference to the value's name.
Definition Value.cpp:322
PreservedAnalyses run(Function &F, FunctionAnalysisManager &)
static LLVM_ABI VectorType * get(Type *ElementType, ElementCount EC)
This static method is the primary way to construct an VectorType.
NodeTy * getNextNode()
Get the next node, or nullptr for the list tail.
Definition ilist_node.h:359
Abstract Attribute helper functions.
Definition Attributor.h:165
constexpr char Align[]
Key for Kernel::Arg::Metadata::mAlign.
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
constexpr char Attrs[]
Key for Kernel::Metadata::mAttrs.
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
LLVM_ABI AttributeSet getFnAttributes(LLVMContext &C, ID id)
Return the function attributes for an intrinsic.
SpecificConstantMatch m_ZeroInt()
Convenience matchers for specific integer values.
OneUse_match< SubPat > m_OneUse(const SubPat &SP)
class_match< PoisonValue > m_Poison()
Match an arbitrary poison constant.
BinaryOp_match< LHS, RHS, Instruction::And > m_And(const LHS &L, const RHS &R)
class_match< BinaryOperator > m_BinOp()
Match an arbitrary binary operation and ignore it.
BinaryOp_match< LHS, RHS, Instruction::URem > m_URem(const LHS &L, const RHS &R)
class_match< Constant > m_Constant()
Match an arbitrary Constant and ignore it.
CastInst_match< OpTy, TruncInst > m_Trunc(const OpTy &Op)
Matches Trunc.
specific_intval< false > m_SpecificInt(const APInt &V)
Match a specific integer value or vector with all elements equal to the value.
bool match(Val *V, const Pattern &P)
bind_ty< Instruction > m_Instruction(Instruction *&I)
Match an instruction, capturing it if we match.
specificval_ty m_Specific(const Value *V)
Match if we have a specific specified value.
DisjointOr_match< LHS, RHS > m_DisjointOr(const LHS &L, const RHS &R)
TwoOps_match< Val_t, Idx_t, Instruction::ExtractElement > m_ExtractElt(const Val_t &Val, const Idx_t &Idx)
Matches ExtractElementInst.
class_match< ConstantInt > m_ConstantInt()
Match an arbitrary ConstantInt and ignore it.
IntrinsicID_match m_Intrinsic()
Match intrinsic calls like this: m_Intrinsic<Intrinsic::fabs>(m_Value(X))
ThreeOps_match< Cond, LHS, RHS, Instruction::Select > m_Select(const Cond &C, const LHS &L, const RHS &R)
Matches SelectInst.
match_combine_and< LTy, RTy > m_CombineAnd(const LTy &L, const RTy &R)
Combine two pattern matchers matching L && R.
BinaryOp_match< LHS, RHS, Instruction::Mul > m_Mul(const LHS &L, const RHS &R)
TwoOps_match< V1_t, V2_t, Instruction::ShuffleVector > m_Shuffle(const V1_t &v1, const V2_t &v2)
Matches ShuffleVectorInst independently of mask value.
OneOps_match< OpTy, Instruction::Load > m_Load(const OpTy &Op)
Matches LoadInst.
CastInst_match< OpTy, ZExtInst > m_ZExt(const OpTy &Op)
Matches ZExt.
BinOpPred_match< LHS, RHS, is_bitwiselogic_op, true > m_c_BitwiseLogic(const LHS &L, const RHS &R)
Matches bitwise logic operations in either order.
class_match< CmpInst > m_Cmp()
Matches any compare instruction and ignore it.
CastOperator_match< OpTy, Instruction::BitCast > m_BitCast(const OpTy &Op)
Matches BitCast.
match_combine_or< CastInst_match< OpTy, SExtInst >, NNegZExt_match< OpTy > > m_SExtLike(const OpTy &Op)
Match either "sext" or "zext nneg".
apint_match m_APInt(const APInt *&Res)
Match a ConstantInt or splatted ConstantVector, binding the specified pointer to the contained APInt.
class_match< Value > m_Value()
Match an arbitrary value and ignore it.
BinaryOp_match< LHS, RHS, Instruction::LShr > m_LShr(const LHS &L, const RHS &R)
match_combine_or< CastInst_match< OpTy, ZExtInst >, CastInst_match< OpTy, SExtInst > > m_ZExtOrSExt(const OpTy &Op)
FNeg_match< OpTy > m_FNeg(const OpTy &X)
Match 'fneg X' as 'fsub -0.0, X'.
BinaryOp_match< LHS, RHS, Instruction::Shl > m_Shl(const LHS &L, const RHS &R)
auto m_Undef()
Match an arbitrary undef constant.
is_zero m_Zero()
Match any null constant or a vector with all elements equal to 0.
ThreeOps_match< Val_t, Elt_t, Idx_t, Instruction::InsertElement > m_InsertElt(const Val_t &Val, const Elt_t &Elt, const Idx_t &Idx)
Matches InsertElementInst.
initializer< Ty > init(const Ty &Val)
PointerTypeMap run(const Module &M)
Compute the PointerTypeMap for the module M.
@ User
could "use" a pointer
NodeAddr< PhiNode * > Phi
Definition RDFGraph.h:390
NodeAddr< UseNode * > Use
Definition RDFGraph.h:385
friend class Instruction
Iterator for Instructions in a `BasicBlock.
Definition BasicBlock.h:73
This is an optimization pass for GlobalISel generic memory operations.
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition STLExtras.h:310
@ Offset
Definition DWP.cpp:477
detail::zippy< detail::zip_shortest, T, U, Args... > zip(T &&t, U &&u, Args &&...args)
zip iterator for two or more iteratable types.
Definition STLExtras.h:823
FunctionAddr VTableAddr Value
Definition InstrProf.h:137
void stable_sort(R &&Range)
Definition STLExtras.h:2040
UnaryFunction for_each(R &&Range, UnaryFunction F)
Provide wrappers to std::for_each which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1700
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1707
LLVM_ABI bool RecursivelyDeleteTriviallyDeadInstructions(Value *V, const TargetLibraryInfo *TLI=nullptr, MemorySSAUpdater *MSSAU=nullptr, std::function< void(Value *)> AboutToDeleteCallback=std::function< void(Value *)>())
If the specified value is a trivially dead instruction, delete it.
Definition Local.cpp:533
detail::scope_exit< std::decay_t< Callable > > make_scope_exit(Callable &&F)
Definition ScopeExit.h:59
LLVM_ABI SDValue peekThroughBitcasts(SDValue V)
Return the non-bitcasted source operand of V if it exists.
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are tuples (A, B,...
Definition STLExtras.h:2454
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:649
unsigned Log2_64_Ceil(uint64_t Value)
Return the ceil log base 2 of the specified value, 64 if the value is zero.
Definition MathExtras.h:361
LLVM_ABI Value * simplifyUnOp(unsigned Opcode, Value *Op, const SimplifyQuery &Q)
Given operand for a UnaryOperator, fold the result or return null.
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
LLVM_ABI unsigned getArithmeticReductionInstruction(Intrinsic::ID RdxID)
Returns the arithmetic instruction opcode used when expanding a reduction.
constexpr bool isUIntN(unsigned N, uint64_t x)
Checks if an unsigned integer fits into the given (dynamic) bit width.
Definition MathExtras.h:252
LLVM_ABI Value * simplifyCall(CallBase *Call, Value *Callee, ArrayRef< Value * > Args, const SimplifyQuery &Q)
Given a callsite, callee, and arguments, fold the result or return null.
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
Definition STLExtras.h:626
LLVM_ABI bool mustSuppressSpeculation(const LoadInst &LI)
Return true if speculation of the given load must be suppressed to avoid ordering or interfering with...
Definition Loads.cpp:416
LLVM_ABI bool widenShuffleMaskElts(int Scale, ArrayRef< int > Mask, SmallVectorImpl< int > &ScaledMask)
Try to transform a shuffle mask by replacing elements with the scaled index for an equivalent mask of...
LLVM_ABI bool isSafeToSpeculativelyExecute(const Instruction *I, const Instruction *CtxI=nullptr, AssumptionCache *AC=nullptr, const DominatorTree *DT=nullptr, const TargetLibraryInfo *TLI=nullptr, bool UseVariableInfo=true, bool IgnoreUBImplyingAttrs=true)
Return true if the instruction does not have any effects besides calculating the result and does not ...
LLVM_ABI Value * getSplatValue(const Value *V)
Get splat value if the input is a splat vector or return nullptr.
LLVM_ABI ConstantRange computeConstantRange(const Value *V, bool ForSigned, bool UseInstrInfo=true, AssumptionCache *AC=nullptr, const Instruction *CtxI=nullptr, const DominatorTree *DT=nullptr, unsigned Depth=0)
Determine the possible constant range of an integer or vector of integer value.
unsigned M1(unsigned Val)
Definition VE.h:377
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1714
LLVM_ABI bool isInstructionTriviallyDead(Instruction *I, const TargetLibraryInfo *TLI=nullptr)
Return true if the result produced by the instruction is not used, and the instruction will return.
Definition Local.cpp:402
LLVM_ABI bool isSplatValue(const Value *V, int Index=-1, unsigned Depth=0)
Return true if each element of the vector value V is poisoned or equal to every other non-poisoned el...
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
Definition MathExtras.h:288
bool isModSet(const ModRefInfo MRI)
Definition ModRef.h:49
void sort(IteratorTy Start, IteratorTy End)
Definition STLExtras.h:1632
LLVM_ABI void computeKnownBits(const Value *V, KnownBits &Known, const DataLayout &DL, AssumptionCache *AC=nullptr, const Instruction *CxtI=nullptr, const DominatorTree *DT=nullptr, bool UseInstrInfo=true, unsigned Depth=0)
Determine which bits of V are known to be either zero or one and return them in the KnownZero/KnownOn...
LLVM_ABI bool isSafeToLoadUnconditionally(Value *V, Align Alignment, const APInt &Size, const DataLayout &DL, Instruction *ScanFrom, AssumptionCache *AC=nullptr, const DominatorTree *DT=nullptr, const TargetLibraryInfo *TLI=nullptr)
Return true if we know that executing a load from this value cannot trap.
Definition Loads.cpp:431
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:207
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:548
LLVM_ABI void propagateIRFlags(Value *I, ArrayRef< Value * > VL, Value *OpValue=nullptr, bool IncludeWrapFlags=true)
Get the intersection (logical and) of all of the potential IR flags of each scalar operation (VL) tha...
LLVM_ABI bool isKnownNonZero(const Value *V, const SimplifyQuery &Q, unsigned Depth=0)
Return true if the given value is known to be non-zero when defined.
MutableArrayRef(T &OneElt) -> MutableArrayRef< T >
constexpr int PoisonMaskElem
LLVM_ABI bool isSafeToSpeculativelyExecuteWithOpcode(unsigned Opcode, const Instruction *Inst, const Instruction *CtxI=nullptr, AssumptionCache *AC=nullptr, const DominatorTree *DT=nullptr, const TargetLibraryInfo *TLI=nullptr, bool UseVariableInfo=true, bool IgnoreUBImplyingAttrs=true)
This returns the same result as isSafeToSpeculativelyExecute if Opcode is the actual opcode of Inst.
@ Other
Any other memory.
Definition ModRef.h:68
TargetTransformInfo TTI
IRBuilder(LLVMContext &, FolderTy, InserterTy, MDNode *, ArrayRef< OperandBundleDef >) -> IRBuilder< FolderTy, InserterTy >
LLVM_ABI Value * simplifyBinOp(unsigned Opcode, Value *LHS, Value *RHS, const SimplifyQuery &Q)
Given operands for a BinaryOperator, fold the result or return null.
LLVM_ABI void narrowShuffleMaskElts(int Scale, ArrayRef< int > Mask, SmallVectorImpl< int > &ScaledMask)
Replace each shuffle mask index with the scaled sequential indices for an equivalent mask of narrowed...
LLVM_ABI Intrinsic::ID getReductionForBinop(Instruction::BinaryOps Opc)
Returns the reduction intrinsic id corresponding to the binary operation.
@ And
Bitwise or logical AND of integers.
LLVM_ABI bool isVectorIntrinsicWithScalarOpAtArg(Intrinsic::ID ID, unsigned ScalarOpdIdx, const TargetTransformInfo *TTI)
Identifies if the vector form of the intrinsic has a scalar operand.
DWARFExpression::Operation Op
unsigned M0(unsigned Val)
Definition VE.h:376
constexpr unsigned BitWidth
LLVM_ABI Constant * getLosslessInvCast(Constant *C, Type *InvCastTo, unsigned CastOp, const DataLayout &DL, PreservedCastFlags *Flags=nullptr)
Try to cast C to InvC losslessly, satisfying CastOp(InvC) equals C, or CastOp(InvC) is a refined valu...
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:565
auto find_if(R &&Range, UnaryPredicate P)
Provide wrappers to std::find_if which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1740
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Definition STLExtras.h:1879
Align commonAlignment(Align A, uint64_t Offset)
Returns the alignment that satisfies both alignments.
Definition Alignment.h:212
bool all_equal(std::initializer_list< T > Values)
Returns true if all Values in the initializer lists are equal or the list.
Definition STLExtras.h:2090
LLVM_ABI Value * simplifyCmpInst(CmpPredicate Predicate, Value *LHS, Value *RHS, const SimplifyQuery &Q)
Given operands for a CmpInst, fold the result or return null.
AnalysisManager< Function > FunctionAnalysisManager
Convenience typedef for the Function analysis manager.
LLVM_ABI bool isGuaranteedNotToBePoison(const Value *V, AssumptionCache *AC=nullptr, const Instruction *CtxI=nullptr, const DominatorTree *DT=nullptr, unsigned Depth=0)
Returns true if V cannot be poison, but may be undef.
Type * toVectorTy(Type *Scalar, ElementCount EC)
A helper function for converting Scalar types to vector types.
LLVM_ABI bool isTriviallyVectorizable(Intrinsic::ID ID)
Identify if the intrinsic is trivially vectorizable.
LLVM_ABI Intrinsic::ID getMinMaxReductionIntrinsicID(Intrinsic::ID IID)
Returns the llvm.vector.reduce min/max intrinsic that corresponds to the intrinsic op.
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition BitVector.h:853
LLVM_ABI AAMDNodes adjustForAccess(unsigned AccessSize)
Create a new AAMDNode for accessing AccessSize bytes of this AAMDNode.
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition Alignment.h:39
unsigned countMaxActiveBits() const
Returns the maximum number of bits needed to represent all possible unsigned values with these known ...
Definition KnownBits.h:289
APInt getMaxValue() const
Return the maximal unsigned value possible given these KnownBits.
Definition KnownBits.h:138