LLVM 22.0.0git
InterleavedAccessPass.cpp
Go to the documentation of this file.
1//===- InterleavedAccessPass.cpp ------------------------------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file implements the Interleaved Access pass, which identifies
10// interleaved memory accesses and transforms them into target specific
11// intrinsics.
12//
13// An interleaved load reads data from memory into several vectors, with
14// DE-interleaving the data on a factor. An interleaved store writes several
15// vectors to memory with RE-interleaving the data on a factor.
16//
17// As interleaved accesses are difficult to identified in CodeGen (mainly
18// because the VECTOR_SHUFFLE DAG node is quite different from the shufflevector
19// IR), we identify and transform them to intrinsics in this pass so the
20// intrinsics can be easily matched into target specific instructions later in
21// CodeGen.
22//
23// E.g. An interleaved load (Factor = 2):
24// %wide.vec = load <8 x i32>, <8 x i32>* %ptr
25// %v0 = shuffle <8 x i32> %wide.vec, <8 x i32> poison, <0, 2, 4, 6>
26// %v1 = shuffle <8 x i32> %wide.vec, <8 x i32> poison, <1, 3, 5, 7>
27//
28// It could be transformed into a ld2 intrinsic in AArch64 backend or a vld2
29// intrinsic in ARM backend.
30//
31// In X86, this can be further optimized into a set of target
32// specific loads followed by an optimized sequence of shuffles.
33//
34// E.g. An interleaved store (Factor = 3):
35// %i.vec = shuffle <8 x i32> %v0, <8 x i32> %v1,
36// <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11>
37// store <12 x i32> %i.vec, <12 x i32>* %ptr
38//
39// It could be transformed into a st3 intrinsic in AArch64 backend or a vst3
40// intrinsic in ARM backend.
41//
42// Similarly, a set of interleaved stores can be transformed into an optimized
43// sequence of shuffles followed by a set of target specific stores for X86.
44//
45//===----------------------------------------------------------------------===//
46
47#include "llvm/ADT/ArrayRef.h"
48#include "llvm/ADT/DenseMap.h"
49#include "llvm/ADT/SetVector.h"
56#include "llvm/IR/Constants.h"
57#include "llvm/IR/Dominators.h"
58#include "llvm/IR/Function.h"
59#include "llvm/IR/IRBuilder.h"
61#include "llvm/IR/Instruction.h"
66#include "llvm/Pass.h"
69#include "llvm/Support/Debug.h"
73#include <cassert>
74#include <utility>
75
76using namespace llvm;
77
78#define DEBUG_TYPE "interleaved-access"
79
81 "lower-interleaved-accesses",
82 cl::desc("Enable lowering interleaved accesses to intrinsics"),
83 cl::init(true), cl::Hidden);
84
85namespace {
86
87class InterleavedAccessImpl {
88 friend class InterleavedAccess;
89
90public:
91 InterleavedAccessImpl() = default;
92 InterleavedAccessImpl(DominatorTree *DT, const TargetLowering *TLI)
93 : DT(DT), TLI(TLI), MaxFactor(TLI->getMaxSupportedInterleaveFactor()) {}
94 bool runOnFunction(Function &F);
95
96private:
97 DominatorTree *DT = nullptr;
98 const TargetLowering *TLI = nullptr;
99
100 /// The maximum supported interleave factor.
101 unsigned MaxFactor = 0u;
102
103 /// Transform an interleaved load into target specific intrinsics.
104 bool lowerInterleavedLoad(Instruction *Load,
105 SmallSetVector<Instruction *, 32> &DeadInsts);
106
107 /// Transform an interleaved store into target specific intrinsics.
108 bool lowerInterleavedStore(Instruction *Store,
109 SmallSetVector<Instruction *, 32> &DeadInsts);
110
111 /// Transform a load and a deinterleave intrinsic into target specific
112 /// instructions.
113 bool lowerDeinterleaveIntrinsic(IntrinsicInst *II,
114 SmallSetVector<Instruction *, 32> &DeadInsts);
115
116 /// Transform an interleave intrinsic and a store into target specific
117 /// instructions.
118 bool lowerInterleaveIntrinsic(IntrinsicInst *II,
119 SmallSetVector<Instruction *, 32> &DeadInsts);
120
121 /// Returns true if the uses of an interleaved load by the
122 /// extractelement instructions in \p Extracts can be replaced by uses of the
123 /// shufflevector instructions in \p Shuffles instead. If so, the necessary
124 /// replacements are also performed.
125 bool tryReplaceExtracts(ArrayRef<ExtractElementInst *> Extracts,
127
128 /// Given a number of shuffles of the form shuffle(binop(x,y)), convert them
129 /// to binop(shuffle(x), shuffle(y)) to allow the formation of an
130 /// interleaving load. Any newly created shuffles that operate on \p LI will
131 /// be added to \p Shuffles. Returns true, if any changes to the IR have been
132 /// made.
133 bool replaceBinOpShuffles(ArrayRef<ShuffleVectorInst *> BinOpShuffles,
134 SmallVectorImpl<ShuffleVectorInst *> &Shuffles,
135 Instruction *LI);
136};
137
138class InterleavedAccess : public FunctionPass {
139 InterleavedAccessImpl Impl;
140
141public:
142 static char ID;
143
144 InterleavedAccess() : FunctionPass(ID) {
146 }
147
148 StringRef getPassName() const override { return "Interleaved Access Pass"; }
149
150 bool runOnFunction(Function &F) override;
151
152 void getAnalysisUsage(AnalysisUsage &AU) const override {
153 AU.addRequired<DominatorTreeWrapperPass>();
154 AU.setPreservesCFG();
155 }
156};
157
158} // end anonymous namespace.
159
162 auto *DT = &FAM.getResult<DominatorTreeAnalysis>(F);
163 auto *TLI = TM->getSubtargetImpl(F)->getTargetLowering();
164 InterleavedAccessImpl Impl(DT, TLI);
165 bool Changed = Impl.runOnFunction(F);
166
167 if (!Changed)
168 return PreservedAnalyses::all();
169
172 return PA;
173}
174
175char InterleavedAccess::ID = 0;
176
177bool InterleavedAccess::runOnFunction(Function &F) {
178 if (skipFunction(F))
179 return false;
180
181 auto *TPC = getAnalysisIfAvailable<TargetPassConfig>();
182 if (!TPC || !LowerInterleavedAccesses)
183 return false;
184
185 LLVM_DEBUG(dbgs() << "*** " << getPassName() << ": " << F.getName() << "\n");
186
187 Impl.DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
188 auto &TM = TPC->getTM<TargetMachine>();
189 Impl.TLI = TM.getSubtargetImpl(F)->getTargetLowering();
190 Impl.MaxFactor = Impl.TLI->getMaxSupportedInterleaveFactor();
191
192 return Impl.runOnFunction(F);
193}
194
196 "Lower interleaved memory accesses to target specific intrinsics", false,
197 false)
200 "Lower interleaved memory accesses to target specific intrinsics", false,
201 false)
202
204 return new InterleavedAccess();
205}
206
207/// Check if the mask is a DE-interleave mask for an interleaved load.
208///
209/// E.g. DE-interleave masks (Factor = 2) could be:
210/// <0, 2, 4, 6> (mask of index 0 to extract even elements)
211/// <1, 3, 5, 7> (mask of index 1 to extract odd elements)
212static bool isDeInterleaveMask(ArrayRef<int> Mask, unsigned &Factor,
213 unsigned &Index, unsigned MaxFactor,
214 unsigned NumLoadElements) {
215 if (Mask.size() < 2)
216 return false;
217
218 // Check potential Factors.
219 for (Factor = 2; Factor <= MaxFactor; Factor++) {
220 // Make sure we don't produce a load wider than the input load.
221 if (Mask.size() * Factor > NumLoadElements)
222 return false;
223 if (ShuffleVectorInst::isDeInterleaveMaskOfFactor(Mask, Factor, Index))
224 return true;
225 }
226
227 return false;
228}
229
230/// Check if the mask can be used in an interleaved store.
231//
232/// It checks for a more general pattern than the RE-interleave mask.
233/// I.e. <x, y, ... z, x+1, y+1, ...z+1, x+2, y+2, ...z+2, ...>
234/// E.g. For a Factor of 2 (LaneLen=4): <4, 32, 5, 33, 6, 34, 7, 35>
235/// E.g. For a Factor of 3 (LaneLen=4): <4, 32, 16, 5, 33, 17, 6, 34, 18, 7, 35, 19>
236/// E.g. For a Factor of 4 (LaneLen=2): <8, 2, 12, 4, 9, 3, 13, 5>
237///
238/// The particular case of an RE-interleave mask is:
239/// I.e. <0, LaneLen, ... , LaneLen*(Factor - 1), 1, LaneLen + 1, ...>
240/// E.g. For a Factor of 2 (LaneLen=4): <0, 4, 1, 5, 2, 6, 3, 7>
241static bool isReInterleaveMask(ShuffleVectorInst *SVI, unsigned &Factor,
242 unsigned MaxFactor) {
243 unsigned NumElts = SVI->getShuffleMask().size();
244 if (NumElts < 4)
245 return false;
246
247 // Check potential Factors.
248 for (Factor = 2; Factor <= MaxFactor; Factor++) {
249 if (SVI->isInterleave(Factor))
250 return true;
251 }
252
253 return false;
254}
255
257 switch (II->getIntrinsicID()) {
258 default:
259 llvm_unreachable("Unexpected intrinsic");
260 case Intrinsic::vp_load:
261 return II->getOperand(1);
262 case Intrinsic::masked_load:
263 return II->getOperand(2);
264 case Intrinsic::vp_store:
265 return II->getOperand(2);
266 case Intrinsic::masked_store:
267 return II->getOperand(3);
268 }
269}
270
271// Return a pair of
272// (1) The corresponded deinterleaved mask, or nullptr if there is no valid
273// mask.
274// (2) Some mask effectively skips a certain field, and this element is a mask
275// in which inactive lanes represent fields that are skipped (i.e. "gaps").
276static std::pair<Value *, APInt> getMask(Value *WideMask, unsigned Factor,
277 ElementCount LeafValueEC);
278
279static std::pair<Value *, APInt> getMask(Value *WideMask, unsigned Factor,
280 VectorType *LeafValueTy) {
281 return getMask(WideMask, Factor, LeafValueTy->getElementCount());
282}
283
284bool InterleavedAccessImpl::lowerInterleavedLoad(
286 if (isa<ScalableVectorType>(Load->getType()))
287 return false;
288
289 auto *LI = dyn_cast<LoadInst>(Load);
290 auto *II = dyn_cast<IntrinsicInst>(Load);
291 if (!LI && !II)
292 return false;
293
294 if (LI && !LI->isSimple())
295 return false;
296
297 // Check if all users of this load are shufflevectors. If we encounter any
298 // users that are extractelement instructions or binary operators, we save
299 // them to later check if they can be modified to extract from one of the
300 // shufflevectors instead of the load.
301
304 // BinOpShuffles need to be handled a single time in case both operands of the
305 // binop are the same load.
307
308 for (auto *User : Load->users()) {
309 auto *Extract = dyn_cast<ExtractElementInst>(User);
310 if (Extract && isa<ConstantInt>(Extract->getIndexOperand())) {
311 Extracts.push_back(Extract);
312 continue;
313 }
314 if (auto *BI = dyn_cast<BinaryOperator>(User)) {
315 if (!BI->user_empty() && all_of(BI->users(), [](auto *U) {
316 auto *SVI = dyn_cast<ShuffleVectorInst>(U);
317 return SVI && isa<UndefValue>(SVI->getOperand(1));
318 })) {
319 for (auto *SVI : BI->users())
320 BinOpShuffles.insert(cast<ShuffleVectorInst>(SVI));
321 continue;
322 }
323 }
325 if (!SVI || !isa<UndefValue>(SVI->getOperand(1)))
326 return false;
327
328 Shuffles.push_back(SVI);
329 }
330
331 if (Shuffles.empty() && BinOpShuffles.empty())
332 return false;
333
334 unsigned Factor, Index;
335
336 unsigned NumLoadElements =
337 cast<FixedVectorType>(Load->getType())->getNumElements();
338 auto *FirstSVI = Shuffles.size() > 0 ? Shuffles[0] : BinOpShuffles[0];
339 // Check if the first shufflevector is DE-interleave shuffle.
340 if (!isDeInterleaveMask(FirstSVI->getShuffleMask(), Factor, Index, MaxFactor,
341 NumLoadElements))
342 return false;
343
344 // Holds the corresponding index for each DE-interleave shuffle.
346
347 VectorType *VecTy = cast<VectorType>(FirstSVI->getType());
348
349 // Check if other shufflevectors are also DE-interleaved of the same type
350 // and factor as the first shufflevector.
351 for (auto *Shuffle : Shuffles) {
352 if (Shuffle->getType() != VecTy)
353 return false;
355 Shuffle->getShuffleMask(), Factor, Index))
356 return false;
357
358 assert(Shuffle->getShuffleMask().size() <= NumLoadElements);
359 Indices.push_back(Index);
360 }
361 for (auto *Shuffle : BinOpShuffles) {
362 if (Shuffle->getType() != VecTy)
363 return false;
365 Shuffle->getShuffleMask(), Factor, Index))
366 return false;
367
368 assert(Shuffle->getShuffleMask().size() <= NumLoadElements);
369
370 if (cast<Instruction>(Shuffle->getOperand(0))->getOperand(0) == Load)
371 Indices.push_back(Index);
372 if (cast<Instruction>(Shuffle->getOperand(0))->getOperand(1) == Load)
373 Indices.push_back(Index);
374 }
375
376 // Try and modify users of the load that are extractelement instructions to
377 // use the shufflevector instructions instead of the load.
378 if (!tryReplaceExtracts(Extracts, Shuffles))
379 return false;
380
381 bool BinOpShuffleChanged =
382 replaceBinOpShuffles(BinOpShuffles.getArrayRef(), Shuffles, Load);
383
384 Value *Mask = nullptr;
385 auto GapMask = APInt::getAllOnes(Factor);
386 if (LI) {
387 LLVM_DEBUG(dbgs() << "IA: Found an interleaved load: " << *Load << "\n");
388 } else {
389 // Check mask operand. Handle both all-true/false and interleaved mask.
390 std::tie(Mask, GapMask) = getMask(getMaskOperand(II), Factor, VecTy);
391 if (!Mask)
392 return false;
393
394 LLVM_DEBUG(dbgs() << "IA: Found an interleaved vp.load or masked.load: "
395 << *Load << "\n");
396 LLVM_DEBUG(dbgs() << "IA: With nominal factor " << Factor
397 << " and actual factor " << GapMask.popcount() << "\n");
398 }
399
400 // Try to create target specific intrinsics to replace the load and
401 // shuffles.
402 if (!TLI->lowerInterleavedLoad(cast<Instruction>(Load), Mask, Shuffles,
403 Indices, Factor, GapMask))
404 // If Extracts is not empty, tryReplaceExtracts made changes earlier.
405 return !Extracts.empty() || BinOpShuffleChanged;
406
407 DeadInsts.insert_range(Shuffles);
408
409 DeadInsts.insert(Load);
410 return true;
411}
412
413bool InterleavedAccessImpl::replaceBinOpShuffles(
414 ArrayRef<ShuffleVectorInst *> BinOpShuffles,
416 for (auto *SVI : BinOpShuffles) {
417 BinaryOperator *BI = cast<BinaryOperator>(SVI->getOperand(0));
418 Type *BIOp0Ty = BI->getOperand(0)->getType();
419 ArrayRef<int> Mask = SVI->getShuffleMask();
420 assert(all_of(Mask, [&](int Idx) {
421 return Idx < (int)cast<FixedVectorType>(BIOp0Ty)->getNumElements();
422 }));
423
424 BasicBlock::iterator insertPos = SVI->getIterator();
425 auto *NewSVI1 =
426 new ShuffleVectorInst(BI->getOperand(0), PoisonValue::get(BIOp0Ty),
427 Mask, SVI->getName(), insertPos);
428 auto *NewSVI2 = new ShuffleVectorInst(
429 BI->getOperand(1), PoisonValue::get(BI->getOperand(1)->getType()), Mask,
430 SVI->getName(), insertPos);
432 BI->getOpcode(), NewSVI1, NewSVI2, BI, BI->getName(), insertPos);
433 SVI->replaceAllUsesWith(NewBI);
434 LLVM_DEBUG(dbgs() << " Replaced: " << *BI << "\n And : " << *SVI
435 << "\n With : " << *NewSVI1 << "\n And : "
436 << *NewSVI2 << "\n And : " << *NewBI << "\n");
438 if (NewSVI1->getOperand(0) == Load)
439 Shuffles.push_back(NewSVI1);
440 if (NewSVI2->getOperand(0) == Load)
441 Shuffles.push_back(NewSVI2);
442 }
443
444 return !BinOpShuffles.empty();
445}
446
447bool InterleavedAccessImpl::tryReplaceExtracts(
450 // If there aren't any extractelement instructions to modify, there's nothing
451 // to do.
452 if (Extracts.empty())
453 return true;
454
455 // Maps extractelement instructions to vector-index pairs. The extractlement
456 // instructions will be modified to use the new vector and index operands.
458
459 for (auto *Extract : Extracts) {
460 // The vector index that is extracted.
461 auto *IndexOperand = cast<ConstantInt>(Extract->getIndexOperand());
462 auto Index = IndexOperand->getSExtValue();
463
464 // Look for a suitable shufflevector instruction. The goal is to modify the
465 // extractelement instruction (which uses an interleaved load) to use one
466 // of the shufflevector instructions instead of the load.
467 for (auto *Shuffle : Shuffles) {
468 // If the shufflevector instruction doesn't dominate the extract, we
469 // can't create a use of it.
470 if (!DT->dominates(Shuffle, Extract))
471 continue;
472
473 // Inspect the indices of the shufflevector instruction. If the shuffle
474 // selects the same index that is extracted, we can modify the
475 // extractelement instruction.
476 SmallVector<int, 4> Indices;
477 Shuffle->getShuffleMask(Indices);
478 for (unsigned I = 0; I < Indices.size(); ++I)
479 if (Indices[I] == Index) {
480 assert(Extract->getOperand(0) == Shuffle->getOperand(0) &&
481 "Vector operations do not match");
482 ReplacementMap[Extract] = std::make_pair(Shuffle, I);
483 break;
484 }
485
486 // If we found a suitable shufflevector instruction, stop looking.
487 if (ReplacementMap.count(Extract))
488 break;
489 }
490
491 // If we did not find a suitable shufflevector instruction, the
492 // extractelement instruction cannot be modified, so we must give up.
493 if (!ReplacementMap.count(Extract))
494 return false;
495 }
496
497 // Finally, perform the replacements.
498 IRBuilder<> Builder(Extracts[0]->getContext());
499 for (auto &Replacement : ReplacementMap) {
500 auto *Extract = Replacement.first;
501 auto *Vector = Replacement.second.first;
502 auto Index = Replacement.second.second;
503 Builder.SetInsertPoint(Extract);
504 Extract->replaceAllUsesWith(Builder.CreateExtractElement(Vector, Index));
505 Extract->eraseFromParent();
506 }
507
508 return true;
509}
510
511bool InterleavedAccessImpl::lowerInterleavedStore(
513 Value *StoredValue;
514 auto *SI = dyn_cast<StoreInst>(Store);
515 auto *II = dyn_cast<IntrinsicInst>(Store);
516 if (SI) {
517 if (!SI->isSimple())
518 return false;
519 StoredValue = SI->getValueOperand();
520 } else {
521 assert(II->getIntrinsicID() == Intrinsic::vp_store ||
522 II->getIntrinsicID() == Intrinsic::masked_store);
523 StoredValue = II->getArgOperand(0);
524 }
525
526 auto *SVI = dyn_cast<ShuffleVectorInst>(StoredValue);
527 if (!SVI || !SVI->hasOneUse() || isa<ScalableVectorType>(SVI->getType()))
528 return false;
529
530 unsigned NumStoredElements =
531 cast<FixedVectorType>(SVI->getType())->getNumElements();
532 // Check if the shufflevector is RE-interleave shuffle.
533 unsigned Factor;
534 if (!isReInterleaveMask(SVI, Factor, MaxFactor))
535 return false;
536 assert(NumStoredElements % Factor == 0 &&
537 "number of stored element should be a multiple of Factor");
538
539 Value *Mask = nullptr;
540 auto GapMask = APInt::getAllOnes(Factor);
541 if (SI) {
542 LLVM_DEBUG(dbgs() << "IA: Found an interleaved store: " << *Store << "\n");
543 } else {
544 // Check mask operand. Handle both all-true/false and interleaved mask.
545 unsigned LaneMaskLen = NumStoredElements / Factor;
546 std::tie(Mask, GapMask) = getMask(getMaskOperand(II), Factor,
547 ElementCount::getFixed(LaneMaskLen));
548 if (!Mask)
549 return false;
550
551 LLVM_DEBUG(dbgs() << "IA: Found an interleaved vp.store or masked.store: "
552 << *Store << "\n");
553 LLVM_DEBUG(dbgs() << "IA: With nominal factor " << Factor
554 << " and actual factor " << GapMask.popcount() << "\n");
555 }
556
557 // Try to create target specific intrinsics to replace the store and
558 // shuffle.
559 if (!TLI->lowerInterleavedStore(Store, Mask, SVI, Factor, GapMask))
560 return false;
561
562 // Already have a new target specific interleaved store. Erase the old store.
563 DeadInsts.insert(Store);
564 DeadInsts.insert(SVI);
565 return true;
566}
567
568// A wide mask <1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0> could be used to skip the
569// last field in a factor-of-three interleaved store or deinterleaved load (in
570// which case LeafMaskLen is 4). Such (wide) mask is also known as gap mask.
571// This helper function tries to detect this pattern and return the actual
572// factor we're accessing, which is 2 in this example.
573static void getGapMask(const Constant &MaskConst, unsigned Factor,
574 unsigned LeafMaskLen, APInt &GapMask) {
575 assert(GapMask.getBitWidth() == Factor);
576 for (unsigned F = 0U; F < Factor; ++F) {
577 bool AllZero = true;
578 for (unsigned Idx = 0U; Idx < LeafMaskLen; ++Idx) {
579 Constant *C = MaskConst.getAggregateElement(F + Idx * Factor);
580 if (!C->isZeroValue()) {
581 AllZero = false;
582 break;
583 }
584 }
585 // All mask bits on this field are zero, skipping it.
586 if (AllZero)
587 GapMask.clearBit(F);
588 }
589}
590
591static std::pair<Value *, APInt> getMask(Value *WideMask, unsigned Factor,
592 ElementCount LeafValueEC) {
593 auto GapMask = APInt::getAllOnes(Factor);
594
595 if (auto *IMI = dyn_cast<IntrinsicInst>(WideMask)) {
596 if (unsigned F = getInterleaveIntrinsicFactor(IMI->getIntrinsicID());
597 F && F == Factor) {
598 Value *RefArg = nullptr;
599 // Check if all the intrinsic arguments are the same, except those that
600 // are zeros, which we mark as gaps in the gap mask.
601 for (auto [Idx, Arg] : enumerate(IMI->args())) {
602 if (auto *C = dyn_cast<Constant>(Arg); C && C->isZeroValue()) {
603 GapMask.clearBit(Idx);
604 continue;
605 }
606
607 if (!RefArg)
608 RefArg = Arg;
609 else if (RefArg != Arg)
610 return {nullptr, GapMask};
611 }
612
613 // In a very rare occasion, all the intrinsic arguments might be zeros,
614 // in which case we still want to return an all-zeros constant instead of
615 // nullptr.
616 return {RefArg ? RefArg : IMI->getArgOperand(0), GapMask};
617 }
618 }
619
620 // Masks that are assembled from bitwise AND.
621 if (auto *AndOp = dyn_cast<BinaryOperator>(WideMask);
622 AndOp && AndOp->getOpcode() == Instruction::And) {
623 auto [MaskLHS, GapMaskLHS] =
624 getMask(AndOp->getOperand(0), Factor, LeafValueEC);
625 auto [MaskRHS, GapMaskRHS] =
626 getMask(AndOp->getOperand(1), Factor, LeafValueEC);
627 if (!MaskLHS || !MaskRHS)
628 return {nullptr, GapMask};
629 // Using IRBuilder here so that any trivial constants could be folded right
630 // away.
631 return {IRBuilder<>(AndOp).CreateAnd(MaskLHS, MaskRHS),
632 GapMaskLHS & GapMaskRHS};
633 }
634
635 if (auto *ConstMask = dyn_cast<Constant>(WideMask)) {
636 if (auto *Splat = ConstMask->getSplatValue())
637 // All-ones or all-zeros mask.
638 return {ConstantVector::getSplat(LeafValueEC, Splat), GapMask};
639
640 if (LeafValueEC.isFixed()) {
641 unsigned LeafMaskLen = LeafValueEC.getFixedValue();
642 // First, check if we use a gap mask to skip some of the factors / fields.
643 getGapMask(*ConstMask, Factor, LeafMaskLen, GapMask);
644
645 SmallVector<Constant *, 8> LeafMask(LeafMaskLen, nullptr);
646 // If this is a fixed-length constant mask, each lane / leaf has to
647 // use the same mask. This is done by checking if every group with Factor
648 // number of elements in the interleaved mask has homogeneous values.
649 for (unsigned Idx = 0U; Idx < LeafMaskLen * Factor; ++Idx) {
650 if (!GapMask[Idx % Factor])
651 continue;
652 Constant *C = ConstMask->getAggregateElement(Idx);
653 if (LeafMask[Idx / Factor] && LeafMask[Idx / Factor] != C)
654 return {nullptr, GapMask};
655 LeafMask[Idx / Factor] = C;
656 }
657
658 return {ConstantVector::get(LeafMask), GapMask};
659 }
660 }
661
662 if (auto *SVI = dyn_cast<ShuffleVectorInst>(WideMask)) {
663 Type *Op1Ty = SVI->getOperand(1)->getType();
664 if (!isa<FixedVectorType>(Op1Ty))
665 return {nullptr, GapMask};
666
667 // Check that the shuffle mask is: a) an interleave, b) all of the same
668 // set of the elements, and c) contained by the first source. (c) could
669 // be relaxed if desired.
670 unsigned NumSrcElts =
671 cast<FixedVectorType>(SVI->getOperand(1)->getType())->getNumElements();
672 SmallVector<unsigned> StartIndexes;
673 if (ShuffleVectorInst::isInterleaveMask(SVI->getShuffleMask(), Factor,
674 NumSrcElts * 2, StartIndexes) &&
675 llvm::all_of(StartIndexes, [](unsigned Start) { return Start == 0; }) &&
676 llvm::all_of(SVI->getShuffleMask(), [&NumSrcElts](int Idx) {
677 return Idx < (int)NumSrcElts;
678 })) {
679 auto *LeafMaskTy =
680 VectorType::get(Type::getInt1Ty(SVI->getContext()), LeafValueEC);
681 IRBuilder<> Builder(SVI);
682 return {Builder.CreateExtractVector(LeafMaskTy, SVI->getOperand(0),
683 uint64_t(0)),
684 GapMask};
685 }
686 }
687
688 return {nullptr, GapMask};
689}
690
691bool InterleavedAccessImpl::lowerDeinterleaveIntrinsic(
693 Instruction *LoadedVal = dyn_cast<Instruction>(DI->getOperand(0));
694 if (!LoadedVal || !LoadedVal->hasOneUse())
695 return false;
696
697 auto *LI = dyn_cast<LoadInst>(LoadedVal);
698 auto *II = dyn_cast<IntrinsicInst>(LoadedVal);
699 if (!LI && !II)
700 return false;
701
702 const unsigned Factor = getDeinterleaveIntrinsicFactor(DI->getIntrinsicID());
703 assert(Factor && "unexpected deinterleave intrinsic");
704
705 Value *Mask = nullptr;
706 if (LI) {
707 if (!LI->isSimple())
708 return false;
709
710 LLVM_DEBUG(dbgs() << "IA: Found a load with deinterleave intrinsic " << *DI
711 << " and factor = " << Factor << "\n");
712 } else {
713 assert(II);
714 if (II->getIntrinsicID() != Intrinsic::masked_load &&
715 II->getIntrinsicID() != Intrinsic::vp_load)
716 return false;
717
718 // Check mask operand. Handle both all-true/false and interleaved mask.
719 APInt GapMask(Factor, 0);
720 std::tie(Mask, GapMask) =
722 if (!Mask)
723 return false;
724 // We haven't supported gap mask if it's deinterleaving using intrinsics.
725 // Yet it is possible that we already changed the IR, hence returning true
726 // here.
727 if (GapMask.popcount() != Factor)
728 return true;
729
730 LLVM_DEBUG(dbgs() << "IA: Found a vp.load or masked.load with deinterleave"
731 << " intrinsic " << *DI << " and factor = "
732 << Factor << "\n");
733 }
734
735 // Try and match this with target specific intrinsics.
736 if (!TLI->lowerDeinterleaveIntrinsicToLoad(LoadedVal, Mask, DI))
737 return false;
738
739 DeadInsts.insert(DI);
740 // We now have a target-specific load, so delete the old one.
741 DeadInsts.insert(LoadedVal);
742 return true;
743}
744
745bool InterleavedAccessImpl::lowerInterleaveIntrinsic(
747 if (!IntII->hasOneUse())
748 return false;
749 Instruction *StoredBy = dyn_cast<Instruction>(IntII->user_back());
750 if (!StoredBy)
751 return false;
752 auto *SI = dyn_cast<StoreInst>(StoredBy);
753 auto *II = dyn_cast<IntrinsicInst>(StoredBy);
754 if (!SI && !II)
755 return false;
756
757 SmallVector<Value *, 8> InterleaveValues(IntII->args());
758 const unsigned Factor = getInterleaveIntrinsicFactor(IntII->getIntrinsicID());
759 assert(Factor && "unexpected interleave intrinsic");
760
761 Value *Mask = nullptr;
762 if (II) {
763 if (II->getIntrinsicID() != Intrinsic::masked_store &&
764 II->getIntrinsicID() != Intrinsic::vp_store)
765 return false;
766 // Check mask operand. Handle both all-true/false and interleaved mask.
767 APInt GapMask(Factor, 0);
768 std::tie(Mask, GapMask) =
769 getMask(getMaskOperand(II), Factor,
770 cast<VectorType>(InterleaveValues[0]->getType()));
771 if (!Mask)
772 return false;
773 // We haven't supported gap mask if it's interleaving using intrinsics. Yet
774 // it is possible that we already changed the IR, hence returning true here.
775 if (GapMask.popcount() != Factor)
776 return true;
777
778 LLVM_DEBUG(dbgs() << "IA: Found a vp.store or masked.store with interleave"
779 << " intrinsic " << *IntII << " and factor = "
780 << Factor << "\n");
781 } else {
782 if (!SI->isSimple())
783 return false;
784
785 LLVM_DEBUG(dbgs() << "IA: Found a store with interleave intrinsic "
786 << *IntII << " and factor = " << Factor << "\n");
787 }
788
789 // Try and match this with target specific intrinsics.
790 if (!TLI->lowerInterleaveIntrinsicToStore(StoredBy, Mask, InterleaveValues))
791 return false;
792
793 // We now have a target-specific store, so delete the old one.
794 DeadInsts.insert(StoredBy);
795 DeadInsts.insert(IntII);
796 return true;
797}
798
799bool InterleavedAccessImpl::runOnFunction(Function &F) {
800 // Holds dead instructions that will be erased later.
802 bool Changed = false;
803
804 using namespace PatternMatch;
805 for (auto &I : instructions(F)) {
809 Changed |= lowerInterleavedLoad(&I, DeadInsts);
810
814 Changed |= lowerInterleavedStore(&I, DeadInsts);
815
816 if (auto *II = dyn_cast<IntrinsicInst>(&I)) {
817 if (getDeinterleaveIntrinsicFactor(II->getIntrinsicID()))
818 Changed |= lowerDeinterleaveIntrinsic(II, DeadInsts);
819 else if (getInterleaveIntrinsicFactor(II->getIntrinsicID()))
820 Changed |= lowerInterleaveIntrinsic(II, DeadInsts);
821 }
822 }
823
824 for (auto *I : DeadInsts)
825 I->eraseFromParent();
826
827 return Changed;
828}
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
Expand Atomic instructions
This file contains the declarations for the subclasses of Constant, which represent the different fla...
This file defines the DenseMap class.
static bool runOnFunction(Function &F, bool PostInlining)
#define DEBUG_TYPE
static bool isDeInterleaveMask(ArrayRef< int > Mask, unsigned &Factor, unsigned &Index, unsigned MaxFactor, unsigned NumLoadElements)
Check if the mask is a DE-interleave mask for an interleaved load.
static void getGapMask(const Constant &MaskConst, unsigned Factor, unsigned LeafMaskLen, APInt &GapMask)
static cl::opt< bool > LowerInterleavedAccesses("lower-interleaved-accesses", cl::desc("Enable lowering interleaved accesses to intrinsics"), cl::init(true), cl::Hidden)
static bool isReInterleaveMask(ShuffleVectorInst *SVI, unsigned &Factor, unsigned MaxFactor)
Check if the mask can be used in an interleaved store.
static Value * getMaskOperand(IntrinsicInst *II)
static std::pair< Value *, APInt > getMask(Value *WideMask, unsigned Factor, ElementCount LeafValueEC)
This file contains the declaration of the InterleavedAccessPass class, its corresponding pass name is...
#define F(x, y, z)
Definition MD5.cpp:55
#define I(x, y, z)
Definition MD5.cpp:58
uint64_t IntrinsicInst * II
FunctionAnalysisManager FAM
#define INITIALIZE_PASS_DEPENDENCY(depName)
Definition PassSupport.h:42
#define INITIALIZE_PASS_END(passName, arg, name, cfg, analysis)
Definition PassSupport.h:44
#define INITIALIZE_PASS_BEGIN(passName, arg, name, cfg, analysis)
Definition PassSupport.h:39
This file implements a set that has insertion order iteration characteristics.
This file defines the SmallVector class.
#define LLVM_DEBUG(...)
Definition Debug.h:114
static SymbolRef::Type getType(const Symbol *Sym)
Definition TapiFile.cpp:39
This file describes how to lower LLVM code to machine code.
Target-Independent Code Generator Pass Configuration Options pass.
Class for arbitrary precision integers.
Definition APInt.h:78
static APInt getAllOnes(unsigned numBits)
Return an APInt of a specified width with all bits set.
Definition APInt.h:234
void clearBit(unsigned BitPosition)
Set a given bit to 0.
Definition APInt.h:1406
unsigned getBitWidth() const
Return the number of bits in the APInt.
Definition APInt.h:1488
AnalysisUsage & addRequired()
LLVM_ABI void setPreservesCFG()
This function should be called by the pass, iff they do not:
Definition Pass.cpp:270
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:41
bool empty() const
empty - Check if the array is empty.
Definition ArrayRef.h:142
InstListType::iterator iterator
Instruction iterators...
Definition BasicBlock.h:170
BinaryOps getOpcode() const
Definition InstrTypes.h:374
static BinaryOperator * CreateWithCopiedFlags(BinaryOps Opc, Value *V1, Value *V2, Value *CopyO, const Twine &Name="", InsertPosition InsertBefore=nullptr)
Definition InstrTypes.h:219
Represents analyses that only rely on functions' control flow.
Definition Analysis.h:73
iterator_range< User::op_iterator > args()
Iteration adapter for range-for loops.
static LLVM_ABI Constant * getSplat(ElementCount EC, Constant *Elt)
Return a ConstantVector with the specified constant in each element.
static LLVM_ABI Constant * get(ArrayRef< Constant * > V)
This is an important base class in LLVM.
Definition Constant.h:43
LLVM_ABI Constant * getAggregateElement(unsigned Elt) const
For aggregates (struct/array/vector) return the constant that corresponds to the specified element if...
size_type count(const_arg_type_t< KeyT > Val) const
Return 1 if the specified key is in the map, 0 otherwise.
Definition DenseMap.h:161
Analysis pass which computes a DominatorTree.
Definition Dominators.h:284
Legacy analysis pass which computes a DominatorTree.
Definition Dominators.h:322
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree.
Definition Dominators.h:165
static constexpr ElementCount getFixed(ScalarTy MinVal)
Definition TypeSize.h:309
FunctionPass class - This class is used to implement most global optimizations.
Definition Pass.h:314
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
Definition IRBuilder.h:2780
Instruction * user_back()
Specialize the methods defined in Value, as we know that an instruction can only be used by other ins...
PreservedAnalyses run(Function &F, FunctionAnalysisManager &FAM)
A wrapper class for inspecting calls to intrinsic functions.
Intrinsic::ID getIntrinsicID() const
Return the intrinsic ID of this intrinsic.
static LLVM_ABI PassRegistry * getPassRegistry()
getPassRegistry - Access the global registry object, which is automatically initialized at applicatio...
static LLVM_ABI PoisonValue * get(Type *T)
Static factory methods - Return an 'poison' object of the specified type.
A set of analyses that are preserved following a run of a transformation pass.
Definition Analysis.h:112
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
Definition Analysis.h:118
PreservedAnalyses & preserveSet()
Mark an analysis set as preserved.
Definition Analysis.h:151
void insert_range(Range &&R)
Definition SetVector.h:193
bool empty() const
Determine if the SetVector is empty or not.
Definition SetVector.h:99
bool insert(const value_type &X)
Insert a new element into the SetVector.
Definition SetVector.h:168
This instruction constructs a fixed permutation of two input vectors.
static LLVM_ABI void getShuffleMask(const Constant *Mask, SmallVectorImpl< int > &Result)
Convert the input shuffle mask operand to a vector of integers.
static LLVM_ABI bool isDeInterleaveMaskOfFactor(ArrayRef< int > Mask, unsigned Factor, unsigned &Index)
Check if the mask is a DE-interleave mask of the given factor Factor like: <Index,...
LLVM_ABI bool isInterleave(unsigned Factor)
Return if this shuffle interleaves its two input vectors together.
static LLVM_ABI bool isInterleaveMask(ArrayRef< int > Mask, unsigned Factor, unsigned NumInputElts, SmallVectorImpl< unsigned > &StartIndexes)
Return true if the mask interleaves one or more input vectors together.
A SetVector that performs no allocations if smaller than a certain size.
Definition SetVector.h:356
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
This class defines information used to lower LLVM code to legal SelectionDAG operators that the targe...
Primary interface to the complete machine description for the target machine.
The instances of the Type class are immutable: once they are created, they are never changed.
Definition Type.h:45
static LLVM_ABI IntegerType * getInt1Ty(LLVMContext &C)
Definition Type.cpp:294
Value * getOperand(unsigned i) const
Definition User.h:232
LLVM Value Representation.
Definition Value.h:75
Type * getType() const
All values are typed, get the type of this value.
Definition Value.h:256
bool hasOneUse() const
Return true if there is exactly one use of this value.
Definition Value.h:439
LLVM_ABI StringRef getName() const
Return a constant reference to the value's name.
Definition Value.cpp:322
Base class of all SIMD vector types.
static LLVM_ABI VectorType * get(Type *ElementType, ElementCount EC)
This static method is the primary way to construct an VectorType.
constexpr ScalarTy getFixedValue() const
Definition TypeSize.h:200
constexpr bool isFixed() const
Returns true if the quantity is not scaled by vscale.
Definition TypeSize.h:172
Changed
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
TwoOps_match< ValueOpTy, PointerOpTy, Instruction::Store > m_Store(const ValueOpTy &ValueOp, const PointerOpTy &PointerOp)
Matches StoreInst.
bool match(Val *V, const Pattern &P)
IntrinsicID_match m_Intrinsic()
Match intrinsic calls like this: m_Intrinsic<Intrinsic::fabs>(m_Value(X))
OneOps_match< OpTy, Instruction::Load > m_Load(const OpTy &Op)
Matches LoadInst.
class_match< Value > m_Value()
Match an arbitrary value and ignore it.
match_combine_or< LTy, RTy > m_CombineOr(const LTy &L, const RTy &R)
Combine two pattern matchers matching L || R.
initializer< Ty > init(const Ty &Val)
Context & getContext() const
Definition BasicBlock.h:99
This is an optimization pass for GlobalISel generic memory operations.
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1707
LLVM_ABI bool RecursivelyDeleteTriviallyDeadInstructions(Value *V, const TargetLibraryInfo *TLI=nullptr, MemorySSAUpdater *MSSAU=nullptr, std::function< void(Value *)> AboutToDeleteCallback=std::function< void(Value *)>())
If the specified value is a trivially dead instruction, delete it.
Definition Local.cpp:533
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are tuples (A, B,...
Definition STLExtras.h:2454
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:649
LLVM_ABI void initializeInterleavedAccessPass(PassRegistry &)
LLVM_ABI unsigned getDeinterleaveIntrinsicFactor(Intrinsic::ID ID)
Returns the corresponding factor of llvm.vector.deinterleaveN intrinsics.
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:207
LLVM_ABI unsigned getInterleaveIntrinsicFactor(Intrinsic::ID ID)
Returns the corresponding factor of llvm.vector.interleaveN intrinsics.
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:548
LLVM_ABI FunctionPass * createInterleavedAccessPass()
InterleavedAccess Pass - This pass identifies and matches interleaved memory accesses to target speci...
IRBuilder(LLVMContext &, FolderTy, InserterTy, MDNode *, ArrayRef< OperandBundleDef >) -> IRBuilder< FolderTy, InserterTy >
LLVM_ABI VectorType * getDeinterleavedVectorType(IntrinsicInst *DI)
Given a deinterleaveN intrinsic, return the (narrow) vector type of each factor.
ArrayRef(const T &OneElt) -> ArrayRef< T >
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:565
AnalysisManager< Function > FunctionAnalysisManager
Convenience typedef for the Function analysis manager.