LLVM 22.0.0git
LoadStoreVectorizer.cpp
Go to the documentation of this file.
1//===- LoadStoreVectorizer.cpp - GPU Load & Store Vectorizer --------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This pass merges loads/stores to/from sequential memory addresses into vector
10// loads/stores. Although there's nothing GPU-specific in here, this pass is
11// motivated by the microarchitectural quirks of nVidia and AMD GPUs.
12//
13// (For simplicity below we talk about loads only, but everything also applies
14// to stores.)
15//
16// This pass is intended to be run late in the pipeline, after other
17// vectorization opportunities have been exploited. So the assumption here is
18// that immediately following our new vector load we'll need to extract out the
19// individual elements of the load, so we can operate on them individually.
20//
21// On CPUs this transformation is usually not beneficial, because extracting the
22// elements of a vector register is expensive on most architectures. It's
23// usually better just to load each element individually into its own scalar
24// register.
25//
26// However, nVidia and AMD GPUs don't have proper vector registers. Instead, a
27// "vector load" loads directly into a series of scalar registers. In effect,
28// extracting the elements of the vector is free. It's therefore always
29// beneficial to vectorize a sequence of loads on these architectures.
30//
31// Vectorizing (perhaps a better name might be "coalescing") loads can have
32// large performance impacts on GPU kernels, and opportunities for vectorizing
33// are common in GPU code. This pass tries very hard to find such
34// opportunities; its runtime is quadratic in the number of loads in a BB.
35//
36// Some CPU architectures, such as ARM, have instructions that load into
37// multiple scalar registers, similar to a GPU vectorized load. In theory ARM
38// could use this pass (with some modifications), but currently it implements
39// its own pass to do something similar to what we do here.
40//
41// Overview of the algorithm and terminology in this pass:
42//
43// - Break up each basic block into pseudo-BBs, composed of instructions which
44// are guaranteed to transfer control to their successors.
45// - Within a single pseudo-BB, find all loads, and group them into
46// "equivalence classes" according to getUnderlyingObject() and loaded
47// element size. Do the same for stores.
48// - For each equivalence class, greedily build "chains". Each chain has a
49// leader instruction, and every other member of the chain has a known
50// constant offset from the first instr in the chain.
51// - Break up chains so that they contain only contiguous accesses of legal
52// size with no intervening may-alias instrs.
53// - Convert each chain to vector instructions.
54//
55// The O(n^2) behavior of this pass comes from initially building the chains.
56// In the worst case we have to compare each new instruction to all of those
57// that came before. To limit this, we only calculate the offset to the leaders
58// of the N most recently-used chains.
59
61#include "llvm/ADT/APInt.h"
62#include "llvm/ADT/ArrayRef.h"
63#include "llvm/ADT/DenseMap.h"
64#include "llvm/ADT/MapVector.h"
66#include "llvm/ADT/STLExtras.h"
67#include "llvm/ADT/Sequence.h"
70#include "llvm/ADT/Statistic.h"
79#include "llvm/IR/Attributes.h"
80#include "llvm/IR/BasicBlock.h"
82#include "llvm/IR/Constants.h"
83#include "llvm/IR/DataLayout.h"
85#include "llvm/IR/Dominators.h"
86#include "llvm/IR/Function.h"
88#include "llvm/IR/IRBuilder.h"
89#include "llvm/IR/InstrTypes.h"
90#include "llvm/IR/Instruction.h"
92#include "llvm/IR/LLVMContext.h"
93#include "llvm/IR/Module.h"
94#include "llvm/IR/Type.h"
95#include "llvm/IR/Value.h"
97#include "llvm/Pass.h"
100#include "llvm/Support/Debug.h"
103#include "llvm/Support/ModRef.h"
106#include <algorithm>
107#include <cassert>
108#include <cstdint>
109#include <cstdlib>
110#include <iterator>
111#include <numeric>
112#include <optional>
113#include <tuple>
114#include <type_traits>
115#include <utility>
116#include <vector>
117
118using namespace llvm;
119
120#define DEBUG_TYPE "load-store-vectorizer"
121
122STATISTIC(NumVectorInstructions, "Number of vector accesses generated");
123STATISTIC(NumScalarsVectorized, "Number of scalar accesses vectorized");
124
125namespace {
126
127// Equivalence class key, the initial tuple by which we group loads/stores.
128// Loads/stores with different EqClassKeys are never merged.
129//
130// (We could in theory remove element-size from the this tuple. We'd just need
131// to fix up the vector packing/unpacking code.)
132using EqClassKey =
133 std::tuple<const Value * /* result of getUnderlyingObject() */,
134 unsigned /* AddrSpace */,
135 unsigned /* Load/Store element size bits */,
136 char /* IsLoad; char b/c bool can't be a DenseMap key */
137 >;
139 const EqClassKey &K) {
140 const auto &[UnderlyingObject, AddrSpace, ElementSize, IsLoad] = K;
141 return OS << (IsLoad ? "load" : "store") << " of " << *UnderlyingObject
142 << " of element size " << ElementSize << " bits in addrspace "
143 << AddrSpace;
144}
145
146// A Chain is a set of instructions such that:
147// - All instructions have the same equivalence class, so in particular all are
148// loads, or all are stores.
149// - We know the address accessed by the i'th chain elem relative to the
150// chain's leader instruction, which is the first instr of the chain in BB
151// order.
152//
153// Chains have two canonical orderings:
154// - BB order, sorted by Instr->comesBefore.
155// - Offset order, sorted by OffsetFromLeader.
156// This pass switches back and forth between these orders.
157struct ChainElem {
158 Instruction *Inst;
159 APInt OffsetFromLeader;
160 ChainElem(Instruction *Inst, APInt OffsetFromLeader)
161 : Inst(std::move(Inst)), OffsetFromLeader(std::move(OffsetFromLeader)) {}
162};
163using Chain = SmallVector<ChainElem, 1>;
164
165void sortChainInBBOrder(Chain &C) {
166 sort(C, [](auto &A, auto &B) { return A.Inst->comesBefore(B.Inst); });
167}
168
169void sortChainInOffsetOrder(Chain &C) {
170 sort(C, [](const auto &A, const auto &B) {
171 if (A.OffsetFromLeader != B.OffsetFromLeader)
172 return A.OffsetFromLeader.slt(B.OffsetFromLeader);
173 return A.Inst->comesBefore(B.Inst); // stable tiebreaker
174 });
175}
176
177[[maybe_unused]] void dumpChain(ArrayRef<ChainElem> C) {
178 for (const auto &E : C) {
179 dbgs() << " " << *E.Inst << " (offset " << E.OffsetFromLeader << ")\n";
180 }
181}
182
183using EquivalenceClassMap =
185
186// FIXME: Assuming stack alignment of 4 is always good enough
187constexpr unsigned StackAdjustedAlignment = 4;
188
191 for (const ChainElem &E : C)
192 Values.emplace_back(E.Inst);
193 return propagateMetadata(I, Values);
194}
195
196bool isInvariantLoad(const Instruction *I) {
197 const LoadInst *LI = dyn_cast<LoadInst>(I);
198 return LI != nullptr && LI->hasMetadata(LLVMContext::MD_invariant_load);
199}
200
201/// Reorders the instructions that I depends on (the instructions defining its
202/// operands), to ensure they dominate I.
203void reorder(Instruction *I) {
204 SmallPtrSet<Instruction *, 16> InstructionsToMove;
206
207 Worklist.emplace_back(I);
208 while (!Worklist.empty()) {
209 Instruction *IW = Worklist.pop_back_val();
210 int NumOperands = IW->getNumOperands();
211 for (int Idx = 0; Idx < NumOperands; Idx++) {
213 if (!IM || IM->getOpcode() == Instruction::PHI)
214 continue;
215
216 // If IM is in another BB, no need to move it, because this pass only
217 // vectorizes instructions within one BB.
218 if (IM->getParent() != I->getParent())
219 continue;
220
221 assert(IM != I && "Unexpected cycle while re-ordering instructions");
222
223 if (!IM->comesBefore(I)) {
224 InstructionsToMove.insert(IM);
225 Worklist.emplace_back(IM);
226 }
227 }
228 }
229
230 // All instructions to move should follow I. Start from I, not from begin().
231 for (auto BBI = I->getIterator(), E = I->getParent()->end(); BBI != E;) {
232 Instruction *IM = &*(BBI++);
233 if (!InstructionsToMove.contains(IM))
234 continue;
235 IM->moveBefore(I->getIterator());
236 }
237}
238
239class Vectorizer {
240 Function &F;
241 AliasAnalysis &AA;
242 AssumptionCache &AC;
243 DominatorTree &DT;
244 ScalarEvolution &SE;
245 TargetTransformInfo &TTI;
246 const DataLayout &DL;
247 IRBuilder<> Builder;
248
249 // We could erase instrs right after vectorizing them, but that can mess up
250 // our BB iterators, and also can make the equivalence class keys point to
251 // freed memory. This is fixable, but it's simpler just to wait until we're
252 // done with the BB and erase all at once.
254
255public:
256 Vectorizer(Function &F, AliasAnalysis &AA, AssumptionCache &AC,
257 DominatorTree &DT, ScalarEvolution &SE, TargetTransformInfo &TTI)
258 : F(F), AA(AA), AC(AC), DT(DT), SE(SE), TTI(TTI),
259 DL(F.getDataLayout()), Builder(SE.getContext()) {}
260
261 bool run();
262
263private:
264 static const unsigned MaxDepth = 3;
265
266 /// Runs the vectorizer on a "pseudo basic block", which is a range of
267 /// instructions [Begin, End) within one BB all of which have
268 /// isGuaranteedToTransferExecutionToSuccessor(I) == true.
269 bool runOnPseudoBB(BasicBlock::iterator Begin, BasicBlock::iterator End);
270
271 /// Runs the vectorizer on one equivalence class, i.e. one set of loads/stores
272 /// in the same BB with the same value for getUnderlyingObject() etc.
273 bool runOnEquivalenceClass(const EqClassKey &EqClassKey,
275
276 /// Runs the vectorizer on one chain, i.e. a subset of an equivalence class
277 /// where all instructions access a known, constant offset from the first
278 /// instruction.
279 bool runOnChain(Chain &C);
280
281 /// Splits the chain into subchains of instructions which read/write a
282 /// contiguous block of memory. Discards any length-1 subchains (because
283 /// there's nothing to vectorize in there).
284 std::vector<Chain> splitChainByContiguity(Chain &C);
285
286 /// Splits the chain into subchains where it's safe to hoist loads up to the
287 /// beginning of the sub-chain and it's safe to sink loads up to the end of
288 /// the sub-chain. Discards any length-1 subchains.
289 std::vector<Chain> splitChainByMayAliasInstrs(Chain &C);
290
291 /// Splits the chain into subchains that make legal, aligned accesses.
292 /// Discards any length-1 subchains.
293 std::vector<Chain> splitChainByAlignment(Chain &C);
294
295 /// Converts the instrs in the chain into a single vectorized load or store.
296 /// Adds the old scalar loads/stores to ToErase.
297 bool vectorizeChain(Chain &C);
298
299 /// Tries to compute the offset in bytes PtrB - PtrA.
300 std::optional<APInt> getConstantOffset(Value *PtrA, Value *PtrB,
301 Instruction *ContextInst,
302 unsigned Depth = 0);
303 std::optional<APInt> getConstantOffsetComplexAddrs(Value *PtrA, Value *PtrB,
304 Instruction *ContextInst,
305 unsigned Depth);
306 std::optional<APInt> getConstantOffsetSelects(Value *PtrA, Value *PtrB,
307 Instruction *ContextInst,
308 unsigned Depth);
309
310 /// Gets the element type of the vector that the chain will load or store.
311 /// This is nontrivial because the chain may contain elements of different
312 /// types; e.g. it's legal to have a chain that contains both i32 and float.
313 Type *getChainElemTy(const Chain &C);
314
315 /// Determines whether ChainElem can be moved up (if IsLoad) or down (if
316 /// !IsLoad) to ChainBegin -- i.e. there are no intervening may-alias
317 /// instructions.
318 ///
319 /// The map ChainElemOffsets must contain all of the elements in
320 /// [ChainBegin, ChainElem] and their offsets from some arbitrary base
321 /// address. It's ok if it contains additional entries.
322 template <bool IsLoadChain>
323 bool isSafeToMove(
324 Instruction *ChainElem, Instruction *ChainBegin,
325 const DenseMap<Instruction *, APInt /*OffsetFromLeader*/> &ChainOffsets,
326 BatchAAResults &BatchAA);
327
328 /// Merges the equivalence classes if they have underlying objects that differ
329 /// by one level of indirection (i.e., one is a getelementptr and the other is
330 /// the base pointer in that getelementptr).
331 void mergeEquivalenceClasses(EquivalenceClassMap &EQClasses) const;
332
333 /// Collects loads and stores grouped by "equivalence class", where:
334 /// - all elements in an eq class are a load or all are a store,
335 /// - they all load/store the same element size (it's OK to have e.g. i8 and
336 /// <4 x i8> in the same class, but not i32 and <4 x i8>), and
337 /// - they all have the same value for getUnderlyingObject().
338 EquivalenceClassMap collectEquivalenceClasses(BasicBlock::iterator Begin,
340
341 /// Partitions Instrs into "chains" where every instruction has a known
342 /// constant offset from the first instr in the chain.
343 ///
344 /// Postcondition: For all i, ret[i][0].second == 0, because the first instr
345 /// in the chain is the leader, and an instr touches distance 0 from itself.
346 std::vector<Chain> gatherChains(ArrayRef<Instruction *> Instrs);
347};
348
349class LoadStoreVectorizerLegacyPass : public FunctionPass {
350public:
351 static char ID;
352
353 LoadStoreVectorizerLegacyPass() : FunctionPass(ID) {
356 }
357
358 bool runOnFunction(Function &F) override;
359
360 StringRef getPassName() const override {
361 return "GPU Load and Store Vectorizer";
362 }
363
364 void getAnalysisUsage(AnalysisUsage &AU) const override {
365 AU.addRequired<AAResultsWrapperPass>();
366 AU.addRequired<AssumptionCacheTracker>();
367 AU.addRequired<ScalarEvolutionWrapperPass>();
368 AU.addRequired<DominatorTreeWrapperPass>();
369 AU.addRequired<TargetTransformInfoWrapperPass>();
370 AU.setPreservesCFG();
371 }
372};
373
374} // end anonymous namespace
375
376char LoadStoreVectorizerLegacyPass::ID = 0;
377
378INITIALIZE_PASS_BEGIN(LoadStoreVectorizerLegacyPass, DEBUG_TYPE,
379 "Vectorize load and Store instructions", false, false)
386INITIALIZE_PASS_END(LoadStoreVectorizerLegacyPass, DEBUG_TYPE,
387 "Vectorize load and store instructions", false, false)
388
390 return new LoadStoreVectorizerLegacyPass();
391}
392
393bool LoadStoreVectorizerLegacyPass::runOnFunction(Function &F) {
394 // Don't vectorize when the attribute NoImplicitFloat is used.
395 if (skipFunction(F) || F.hasFnAttribute(Attribute::NoImplicitFloat))
396 return false;
397
398 AliasAnalysis &AA = getAnalysis<AAResultsWrapperPass>().getAAResults();
399 DominatorTree &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
400 ScalarEvolution &SE = getAnalysis<ScalarEvolutionWrapperPass>().getSE();
401 TargetTransformInfo &TTI =
402 getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
403
404 AssumptionCache &AC =
405 getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
406
407 return Vectorizer(F, AA, AC, DT, SE, TTI).run();
408}
409
412 // Don't vectorize when the attribute NoImplicitFloat is used.
413 if (F.hasFnAttribute(Attribute::NoImplicitFloat))
414 return PreservedAnalyses::all();
415
421
422 bool Changed = Vectorizer(F, AA, AC, DT, SE, TTI).run();
425 return Changed ? PA : PreservedAnalyses::all();
426}
427
428bool Vectorizer::run() {
429 bool Changed = false;
430 // Break up the BB if there are any instrs which aren't guaranteed to transfer
431 // execution to their successor.
432 //
433 // Consider, for example:
434 //
435 // def assert_arr_len(int n) { if (n < 2) exit(); }
436 //
437 // load arr[0]
438 // call assert_array_len(arr.length)
439 // load arr[1]
440 //
441 // Even though assert_arr_len does not read or write any memory, we can't
442 // speculate the second load before the call. More info at
443 // https://github.com/llvm/llvm-project/issues/52950.
444 for (BasicBlock *BB : post_order(&F)) {
445 // BB must at least have a terminator.
446 assert(!BB->empty());
447
449 Barriers.emplace_back(BB->begin());
450 for (Instruction &I : *BB)
452 Barriers.emplace_back(I.getIterator());
453 Barriers.emplace_back(BB->end());
454
455 for (auto It = Barriers.begin(), End = std::prev(Barriers.end()); It != End;
456 ++It)
457 Changed |= runOnPseudoBB(*It, *std::next(It));
458
459 for (Instruction *I : ToErase) {
460 auto *PtrOperand = getLoadStorePointerOperand(I);
461 if (I->use_empty())
462 I->eraseFromParent();
464 }
465 ToErase.clear();
466 }
467
468 return Changed;
469}
470
471bool Vectorizer::runOnPseudoBB(BasicBlock::iterator Begin,
473 LLVM_DEBUG({
474 dbgs() << "LSV: Running on pseudo-BB [" << *Begin << " ... ";
475 if (End != Begin->getParent()->end())
476 dbgs() << *End;
477 else
478 dbgs() << "<BB end>";
479 dbgs() << ")\n";
480 });
481
482 bool Changed = false;
483 for (const auto &[EqClassKey, EqClass] :
484 collectEquivalenceClasses(Begin, End))
485 Changed |= runOnEquivalenceClass(EqClassKey, EqClass);
486
487 return Changed;
488}
489
490bool Vectorizer::runOnEquivalenceClass(const EqClassKey &EqClassKey,
491 ArrayRef<Instruction *> EqClass) {
492 bool Changed = false;
493
494 LLVM_DEBUG({
495 dbgs() << "LSV: Running on equivalence class of size " << EqClass.size()
496 << " keyed on " << EqClassKey << ":\n";
497 for (Instruction *I : EqClass)
498 dbgs() << " " << *I << "\n";
499 });
500
501 std::vector<Chain> Chains = gatherChains(EqClass);
502 LLVM_DEBUG(dbgs() << "LSV: Got " << Chains.size()
503 << " nontrivial chains.\n";);
504 for (Chain &C : Chains)
505 Changed |= runOnChain(C);
506 return Changed;
507}
508
509bool Vectorizer::runOnChain(Chain &C) {
510 LLVM_DEBUG({
511 dbgs() << "LSV: Running on chain with " << C.size() << " instructions:\n";
512 dumpChain(C);
513 });
514
515 // Split up the chain into increasingly smaller chains, until we can finally
516 // vectorize the chains.
517 //
518 // (Don't be scared by the depth of the loop nest here. These operations are
519 // all at worst O(n lg n) in the number of instructions, and splitting chains
520 // doesn't change the number of instrs. So the whole loop nest is O(n lg n).)
521 bool Changed = false;
522 for (auto &C : splitChainByMayAliasInstrs(C))
523 for (auto &C : splitChainByContiguity(C))
524 for (auto &C : splitChainByAlignment(C))
525 Changed |= vectorizeChain(C);
526 return Changed;
527}
528
529std::vector<Chain> Vectorizer::splitChainByMayAliasInstrs(Chain &C) {
530 if (C.empty())
531 return {};
532
533 sortChainInBBOrder(C);
534
535 LLVM_DEBUG({
536 dbgs() << "LSV: splitChainByMayAliasInstrs considering chain:\n";
537 dumpChain(C);
538 });
539
540 // We know that elements in the chain with nonverlapping offsets can't
541 // alias, but AA may not be smart enough to figure this out. Use a
542 // hashtable.
543 DenseMap<Instruction *, APInt /*OffsetFromLeader*/> ChainOffsets;
544 for (const auto &E : C)
545 ChainOffsets.insert({&*E.Inst, E.OffsetFromLeader});
546
547 // Across a single invocation of this function the IR is not changing, so
548 // using a batched Alias Analysis is safe and can reduce compile time.
549 BatchAAResults BatchAA(AA);
550
551 // Loads get hoisted up to the first load in the chain. Stores get sunk
552 // down to the last store in the chain. Our algorithm for loads is:
553 //
554 // - Take the first element of the chain. This is the start of a new chain.
555 // - Take the next element of `Chain` and check for may-alias instructions
556 // up to the start of NewChain. If no may-alias instrs, add it to
557 // NewChain. Otherwise, start a new NewChain.
558 //
559 // For stores it's the same except in the reverse direction.
560 //
561 // We expect IsLoad to be an std::bool_constant.
562 auto Impl = [&](auto IsLoad) {
563 // MSVC is unhappy if IsLoad is a capture, so pass it as an arg.
564 auto [ChainBegin, ChainEnd] = [&](auto IsLoad) {
565 if constexpr (IsLoad())
566 return std::make_pair(C.begin(), C.end());
567 else
568 return std::make_pair(C.rbegin(), C.rend());
569 }(IsLoad);
570 assert(ChainBegin != ChainEnd);
571
572 std::vector<Chain> Chains;
574 NewChain.emplace_back(*ChainBegin);
575 for (auto ChainIt = std::next(ChainBegin); ChainIt != ChainEnd; ++ChainIt) {
576 if (isSafeToMove<IsLoad>(ChainIt->Inst, NewChain.front().Inst,
577 ChainOffsets, BatchAA)) {
578 LLVM_DEBUG(dbgs() << "LSV: No intervening may-alias instrs; can merge "
579 << *ChainIt->Inst << " into " << *ChainBegin->Inst
580 << "\n");
581 NewChain.emplace_back(*ChainIt);
582 } else {
584 dbgs() << "LSV: Found intervening may-alias instrs; cannot merge "
585 << *ChainIt->Inst << " into " << *ChainBegin->Inst << "\n");
586 if (NewChain.size() > 1) {
587 LLVM_DEBUG({
588 dbgs() << "LSV: got nontrivial chain without aliasing instrs:\n";
589 dumpChain(NewChain);
590 });
591 Chains.emplace_back(std::move(NewChain));
592 }
593
594 // Start a new chain.
595 NewChain = SmallVector<ChainElem, 1>({*ChainIt});
596 }
597 }
598 if (NewChain.size() > 1) {
599 LLVM_DEBUG({
600 dbgs() << "LSV: got nontrivial chain without aliasing instrs:\n";
601 dumpChain(NewChain);
602 });
603 Chains.emplace_back(std::move(NewChain));
604 }
605 return Chains;
606 };
607
608 if (isa<LoadInst>(C[0].Inst))
609 return Impl(/*IsLoad=*/std::bool_constant<true>());
610
611 assert(isa<StoreInst>(C[0].Inst));
612 return Impl(/*IsLoad=*/std::bool_constant<false>());
613}
614
615std::vector<Chain> Vectorizer::splitChainByContiguity(Chain &C) {
616 if (C.empty())
617 return {};
618
619 sortChainInOffsetOrder(C);
620
621 LLVM_DEBUG({
622 dbgs() << "LSV: splitChainByContiguity considering chain:\n";
623 dumpChain(C);
624 });
625
626 std::vector<Chain> Ret;
627 Ret.push_back({C.front()});
628
629 for (auto It = std::next(C.begin()), End = C.end(); It != End; ++It) {
630 // `prev` accesses offsets [PrevDistFromBase, PrevReadEnd).
631 auto &CurChain = Ret.back();
632 const ChainElem &Prev = CurChain.back();
633 unsigned SzBits = DL.getTypeSizeInBits(getLoadStoreType(&*Prev.Inst));
634 assert(SzBits % 8 == 0 && "Non-byte sizes should have been filtered out by "
635 "collectEquivalenceClass");
636 APInt PrevReadEnd = Prev.OffsetFromLeader + SzBits / 8;
637
638 // Add this instruction to the end of the current chain, or start a new one.
639 bool AreContiguous = It->OffsetFromLeader == PrevReadEnd;
640 LLVM_DEBUG(dbgs() << "LSV: Instructions are "
641 << (AreContiguous ? "" : "not ") << "contiguous: "
642 << *Prev.Inst << " (ends at offset " << PrevReadEnd
643 << ") -> " << *It->Inst << " (starts at offset "
644 << It->OffsetFromLeader << ")\n");
645 if (AreContiguous)
646 CurChain.push_back(*It);
647 else
648 Ret.push_back({*It});
649 }
650
651 // Filter out length-1 chains, these are uninteresting.
652 llvm::erase_if(Ret, [](const auto &Chain) { return Chain.size() <= 1; });
653 return Ret;
654}
655
656Type *Vectorizer::getChainElemTy(const Chain &C) {
657 assert(!C.empty());
658 // The rules are:
659 // - If there are any pointer types in the chain, use an integer type.
660 // - Prefer an integer type if it appears in the chain.
661 // - Otherwise, use the first type in the chain.
662 //
663 // The rule about pointer types is a simplification when we merge e.g. a load
664 // of a ptr and a double. There's no direct conversion from a ptr to a
665 // double; it requires a ptrtoint followed by a bitcast.
666 //
667 // It's unclear to me if the other rules have any practical effect, but we do
668 // it to match this pass's previous behavior.
669 if (any_of(C, [](const ChainElem &E) {
670 return getLoadStoreType(E.Inst)->getScalarType()->isPointerTy();
671 })) {
672 return Type::getIntNTy(
673 F.getContext(),
674 DL.getTypeSizeInBits(getLoadStoreType(C[0].Inst)->getScalarType()));
675 }
676
677 for (const ChainElem &E : C)
678 if (Type *T = getLoadStoreType(E.Inst)->getScalarType(); T->isIntegerTy())
679 return T;
680 return getLoadStoreType(C[0].Inst)->getScalarType();
681}
682
683std::vector<Chain> Vectorizer::splitChainByAlignment(Chain &C) {
684 // We use a simple greedy algorithm.
685 // - Given a chain of length N, find all prefixes that
686 // (a) are not longer than the max register length, and
687 // (b) are a power of 2.
688 // - Starting from the longest prefix, try to create a vector of that length.
689 // - If one of them works, great. Repeat the algorithm on any remaining
690 // elements in the chain.
691 // - If none of them work, discard the first element and repeat on a chain
692 // of length N-1.
693 if (C.empty())
694 return {};
695
696 sortChainInOffsetOrder(C);
697
698 LLVM_DEBUG({
699 dbgs() << "LSV: splitChainByAlignment considering chain:\n";
700 dumpChain(C);
701 });
702
703 bool IsLoadChain = isa<LoadInst>(C[0].Inst);
704 auto GetVectorFactor = [&](unsigned VF, unsigned LoadStoreSize,
705 unsigned ChainSizeBytes, VectorType *VecTy) {
706 return IsLoadChain ? TTI.getLoadVectorFactor(VF, LoadStoreSize,
707 ChainSizeBytes, VecTy)
708 : TTI.getStoreVectorFactor(VF, LoadStoreSize,
709 ChainSizeBytes, VecTy);
710 };
711
712#ifndef NDEBUG
713 for (const auto &E : C) {
714 Type *Ty = getLoadStoreType(E.Inst)->getScalarType();
715 assert(isPowerOf2_32(DL.getTypeSizeInBits(Ty)) &&
716 "Should have filtered out non-power-of-two elements in "
717 "collectEquivalenceClasses.");
718 }
719#endif
720
721 unsigned AS = getLoadStoreAddressSpace(C[0].Inst);
722 unsigned VecRegBytes = TTI.getLoadStoreVecRegBitWidth(AS) / 8;
723
724 std::vector<Chain> Ret;
725 for (unsigned CBegin = 0; CBegin < C.size(); ++CBegin) {
726 // Find candidate chains of size not greater than the largest vector reg.
727 // These chains are over the closed interval [CBegin, CEnd].
728 SmallVector<std::pair<unsigned /*CEnd*/, unsigned /*SizeBytes*/>, 8>
729 CandidateChains;
730 for (unsigned CEnd = CBegin + 1, Size = C.size(); CEnd < Size; ++CEnd) {
731 APInt Sz = C[CEnd].OffsetFromLeader +
732 DL.getTypeStoreSize(getLoadStoreType(C[CEnd].Inst)) -
733 C[CBegin].OffsetFromLeader;
734 if (Sz.sgt(VecRegBytes))
735 break;
736 CandidateChains.emplace_back(CEnd,
737 static_cast<unsigned>(Sz.getLimitedValue()));
738 }
739
740 // Consider the longest chain first.
741 for (auto It = CandidateChains.rbegin(), End = CandidateChains.rend();
742 It != End; ++It) {
743 auto [CEnd, SizeBytes] = *It;
745 dbgs() << "LSV: splitChainByAlignment considering candidate chain ["
746 << *C[CBegin].Inst << " ... " << *C[CEnd].Inst << "]\n");
747
748 Type *VecElemTy = getChainElemTy(C);
749 // Note, VecElemTy is a power of 2, but might be less than one byte. For
750 // example, we can vectorize 2 x <2 x i4> to <4 x i4>, and in this case
751 // VecElemTy would be i4.
752 unsigned VecElemBits = DL.getTypeSizeInBits(VecElemTy);
753
754 // SizeBytes and VecElemBits are powers of 2, so they divide evenly.
755 assert((8 * SizeBytes) % VecElemBits == 0);
756 unsigned NumVecElems = 8 * SizeBytes / VecElemBits;
757 FixedVectorType *VecTy = FixedVectorType::get(VecElemTy, NumVecElems);
758 unsigned VF = 8 * VecRegBytes / VecElemBits;
759
760 // Check that TTI is happy with this vectorization factor.
761 unsigned TargetVF = GetVectorFactor(VF, VecElemBits,
762 VecElemBits * NumVecElems / 8, VecTy);
763 if (TargetVF != VF && TargetVF < NumVecElems) {
765 dbgs() << "LSV: splitChainByAlignment discarding candidate chain "
766 "because TargetVF="
767 << TargetVF << " != VF=" << VF
768 << " and TargetVF < NumVecElems=" << NumVecElems << "\n");
769 continue;
770 }
771
772 // Is a load/store with this alignment allowed by TTI and at least as fast
773 // as an unvectorized load/store?
774 //
775 // TTI and F are passed as explicit captures to WAR an MSVC misparse (??).
776 auto IsAllowedAndFast = [&, SizeBytes = SizeBytes, &TTI = TTI,
777 &F = F](Align Alignment) {
778 if (Alignment.value() % SizeBytes == 0)
779 return true;
780 unsigned VectorizedSpeed = 0;
781 bool AllowsMisaligned = TTI.allowsMisalignedMemoryAccesses(
782 F.getContext(), SizeBytes * 8, AS, Alignment, &VectorizedSpeed);
783 if (!AllowsMisaligned) {
785 << "LSV: Access of " << SizeBytes << "B in addrspace "
786 << AS << " with alignment " << Alignment.value()
787 << " is misaligned, and therefore can't be vectorized.\n");
788 return false;
789 }
790
791 unsigned ElementwiseSpeed = 0;
792 (TTI).allowsMisalignedMemoryAccesses((F).getContext(), VecElemBits, AS,
793 Alignment, &ElementwiseSpeed);
794 if (VectorizedSpeed < ElementwiseSpeed) {
796 << "LSV: Access of " << SizeBytes << "B in addrspace "
797 << AS << " with alignment " << Alignment.value()
798 << " has relative speed " << VectorizedSpeed
799 << ", which is lower than the elementwise speed of "
800 << ElementwiseSpeed
801 << ". Therefore this access won't be vectorized.\n");
802 return false;
803 }
804 return true;
805 };
806
807 // If we're loading/storing from an alloca, align it if possible.
808 //
809 // FIXME: We eagerly upgrade the alignment, regardless of whether TTI
810 // tells us this is beneficial. This feels a bit odd, but it matches
811 // existing tests. This isn't *so* bad, because at most we align to 4
812 // bytes (current value of StackAdjustedAlignment).
813 //
814 // FIXME: We will upgrade the alignment of the alloca even if it turns out
815 // we can't vectorize for some other reason.
816 Value *PtrOperand = getLoadStorePointerOperand(C[CBegin].Inst);
817 bool IsAllocaAccess = AS == DL.getAllocaAddrSpace() &&
818 isa<AllocaInst>(PtrOperand->stripPointerCasts());
819 Align Alignment = getLoadStoreAlignment(C[CBegin].Inst);
820 Align PrefAlign = Align(StackAdjustedAlignment);
821 if (IsAllocaAccess && Alignment.value() % SizeBytes != 0 &&
822 IsAllowedAndFast(PrefAlign)) {
824 PtrOperand, PrefAlign, DL, C[CBegin].Inst, nullptr, &DT);
825 if (NewAlign >= Alignment) {
827 << "LSV: splitByChain upgrading alloca alignment from "
828 << Alignment.value() << " to " << NewAlign.value()
829 << "\n");
830 Alignment = NewAlign;
831 }
832 }
833
834 if (!IsAllowedAndFast(Alignment)) {
836 dbgs() << "LSV: splitChainByAlignment discarding candidate chain "
837 "because its alignment is not AllowedAndFast: "
838 << Alignment.value() << "\n");
839 continue;
840 }
841
842 if ((IsLoadChain &&
843 !TTI.isLegalToVectorizeLoadChain(SizeBytes, Alignment, AS)) ||
844 (!IsLoadChain &&
845 !TTI.isLegalToVectorizeStoreChain(SizeBytes, Alignment, AS))) {
847 dbgs() << "LSV: splitChainByAlignment discarding candidate chain "
848 "because !isLegalToVectorizeLoad/StoreChain.");
849 continue;
850 }
851
852 // Hooray, we can vectorize this chain!
853 Chain &NewChain = Ret.emplace_back();
854 for (unsigned I = CBegin; I <= CEnd; ++I)
855 NewChain.emplace_back(C[I]);
856 CBegin = CEnd; // Skip over the instructions we've added to the chain.
857 break;
858 }
859 }
860 return Ret;
861}
862
863bool Vectorizer::vectorizeChain(Chain &C) {
864 if (C.size() < 2)
865 return false;
866
867 sortChainInOffsetOrder(C);
868
869 LLVM_DEBUG({
870 dbgs() << "LSV: Vectorizing chain of " << C.size() << " instructions:\n";
871 dumpChain(C);
872 });
873
874 Type *VecElemTy = getChainElemTy(C);
875 bool IsLoadChain = isa<LoadInst>(C[0].Inst);
876 unsigned AS = getLoadStoreAddressSpace(C[0].Inst);
877 unsigned ChainBytes = std::accumulate(
878 C.begin(), C.end(), 0u, [&](unsigned Bytes, const ChainElem &E) {
879 return Bytes + DL.getTypeStoreSize(getLoadStoreType(E.Inst));
880 });
881 assert(ChainBytes % DL.getTypeStoreSize(VecElemTy) == 0);
882 // VecTy is a power of 2 and 1 byte at smallest, but VecElemTy may be smaller
883 // than 1 byte (e.g. VecTy == <32 x i1>).
884 Type *VecTy = FixedVectorType::get(
885 VecElemTy, 8 * ChainBytes / DL.getTypeSizeInBits(VecElemTy));
886
887 Align Alignment = getLoadStoreAlignment(C[0].Inst);
888 // If this is a load/store of an alloca, we might have upgraded the alloca's
889 // alignment earlier. Get the new alignment.
890 if (AS == DL.getAllocaAddrSpace()) {
891 Alignment = std::max(
892 Alignment,
894 MaybeAlign(), DL, C[0].Inst, nullptr, &DT));
895 }
896
897 // All elements of the chain must have the same scalar-type size.
898#ifndef NDEBUG
899 for (const ChainElem &E : C)
900 assert(DL.getTypeStoreSize(getLoadStoreType(E.Inst)->getScalarType()) ==
901 DL.getTypeStoreSize(VecElemTy));
902#endif
903
904 Instruction *VecInst;
905 if (IsLoadChain) {
906 // Loads get hoisted to the location of the first load in the chain. We may
907 // also need to hoist the (transitive) operands of the loads.
908 Builder.SetInsertPoint(
909 llvm::min_element(C, [](const auto &A, const auto &B) {
910 return A.Inst->comesBefore(B.Inst);
911 })->Inst);
912
913 // Chain is in offset order, so C[0] is the instr with the lowest offset,
914 // i.e. the root of the vector.
915 VecInst = Builder.CreateAlignedLoad(VecTy,
917 Alignment);
918
919 unsigned VecIdx = 0;
920 for (const ChainElem &E : C) {
921 Instruction *I = E.Inst;
922 Value *V;
924 if (auto *VT = dyn_cast<FixedVectorType>(T)) {
926 llvm::seq<int>(VecIdx, VecIdx + VT->getNumElements()));
927 V = Builder.CreateShuffleVector(VecInst, Mask, I->getName());
928 VecIdx += VT->getNumElements();
929 } else {
930 V = Builder.CreateExtractElement(VecInst, Builder.getInt32(VecIdx),
931 I->getName());
932 ++VecIdx;
933 }
934 if (V->getType() != I->getType())
935 V = Builder.CreateBitOrPointerCast(V, I->getType());
937 }
938
939 // Finally, we need to reorder the instrs in the BB so that the (transitive)
940 // operands of VecInst appear before it. To see why, suppose we have
941 // vectorized the following code:
942 //
943 // ptr1 = gep a, 1
944 // load1 = load i32 ptr1
945 // ptr0 = gep a, 0
946 // load0 = load i32 ptr0
947 //
948 // We will put the vectorized load at the location of the earliest load in
949 // the BB, i.e. load1. We get:
950 //
951 // ptr1 = gep a, 1
952 // loadv = load <2 x i32> ptr0
953 // load0 = extractelement loadv, 0
954 // load1 = extractelement loadv, 1
955 // ptr0 = gep a, 0
956 //
957 // Notice that loadv uses ptr0, which is defined *after* it!
958 reorder(VecInst);
959 } else {
960 // Stores get sunk to the location of the last store in the chain.
961 Builder.SetInsertPoint(llvm::max_element(C, [](auto &A, auto &B) {
962 return A.Inst->comesBefore(B.Inst);
963 })->Inst);
964
965 // Build the vector to store.
966 Value *Vec = PoisonValue::get(VecTy);
967 unsigned VecIdx = 0;
968 auto InsertElem = [&](Value *V) {
969 if (V->getType() != VecElemTy)
970 V = Builder.CreateBitOrPointerCast(V, VecElemTy);
971 Vec = Builder.CreateInsertElement(Vec, V, Builder.getInt32(VecIdx++));
972 };
973 for (const ChainElem &E : C) {
974 auto *I = cast<StoreInst>(E.Inst);
975 if (FixedVectorType *VT =
977 for (int J = 0, JE = VT->getNumElements(); J < JE; ++J) {
978 InsertElem(Builder.CreateExtractElement(I->getValueOperand(),
979 Builder.getInt32(J)));
980 }
981 } else {
982 InsertElem(I->getValueOperand());
983 }
984 }
985
986 // Chain is in offset order, so C[0] is the instr with the lowest offset,
987 // i.e. the root of the vector.
988 VecInst = Builder.CreateAlignedStore(
989 Vec,
991 Alignment);
992 }
993
994 propagateMetadata(VecInst, C);
995
996 for (const ChainElem &E : C)
997 ToErase.emplace_back(E.Inst);
998
999 ++NumVectorInstructions;
1000 NumScalarsVectorized += C.size();
1001 return true;
1002}
1003
1004template <bool IsLoadChain>
1005bool Vectorizer::isSafeToMove(
1006 Instruction *ChainElem, Instruction *ChainBegin,
1007 const DenseMap<Instruction *, APInt /*OffsetFromLeader*/> &ChainOffsets,
1008 BatchAAResults &BatchAA) {
1009 LLVM_DEBUG(dbgs() << "LSV: isSafeToMove(" << *ChainElem << " -> "
1010 << *ChainBegin << ")\n");
1011
1012 assert(isa<LoadInst>(ChainElem) == IsLoadChain);
1013 if (ChainElem == ChainBegin)
1014 return true;
1015
1016 // Invariant loads can always be reordered; by definition they are not
1017 // clobbered by stores.
1018 if (isInvariantLoad(ChainElem))
1019 return true;
1020
1021 auto BBIt = std::next([&] {
1022 if constexpr (IsLoadChain)
1023 return BasicBlock::reverse_iterator(ChainElem);
1024 else
1025 return BasicBlock::iterator(ChainElem);
1026 }());
1027 auto BBItEnd = std::next([&] {
1028 if constexpr (IsLoadChain)
1029 return BasicBlock::reverse_iterator(ChainBegin);
1030 else
1031 return BasicBlock::iterator(ChainBegin);
1032 }());
1033
1034 const APInt &ChainElemOffset = ChainOffsets.at(ChainElem);
1035 const unsigned ChainElemSize =
1036 DL.getTypeStoreSize(getLoadStoreType(ChainElem));
1037
1038 for (; BBIt != BBItEnd; ++BBIt) {
1039 Instruction *I = &*BBIt;
1040
1041 if (!I->mayReadOrWriteMemory())
1042 continue;
1043
1044 // Loads can be reordered with other loads.
1045 if (IsLoadChain && isa<LoadInst>(I))
1046 continue;
1047
1048 // Stores can be sunk below invariant loads.
1049 if (!IsLoadChain && isInvariantLoad(I))
1050 continue;
1051
1052 // If I is in the chain, we can tell whether it aliases ChainIt by checking
1053 // what offset ChainIt accesses. This may be better than AA is able to do.
1054 //
1055 // We should really only have duplicate offsets for stores (the duplicate
1056 // loads should be CSE'ed), but in case we have a duplicate load, we'll
1057 // split the chain so we don't have to handle this case specially.
1058 if (auto OffsetIt = ChainOffsets.find(I); OffsetIt != ChainOffsets.end()) {
1059 // I and ChainElem overlap if:
1060 // - I and ChainElem have the same offset, OR
1061 // - I's offset is less than ChainElem's, but I touches past the
1062 // beginning of ChainElem, OR
1063 // - ChainElem's offset is less than I's, but ChainElem touches past the
1064 // beginning of I.
1065 const APInt &IOffset = OffsetIt->second;
1066 unsigned IElemSize = DL.getTypeStoreSize(getLoadStoreType(I));
1067 if (IOffset == ChainElemOffset ||
1068 (IOffset.sle(ChainElemOffset) &&
1069 (IOffset + IElemSize).sgt(ChainElemOffset)) ||
1070 (ChainElemOffset.sle(IOffset) &&
1071 (ChainElemOffset + ChainElemSize).sgt(OffsetIt->second))) {
1072 LLVM_DEBUG({
1073 // Double check that AA also sees this alias. If not, we probably
1074 // have a bug.
1075 ModRefInfo MR =
1076 BatchAA.getModRefInfo(I, MemoryLocation::get(ChainElem));
1077 assert(IsLoadChain ? isModSet(MR) : isModOrRefSet(MR));
1078 dbgs() << "LSV: Found alias in chain: " << *I << "\n";
1079 });
1080 return false; // We found an aliasing instruction; bail.
1081 }
1082
1083 continue; // We're confident there's no alias.
1084 }
1085
1086 LLVM_DEBUG(dbgs() << "LSV: Querying AA for " << *I << "\n");
1087 ModRefInfo MR = BatchAA.getModRefInfo(I, MemoryLocation::get(ChainElem));
1088 if (IsLoadChain ? isModSet(MR) : isModOrRefSet(MR)) {
1089 LLVM_DEBUG(dbgs() << "LSV: Found alias in chain:\n"
1090 << " Aliasing instruction:\n"
1091 << " " << *I << '\n'
1092 << " Aliased instruction and pointer:\n"
1093 << " " << *ChainElem << '\n'
1094 << " " << *getLoadStorePointerOperand(ChainElem)
1095 << '\n');
1096
1097 return false;
1098 }
1099 }
1100 return true;
1101}
1102
1105 return (Signed && BinOpI->hasNoSignedWrap()) ||
1106 (!Signed && BinOpI->hasNoUnsignedWrap());
1107}
1108
1109static bool checkIfSafeAddSequence(const APInt &IdxDiff, Instruction *AddOpA,
1110 unsigned MatchingOpIdxA, Instruction *AddOpB,
1111 unsigned MatchingOpIdxB, bool Signed) {
1112 LLVM_DEBUG(dbgs() << "LSV: checkIfSafeAddSequence IdxDiff=" << IdxDiff
1113 << ", AddOpA=" << *AddOpA << ", MatchingOpIdxA="
1114 << MatchingOpIdxA << ", AddOpB=" << *AddOpB
1115 << ", MatchingOpIdxB=" << MatchingOpIdxB
1116 << ", Signed=" << Signed << "\n");
1117 // If both OpA and OpB are adds with NSW/NUW and with one of the operands
1118 // being the same, we can guarantee that the transformation is safe if we can
1119 // prove that OpA won't overflow when Ret added to the other operand of OpA.
1120 // For example:
1121 // %tmp7 = add nsw i32 %tmp2, %v0
1122 // %tmp8 = sext i32 %tmp7 to i64
1123 // ...
1124 // %tmp11 = add nsw i32 %v0, 1
1125 // %tmp12 = add nsw i32 %tmp2, %tmp11
1126 // %tmp13 = sext i32 %tmp12 to i64
1127 //
1128 // Both %tmp7 and %tmp12 have the nsw flag and the first operand is %tmp2.
1129 // It's guaranteed that adding 1 to %tmp7 won't overflow because %tmp11 adds
1130 // 1 to %v0 and both %tmp11 and %tmp12 have the nsw flag.
1131 assert(AddOpA->getOpcode() == Instruction::Add &&
1132 AddOpB->getOpcode() == Instruction::Add &&
1133 checkNoWrapFlags(AddOpA, Signed) && checkNoWrapFlags(AddOpB, Signed));
1134 if (AddOpA->getOperand(MatchingOpIdxA) ==
1135 AddOpB->getOperand(MatchingOpIdxB)) {
1136 Value *OtherOperandA = AddOpA->getOperand(MatchingOpIdxA == 1 ? 0 : 1);
1137 Value *OtherOperandB = AddOpB->getOperand(MatchingOpIdxB == 1 ? 0 : 1);
1138 Instruction *OtherInstrA = dyn_cast<Instruction>(OtherOperandA);
1139 Instruction *OtherInstrB = dyn_cast<Instruction>(OtherOperandB);
1140 // Match `x +nsw/nuw y` and `x +nsw/nuw (y +nsw/nuw IdxDiff)`.
1141 if (OtherInstrB && OtherInstrB->getOpcode() == Instruction::Add &&
1142 checkNoWrapFlags(OtherInstrB, Signed) &&
1143 isa<ConstantInt>(OtherInstrB->getOperand(1))) {
1144 int64_t CstVal =
1145 cast<ConstantInt>(OtherInstrB->getOperand(1))->getSExtValue();
1146 if (OtherInstrB->getOperand(0) == OtherOperandA &&
1147 IdxDiff.getSExtValue() == CstVal)
1148 return true;
1149 }
1150 // Match `x +nsw/nuw (y +nsw/nuw -Idx)` and `x +nsw/nuw (y +nsw/nuw x)`.
1151 if (OtherInstrA && OtherInstrA->getOpcode() == Instruction::Add &&
1152 checkNoWrapFlags(OtherInstrA, Signed) &&
1153 isa<ConstantInt>(OtherInstrA->getOperand(1))) {
1154 int64_t CstVal =
1155 cast<ConstantInt>(OtherInstrA->getOperand(1))->getSExtValue();
1156 if (OtherInstrA->getOperand(0) == OtherOperandB &&
1157 IdxDiff.getSExtValue() == -CstVal)
1158 return true;
1159 }
1160 // Match `x +nsw/nuw (y +nsw/nuw c)` and
1161 // `x +nsw/nuw (y +nsw/nuw (c + IdxDiff))`.
1162 if (OtherInstrA && OtherInstrB &&
1163 OtherInstrA->getOpcode() == Instruction::Add &&
1164 OtherInstrB->getOpcode() == Instruction::Add &&
1165 checkNoWrapFlags(OtherInstrA, Signed) &&
1166 checkNoWrapFlags(OtherInstrB, Signed) &&
1167 isa<ConstantInt>(OtherInstrA->getOperand(1)) &&
1168 isa<ConstantInt>(OtherInstrB->getOperand(1))) {
1169 int64_t CstValA =
1170 cast<ConstantInt>(OtherInstrA->getOperand(1))->getSExtValue();
1171 int64_t CstValB =
1172 cast<ConstantInt>(OtherInstrB->getOperand(1))->getSExtValue();
1173 if (OtherInstrA->getOperand(0) == OtherInstrB->getOperand(0) &&
1174 IdxDiff.getSExtValue() == (CstValB - CstValA))
1175 return true;
1176 }
1177 }
1178 return false;
1179}
1180
1181std::optional<APInt> Vectorizer::getConstantOffsetComplexAddrs(
1182 Value *PtrA, Value *PtrB, Instruction *ContextInst, unsigned Depth) {
1183 LLVM_DEBUG(dbgs() << "LSV: getConstantOffsetComplexAddrs PtrA=" << *PtrA
1184 << " PtrB=" << *PtrB << " ContextInst=" << *ContextInst
1185 << " Depth=" << Depth << "\n");
1186 auto *GEPA = dyn_cast<GetElementPtrInst>(PtrA);
1187 auto *GEPB = dyn_cast<GetElementPtrInst>(PtrB);
1188 if (!GEPA || !GEPB)
1189 return getConstantOffsetSelects(PtrA, PtrB, ContextInst, Depth);
1190
1191 // Look through GEPs after checking they're the same except for the last
1192 // index.
1193 if (GEPA->getNumOperands() != GEPB->getNumOperands() ||
1194 GEPA->getPointerOperand() != GEPB->getPointerOperand())
1195 return std::nullopt;
1196 gep_type_iterator GTIA = gep_type_begin(GEPA);
1197 gep_type_iterator GTIB = gep_type_begin(GEPB);
1198 for (unsigned I = 0, E = GEPA->getNumIndices() - 1; I < E; ++I) {
1199 if (GTIA.getOperand() != GTIB.getOperand())
1200 return std::nullopt;
1201 ++GTIA;
1202 ++GTIB;
1203 }
1204
1207 if (!OpA || !OpB || OpA->getOpcode() != OpB->getOpcode() ||
1208 OpA->getType() != OpB->getType())
1209 return std::nullopt;
1210
1211 uint64_t Stride = GTIA.getSequentialElementStride(DL);
1212
1213 // Only look through a ZExt/SExt.
1214 if (!isa<SExtInst>(OpA) && !isa<ZExtInst>(OpA))
1215 return std::nullopt;
1216
1217 bool Signed = isa<SExtInst>(OpA);
1218
1219 // At this point A could be a function parameter, i.e. not an instruction
1220 Value *ValA = OpA->getOperand(0);
1221 OpB = dyn_cast<Instruction>(OpB->getOperand(0));
1222 if (!OpB || ValA->getType() != OpB->getType())
1223 return std::nullopt;
1224
1225 const SCEV *OffsetSCEVA = SE.getSCEV(ValA);
1226 const SCEV *OffsetSCEVB = SE.getSCEV(OpB);
1227 const SCEV *IdxDiffSCEV = SE.getMinusSCEV(OffsetSCEVB, OffsetSCEVA);
1228 if (IdxDiffSCEV == SE.getCouldNotCompute())
1229 return std::nullopt;
1230
1231 ConstantRange IdxDiffRange = SE.getSignedRange(IdxDiffSCEV);
1232 if (!IdxDiffRange.isSingleElement())
1233 return std::nullopt;
1234 APInt IdxDiff = *IdxDiffRange.getSingleElement();
1235
1236 LLVM_DEBUG(dbgs() << "LSV: getConstantOffsetComplexAddrs IdxDiff=" << IdxDiff
1237 << "\n");
1238
1239 // Now we need to prove that adding IdxDiff to ValA won't overflow.
1240 bool Safe = false;
1241
1242 // First attempt: if OpB is an add with NSW/NUW, and OpB is IdxDiff added to
1243 // ValA, we're okay.
1244 if (OpB->getOpcode() == Instruction::Add &&
1245 isa<ConstantInt>(OpB->getOperand(1)) &&
1246 IdxDiff.sle(cast<ConstantInt>(OpB->getOperand(1))->getSExtValue()) &&
1248 Safe = true;
1249
1250 // Second attempt: check if we have eligible add NSW/NUW instruction
1251 // sequences.
1252 OpA = dyn_cast<Instruction>(ValA);
1253 if (!Safe && OpA && OpA->getOpcode() == Instruction::Add &&
1254 OpB->getOpcode() == Instruction::Add && checkNoWrapFlags(OpA, Signed) &&
1255 checkNoWrapFlags(OpB, Signed)) {
1256 // In the checks below a matching operand in OpA and OpB is an operand which
1257 // is the same in those two instructions. Below we account for possible
1258 // orders of the operands of these add instructions.
1259 for (unsigned MatchingOpIdxA : {0, 1})
1260 for (unsigned MatchingOpIdxB : {0, 1})
1261 if (!Safe)
1262 Safe = checkIfSafeAddSequence(IdxDiff, OpA, MatchingOpIdxA, OpB,
1263 MatchingOpIdxB, Signed);
1264 }
1265
1266 unsigned BitWidth = ValA->getType()->getScalarSizeInBits();
1267
1268 // Third attempt:
1269 //
1270 // Assuming IdxDiff is positive: If all set bits of IdxDiff or any higher
1271 // order bit other than the sign bit are known to be zero in ValA, we can add
1272 // Diff to it while guaranteeing no overflow of any sort.
1273 //
1274 // If IdxDiff is negative, do the same, but swap ValA and ValB.
1275 if (!Safe) {
1276 // When computing known bits, use the GEPs as context instructions, since
1277 // they likely are in the same BB as the load/store.
1278 KnownBits Known(BitWidth);
1279 computeKnownBits((IdxDiff.sge(0) ? ValA : OpB), Known, DL, &AC, ContextInst,
1280 &DT);
1281 APInt BitsAllowedToBeSet = Known.Zero.zext(IdxDiff.getBitWidth());
1282 if (Signed)
1283 BitsAllowedToBeSet.clearBit(BitWidth - 1);
1284 Safe = BitsAllowedToBeSet.uge(IdxDiff.abs());
1285 }
1286
1287 if (Safe)
1288 return IdxDiff * Stride;
1289 return std::nullopt;
1290}
1291
1292std::optional<APInt> Vectorizer::getConstantOffsetSelects(
1293 Value *PtrA, Value *PtrB, Instruction *ContextInst, unsigned Depth) {
1294 if (Depth++ == MaxDepth)
1295 return std::nullopt;
1296
1297 if (auto *SelectA = dyn_cast<SelectInst>(PtrA)) {
1298 if (auto *SelectB = dyn_cast<SelectInst>(PtrB)) {
1299 if (SelectA->getCondition() != SelectB->getCondition())
1300 return std::nullopt;
1301 LLVM_DEBUG(dbgs() << "LSV: getConstantOffsetSelects, PtrA=" << *PtrA
1302 << ", PtrB=" << *PtrB << ", ContextInst="
1303 << *ContextInst << ", Depth=" << Depth << "\n");
1304 std::optional<APInt> TrueDiff = getConstantOffset(
1305 SelectA->getTrueValue(), SelectB->getTrueValue(), ContextInst, Depth);
1306 if (!TrueDiff)
1307 return std::nullopt;
1308 std::optional<APInt> FalseDiff =
1309 getConstantOffset(SelectA->getFalseValue(), SelectB->getFalseValue(),
1310 ContextInst, Depth);
1311 if (TrueDiff == FalseDiff)
1312 return TrueDiff;
1313 }
1314 }
1315 return std::nullopt;
1316}
1317
1318void Vectorizer::mergeEquivalenceClasses(EquivalenceClassMap &EQClasses) const {
1319 if (EQClasses.size() < 2) // There is nothing to merge.
1320 return;
1321
1322 // The reduced key has all elements of the ECClassKey except the underlying
1323 // object. Check that EqClassKey has 4 elements and define the reduced key.
1324 static_assert(std::tuple_size_v<EqClassKey> == 4,
1325 "EqClassKey has changed - EqClassReducedKey needs changes too");
1326 using EqClassReducedKey =
1327 std::tuple<std::tuple_element_t<1, EqClassKey> /* AddrSpace */,
1328 std::tuple_element_t<2, EqClassKey> /* Element size */,
1329 std::tuple_element_t<3, EqClassKey> /* IsLoad; */>;
1330 using ECReducedKeyToUnderlyingObjectMap =
1331 MapVector<EqClassReducedKey,
1332 SmallPtrSet<std::tuple_element_t<0, EqClassKey>, 4>>;
1333
1334 // Form a map from the reduced key (without the underlying object) to the
1335 // underlying objects: 1 reduced key to many underlying objects, to form
1336 // groups of potentially merge-able equivalence classes.
1337 ECReducedKeyToUnderlyingObjectMap RedKeyToUOMap;
1338 bool FoundPotentiallyOptimizableEC = false;
1339 for (const auto &EC : EQClasses) {
1340 const auto &Key = EC.first;
1341 EqClassReducedKey RedKey{std::get<1>(Key), std::get<2>(Key),
1342 std::get<3>(Key)};
1343 auto &UOMap = RedKeyToUOMap[RedKey];
1344 UOMap.insert(std::get<0>(Key));
1345 if (UOMap.size() > 1)
1346 FoundPotentiallyOptimizableEC = true;
1347 }
1348 if (!FoundPotentiallyOptimizableEC)
1349 return;
1350
1351 LLVM_DEBUG({
1352 dbgs() << "LSV: mergeEquivalenceClasses: before merging:\n";
1353 for (const auto &EC : EQClasses) {
1354 dbgs() << " Key: {" << EC.first << "}\n";
1355 for (const auto &Inst : EC.second)
1356 dbgs() << " Inst: " << *Inst << '\n';
1357 }
1358 });
1359 LLVM_DEBUG({
1360 dbgs() << "LSV: mergeEquivalenceClasses: RedKeyToUOMap:\n";
1361 for (const auto &RedKeyToUO : RedKeyToUOMap) {
1362 dbgs() << " Reduced key: {" << std::get<0>(RedKeyToUO.first) << ", "
1363 << std::get<1>(RedKeyToUO.first) << ", "
1364 << static_cast<int>(std::get<2>(RedKeyToUO.first)) << "} --> "
1365 << RedKeyToUO.second.size() << " underlying objects:\n";
1366 for (auto UObject : RedKeyToUO.second)
1367 dbgs() << " " << *UObject << '\n';
1368 }
1369 });
1370
1371 using UObjectToUObjectMap = DenseMap<const Value *, const Value *>;
1372
1373 // Compute the ultimate targets for a set of underlying objects.
1374 auto GetUltimateTargets =
1375 [](SmallPtrSetImpl<const Value *> &UObjects) -> UObjectToUObjectMap {
1376 UObjectToUObjectMap IndirectionMap;
1377 for (const auto *UObject : UObjects) {
1378 const unsigned MaxLookupDepth = 1; // look for 1-level indirections only
1379 const auto *UltimateTarget = getUnderlyingObject(UObject, MaxLookupDepth);
1380 if (UltimateTarget != UObject)
1381 IndirectionMap[UObject] = UltimateTarget;
1382 }
1383 UObjectToUObjectMap UltimateTargetsMap;
1384 for (const auto *UObject : UObjects) {
1385 auto Target = UObject;
1386 auto It = IndirectionMap.find(Target);
1387 for (; It != IndirectionMap.end(); It = IndirectionMap.find(Target))
1388 Target = It->second;
1389 UltimateTargetsMap[UObject] = Target;
1390 }
1391 return UltimateTargetsMap;
1392 };
1393
1394 // For each item in RedKeyToUOMap, if it has more than one underlying object,
1395 // try to merge the equivalence classes.
1396 for (auto &[RedKey, UObjects] : RedKeyToUOMap) {
1397 if (UObjects.size() < 2)
1398 continue;
1399 auto UTMap = GetUltimateTargets(UObjects);
1400 for (const auto &[UObject, UltimateTarget] : UTMap) {
1401 if (UObject == UltimateTarget)
1402 continue;
1403
1404 EqClassKey KeyFrom{UObject, std::get<0>(RedKey), std::get<1>(RedKey),
1405 std::get<2>(RedKey)};
1406 EqClassKey KeyTo{UltimateTarget, std::get<0>(RedKey), std::get<1>(RedKey),
1407 std::get<2>(RedKey)};
1408 // The entry for KeyFrom is guarantted to exist, unlike KeyTo. Thus,
1409 // request the reference to the instructions vector for KeyTo first.
1410 const auto &VecTo = EQClasses[KeyTo];
1411 const auto &VecFrom = EQClasses[KeyFrom];
1412 SmallVector<Instruction *, 8> MergedVec;
1413 std::merge(VecFrom.begin(), VecFrom.end(), VecTo.begin(), VecTo.end(),
1414 std::back_inserter(MergedVec),
1415 [](Instruction *A, Instruction *B) {
1416 return A && B && A->comesBefore(B);
1417 });
1418 EQClasses[KeyTo] = std::move(MergedVec);
1419 EQClasses.erase(KeyFrom);
1420 }
1421 }
1422 LLVM_DEBUG({
1423 dbgs() << "LSV: mergeEquivalenceClasses: after merging:\n";
1424 for (const auto &EC : EQClasses) {
1425 dbgs() << " Key: {" << EC.first << "}\n";
1426 for (const auto &Inst : EC.second)
1427 dbgs() << " Inst: " << *Inst << '\n';
1428 }
1429 });
1430}
1431
1432EquivalenceClassMap
1433Vectorizer::collectEquivalenceClasses(BasicBlock::iterator Begin,
1435 EquivalenceClassMap Ret;
1436
1437 auto GetUnderlyingObject = [](const Value *Ptr) -> const Value * {
1438 const Value *ObjPtr = llvm::getUnderlyingObject(Ptr);
1439 if (const auto *Sel = dyn_cast<SelectInst>(ObjPtr)) {
1440 // The select's themselves are distinct instructions even if they share
1441 // the same condition and evaluate to consecutive pointers for true and
1442 // false values of the condition. Therefore using the select's themselves
1443 // for grouping instructions would put consecutive accesses into different
1444 // lists and they won't be even checked for being consecutive, and won't
1445 // be vectorized.
1446 return Sel->getCondition();
1447 }
1448 return ObjPtr;
1449 };
1450
1451 for (Instruction &I : make_range(Begin, End)) {
1452 auto *LI = dyn_cast<LoadInst>(&I);
1453 auto *SI = dyn_cast<StoreInst>(&I);
1454 if (!LI && !SI)
1455 continue;
1456
1457 if ((LI && !LI->isSimple()) || (SI && !SI->isSimple()))
1458 continue;
1459
1460 if ((LI && !TTI.isLegalToVectorizeLoad(LI)) ||
1461 (SI && !TTI.isLegalToVectorizeStore(SI)))
1462 continue;
1463
1464 Type *Ty = getLoadStoreType(&I);
1465 if (!VectorType::isValidElementType(Ty->getScalarType()))
1466 continue;
1467
1468 // Skip weird non-byte sizes. They probably aren't worth the effort of
1469 // handling correctly.
1470 unsigned TySize = DL.getTypeSizeInBits(Ty);
1471 if ((TySize % 8) != 0)
1472 continue;
1473
1474 // Skip vectors of pointers. The vectorizeLoadChain/vectorizeStoreChain
1475 // functions are currently using an integer type for the vectorized
1476 // load/store, and does not support casting between the integer type and a
1477 // vector of pointers (e.g. i64 to <2 x i16*>)
1478 if (Ty->isVectorTy() && Ty->isPtrOrPtrVectorTy())
1479 continue;
1480
1482 unsigned AS = Ptr->getType()->getPointerAddressSpace();
1483 unsigned VecRegSize = TTI.getLoadStoreVecRegBitWidth(AS);
1484
1485 unsigned VF = VecRegSize / TySize;
1486 VectorType *VecTy = dyn_cast<VectorType>(Ty);
1487
1488 // Only handle power-of-two sized elements.
1489 if ((!VecTy && !isPowerOf2_32(DL.getTypeSizeInBits(Ty))) ||
1490 (VecTy && !isPowerOf2_32(DL.getTypeSizeInBits(VecTy->getScalarType()))))
1491 continue;
1492
1493 // No point in looking at these if they're too big to vectorize.
1494 if (TySize > VecRegSize / 2 ||
1495 (VecTy && TTI.getLoadVectorFactor(VF, TySize, TySize / 8, VecTy) == 0))
1496 continue;
1497
1498 Ret[{GetUnderlyingObject(Ptr), AS,
1499 DL.getTypeSizeInBits(getLoadStoreType(&I)->getScalarType()),
1500 /*IsLoad=*/LI != nullptr}]
1501 .emplace_back(&I);
1502 }
1503
1504 mergeEquivalenceClasses(Ret);
1505 return Ret;
1506}
1507
1508std::vector<Chain> Vectorizer::gatherChains(ArrayRef<Instruction *> Instrs) {
1509 if (Instrs.empty())
1510 return {};
1511
1512 unsigned AS = getLoadStoreAddressSpace(Instrs[0]);
1513 unsigned ASPtrBits = DL.getIndexSizeInBits(AS);
1514
1515#ifndef NDEBUG
1516 // Check that Instrs is in BB order and all have the same addr space.
1517 for (size_t I = 1; I < Instrs.size(); ++I) {
1518 assert(Instrs[I - 1]->comesBefore(Instrs[I]));
1519 assert(getLoadStoreAddressSpace(Instrs[I]) == AS);
1520 }
1521#endif
1522
1523 // Machinery to build an MRU-hashtable of Chains.
1524 //
1525 // (Ideally this could be done with MapVector, but as currently implemented,
1526 // moving an element to the front of a MapVector is O(n).)
1527 struct InstrListElem : ilist_node<InstrListElem>,
1528 std::pair<Instruction *, Chain> {
1529 explicit InstrListElem(Instruction *I)
1530 : std::pair<Instruction *, Chain>(I, {}) {}
1531 };
1532 struct InstrListElemDenseMapInfo {
1533 using PtrInfo = DenseMapInfo<InstrListElem *>;
1534 using IInfo = DenseMapInfo<Instruction *>;
1535 static InstrListElem *getEmptyKey() { return PtrInfo::getEmptyKey(); }
1536 static InstrListElem *getTombstoneKey() {
1537 return PtrInfo::getTombstoneKey();
1538 }
1539 static unsigned getHashValue(const InstrListElem *E) {
1540 return IInfo::getHashValue(E->first);
1541 }
1542 static bool isEqual(const InstrListElem *A, const InstrListElem *B) {
1543 if (A == getEmptyKey() || B == getEmptyKey())
1544 return A == getEmptyKey() && B == getEmptyKey();
1545 if (A == getTombstoneKey() || B == getTombstoneKey())
1546 return A == getTombstoneKey() && B == getTombstoneKey();
1547 return IInfo::isEqual(A->first, B->first);
1548 }
1549 };
1550 SpecificBumpPtrAllocator<InstrListElem> Allocator;
1551 simple_ilist<InstrListElem> MRU;
1552 DenseSet<InstrListElem *, InstrListElemDenseMapInfo> Chains;
1553
1554 // Compare each instruction in `instrs` to leader of the N most recently-used
1555 // chains. This limits the O(n^2) behavior of this pass while also allowing
1556 // us to build arbitrarily long chains.
1557 for (Instruction *I : Instrs) {
1558 constexpr int MaxChainsToTry = 64;
1559
1560 bool MatchFound = false;
1561 auto ChainIter = MRU.begin();
1562 for (size_t J = 0; J < MaxChainsToTry && ChainIter != MRU.end();
1563 ++J, ++ChainIter) {
1564 if (std::optional<APInt> Offset = getConstantOffset(
1565 getLoadStorePointerOperand(ChainIter->first),
1567 /*ContextInst=*/
1568 (ChainIter->first->comesBefore(I) ? I : ChainIter->first))) {
1569 // `Offset` might not have the expected number of bits, if e.g. AS has a
1570 // different number of bits than opaque pointers.
1571 ChainIter->second.emplace_back(I, Offset.value());
1572 // Move ChainIter to the front of the MRU list.
1573 MRU.remove(*ChainIter);
1574 MRU.push_front(*ChainIter);
1575 MatchFound = true;
1576 break;
1577 }
1578 }
1579
1580 if (!MatchFound) {
1581 APInt ZeroOffset(ASPtrBits, 0);
1582 InstrListElem *E = new (Allocator.Allocate()) InstrListElem(I);
1583 E->second.emplace_back(I, ZeroOffset);
1584 MRU.push_front(*E);
1585 Chains.insert(E);
1586 }
1587 }
1588
1589 std::vector<Chain> Ret;
1590 Ret.reserve(Chains.size());
1591 // Iterate over MRU rather than Chains so the order is deterministic.
1592 for (auto &E : MRU)
1593 if (E.second.size() > 1)
1594 Ret.emplace_back(std::move(E.second));
1595 return Ret;
1596}
1597
1598std::optional<APInt> Vectorizer::getConstantOffset(Value *PtrA, Value *PtrB,
1599 Instruction *ContextInst,
1600 unsigned Depth) {
1601 LLVM_DEBUG(dbgs() << "LSV: getConstantOffset, PtrA=" << *PtrA
1602 << ", PtrB=" << *PtrB << ", ContextInst= " << *ContextInst
1603 << ", Depth=" << Depth << "\n");
1604 // We'll ultimately return a value of this bit width, even if computations
1605 // happen in a different width.
1606 unsigned OrigBitWidth = DL.getIndexTypeSizeInBits(PtrA->getType());
1607 APInt OffsetA(OrigBitWidth, 0);
1608 APInt OffsetB(OrigBitWidth, 0);
1609 PtrA = PtrA->stripAndAccumulateInBoundsConstantOffsets(DL, OffsetA);
1610 PtrB = PtrB->stripAndAccumulateInBoundsConstantOffsets(DL, OffsetB);
1611 unsigned NewPtrBitWidth = DL.getTypeStoreSizeInBits(PtrA->getType());
1612 if (NewPtrBitWidth != DL.getTypeStoreSizeInBits(PtrB->getType()))
1613 return std::nullopt;
1614
1615 // If we have to shrink the pointer, stripAndAccumulateInBoundsConstantOffsets
1616 // should properly handle a possible overflow and the value should fit into
1617 // the smallest data type used in the cast/gep chain.
1618 assert(OffsetA.getSignificantBits() <= NewPtrBitWidth &&
1619 OffsetB.getSignificantBits() <= NewPtrBitWidth);
1620
1621 OffsetA = OffsetA.sextOrTrunc(NewPtrBitWidth);
1622 OffsetB = OffsetB.sextOrTrunc(NewPtrBitWidth);
1623 if (PtrA == PtrB)
1624 return (OffsetB - OffsetA).sextOrTrunc(OrigBitWidth);
1625
1626 // Try to compute B - A.
1627 const SCEV *DistScev = SE.getMinusSCEV(SE.getSCEV(PtrB), SE.getSCEV(PtrA));
1628 if (DistScev != SE.getCouldNotCompute()) {
1629 LLVM_DEBUG(dbgs() << "LSV: SCEV PtrB - PtrA =" << *DistScev << "\n");
1630 ConstantRange DistRange = SE.getSignedRange(DistScev);
1631 if (DistRange.isSingleElement()) {
1632 // Handle index width (the width of Dist) != pointer width (the width of
1633 // the Offset*s at this point).
1634 APInt Dist = DistRange.getSingleElement()->sextOrTrunc(NewPtrBitWidth);
1635 return (OffsetB - OffsetA + Dist).sextOrTrunc(OrigBitWidth);
1636 }
1637 }
1638 if (std::optional<APInt> Diff =
1639 getConstantOffsetComplexAddrs(PtrA, PtrB, ContextInst, Depth))
1640 return (OffsetB - OffsetA + Diff->sext(OffsetB.getBitWidth()))
1641 .sextOrTrunc(OrigBitWidth);
1642 return std::nullopt;
1643}
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
This file implements a class to represent arbitrary precision integral constant values and operations...
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static bool isEqual(const Function &Caller, const Function &Callee)
This file contains the simple types necessary to represent the attributes associated with functions a...
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
This file contains the declarations for the subclasses of Constant, which represent the different fla...
This file defines the DenseMap class.
static bool runOnFunction(Function &F, bool PostInlining)
#define DEBUG_TYPE
Module.h This file contains the declarations for the Module class.
static bool checkNoWrapFlags(Instruction *I, bool Signed)
static bool checkIfSafeAddSequence(const APInt &IdxDiff, Instruction *AddOpA, unsigned MatchingOpIdxA, Instruction *AddOpB, unsigned MatchingOpIdxB, bool Signed)
#define F(x, y, z)
Definition MD5.cpp:55
#define I(x, y, z)
Definition MD5.cpp:58
This file implements a map that provides insertion order iteration.
This file provides utility analysis objects describing memory locations.
#define T
static bool isInvariantLoad(const LoadInst *LI, const bool IsKernelFn)
#define INITIALIZE_PASS_DEPENDENCY(depName)
Definition PassSupport.h:42
#define INITIALIZE_PASS_END(passName, arg, name, cfg, analysis)
Definition PassSupport.h:44
#define INITIALIZE_PASS_BEGIN(passName, arg, name, cfg, analysis)
Definition PassSupport.h:39
This file builds on the ADT/GraphTraits.h file to build a generic graph post order iterator.
static bool isSafeToMove(const MachineInstr &From, const MachineInstr &To)
Check if it's safe to move From down to To, checking that no physical registers are clobbered.
Basic Register Allocator
This file contains some templates that are useful if you are working with the STL at all.
Provides some synthesis utilities to produce sequences of values.
This file defines the SmallPtrSet class.
This file defines the SmallVector class.
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
Definition Statistic.h:167
#define LLVM_DEBUG(...)
Definition Debug.h:114
This pass exposes codegen information to IR-level passes.
A manager for alias analyses.
A wrapper pass to provide the legacy pass manager access to a suitably prepared AAResults object.
Class for arbitrary precision integers.
Definition APInt.h:78
void clearBit(unsigned BitPosition)
Set a given bit to 0.
Definition APInt.h:1406
APInt abs() const
Get the absolute value.
Definition APInt.h:1795
bool sgt(const APInt &RHS) const
Signed greater than comparison.
Definition APInt.h:1201
unsigned getBitWidth() const
Return the number of bits in the APInt.
Definition APInt.h:1488
bool sle(const APInt &RHS) const
Signed less or equal comparison.
Definition APInt.h:1166
LLVM_ABI APInt sextOrTrunc(unsigned width) const
Sign extend or truncate to width.
Definition APInt.cpp:1041
uint64_t getLimitedValue(uint64_t Limit=UINT64_MAX) const
If this value is smaller than the specified limit, return it, otherwise return the limit value.
Definition APInt.h:475
bool sge(const APInt &RHS) const
Signed greater or equal comparison.
Definition APInt.h:1237
int64_t getSExtValue() const
Get sign extended value.
Definition APInt.h:1562
bool uge(const APInt &RHS) const
Unsigned greater or equal comparison.
Definition APInt.h:1221
PassT::Result & getResult(IRUnitT &IR, ExtraArgTs... ExtraArgs)
Get the result of an analysis pass for a given IR unit.
AnalysisUsage & addRequired()
LLVM_ABI void setPreservesCFG()
This function should be called by the pass, iff they do not:
Definition Pass.cpp:270
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:41
size_t size() const
size - Get the array size.
Definition ArrayRef.h:147
bool empty() const
empty - Check if the array is empty.
Definition ArrayRef.h:142
A function analysis which provides an AssumptionCache.
An immutable pass that tracks lazily created AssumptionCache objects.
A cache of @llvm.assume calls within a function.
LLVM Basic Block Representation.
Definition BasicBlock.h:62
InstListType::reverse_iterator reverse_iterator
Definition BasicBlock.h:172
InstListType::iterator iterator
Instruction iterators...
Definition BasicBlock.h:170
ModRefInfo getModRefInfo(const Instruction *I, const std::optional< MemoryLocation > &OptLoc)
Represents analyses that only rely on functions' control flow.
Definition Analysis.h:73
const APInt * getSingleElement() const
If this set contains a single element, return it, otherwise return null.
bool isSingleElement() const
Return true if this set contains exactly one member.
iterator find(const_arg_type_t< KeyT > Val)
Definition DenseMap.h:165
iterator end()
Definition DenseMap.h:81
const ValueT & at(const_arg_type_t< KeyT > Val) const
at - Return the entry for the specified key, or abort if no such entry exists.
Definition DenseMap.h:205
Analysis pass which computes a DominatorTree.
Definition Dominators.h:284
Legacy analysis pass which computes a DominatorTree.
Definition Dominators.h:322
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree.
Definition Dominators.h:165
static LLVM_ABI FixedVectorType * get(Type *ElementType, unsigned NumElts)
Definition Type.cpp:803
FunctionPass class - This class is used to implement most global optimizations.
Definition Pass.h:314
Legacy wrapper pass to provide the GlobalsAAResult object.
Value * CreateInsertElement(Type *VecTy, Value *NewElt, Value *Idx, const Twine &Name="")
Definition IRBuilder.h:2571
Value * CreateExtractElement(Value *Vec, Value *Idx, const Twine &Name="")
Definition IRBuilder.h:2559
LoadInst * CreateAlignedLoad(Type *Ty, Value *Ptr, MaybeAlign Align, const char *Name)
Definition IRBuilder.h:1864
ConstantInt * getInt32(uint32_t C)
Get a constant 32-bit value.
Definition IRBuilder.h:522
Value * CreateBitOrPointerCast(Value *V, Type *DestTy, const Twine &Name="")
Definition IRBuilder.h:2286
Value * CreateShuffleVector(Value *V1, Value *V2, Value *Mask, const Twine &Name="")
Definition IRBuilder.h:2593
void SetInsertPoint(BasicBlock *TheBB)
This specifies that created instructions should be appended to the end of the specified block.
Definition IRBuilder.h:207
StoreInst * CreateAlignedStore(Value *Val, Value *Ptr, MaybeAlign Align, bool isVolatile=false)
Definition IRBuilder.h:1883
LLVM_ABI bool hasNoUnsignedWrap() const LLVM_READONLY
Determine whether the no unsigned wrap flag is set.
LLVM_ABI bool hasNoSignedWrap() const LLVM_READONLY
Determine whether the no signed wrap flag is set.
bool hasMetadata() const
Return true if this instruction has any metadata attached to it.
LLVM_ABI void moveBefore(InstListType::iterator InsertPos)
Unlink this instruction from its current basic block and insert it into the basic block that MovePos ...
LLVM_ABI bool comesBefore(const Instruction *Other) const
Given an instruction Other in the same basic block as this instruction, return true if this instructi...
unsigned getOpcode() const
Returns a member of one of the enums like Instruction::Add.
An instruction for reading from memory.
bool isSimple() const
LLVM_ABI PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM)
This class implements a map that also provides access to all stored values in a deterministic order.
Definition MapVector.h:36
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
Definition MapVector.h:115
static LLVM_ABI MemoryLocation get(const LoadInst *LI)
Return a location with information about the memory reference by the given instruction.
static LLVM_ABI PassRegistry * getPassRegistry()
getPassRegistry - Access the global registry object, which is automatically initialized at applicatio...
Pass interface - Implemented by all 'passes'.
Definition Pass.h:99
static LLVM_ABI PoisonValue * get(Type *T)
Static factory methods - Return an 'poison' object of the specified type.
A set of analyses that are preserved following a run of a transformation pass.
Definition Analysis.h:112
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
Definition Analysis.h:118
PreservedAnalyses & preserveSet()
Mark an analysis set as preserved.
Definition Analysis.h:151
Legacy wrapper pass to provide the SCEVAAResult object.
Analysis pass that exposes the ScalarEvolution for a function.
The main scalar evolution driver.
LLVM_ABI const SCEV * getSCEV(Value *V)
Return a SCEV expression for the full generality of the specified expression.
ConstantRange getSignedRange(const SCEV *S)
Determine the signed range for a particular SCEV.
LLVM_ABI const SCEV * getMinusSCEV(const SCEV *LHS, const SCEV *RHS, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Return LHS-RHS.
LLVM_ABI const SCEV * getCouldNotCompute()
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
bool contains(ConstPtrType Ptr) const
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
reference emplace_back(ArgTypes &&... Args)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Analysis pass providing the TargetTransformInfo.
Wrapper pass for TargetTransformInfo.
This pass provides access to the codegen interfaces that are needed for IR-level transformations.
LLVM_ABI bool isLegalToVectorizeLoad(LoadInst *LI) const
LLVM_ABI bool isLegalToVectorizeStore(StoreInst *SI) const
LLVM_ABI unsigned getStoreVectorFactor(unsigned VF, unsigned StoreSize, unsigned ChainSizeInBytes, VectorType *VecTy) const
LLVM_ABI bool isLegalToVectorizeStoreChain(unsigned ChainSizeInBytes, Align Alignment, unsigned AddrSpace) const
LLVM_ABI unsigned getLoadStoreVecRegBitWidth(unsigned AddrSpace) const
LLVM_ABI unsigned getLoadVectorFactor(unsigned VF, unsigned LoadSize, unsigned ChainSizeInBytes, VectorType *VecTy) const
LLVM_ABI bool allowsMisalignedMemoryAccesses(LLVMContext &Context, unsigned BitWidth, unsigned AddressSpace=0, Align Alignment=Align(1), unsigned *Fast=nullptr) const
Determine if the target supports unaligned memory accesses.
LLVM_ABI bool isLegalToVectorizeLoadChain(unsigned ChainSizeInBytes, Align Alignment, unsigned AddrSpace) const
bool isVectorTy() const
True if this is an instance of VectorType.
Definition Type.h:273
bool isPointerTy() const
True if this is an instance of PointerType.
Definition Type.h:267
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
Definition Type.h:352
LLVM_ABI unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
Definition Type.cpp:231
bool isPtrOrPtrVectorTy() const
Return true if this is a pointer type or a vector of pointer types.
Definition Type.h:270
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition Type.h:240
Value * getOperand(unsigned i) const
Definition User.h:232
unsigned getNumOperands() const
Definition User.h:254
LLVM Value Representation.
Definition Value.h:75
Type * getType() const
All values are typed, get the type of this value.
Definition Value.h:256
const Value * stripAndAccumulateInBoundsConstantOffsets(const DataLayout &DL, APInt &Offset) const
This is a wrapper around stripAndAccumulateConstantOffsets with the in-bounds requirement set to fals...
Definition Value.h:759
LLVM_ABI void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
Definition Value.cpp:546
LLVM_ABI const Value * stripPointerCasts() const
Strip off pointer casts, all-zero GEPs and address space casts.
Definition Value.cpp:701
LLVM_ABI StringRef getName() const
Return a constant reference to the value's name.
Definition Value.cpp:322
std::pair< iterator, bool > insert(const ValueT &V)
Definition DenseSet.h:194
size_type size() const
Definition DenseSet.h:87
TypeSize getSequentialElementStride(const DataLayout &DL) const
const ParentTy * getParent() const
Definition ilist_node.h:34
This class implements an extremely fast bulk output stream that can only output to a stream.
Definition raw_ostream.h:53
void push_front(reference Node)
Insert a node at the front; never copies.
void remove(reference N)
Remove a node by reference; never deletes.
Changed
This provides a very simple, boring adaptor for a begin and end iterator into a range type.
Abstract Attribute helper functions.
Definition Attributor.h:165
constexpr char Align[]
Key for Kernel::Arg::Metadata::mAlign.
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
PointerTypeMap run(const Module &M)
Compute the PointerTypeMap for the module M.
Context & getContext() const
Definition BasicBlock.h:99
friend class Instruction
Iterator for Instructions in a `BasicBlock.
Definition BasicBlock.h:73
This is an optimization pass for GlobalISel generic memory operations.
@ Offset
Definition DWP.cpp:477
FunctionAddr VTableAddr Value
Definition InstrProf.h:137
auto min_element(R &&Range)
Provide wrappers to std::min_element which take ranges instead of having to pass begin/end explicitly...
Definition STLExtras.h:2002
unsigned getLoadStoreAddressSpace(const Value *I)
A helper function that returns the address space of the pointer operand of load or store instruction.
LLVM_ABI bool RecursivelyDeleteTriviallyDeadInstructions(Value *V, const TargetLibraryInfo *TLI=nullptr, MemorySSAUpdater *MSSAU=nullptr, std::function< void(Value *)> AboutToDeleteCallback=std::function< void(Value *)>())
If the specified value is a trivially dead instruction, delete it.
Definition Local.cpp:533
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:649
const Value * getLoadStorePointerOperand(const Value *V)
A helper function that returns the pointer operand of a load or store instruction.
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
LLVM_ABI Pass * createLoadStoreVectorizerPass()
Create a legacy pass manager instance of the LoadStoreVectorizer pass.
iterator_range< po_iterator< T > > post_order(const T &G)
Align getLoadStoreAlignment(const Value *I)
A helper function that returns the alignment of load or store instruction.
LLVM_ABI Instruction * propagateMetadata(Instruction *I, ArrayRef< Value * > VL)
Specifically, let Kinds = [MD_tbaa, MD_alias_scope, MD_noalias, MD_fpmath, MD_nontemporal,...
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1714
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
Definition MathExtras.h:288
LLVM_ABI Align getOrEnforceKnownAlignment(Value *V, MaybeAlign PrefAlign, const DataLayout &DL, const Instruction *CxtI=nullptr, AssumptionCache *AC=nullptr, const DominatorTree *DT=nullptr)
Try to ensure that the alignment of V is at least PrefAlign bytes.
Definition Local.cpp:1566
bool isModSet(const ModRefInfo MRI)
Definition ModRef.h:49
void sort(IteratorTy Start, IteratorTy End)
Definition STLExtras.h:1632
LLVM_ABI void computeKnownBits(const Value *V, KnownBits &Known, const DataLayout &DL, AssumptionCache *AC=nullptr, const Instruction *CxtI=nullptr, const DominatorTree *DT=nullptr, bool UseInstrInfo=true, unsigned Depth=0)
Determine which bits of V are known to be either zero or one and return them in the KnownZero/KnownOn...
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:207
generic_gep_type_iterator<> gep_type_iterator
bool isModOrRefSet(const ModRefInfo MRI)
Definition ModRef.h:43
SmallVector< ValueTypeFromRangeType< R >, Size > to_vector(R &&Range)
Given a range of type R, iterate the entire range and return a SmallVector with elements of the vecto...
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:548
LLVM_ATTRIBUTE_VISIBILITY_DEFAULT AnalysisKey InnerAnalysisManagerProxy< AnalysisManagerT, IRUnitT, ExtraArgTs... >::Key
ModRefInfo
Flags indicating whether a memory access modifies or references memory.
Definition ModRef.h:28
TargetTransformInfo TTI
IRBuilder(LLVMContext &, FolderTy, InserterTy, MDNode *, ArrayRef< OperandBundleDef >) -> IRBuilder< FolderTy, InserterTy >
LLVM_ABI void initializeLoadStoreVectorizerLegacyPassPass(PassRegistry &)
auto max_element(R &&Range)
Provide wrappers to std::max_element which take ranges instead of having to pass begin/end explicitly...
Definition STLExtras.h:2012
raw_ostream & operator<<(raw_ostream &OS, const APFixedPoint &FX)
ArrayRef(const T &OneElt) -> ArrayRef< T >
constexpr unsigned BitWidth
OutputIt move(R &&Range, OutputIt Out)
Provide wrappers to std::move which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1849
LLVM_ABI bool isGuaranteedToTransferExecutionToSuccessor(const Instruction *I)
Return true if this function can prove that the instruction I will always transfer execution to one o...
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:565
gep_type_iterator gep_type_begin(const User *GEP)
void erase_if(Container &C, UnaryPredicate P)
Provide a container algorithm similar to C++ Library Fundamentals v2's erase_if which is equivalent t...
Definition STLExtras.h:2102
Type * getLoadStoreType(const Value *I)
A helper function that returns the type of a load or store instruction.
auto seq(T Begin, T End)
Iterate over an integral type from Begin up to - but not including - End.
Definition Sequence.h:305
AnalysisManager< Function > FunctionAnalysisManager
Convenience typedef for the Function analysis manager.
LLVM_ABI const Value * getUnderlyingObject(const Value *V, unsigned MaxLookup=MaxLookupSearchDepth)
This method strips off any GEP address adjustments, pointer casts or llvm.threadlocal....
AAResults AliasAnalysis
Temporary typedef for legacy code that uses a generic AliasAnalysis pointer or reference.
uint64_t value() const
This is a hole in the type system and should not be abused.
Definition Alignment.h:85