LLVM 22.0.0git
InterleavedLoadCombinePass.cpp
Go to the documentation of this file.
1//===- InterleavedLoadCombine.cpp - Combine Interleaved Loads ---*- C++ -*-===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// \file
10//
11// This file defines the interleaved-load-combine pass. The pass searches for
12// ShuffleVectorInstruction that execute interleaving loads. If a matching
13// pattern is found, it adds a combined load and further instructions in a
14// pattern that is detectable by InterleavedAccesPass. The old instructions are
15// left dead to be removed later. The pass is specifically designed to be
16// executed just before InterleavedAccesPass to find any left-over instances
17// that are not detected within former passes.
18//
19//===----------------------------------------------------------------------===//
20
21#include "llvm/ADT/Statistic.h"
27#include "llvm/CodeGen/Passes.h"
31#include "llvm/IR/DataLayout.h"
32#include "llvm/IR/Dominators.h"
33#include "llvm/IR/Function.h"
34#include "llvm/IR/IRBuilder.h"
37#include "llvm/Pass.h"
38#include "llvm/Support/Debug.h"
42
43#include <algorithm>
44#include <cassert>
45#include <list>
46
47using namespace llvm;
48
49#define DEBUG_TYPE "interleaved-load-combine"
50
51namespace {
52
53/// Statistic counter
54STATISTIC(NumInterleavedLoadCombine, "Number of combined loads");
55
56/// Option to disable the pass
57static cl::opt<bool> DisableInterleavedLoadCombine(
58 "disable-" DEBUG_TYPE, cl::init(false), cl::Hidden,
59 cl::desc("Disable combining of interleaved loads"));
60
61struct VectorInfo;
62
63struct InterleavedLoadCombineImpl {
64public:
65 InterleavedLoadCombineImpl(Function &F, DominatorTree &DT, MemorySSA &MSSA,
67 const TargetMachine &TM)
68 : F(F), DT(DT), MSSA(MSSA),
69 TLI(*TM.getSubtargetImpl(F)->getTargetLowering()), TTI(TTI) {}
70
71 /// Scan the function for interleaved load candidates and execute the
72 /// replacement if applicable.
73 bool run();
74
75private:
76 /// Function this pass is working on
77 Function &F;
78
79 /// Dominator Tree Analysis
80 DominatorTree &DT;
81
82 /// Memory Alias Analyses
83 MemorySSA &MSSA;
84
85 /// Target Lowering Information
86 const TargetLowering &TLI;
87
88 /// Target Transform Information
90
91 /// Find the instruction in sets LIs that dominates all others, return nullptr
92 /// if there is none.
93 LoadInst *findFirstLoad(const std::set<LoadInst *> &LIs);
94
95 /// Replace interleaved load candidates. It does additional
96 /// analyses if this makes sense. Returns true on success and false
97 /// of nothing has been changed.
98 bool combine(std::list<VectorInfo> &InterleavedLoad,
100
101 /// Given a set of VectorInfo containing candidates for a given interleave
102 /// factor, find a set that represents a 'factor' interleaved load.
103 bool findPattern(std::list<VectorInfo> &Candidates,
104 std::list<VectorInfo> &InterleavedLoad, unsigned Factor,
105 const DataLayout &DL);
106}; // InterleavedLoadCombine
107
108/// First Order Polynomial on an n-Bit Integer Value
109///
110/// Polynomial(Value) = Value * B + A + E*2^(n-e)
111///
112/// A and B are the coefficients. E*2^(n-e) is an error within 'e' most
113/// significant bits. It is introduced if an exact computation cannot be proven
114/// (e.q. division by 2).
115///
116/// As part of this optimization multiple loads will be combined. It necessary
117/// to prove that loads are within some relative offset to each other. This
118/// class is used to prove relative offsets of values loaded from memory.
119///
120/// Representing an integer in this form is sound since addition in two's
121/// complement is associative (trivial) and multiplication distributes over the
122/// addition (see Proof(1) in Polynomial::mul). Further, both operations
123/// commute.
124//
125// Example:
126// declare @fn(i64 %IDX, <4 x float>* %PTR) {
127// %Pa1 = add i64 %IDX, 2
128// %Pa2 = lshr i64 %Pa1, 1
129// %Pa3 = getelementptr inbounds <4 x float>, <4 x float>* %PTR, i64 %Pa2
130// %Va = load <4 x float>, <4 x float>* %Pa3
131//
132// %Pb1 = add i64 %IDX, 4
133// %Pb2 = lshr i64 %Pb1, 1
134// %Pb3 = getelementptr inbounds <4 x float>, <4 x float>* %PTR, i64 %Pb2
135// %Vb = load <4 x float>, <4 x float>* %Pb3
136// ... }
137//
138// The goal is to prove that two loads load consecutive addresses.
139//
140// In this case the polynomials are constructed by the following
141// steps.
142//
143// The number tag #e specifies the error bits.
144//
145// Pa_0 = %IDX #0
146// Pa_1 = %IDX + 2 #0 | add 2
147// Pa_2 = %IDX/2 + 1 #1 | lshr 1
148// Pa_3 = %IDX/2 + 1 #1 | GEP, step signext to i64
149// Pa_4 = (%IDX/2)*16 + 16 #0 | GEP, multiply index by sizeof(4) for floats
150// Pa_5 = (%IDX/2)*16 + 16 #0 | GEP, add offset of leading components
151//
152// Pb_0 = %IDX #0
153// Pb_1 = %IDX + 4 #0 | add 2
154// Pb_2 = %IDX/2 + 2 #1 | lshr 1
155// Pb_3 = %IDX/2 + 2 #1 | GEP, step signext to i64
156// Pb_4 = (%IDX/2)*16 + 32 #0 | GEP, multiply index by sizeof(4) for floats
157// Pb_5 = (%IDX/2)*16 + 16 #0 | GEP, add offset of leading components
158//
159// Pb_5 - Pa_5 = 16 #0 | subtract to get the offset
160//
161// Remark: %PTR is not maintained within this class. So in this instance the
162// offset of 16 can only be assumed if the pointers are equal.
163//
164class Polynomial {
165 /// Operations on B
166 enum BOps {
167 LShr,
168 Mul,
169 SExt,
170 Trunc,
171 };
172
173 /// Number of Error Bits e
174 unsigned ErrorMSBs = (unsigned)-1;
175
176 /// Value
177 Value *V = nullptr;
178
179 /// Coefficient B
181
182 /// Coefficient A
183 APInt A;
184
185public:
186 Polynomial(Value *V) : V(V) {
187 IntegerType *Ty = dyn_cast<IntegerType>(V->getType());
188 if (Ty) {
189 ErrorMSBs = 0;
190 this->V = V;
191 A = APInt(Ty->getBitWidth(), 0);
192 }
193 }
194
195 Polynomial(const APInt &A, unsigned ErrorMSBs = 0)
196 : ErrorMSBs(ErrorMSBs), A(A) {}
197
198 Polynomial(unsigned BitWidth, uint64_t A, unsigned ErrorMSBs = 0)
199 : ErrorMSBs(ErrorMSBs), A(BitWidth, A) {}
200
201 Polynomial() = default;
202
203 /// Increment and clamp the number of undefined bits.
204 void incErrorMSBs(unsigned amt) {
205 if (ErrorMSBs == (unsigned)-1)
206 return;
207
208 ErrorMSBs += amt;
209 if (ErrorMSBs > A.getBitWidth())
210 ErrorMSBs = A.getBitWidth();
211 }
212
213 /// Decrement and clamp the number of undefined bits.
214 void decErrorMSBs(unsigned amt) {
215 if (ErrorMSBs == (unsigned)-1)
216 return;
217
218 if (ErrorMSBs > amt)
219 ErrorMSBs -= amt;
220 else
221 ErrorMSBs = 0;
222 }
223
224 /// Apply an add on the polynomial
225 Polynomial &add(const APInt &C) {
226 // Note: Addition is associative in two's complement even when in case of
227 // signed overflow.
228 //
229 // Error bits can only propagate into higher significant bits. As these are
230 // already regarded as undefined, there is no change.
231 //
232 // Theorem: Adding a constant to a polynomial does not change the error
233 // term.
234 //
235 // Proof:
236 //
237 // Since the addition is associative and commutes:
238 //
239 // (B + A + E*2^(n-e)) + C = B + (A + C) + E*2^(n-e)
240 // [qed]
241
242 if (C.getBitWidth() != A.getBitWidth()) {
243 ErrorMSBs = (unsigned)-1;
244 return *this;
245 }
246
247 A += C;
248 return *this;
249 }
250
251 /// Apply a multiplication onto the polynomial.
252 Polynomial &mul(const APInt &C) {
253 // Note: Multiplication distributes over the addition
254 //
255 // Theorem: Multiplication distributes over the addition
256 //
257 // Proof(1):
258 //
259 // (B+A)*C =-
260 // = (B + A) + (B + A) + .. {C Times}
261 // addition is associative and commutes, hence
262 // = B + B + .. {C Times} .. + A + A + .. {C times}
263 // = B*C + A*C
264 // (see (function add) for signed values and overflows)
265 // [qed]
266 //
267 // Theorem: If C has c trailing zeros, errors bits in A or B are shifted out
268 // to the left.
269 //
270 // Proof(2):
271 //
272 // Let B' and A' be the n-Bit inputs with some unknown errors EA,
273 // EB at e leading bits. B' and A' can be written down as:
274 //
275 // B' = B + 2^(n-e)*EB
276 // A' = A + 2^(n-e)*EA
277 //
278 // Let C' be an input with c trailing zero bits. C' can be written as
279 //
280 // C' = C*2^c
281 //
282 // Therefore we can compute the result by using distributivity and
283 // commutativity.
284 //
285 // (B'*C' + A'*C') = [B + 2^(n-e)*EB] * C' + [A + 2^(n-e)*EA] * C' =
286 // = [B + 2^(n-e)*EB + A + 2^(n-e)*EA] * C' =
287 // = (B'+A') * C' =
288 // = [B + 2^(n-e)*EB + A + 2^(n-e)*EA] * C' =
289 // = [B + A + 2^(n-e)*EB + 2^(n-e)*EA] * C' =
290 // = (B + A) * C' + [2^(n-e)*EB + 2^(n-e)*EA)] * C' =
291 // = (B + A) * C' + [2^(n-e)*EB + 2^(n-e)*EA)] * C*2^c =
292 // = (B + A) * C' + C*(EB + EA)*2^(n-e)*2^c =
293 //
294 // Let EC be the final error with EC = C*(EB + EA)
295 //
296 // = (B + A)*C' + EC*2^(n-e)*2^c =
297 // = (B + A)*C' + EC*2^(n-(e-c))
298 //
299 // Since EC is multiplied by 2^(n-(e-c)) the resulting error contains c
300 // less error bits than the input. c bits are shifted out to the left.
301 // [qed]
302
303 if (C.getBitWidth() != A.getBitWidth()) {
304 ErrorMSBs = (unsigned)-1;
305 return *this;
306 }
307
308 // Multiplying by one is a no-op.
309 if (C.isOne()) {
310 return *this;
311 }
312
313 // Multiplying by zero removes the coefficient B and defines all bits.
314 if (C.isZero()) {
315 ErrorMSBs = 0;
316 deleteB();
317 }
318
319 // See Proof(2): Trailing zero bits indicate a left shift. This removes
320 // leading bits from the result even if they are undefined.
321 decErrorMSBs(C.countr_zero());
322
323 A *= C;
324 pushBOperation(Mul, C);
325 return *this;
326 }
327
328 /// Apply a logical shift right on the polynomial
329 Polynomial &lshr(const APInt &C) {
330 // Theorem(1): (B + A + E*2^(n-e)) >> 1 => (B >> 1) + (A >> 1) + E'*2^(n-e')
331 // where
332 // e' = e + 1,
333 // E is a e-bit number,
334 // E' is a e'-bit number,
335 // holds under the following precondition:
336 // pre(1): A % 2 = 0
337 // pre(2): e < n, (see Theorem(2) for the trivial case with e=n)
338 // where >> expresses a logical shift to the right, with adding zeros.
339 //
340 // We need to show that for every, E there is a E'
341 //
342 // B = b_h * 2^(n-1) + b_m * 2 + b_l
343 // A = a_h * 2^(n-1) + a_m * 2 (pre(1))
344 //
345 // where a_h, b_h, b_l are single bits, and a_m, b_m are (n-2) bit numbers
346 //
347 // Let X = (B + A + E*2^(n-e)) >> 1
348 // Let Y = (B >> 1) + (A >> 1) + E*2^(n-e) >> 1
349 //
350 // X = [B + A + E*2^(n-e)] >> 1 =
351 // = [ b_h * 2^(n-1) + b_m * 2 + b_l +
352 // + a_h * 2^(n-1) + a_m * 2 +
353 // + E * 2^(n-e) ] >> 1 =
354 //
355 // The sum is built by putting the overflow of [a_m + b+n] into the term
356 // 2^(n-1). As there are no more bits beyond 2^(n-1) the overflow within
357 // this bit is discarded. This is expressed by % 2.
358 //
359 // The bit in position 0 cannot overflow into the term (b_m + a_m).
360 //
361 // = [ ([b_h + a_h + (b_m + a_m) >> (n-2)] % 2) * 2^(n-1) +
362 // + ((b_m + a_m) % 2^(n-2)) * 2 +
363 // + b_l + E * 2^(n-e) ] >> 1 =
364 //
365 // The shift is computed by dividing the terms by 2 and by cutting off
366 // b_l.
367 //
368 // = ([b_h + a_h + (b_m + a_m) >> (n-2)] % 2) * 2^(n-2) +
369 // + ((b_m + a_m) % 2^(n-2)) +
370 // + E * 2^(n-(e+1)) =
371 //
372 // by the definition in the Theorem e+1 = e'
373 //
374 // = ([b_h + a_h + (b_m + a_m) >> (n-2)] % 2) * 2^(n-2) +
375 // + ((b_m + a_m) % 2^(n-2)) +
376 // + E * 2^(n-e') =
377 //
378 // Compute Y by applying distributivity first
379 //
380 // Y = (B >> 1) + (A >> 1) + E*2^(n-e') =
381 // = (b_h * 2^(n-1) + b_m * 2 + b_l) >> 1 +
382 // + (a_h * 2^(n-1) + a_m * 2) >> 1 +
383 // + E * 2^(n-e) >> 1 =
384 //
385 // Again, the shift is computed by dividing the terms by 2 and by cutting
386 // off b_l.
387 //
388 // = b_h * 2^(n-2) + b_m +
389 // + a_h * 2^(n-2) + a_m +
390 // + E * 2^(n-(e+1)) =
391 //
392 // Again, the sum is built by putting the overflow of [a_m + b+n] into
393 // the term 2^(n-1). But this time there is room for a second bit in the
394 // term 2^(n-2) we add this bit to a new term and denote it o_h in a
395 // second step.
396 //
397 // = ([b_h + a_h + (b_m + a_m) >> (n-2)] >> 1) * 2^(n-1) +
398 // + ([b_h + a_h + (b_m + a_m) >> (n-2)] % 2) * 2^(n-2) +
399 // + ((b_m + a_m) % 2^(n-2)) +
400 // + E * 2^(n-(e+1)) =
401 //
402 // Let o_h = [b_h + a_h + (b_m + a_m) >> (n-2)] >> 1
403 // Further replace e+1 by e'.
404 //
405 // = o_h * 2^(n-1) +
406 // + ([b_h + a_h + (b_m + a_m) >> (n-2)] % 2) * 2^(n-2) +
407 // + ((b_m + a_m) % 2^(n-2)) +
408 // + E * 2^(n-e') =
409 //
410 // Move o_h into the error term and construct E'. To ensure that there is
411 // no 2^x with negative x, this step requires pre(2) (e < n).
412 //
413 // = ([b_h + a_h + (b_m + a_m) >> (n-2)] % 2) * 2^(n-2) +
414 // + ((b_m + a_m) % 2^(n-2)) +
415 // + o_h * 2^(e'-1) * 2^(n-e') + | pre(2), move 2^(e'-1)
416 // | out of the old exponent
417 // + E * 2^(n-e') =
418 // = ([b_h + a_h + (b_m + a_m) >> (n-2)] % 2) * 2^(n-2) +
419 // + ((b_m + a_m) % 2^(n-2)) +
420 // + [o_h * 2^(e'-1) + E] * 2^(n-e') + | move 2^(e'-1) out of
421 // | the old exponent
422 //
423 // Let E' = o_h * 2^(e'-1) + E
424 //
425 // = ([b_h + a_h + (b_m + a_m) >> (n-2)] % 2) * 2^(n-2) +
426 // + ((b_m + a_m) % 2^(n-2)) +
427 // + E' * 2^(n-e')
428 //
429 // Because X and Y are distinct only in there error terms and E' can be
430 // constructed as shown the theorem holds.
431 // [qed]
432 //
433 // For completeness in case of the case e=n it is also required to show that
434 // distributivity can be applied.
435 //
436 // In this case Theorem(1) transforms to (the pre-condition on A can also be
437 // dropped)
438 //
439 // Theorem(2): (B + A + E) >> 1 => (B >> 1) + (A >> 1) + E'
440 // where
441 // A, B, E, E' are two's complement numbers with the same bit
442 // width
443 //
444 // Let A + B + E = X
445 // Let (B >> 1) + (A >> 1) = Y
446 //
447 // Therefore we need to show that for every X and Y there is an E' which
448 // makes the equation
449 //
450 // X = Y + E'
451 //
452 // hold. This is trivially the case for E' = X - Y.
453 //
454 // [qed]
455 //
456 // Remark: Distributing lshr with and arbitrary number n can be expressed as
457 // ((((B + A) lshr 1) lshr 1) ... ) {n times}.
458 // This construction induces n additional error bits at the left.
459
460 if (C.getBitWidth() != A.getBitWidth()) {
461 ErrorMSBs = (unsigned)-1;
462 return *this;
463 }
464
465 if (C.isZero())
466 return *this;
467
468 // Test if the result will be zero
469 unsigned shiftAmt = C.getZExtValue();
470 if (shiftAmt >= C.getBitWidth())
471 return mul(APInt(C.getBitWidth(), 0));
472
473 // The proof that shiftAmt LSBs are zero for at least one summand is only
474 // possible for the constant number.
475 //
476 // If this can be proven add shiftAmt to the error counter
477 // `ErrorMSBs`. Otherwise set all bits as undefined.
478 if (A.countr_zero() < shiftAmt)
479 ErrorMSBs = A.getBitWidth();
480 else
481 incErrorMSBs(shiftAmt);
482
483 // Apply the operation.
484 pushBOperation(LShr, C);
485 A = A.lshr(shiftAmt);
486
487 return *this;
488 }
489
490 /// Apply a sign-extend or truncate operation on the polynomial.
491 Polynomial &sextOrTrunc(unsigned n) {
492 if (n < A.getBitWidth()) {
493 // Truncate: Clearly undefined Bits on the MSB side are removed
494 // if there are any.
495 decErrorMSBs(A.getBitWidth() - n);
496 A = A.trunc(n);
497 pushBOperation(Trunc, APInt(sizeof(n) * 8, n));
498 }
499 if (n > A.getBitWidth()) {
500 // Extend: Clearly extending first and adding later is different
501 // to adding first and extending later in all extended bits.
502 incErrorMSBs(n - A.getBitWidth());
503 A = A.sext(n);
504 pushBOperation(SExt, APInt(sizeof(n) * 8, n));
505 }
506
507 return *this;
508 }
509
510 /// Test if there is a coefficient B.
511 bool isFirstOrder() const { return V != nullptr; }
512
513 /// Test coefficient B of two Polynomials are equal.
514 bool isCompatibleTo(const Polynomial &o) const {
515 // The polynomial use different bit width.
516 if (A.getBitWidth() != o.A.getBitWidth())
517 return false;
518
519 // If neither Polynomial has the Coefficient B.
520 if (!isFirstOrder() && !o.isFirstOrder())
521 return true;
522
523 // The index variable is different.
524 if (V != o.V)
525 return false;
526
527 // Check the operations.
528 if (B.size() != o.B.size())
529 return false;
530
531 auto *ob = o.B.begin();
532 for (const auto &b : B) {
533 if (b != *ob)
534 return false;
535 ob++;
536 }
537
538 return true;
539 }
540
541 /// Subtract two polynomials, return an undefined polynomial if
542 /// subtraction is not possible.
543 Polynomial operator-(const Polynomial &o) const {
544 // Return an undefined polynomial if incompatible.
545 if (!isCompatibleTo(o))
546 return Polynomial();
547
548 // If the polynomials are compatible (meaning they have the same
549 // coefficient on B), B is eliminated. Thus a polynomial solely
550 // containing A is returned
551 return Polynomial(A - o.A, std::max(ErrorMSBs, o.ErrorMSBs));
552 }
553
554 /// Subtract a constant from a polynomial,
555 Polynomial operator-(uint64_t C) const {
556 Polynomial Result(*this);
557 Result.A -= C;
558 return Result;
559 }
560
561 /// Add a constant to a polynomial,
562 Polynomial operator+(uint64_t C) const {
563 Polynomial Result(*this);
564 Result.A += C;
565 return Result;
566 }
567
568 /// Returns true if it can be proven that two Polynomials are equal.
569 bool isProvenEqualTo(const Polynomial &o) {
570 // Subtract both polynomials and test if it is fully defined and zero.
571 Polynomial r = *this - o;
572 return (r.ErrorMSBs == 0) && (!r.isFirstOrder()) && (r.A.isZero());
573 }
574
575 /// Print the polynomial into a stream.
576 void print(raw_ostream &OS) const {
577 OS << "[{#ErrBits:" << ErrorMSBs << "} ";
578
579 if (V) {
580 for (auto b : B)
581 OS << "(";
582 OS << "(" << *V << ") ";
583
584 for (auto b : B) {
585 switch (b.first) {
586 case LShr:
587 OS << "LShr ";
588 break;
589 case Mul:
590 OS << "Mul ";
591 break;
592 case SExt:
593 OS << "SExt ";
594 break;
595 case Trunc:
596 OS << "Trunc ";
597 break;
598 }
599
600 OS << b.second << ") ";
601 }
602 }
603
604 OS << "+ " << A << "]";
605 }
606
607private:
608 void deleteB() {
609 V = nullptr;
610 B.clear();
611 }
612
613 void pushBOperation(const BOps Op, const APInt &C) {
614 if (isFirstOrder()) {
615 B.push_back(std::make_pair(Op, C));
616 return;
617 }
618 }
619};
620
621#ifndef NDEBUG
622static raw_ostream &operator<<(raw_ostream &OS, const Polynomial &S) {
623 S.print(OS);
624 return OS;
625}
626#endif
627
628/// VectorInfo stores abstract the following information for each vector
629/// element:
630///
631/// 1) The memory address loaded into the element as Polynomial
632/// 2) a set of load instruction necessary to construct the vector,
633/// 3) a set of all other instructions that are necessary to create the vector and
634/// 4) a pointer value that can be used as relative base for all elements.
635struct VectorInfo {
636private:
637 VectorInfo(const VectorInfo &c) : VTy(c.VTy) {
639 "Copying VectorInfo is neither implemented nor necessary,");
640 }
641
642public:
643 /// Information of a Vector Element
644 struct ElementInfo {
645 /// Offset Polynomial.
646 Polynomial Ofs;
647
648 /// The Load Instruction used to Load the entry. LI is null if the pointer
649 /// of the load instruction does not point on to the entry
650 LoadInst *LI;
651
652 ElementInfo(Polynomial Offset = Polynomial(), LoadInst *LI = nullptr)
653 : Ofs(Offset), LI(LI) {}
654 };
655
656 /// Basic-block the load instructions are within
657 BasicBlock *BB = nullptr;
658
659 /// Pointer value of all participation load instructions
660 Value *PV = nullptr;
661
662 /// Participating load instructions
663 std::set<LoadInst *> LIs;
664
665 /// Participating instructions
666 std::set<Instruction *> Is;
667
668 /// Final shuffle-vector instruction
669 ShuffleVectorInst *SVI = nullptr;
670
671 /// Information of the offset for each vector element
672 ElementInfo *EI;
673
674 /// Vector Type
675 FixedVectorType *const VTy;
676
677 VectorInfo(FixedVectorType *VTy) : VTy(VTy) {
678 EI = new ElementInfo[VTy->getNumElements()];
679 }
680
681 VectorInfo &operator=(const VectorInfo &other) = delete;
682
683 virtual ~VectorInfo() { delete[] EI; }
684
685 unsigned getDimension() const { return VTy->getNumElements(); }
686
687 /// Test if the VectorInfo can be part of an interleaved load with the
688 /// specified factor.
689 ///
690 /// \param Factor of the interleave
691 /// \param DL Targets Datalayout
692 ///
693 /// \returns true if this is possible and false if not
694 bool isInterleaved(unsigned Factor, const DataLayout &DL) const {
695 unsigned Size = DL.getTypeAllocSize(VTy->getElementType());
696 for (unsigned i = 1; i < getDimension(); i++) {
697 if (!EI[i].Ofs.isProvenEqualTo(EI[0].Ofs + i * Factor * Size)) {
698 return false;
699 }
700 }
701 return true;
702 }
703
704 /// Recursively computes the vector information stored in V.
705 ///
706 /// This function delegates the work to specialized implementations
707 ///
708 /// \param V Value to operate on
709 /// \param Result Result of the computation
710 ///
711 /// \returns false if no sensible information can be gathered.
712 static bool compute(Value *V, VectorInfo &Result, const DataLayout &DL) {
714 if (SVI)
715 return computeFromSVI(SVI, Result, DL);
717 if (LI)
718 return computeFromLI(LI, Result, DL);
720 if (BCI)
721 return computeFromBCI(BCI, Result, DL);
722 return false;
723 }
724
725 /// BitCastInst specialization to compute the vector information.
726 ///
727 /// \param BCI BitCastInst to operate on
728 /// \param Result Result of the computation
729 ///
730 /// \returns false if no sensible information can be gathered.
731 static bool computeFromBCI(BitCastInst *BCI, VectorInfo &Result,
732 const DataLayout &DL) {
734
735 if (!Op)
736 return false;
737
739 if (!VTy)
740 return false;
741
742 // We can only cast from large to smaller vectors
743 if (Result.VTy->getNumElements() % VTy->getNumElements())
744 return false;
745
746 unsigned Factor = Result.VTy->getNumElements() / VTy->getNumElements();
747 unsigned NewSize = DL.getTypeAllocSize(Result.VTy->getElementType());
748 unsigned OldSize = DL.getTypeAllocSize(VTy->getElementType());
749
750 if (NewSize * Factor != OldSize)
751 return false;
752
753 VectorInfo Old(VTy);
754 if (!compute(Op, Old, DL))
755 return false;
756
757 for (unsigned i = 0; i < Result.VTy->getNumElements(); i += Factor) {
758 for (unsigned j = 0; j < Factor; j++) {
759 Result.EI[i + j] =
760 ElementInfo(Old.EI[i / Factor].Ofs + j * NewSize,
761 j == 0 ? Old.EI[i / Factor].LI : nullptr);
762 }
763 }
764
765 Result.BB = Old.BB;
766 Result.PV = Old.PV;
767 Result.LIs.insert(Old.LIs.begin(), Old.LIs.end());
768 Result.Is.insert(Old.Is.begin(), Old.Is.end());
769 Result.Is.insert(BCI);
770 Result.SVI = nullptr;
771
772 return true;
773 }
774
775 /// ShuffleVectorInst specialization to compute vector information.
776 ///
777 /// \param SVI ShuffleVectorInst to operate on
778 /// \param Result Result of the computation
779 ///
780 /// Compute the left and the right side vector information and merge them by
781 /// applying the shuffle operation. This function also ensures that the left
782 /// and right side have compatible loads. This means that all loads are with
783 /// in the same basic block and are based on the same pointer.
784 ///
785 /// \returns false if no sensible information can be gathered.
786 static bool computeFromSVI(ShuffleVectorInst *SVI, VectorInfo &Result,
787 const DataLayout &DL) {
788 FixedVectorType *ArgTy =
790
791 // Compute the left hand vector information.
792 VectorInfo LHS(ArgTy);
793 if (!compute(SVI->getOperand(0), LHS, DL))
794 LHS.BB = nullptr;
795
796 // Compute the right hand vector information.
797 VectorInfo RHS(ArgTy);
798 if (!compute(SVI->getOperand(1), RHS, DL))
799 RHS.BB = nullptr;
800
801 // Neither operand produced sensible results?
802 if (!LHS.BB && !RHS.BB)
803 return false;
804 // Only RHS produced sensible results?
805 else if (!LHS.BB) {
806 Result.BB = RHS.BB;
807 Result.PV = RHS.PV;
808 }
809 // Only LHS produced sensible results?
810 else if (!RHS.BB) {
811 Result.BB = LHS.BB;
812 Result.PV = LHS.PV;
813 }
814 // Both operands produced sensible results?
815 else if ((LHS.BB == RHS.BB) && (LHS.PV == RHS.PV)) {
816 Result.BB = LHS.BB;
817 Result.PV = LHS.PV;
818 }
819 // Both operands produced sensible results but they are incompatible.
820 else {
821 return false;
822 }
823
824 // Merge and apply the operation on the offset information.
825 if (LHS.BB) {
826 Result.LIs.insert(LHS.LIs.begin(), LHS.LIs.end());
827 Result.Is.insert(LHS.Is.begin(), LHS.Is.end());
828 }
829 if (RHS.BB) {
830 Result.LIs.insert(RHS.LIs.begin(), RHS.LIs.end());
831 Result.Is.insert(RHS.Is.begin(), RHS.Is.end());
832 }
833 Result.Is.insert(SVI);
834 Result.SVI = SVI;
835
836 int j = 0;
837 for (int i : SVI->getShuffleMask()) {
838 assert((i < 2 * (signed)ArgTy->getNumElements()) &&
839 "Invalid ShuffleVectorInst (index out of bounds)");
840
841 if (i < 0)
842 Result.EI[j] = ElementInfo();
843 else if (i < (signed)ArgTy->getNumElements()) {
844 if (LHS.BB)
845 Result.EI[j] = LHS.EI[i];
846 else
847 Result.EI[j] = ElementInfo();
848 } else {
849 if (RHS.BB)
850 Result.EI[j] = RHS.EI[i - ArgTy->getNumElements()];
851 else
852 Result.EI[j] = ElementInfo();
853 }
854 j++;
855 }
856
857 return true;
858 }
859
860 /// LoadInst specialization to compute vector information.
861 ///
862 /// This function also acts as abort condition to the recursion.
863 ///
864 /// \param LI LoadInst to operate on
865 /// \param Result Result of the computation
866 ///
867 /// \returns false if no sensible information can be gathered.
868 static bool computeFromLI(LoadInst *LI, VectorInfo &Result,
869 const DataLayout &DL) {
870 Value *BasePtr;
871 Polynomial Offset;
872
873 if (LI->isVolatile())
874 return false;
875
876 if (LI->isAtomic())
877 return false;
878
879 if (!DL.typeSizeEqualsStoreSize(Result.VTy->getElementType()))
880 return false;
881
882 // Get the base polynomial
883 computePolynomialFromPointer(*LI->getPointerOperand(), Offset, BasePtr, DL);
884
885 Result.BB = LI->getParent();
886 Result.PV = BasePtr;
887 Result.LIs.insert(LI);
888 Result.Is.insert(LI);
889
890 for (unsigned i = 0; i < Result.getDimension(); i++) {
891 Value *Idx[2] = {
892 ConstantInt::get(Type::getInt32Ty(LI->getContext()), 0),
893 ConstantInt::get(Type::getInt32Ty(LI->getContext()), i),
894 };
895 int64_t Ofs = DL.getIndexedOffsetInType(Result.VTy, Idx);
896 Result.EI[i] = ElementInfo(Offset + Ofs, i == 0 ? LI : nullptr);
897 }
898
899 return true;
900 }
901
902 /// Recursively compute polynomial of a value.
903 ///
904 /// \param BO Input binary operation
905 /// \param Result Result polynomial
906 static void computePolynomialBinOp(BinaryOperator &BO, Polynomial &Result) {
907 Value *LHS = BO.getOperand(0);
908 Value *RHS = BO.getOperand(1);
909
910 // Find the RHS Constant if any
912 if ((!C) && BO.isCommutative()) {
914 if (C)
915 std::swap(LHS, RHS);
916 }
917
918 switch (BO.getOpcode()) {
919 case Instruction::Add:
920 if (!C)
921 break;
922
923 computePolynomial(*LHS, Result);
924 Result.add(C->getValue());
925 return;
926
927 case Instruction::LShr:
928 if (!C)
929 break;
930
931 computePolynomial(*LHS, Result);
932 Result.lshr(C->getValue());
933 return;
934
935 default:
936 break;
937 }
938
939 Result = Polynomial(&BO);
940 }
941
942 /// Recursively compute polynomial of a value
943 ///
944 /// \param V input value
945 /// \param Result result polynomial
946 static void computePolynomial(Value &V, Polynomial &Result) {
947 if (auto *BO = dyn_cast<BinaryOperator>(&V))
948 computePolynomialBinOp(*BO, Result);
949 else
950 Result = Polynomial(&V);
951 }
952
953 /// Compute the Polynomial representation of a Pointer type.
954 ///
955 /// \param Ptr input pointer value
956 /// \param Result result polynomial
957 /// \param BasePtr pointer the polynomial is based on
958 /// \param DL Datalayout of the target machine
959 static void computePolynomialFromPointer(Value &Ptr, Polynomial &Result,
960 Value *&BasePtr,
961 const DataLayout &DL) {
962 // Not a pointer type? Return an undefined polynomial
963 PointerType *PtrTy = dyn_cast<PointerType>(Ptr.getType());
964 if (!PtrTy) {
965 Result = Polynomial();
966 BasePtr = nullptr;
967 return;
968 }
969 unsigned PointerBits =
970 DL.getIndexSizeInBits(PtrTy->getPointerAddressSpace());
971
972 /// Skip pointer casts. Return Zero polynomial otherwise
973 if (isa<CastInst>(&Ptr)) {
974 CastInst &CI = *cast<CastInst>(&Ptr);
975 switch (CI.getOpcode()) {
976 case Instruction::BitCast:
977 computePolynomialFromPointer(*CI.getOperand(0), Result, BasePtr, DL);
978 break;
979 default:
980 BasePtr = &Ptr;
981 Polynomial(PointerBits, 0);
982 break;
983 }
984 }
985 /// Resolve GetElementPtrInst.
986 else if (isa<GetElementPtrInst>(&Ptr)) {
988
989 APInt BaseOffset(PointerBits, 0);
990
991 // Check if we can compute the Offset with accumulateConstantOffset
992 if (GEP.accumulateConstantOffset(DL, BaseOffset)) {
993 Result = Polynomial(BaseOffset);
994 BasePtr = GEP.getPointerOperand();
995 return;
996 } else {
997 // Otherwise we allow that the last index operand of the GEP is
998 // non-constant.
999 unsigned idxOperand, e;
1001 for (idxOperand = 1, e = GEP.getNumOperands(); idxOperand < e;
1002 idxOperand++) {
1003 ConstantInt *IDX = dyn_cast<ConstantInt>(GEP.getOperand(idxOperand));
1004 if (!IDX)
1005 break;
1006 Indices.push_back(IDX);
1007 }
1008
1009 // It must also be the last operand.
1010 if (idxOperand + 1 != e) {
1011 Result = Polynomial();
1012 BasePtr = nullptr;
1013 return;
1014 }
1015
1016 // Compute the polynomial of the index operand.
1017 computePolynomial(*GEP.getOperand(idxOperand), Result);
1018
1019 // Compute base offset from zero based index, excluding the last
1020 // variable operand.
1021 BaseOffset =
1022 DL.getIndexedOffsetInType(GEP.getSourceElementType(), Indices);
1023
1024 // Apply the operations of GEP to the polynomial.
1025 unsigned ResultSize = DL.getTypeAllocSize(GEP.getResultElementType());
1026 Result.sextOrTrunc(PointerBits);
1027 Result.mul(APInt(PointerBits, ResultSize));
1028 Result.add(BaseOffset);
1029 BasePtr = GEP.getPointerOperand();
1030 }
1031 }
1032 // All other instructions are handled by using the value as base pointer and
1033 // a zero polynomial.
1034 else {
1035 BasePtr = &Ptr;
1036 Polynomial(DL.getIndexSizeInBits(PtrTy->getPointerAddressSpace()), 0);
1037 }
1038 }
1039
1040#ifndef NDEBUG
1041 void print(raw_ostream &OS) const {
1042 if (PV)
1043 OS << *PV;
1044 else
1045 OS << "(none)";
1046 OS << " + ";
1047 for (unsigned i = 0; i < getDimension(); i++)
1048 OS << ((i == 0) ? "[" : ", ") << EI[i].Ofs;
1049 OS << "]";
1050 }
1051#endif
1052};
1053
1054} // anonymous namespace
1055
1056bool InterleavedLoadCombineImpl::findPattern(
1057 std::list<VectorInfo> &Candidates, std::list<VectorInfo> &InterleavedLoad,
1058 unsigned Factor, const DataLayout &DL) {
1059 for (auto C0 = Candidates.begin(), E0 = Candidates.end(); C0 != E0; ++C0) {
1060 unsigned i;
1061 // Try to find an interleaved load using the front of Worklist as first line
1062 unsigned Size = DL.getTypeAllocSize(C0->VTy->getElementType());
1063
1064 // List containing iterators pointing to the VectorInfos of the candidates
1065 std::vector<std::list<VectorInfo>::iterator> Res(Factor, Candidates.end());
1066
1067 for (auto C = Candidates.begin(), E = Candidates.end(); C != E; C++) {
1068 if (C->VTy != C0->VTy)
1069 continue;
1070 if (C->BB != C0->BB)
1071 continue;
1072 if (C->PV != C0->PV)
1073 continue;
1074
1075 // Check the current value matches any of factor - 1 remaining lines
1076 for (i = 1; i < Factor; i++) {
1077 if (C->EI[0].Ofs.isProvenEqualTo(C0->EI[0].Ofs + i * Size)) {
1078 Res[i] = C;
1079 }
1080 }
1081
1082 for (i = 1; i < Factor; i++) {
1083 if (Res[i] == Candidates.end())
1084 break;
1085 }
1086 if (i == Factor) {
1087 Res[0] = C0;
1088 break;
1089 }
1090 }
1091
1092 if (Res[0] != Candidates.end()) {
1093 // Move the result into the output
1094 for (unsigned i = 0; i < Factor; i++) {
1095 InterleavedLoad.splice(InterleavedLoad.end(), Candidates, Res[i]);
1096 }
1097
1098 return true;
1099 }
1100 }
1101 return false;
1102}
1103
1104LoadInst *
1105InterleavedLoadCombineImpl::findFirstLoad(const std::set<LoadInst *> &LIs) {
1106 assert(!LIs.empty() && "No load instructions given.");
1107
1108 // All LIs are within the same BB. Select the first for a reference.
1109 BasicBlock *BB = (*LIs.begin())->getParent();
1111 *BB, [&LIs](Instruction &I) -> bool { return is_contained(LIs, &I); });
1112 assert(FLI != BB->end());
1113
1114 return cast<LoadInst>(FLI);
1115}
1116
1117bool InterleavedLoadCombineImpl::combine(std::list<VectorInfo> &InterleavedLoad,
1118 OptimizationRemarkEmitter &ORE) {
1119 LLVM_DEBUG(dbgs() << "Checking interleaved load\n");
1120
1121 // The insertion point is the LoadInst which loads the first values. The
1122 // following tests are used to proof that the combined load can be inserted
1123 // just before InsertionPoint.
1124 LoadInst *InsertionPoint = InterleavedLoad.front().EI[0].LI;
1125
1126 // Test if the offset is computed
1127 if (!InsertionPoint)
1128 return false;
1129
1130 std::set<LoadInst *> LIs;
1131 std::set<Instruction *> Is;
1132 std::set<Instruction *> SVIs;
1133
1134 InstructionCost InterleavedCost;
1137
1138 // Get the interleave factor
1139 unsigned Factor = InterleavedLoad.size();
1140
1141 // Merge all input sets used in analysis
1142 for (auto &VI : InterleavedLoad) {
1143 // Generate a set of all load instructions to be combined
1144 LIs.insert(VI.LIs.begin(), VI.LIs.end());
1145
1146 // Generate a set of all instructions taking part in load
1147 // interleaved. This list excludes the instructions necessary for the
1148 // polynomial construction.
1149 Is.insert(VI.Is.begin(), VI.Is.end());
1150
1151 // Generate the set of the final ShuffleVectorInst.
1152 SVIs.insert(VI.SVI);
1153 }
1154
1155 // There is nothing to combine.
1156 if (LIs.size() < 2)
1157 return false;
1158
1159 // Test if all participating instruction will be dead after the
1160 // transformation. If intermediate results are used, no performance gain can
1161 // be expected. Also sum the cost of the Instructions beeing left dead.
1162 for (const auto &I : Is) {
1163 // Compute the old cost
1165
1166 // The final SVIs are allowed not to be dead, all uses will be replaced
1167 if (SVIs.find(I) != SVIs.end())
1168 continue;
1169
1170 // If there are users outside the set to be eliminated, we abort the
1171 // transformation. No gain can be expected.
1172 for (auto *U : I->users()) {
1173 if (Is.find(dyn_cast<Instruction>(U)) == Is.end())
1174 return false;
1175 }
1176 }
1177
1178 // We need to have a valid cost in order to proceed.
1179 if (!InstructionCost.isValid())
1180 return false;
1181
1182 // We know that all LoadInst are within the same BB. This guarantees that
1183 // either everything or nothing is loaded.
1184 LoadInst *First = findFirstLoad(LIs);
1185
1186 // To be safe that the loads can be combined, iterate over all loads and test
1187 // that the corresponding defining access dominates first LI. This guarantees
1188 // that there are no aliasing stores in between the loads.
1189 auto FMA = MSSA.getMemoryAccess(First);
1190 for (auto *LI : LIs) {
1191 auto MADef = MSSA.getMemoryAccess(LI)->getDefiningAccess();
1192 if (!MSSA.dominates(MADef, FMA))
1193 return false;
1194 }
1195 assert(!LIs.empty() && "There are no LoadInst to combine");
1196
1197 // It is necessary that insertion point dominates all final ShuffleVectorInst.
1198 for (auto &VI : InterleavedLoad) {
1199 if (!DT.dominates(InsertionPoint, VI.SVI))
1200 return false;
1201 }
1202
1203 // All checks are done. Add instructions detectable by InterleavedAccessPass
1204 // The old instruction will are left dead.
1205 IRBuilder<> Builder(InsertionPoint);
1206 Type *ETy = InterleavedLoad.front().SVI->getType()->getElementType();
1207 unsigned ElementsPerSVI =
1208 cast<FixedVectorType>(InterleavedLoad.front().SVI->getType())
1209 ->getNumElements();
1210 FixedVectorType *ILTy = FixedVectorType::get(ETy, Factor * ElementsPerSVI);
1211
1212 auto Indices = llvm::to_vector<4>(llvm::seq<unsigned>(0, Factor));
1213 InterleavedCost = TTI.getInterleavedMemoryOpCost(
1214 Instruction::Load, ILTy, Factor, Indices, InsertionPoint->getAlign(),
1215 InsertionPoint->getPointerAddressSpace(), CostKind);
1216
1217 if (InterleavedCost >= InstructionCost) {
1218 return false;
1219 }
1220
1221 // Create the wide load and update the MemorySSA.
1222 auto Ptr = InsertionPoint->getPointerOperand();
1223 auto LI = Builder.CreateAlignedLoad(ILTy, Ptr, InsertionPoint->getAlign(),
1224 "interleaved.wide.load");
1225 auto MSSAU = MemorySSAUpdater(&MSSA);
1226 MemoryUse *MSSALoad = cast<MemoryUse>(MSSAU.createMemoryAccessBefore(
1227 LI, nullptr, MSSA.getMemoryAccess(InsertionPoint)));
1228 MSSAU.insertUse(MSSALoad, /*RenameUses=*/ true);
1229
1230 // Create the final SVIs and replace all uses.
1231 int i = 0;
1232 for (auto &VI : InterleavedLoad) {
1233 SmallVector<int, 4> Mask;
1234 for (unsigned j = 0; j < ElementsPerSVI; j++)
1235 Mask.push_back(i + j * Factor);
1236
1237 Builder.SetInsertPoint(VI.SVI);
1238 auto SVI = Builder.CreateShuffleVector(LI, Mask, "interleaved.shuffle");
1239 VI.SVI->replaceAllUsesWith(SVI);
1240 i++;
1241 }
1242
1243 NumInterleavedLoadCombine++;
1244 ORE.emit([&]() {
1245 return OptimizationRemark(DEBUG_TYPE, "Combined Interleaved Load", LI)
1246 << "Load interleaved combined with factor "
1247 << ore::NV("Factor", Factor);
1248 });
1249
1250 return true;
1251}
1252
1253bool InterleavedLoadCombineImpl::run() {
1254 OptimizationRemarkEmitter ORE(&F);
1255 bool changed = false;
1256 unsigned MaxFactor = TLI.getMaxSupportedInterleaveFactor();
1257
1258 auto &DL = F.getDataLayout();
1259
1260 // Start with the highest factor to avoid combining and recombining.
1261 for (unsigned Factor = MaxFactor; Factor >= 2; Factor--) {
1262 std::list<VectorInfo> Candidates;
1263
1264 for (BasicBlock &BB : F) {
1265 for (Instruction &I : BB) {
1266 if (auto SVI = dyn_cast<ShuffleVectorInst>(&I)) {
1267 // We don't support scalable vectors in this pass.
1268 if (isa<ScalableVectorType>(SVI->getType()))
1269 continue;
1270
1271 Candidates.emplace_back(cast<FixedVectorType>(SVI->getType()));
1272
1273 if (!VectorInfo::computeFromSVI(SVI, Candidates.back(), DL)) {
1274 Candidates.pop_back();
1275 continue;
1276 }
1277
1278 if (!Candidates.back().isInterleaved(Factor, DL)) {
1279 Candidates.pop_back();
1280 }
1281 }
1282 }
1283 }
1284
1285 std::list<VectorInfo> InterleavedLoad;
1286 while (findPattern(Candidates, InterleavedLoad, Factor, DL)) {
1287 if (combine(InterleavedLoad, ORE)) {
1288 changed = true;
1289 } else {
1290 // Remove the first element of the Interleaved Load but put the others
1291 // back on the list and continue searching
1292 Candidates.splice(Candidates.begin(), InterleavedLoad,
1293 std::next(InterleavedLoad.begin()),
1294 InterleavedLoad.end());
1295 }
1296 InterleavedLoad.clear();
1297 }
1298 }
1299
1300 return changed;
1301}
1302
1303namespace {
1304/// This pass combines interleaved loads into a pattern detectable by
1305/// InterleavedAccessPass.
1306struct InterleavedLoadCombine : public FunctionPass {
1307 static char ID;
1308
1309 InterleavedLoadCombine() : FunctionPass(ID) {
1311 }
1312
1313 StringRef getPassName() const override {
1314 return "Interleaved Load Combine Pass";
1315 }
1316
1317 bool runOnFunction(Function &F) override {
1318 if (DisableInterleavedLoadCombine)
1319 return false;
1320
1321 auto *TPC = getAnalysisIfAvailable<TargetPassConfig>();
1322 if (!TPC)
1323 return false;
1324
1325 LLVM_DEBUG(dbgs() << "*** " << getPassName() << ": " << F.getName()
1326 << "\n");
1327
1328 return InterleavedLoadCombineImpl(
1329 F, getAnalysis<DominatorTreeWrapperPass>().getDomTree(),
1330 getAnalysis<MemorySSAWrapperPass>().getMSSA(),
1331 getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F),
1332 TPC->getTM<TargetMachine>())
1333 .run();
1334 }
1335
1336 void getAnalysisUsage(AnalysisUsage &AU) const override {
1337 AU.addRequired<MemorySSAWrapperPass>();
1338 AU.addRequired<DominatorTreeWrapperPass>();
1339 AU.addRequired<TargetTransformInfoWrapperPass>();
1340 FunctionPass::getAnalysisUsage(AU);
1341 }
1342
1343private:
1344};
1345} // anonymous namespace
1346
1347PreservedAnalyses
1349
1350 auto &DT = FAM.getResult<DominatorTreeAnalysis>(F);
1351 auto &MemSSA = FAM.getResult<MemorySSAAnalysis>(F).getMSSA();
1352 auto &TTI = FAM.getResult<TargetIRAnalysis>(F);
1353 bool Changed = InterleavedLoadCombineImpl(F, DT, MemSSA, TTI, *TM).run();
1355}
1356
1357char InterleavedLoadCombine::ID = 0;
1358
1360 InterleavedLoadCombine, DEBUG_TYPE,
1361 "Combine interleaved loads into wide loads and shufflevector instructions",
1362 false, false)
1367 InterleavedLoadCombine, DEBUG_TYPE,
1368 "Combine interleaved loads into wide loads and shufflevector instructions",
1370
1373 auto P = new InterleavedLoadCombine();
1374 return P;
1375}
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static cl::opt< OutputCostKind > CostKind("cost-kind", cl::desc("Target cost kind"), cl::init(OutputCostKind::RecipThroughput), cl::values(clEnumValN(OutputCostKind::RecipThroughput, "throughput", "Reciprocal throughput"), clEnumValN(OutputCostKind::Latency, "latency", "Instruction latency"), clEnumValN(OutputCostKind::CodeSize, "code-size", "Code size"), clEnumValN(OutputCostKind::SizeAndLatency, "size-latency", "Code size and latency"), clEnumValN(OutputCostKind::All, "all", "Print all cost kinds")))
static bool runOnFunction(Function &F, bool PostInlining)
#define DEBUG_TYPE
Hexagon Common GEP
#define F(x, y, z)
Definition MD5.cpp:55
#define I(x, y, z)
Definition MD5.cpp:58
This file exposes an interface to building/using memory SSA to walk memory instructions using a use/d...
#define P(N)
FunctionAnalysisManager FAM
#define INITIALIZE_PASS_DEPENDENCY(depName)
Definition PassSupport.h:42
#define INITIALIZE_PASS_END(passName, arg, name, cfg, analysis)
Definition PassSupport.h:44
#define INITIALIZE_PASS_BEGIN(passName, arg, name, cfg, analysis)
Definition PassSupport.h:39
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
Definition Statistic.h:167
#define LLVM_DEBUG(...)
Definition Debug.h:114
This file describes how to lower LLVM code to machine code.
Target-Independent Code Generator Pass Configuration Options pass.
This pass exposes codegen information to IR-level passes.
Value * RHS
Value * LHS
BinaryOperator * Mul
Class for arbitrary precision integers.
Definition APInt.h:78
bool isZero() const
Determine if this value is zero, i.e. all bits are clear.
Definition APInt.h:380
AnalysisUsage & addRequired()
LLVM Basic Block Representation.
Definition BasicBlock.h:62
iterator end()
Definition BasicBlock.h:472
InstListType::iterator iterator
Instruction iterators...
Definition BasicBlock.h:170
BinaryOps getOpcode() const
Definition InstrTypes.h:374
This class represents a no-op cast from one type to another.
This is the base class for all instructions that perform data casts.
Definition InstrTypes.h:448
Instruction::CastOps getOpcode() const
Return the opcode of this CastInst.
Definition InstrTypes.h:612
This is the shared class of boolean and integer constants.
Definition Constants.h:87
A parsed version of the target data layout string in and methods for querying it.
Definition DataLayout.h:63
Analysis pass which computes a DominatorTree.
Definition Dominators.h:284
Legacy analysis pass which computes a DominatorTree.
Definition Dominators.h:322
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree.
Definition Dominators.h:165
LLVM_ABI bool dominates(const BasicBlock *BB, const Use &U) const
Return true if the (end of the) basic block BB dominates the use U.
Class to represent fixed width SIMD vectors.
unsigned getNumElements() const
static LLVM_ABI FixedVectorType * get(Type *ElementType, unsigned NumElts)
Definition Type.cpp:803
FunctionPass class - This class is used to implement most global optimizations.
Definition Pass.h:314
an instruction for type-safe pointer arithmetic to access elements of arrays and structs
LLVM_ABI bool isCommutative() const LLVM_READONLY
Return true if the instruction is commutative:
LLVM_ABI bool isAtomic() const LLVM_READONLY
Return true if this instruction has an AtomicOrdering of unordered or higher.
Class to represent integer types.
PreservedAnalyses run(Function &F, FunctionAnalysisManager &FAM)
An instruction for reading from memory.
unsigned getPointerAddressSpace() const
Returns the address space of the pointer operand.
Value * getPointerOperand()
bool isVolatile() const
Return true if this is a load from a volatile memory location.
Align getAlign() const
Return the alignment of the access that is being performed.
An analysis that produces MemorySSA for a function.
Definition MemorySSA.h:936
Legacy analysis pass which computes MemorySSA.
Definition MemorySSA.h:993
Encapsulates MemorySSA, including all data associated with memory accesses.
Definition MemorySSA.h:702
LLVM_ABI bool dominates(const MemoryAccess *A, const MemoryAccess *B) const
Given two memory accesses in potentially different blocks, determine whether MemoryAccess A dominates...
MemoryUseOrDef * getMemoryAccess(const Instruction *I) const
Given a memory Mod/Ref'ing instruction, get the MemorySSA access associated with it.
Definition MemorySSA.h:720
MemoryAccess * getDefiningAccess() const
Get the access that produces the memory state used by this Use.
Definition MemorySSA.h:260
The optimization diagnostic interface.
LLVM_ABI void emit(DiagnosticInfoOptimizationBase &OptDiag)
Output the remark via the diagnostic handler and to the optimization record file.
static LLVM_ABI PassRegistry * getPassRegistry()
getPassRegistry - Access the global registry object, which is automatically initialized at applicatio...
static PreservedAnalyses none()
Convenience factory function for the empty preserved set.
Definition Analysis.h:115
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
Definition Analysis.h:118
This instruction constructs a fixed permutation of two input vectors.
static LLVM_ABI void getShuffleMask(const Constant *Mask, SmallVectorImpl< int > &Result)
Convert the input shuffle mask operand to a vector of integers.
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Analysis pass providing the TargetTransformInfo.
virtual unsigned getMaxSupportedInterleaveFactor() const
Get the maximum supported factor for interleaved memory accesses.
This class defines information used to lower LLVM code to legal SelectionDAG operators that the targe...
Primary interface to the complete machine description for the target machine.
Wrapper pass for TargetTransformInfo.
This pass provides access to the codegen interfaces that are needed for IR-level transformations.
LLVM_ABI InstructionCost getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef< unsigned > Indices, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, bool UseMaskForCond=false, bool UseMaskForGaps=false) const
TargetCostKind
The kind of cost model.
@ TCK_SizeAndLatency
The weighted sum of size and latency.
LLVM_ABI InstructionCost getInstructionCost(const User *U, ArrayRef< const Value * > Operands, TargetCostKind CostKind) const
Estimate the cost of a given IR user when lowered.
static LLVM_ABI IntegerType * getInt32Ty(LLVMContext &C)
Definition Type.cpp:297
Value * getOperand(unsigned i) const
Definition User.h:232
LLVM Value Representation.
Definition Value.h:75
Type * getType() const
All values are typed, get the type of this value.
Definition Value.h:256
LLVM_ABI LLVMContext & getContext() const
All values hold a context through their type.
Definition Value.cpp:1101
Type * getElementType() const
const ParentTy * getParent() const
Definition ilist_node.h:34
This class implements an extremely fast bulk output stream that can only output to a stream.
Definition raw_ostream.h:53
Changed
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition CallingConv.h:24
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
@ FMA
FMA - Perform a * b + c with no intermediate rounding step.
Definition ISDOpcodes.h:511
@ BasicBlock
Various leaf nodes.
Definition ISDOpcodes.h:81
initializer< Ty > init(const Ty &Val)
DiagnosticInfoOptimizationBase::Argument NV
This is an optimization pass for GlobalISel generic memory operations.
@ Offset
Definition DWP.cpp:477
Printable print(const GCNRegPressure &RP, const GCNSubtarget *ST=nullptr, unsigned DynamicVGPRBlockSize=0)
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:649
LLVM_ABI void initializeInterleavedLoadCombinePass(PassRegistry &)
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:207
SmallVector< ValueTypeFromRangeType< R >, Size > to_vector(R &&Range)
Given a range of type R, iterate the entire range and return a SmallVector with elements of the vecto...
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:548
@ First
Helpers to iterate all locations in the MemoryEffectsBase class.
Definition ModRef.h:71
TargetTransformInfo TTI
IRBuilder(LLVMContext &, FolderTy, InserterTy, MDNode *, ArrayRef< OperandBundleDef >) -> IRBuilder< FolderTy, InserterTy >
DWARFExpression::Operation Op
raw_ostream & operator<<(raw_ostream &OS, const APFixedPoint &FX)
constexpr unsigned BitWidth
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:565
auto find_if(R &&Range, UnaryPredicate P)
Provide wrappers to std::find_if which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1740
APInt operator-(APInt)
Definition APInt.h:2188
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Definition STLExtras.h:1879
APInt operator+(APInt a, const APInt &b)
Definition APInt.h:2193
auto seq(T Begin, T End)
Iterate over an integral type from Begin up to - but not including - End.
Definition Sequence.h:305
AnalysisManager< Function > FunctionAnalysisManager
Convenience typedef for the Function analysis manager.
LLVM_ABI FunctionPass * createInterleavedLoadCombinePass()
InterleavedLoadCombines Pass - This pass identifies interleaved loads and combines them into wide loa...
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition BitVector.h:853