49#define DEBUG_TYPE "interleaved-load-combine"
54STATISTIC(NumInterleavedLoadCombine,
"Number of combined loads");
59 cl::desc(
"Disable combining of interleaved loads"));
63struct InterleavedLoadCombineImpl {
68 :
F(
F), DT(DT), MSSA(MSSA),
69 TLI(*TM.getSubtargetImpl(
F)->getTargetLowering()),
TTI(
TTI) {}
93 LoadInst *findFirstLoad(
const std::set<LoadInst *> &LIs);
98 bool combine(std::list<VectorInfo> &InterleavedLoad,
103 bool findPattern(std::list<VectorInfo> &Candidates,
104 std::list<VectorInfo> &InterleavedLoad,
unsigned Factor,
186 Polynomial(
Value *V) : V(V) {
191 A =
APInt(Ty->getBitWidth(), 0);
195 Polynomial(
const APInt &
A,
unsigned ErrorMSBs = 0)
196 : ErrorMSBs(ErrorMSBs),
A(
A) {}
201 Polynomial() =
default;
204 void incErrorMSBs(
unsigned amt) {
205 if (ErrorMSBs == (
unsigned)-1)
209 if (ErrorMSBs >
A.getBitWidth())
210 ErrorMSBs =
A.getBitWidth();
214 void decErrorMSBs(
unsigned amt) {
215 if (ErrorMSBs == (
unsigned)-1)
225 Polynomial &add(
const APInt &
C) {
242 if (
C.getBitWidth() !=
A.getBitWidth()) {
252 Polynomial &mul(
const APInt &
C) {
303 if (
C.getBitWidth() !=
A.getBitWidth()) {
321 decErrorMSBs(
C.countr_zero());
324 pushBOperation(
Mul,
C);
329 Polynomial &lshr(
const APInt &
C) {
460 if (
C.getBitWidth() !=
A.getBitWidth()) {
469 unsigned shiftAmt =
C.getZExtValue();
470 if (shiftAmt >=
C.getBitWidth())
471 return mul(
APInt(
C.getBitWidth(), 0));
478 if (
A.countr_zero() < shiftAmt)
479 ErrorMSBs =
A.getBitWidth();
481 incErrorMSBs(shiftAmt);
484 pushBOperation(LShr,
C);
485 A =
A.lshr(shiftAmt);
491 Polynomial &sextOrTrunc(
unsigned n) {
492 if (n <
A.getBitWidth()) {
495 decErrorMSBs(
A.getBitWidth() - n);
497 pushBOperation(Trunc,
APInt(
sizeof(n) * 8, n));
499 if (n >
A.getBitWidth()) {
502 incErrorMSBs(n -
A.getBitWidth());
504 pushBOperation(SExt,
APInt(
sizeof(n) * 8, n));
511 bool isFirstOrder()
const {
return V !=
nullptr; }
514 bool isCompatibleTo(
const Polynomial &o)
const {
516 if (
A.getBitWidth() != o.A.getBitWidth())
520 if (!isFirstOrder() && !o.isFirstOrder())
528 if (
B.size() != o.B.size())
531 auto *ob = o.B.begin();
532 for (
const auto &b :
B) {
543 Polynomial
operator-(
const Polynomial &o)
const {
545 if (!isCompatibleTo(o))
551 return Polynomial(
A - o.A, std::max(ErrorMSBs, o.ErrorMSBs));
556 Polynomial Result(*
this);
563 Polynomial Result(*
this);
569 bool isProvenEqualTo(
const Polynomial &o) {
571 Polynomial r = *
this - o;
572 return (r.ErrorMSBs == 0) && (!r.isFirstOrder()) && (r.A.
isZero());
577 OS <<
"[{#ErrBits:" << ErrorMSBs <<
"} ";
582 OS <<
"(" << *V <<
") ";
600 OS << b.second <<
") ";
604 OS <<
"+ " <<
A <<
"]";
613 void pushBOperation(
const BOps
Op,
const APInt &
C) {
614 if (isFirstOrder()) {
615 B.push_back(std::make_pair(
Op,
C));
637 VectorInfo(
const VectorInfo &c) : VTy(c.VTy) {
639 "Copying VectorInfo is neither implemented nor necessary,");
652 ElementInfo(Polynomial
Offset = Polynomial(),
LoadInst *LI =
nullptr)
663 std::set<LoadInst *> LIs;
666 std::set<Instruction *> Is;
681 VectorInfo &operator=(
const VectorInfo &other) =
delete;
683 virtual ~VectorInfo() {
delete[] EI; }
694 bool isInterleaved(
unsigned Factor,
const DataLayout &
DL)
const {
696 for (
unsigned i = 1; i < getDimension(); i++) {
697 if (!EI[i].Ofs.isProvenEqualTo(EI[0].Ofs + i * Factor *
Size)) {
715 return computeFromSVI(SVI, Result,
DL);
718 return computeFromLI(LI, Result,
DL);
721 return computeFromBCI(BCI, Result,
DL);
731 static bool computeFromBCI(
BitCastInst *BCI, VectorInfo &Result,
746 unsigned Factor = Result.VTy->getNumElements() / VTy->
getNumElements();
747 unsigned NewSize =
DL.getTypeAllocSize(Result.VTy->getElementType());
750 if (NewSize * Factor != OldSize)
754 if (!compute(
Op, Old,
DL))
757 for (
unsigned i = 0; i < Result.VTy->getNumElements(); i += Factor) {
758 for (
unsigned j = 0; j < Factor; j++) {
760 ElementInfo(Old.EI[i / Factor].Ofs + j * NewSize,
761 j == 0 ? Old.EI[i / Factor].LI :
nullptr);
767 Result.LIs.insert(Old.LIs.begin(), Old.LIs.end());
768 Result.Is.insert(Old.Is.begin(), Old.Is.end());
769 Result.Is.insert(BCI);
770 Result.SVI =
nullptr;
792 VectorInfo
LHS(ArgTy);
797 VectorInfo
RHS(ArgTy);
826 Result.LIs.insert(
LHS.LIs.begin(),
LHS.LIs.end());
827 Result.Is.insert(
LHS.Is.begin(),
LHS.Is.end());
830 Result.LIs.insert(
RHS.LIs.begin(),
RHS.LIs.end());
831 Result.Is.insert(
RHS.Is.begin(),
RHS.Is.end());
833 Result.Is.insert(SVI);
839 "Invalid ShuffleVectorInst (index out of bounds)");
842 Result.EI[j] = ElementInfo();
845 Result.EI[j] =
LHS.EI[i];
847 Result.EI[j] = ElementInfo();
852 Result.EI[j] = ElementInfo();
868 static bool computeFromLI(
LoadInst *LI, VectorInfo &Result,
879 if (!
DL.typeSizeEqualsStoreSize(Result.VTy->getElementType()))
887 Result.LIs.insert(LI);
888 Result.Is.insert(LI);
890 for (
unsigned i = 0; i < Result.getDimension(); i++) {
895 int64_t Ofs =
DL.getIndexedOffsetInType(Result.VTy, Idx);
896 Result.EI[i] = ElementInfo(
Offset + Ofs, i == 0 ? LI :
nullptr);
906 static void computePolynomialBinOp(
BinaryOperator &BO, Polynomial &Result) {
919 case Instruction::Add:
923 computePolynomial(*
LHS, Result);
924 Result.add(
C->getValue());
927 case Instruction::LShr:
931 computePolynomial(*
LHS, Result);
932 Result.lshr(
C->getValue());
939 Result = Polynomial(&BO);
946 static void computePolynomial(
Value &V, Polynomial &Result) {
948 computePolynomialBinOp(*BO, Result);
950 Result = Polynomial(&V);
959 static void computePolynomialFromPointer(
Value &
Ptr, Polynomial &Result,
965 Result = Polynomial();
969 unsigned PointerBits =
970 DL.getIndexSizeInBits(PtrTy->getPointerAddressSpace());
976 case Instruction::BitCast:
977 computePolynomialFromPointer(*CI.
getOperand(0), Result, BasePtr,
DL);
981 Polynomial(PointerBits, 0);
989 APInt BaseOffset(PointerBits, 0);
992 if (
GEP.accumulateConstantOffset(
DL, BaseOffset)) {
993 Result = Polynomial(BaseOffset);
994 BasePtr =
GEP.getPointerOperand();
999 unsigned idxOperand, e;
1001 for (idxOperand = 1, e =
GEP.getNumOperands(); idxOperand < e;
1010 if (idxOperand + 1 != e) {
1011 Result = Polynomial();
1017 computePolynomial(*
GEP.getOperand(idxOperand), Result);
1022 DL.getIndexedOffsetInType(
GEP.getSourceElementType(), Indices);
1025 unsigned ResultSize =
DL.getTypeAllocSize(
GEP.getResultElementType());
1026 Result.sextOrTrunc(PointerBits);
1027 Result.mul(
APInt(PointerBits, ResultSize));
1028 Result.add(BaseOffset);
1029 BasePtr =
GEP.getPointerOperand();
1036 Polynomial(
DL.getIndexSizeInBits(PtrTy->getPointerAddressSpace()), 0);
1047 for (
unsigned i = 0; i < getDimension(); i++)
1048 OS << ((i == 0) ?
"[" :
", ") << EI[i].Ofs;
1056bool InterleavedLoadCombineImpl::findPattern(
1057 std::list<VectorInfo> &Candidates, std::list<VectorInfo> &InterleavedLoad,
1059 for (
auto C0 = Candidates.begin(), E0 = Candidates.end(); C0 != E0; ++C0) {
1062 unsigned Size =
DL.getTypeAllocSize(C0->VTy->getElementType());
1065 std::vector<std::list<VectorInfo>::iterator> Res(Factor, Candidates.end());
1067 for (
auto C = Candidates.begin(),
E = Candidates.end();
C !=
E;
C++) {
1068 if (
C->VTy != C0->VTy)
1070 if (
C->BB != C0->BB)
1072 if (
C->PV != C0->PV)
1076 for (i = 1; i < Factor; i++) {
1077 if (
C->EI[0].Ofs.isProvenEqualTo(C0->EI[0].Ofs + i *
Size)) {
1082 for (i = 1; i < Factor; i++) {
1083 if (Res[i] == Candidates.end())
1092 if (Res[0] != Candidates.end()) {
1094 for (
unsigned i = 0; i < Factor; i++) {
1095 InterleavedLoad.splice(InterleavedLoad.end(), Candidates, Res[i]);
1105InterleavedLoadCombineImpl::findFirstLoad(
const std::set<LoadInst *> &LIs) {
1106 assert(!LIs.empty() &&
"No load instructions given.");
1109 BasicBlock *BB = (*LIs.begin())->getParent();
1111 *BB, [&LIs](Instruction &
I) ->
bool {
return is_contained(LIs, &
I); });
1117bool InterleavedLoadCombineImpl::combine(std::list<VectorInfo> &InterleavedLoad,
1118 OptimizationRemarkEmitter &ORE) {
1124 LoadInst *InsertionPoint = InterleavedLoad.front().EI[0].LI;
1127 if (!InsertionPoint)
1130 std::set<LoadInst *> LIs;
1131 std::set<Instruction *> Is;
1132 std::set<Instruction *> SVIs;
1139 unsigned Factor = InterleavedLoad.size();
1142 for (
auto &VI : InterleavedLoad) {
1144 LIs.insert(
VI.LIs.begin(),
VI.LIs.end());
1149 Is.insert(
VI.Is.begin(),
VI.Is.end());
1152 SVIs.insert(
VI.SVI);
1162 for (
const auto &
I : Is) {
1167 if (SVIs.find(
I) != SVIs.end())
1172 for (
auto *U :
I->users()) {
1184 LoadInst *
First = findFirstLoad(LIs);
1190 for (
auto *LI : LIs) {
1195 assert(!LIs.empty() &&
"There are no LoadInst to combine");
1198 for (
auto &VI : InterleavedLoad) {
1206 Type *ETy = InterleavedLoad.front().SVI->getType()->getElementType();
1207 unsigned ElementsPerSVI =
1214 Instruction::Load, ILTy, Factor, Indices, InsertionPoint->
getAlign(),
1223 auto LI = Builder.CreateAlignedLoad(ILTy,
Ptr, InsertionPoint->
getAlign(),
1224 "interleaved.wide.load");
1225 auto MSSAU = MemorySSAUpdater(&MSSA);
1228 MSSAU.insertUse(MSSALoad,
true);
1232 for (
auto &VI : InterleavedLoad) {
1233 SmallVector<int, 4>
Mask;
1234 for (
unsigned j = 0;
j < ElementsPerSVI;
j++)
1235 Mask.push_back(i + j * Factor);
1237 Builder.SetInsertPoint(
VI.SVI);
1238 auto SVI = Builder.CreateShuffleVector(LI, Mask,
"interleaved.shuffle");
1239 VI.SVI->replaceAllUsesWith(SVI);
1243 NumInterleavedLoadCombine++;
1245 return OptimizationRemark(
DEBUG_TYPE,
"Combined Interleaved Load", LI)
1246 <<
"Load interleaved combined with factor "
1253bool InterleavedLoadCombineImpl::run() {
1254 OptimizationRemarkEmitter ORE(&
F);
1255 bool changed =
false;
1258 auto &
DL =
F.getDataLayout();
1261 for (
unsigned Factor = MaxFactor; Factor >= 2; Factor--) {
1262 std::list<VectorInfo> Candidates;
1264 for (BasicBlock &BB :
F) {
1265 for (Instruction &
I : BB) {
1273 if (!VectorInfo::computeFromSVI(SVI, Candidates.back(),
DL)) {
1274 Candidates.pop_back();
1278 if (!Candidates.back().isInterleaved(Factor,
DL)) {
1279 Candidates.pop_back();
1285 std::list<VectorInfo> InterleavedLoad;
1286 while (findPattern(Candidates, InterleavedLoad, Factor,
DL)) {
1287 if (combine(InterleavedLoad, ORE)) {
1292 Candidates.splice(Candidates.begin(), InterleavedLoad,
1293 std::next(InterleavedLoad.begin()),
1294 InterleavedLoad.end());
1296 InterleavedLoad.clear();
1306struct InterleavedLoadCombine :
public FunctionPass {
1309 InterleavedLoadCombine() : FunctionPass(
ID) {
1313 StringRef getPassName()
const override {
1314 return "Interleaved Load Combine Pass";
1318 if (DisableInterleavedLoadCombine)
1321 auto *TPC = getAnalysisIfAvailable<TargetPassConfig>();
1328 return InterleavedLoadCombineImpl(
1329 F, getAnalysis<DominatorTreeWrapperPass>().getDomTree(),
1330 getAnalysis<MemorySSAWrapperPass>().getMSSA(),
1331 getAnalysis<TargetTransformInfoWrapperPass>().getTTI(
F),
1332 TPC->getTM<TargetMachine>())
1336 void getAnalysisUsage(AnalysisUsage &AU)
const override {
1340 FunctionPass::getAnalysisUsage(AU);
1353 bool Changed = InterleavedLoadCombineImpl(
F, DT, MemSSA,
TTI, *TM).run();
1357char InterleavedLoadCombine::ID = 0;
1361 "Combine interleaved loads into wide loads and shufflevector instructions",
1368 "Combine interleaved loads into wide loads and shufflevector instructions",
1373 auto P =
new InterleavedLoadCombine();
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static cl::opt< OutputCostKind > CostKind("cost-kind", cl::desc("Target cost kind"), cl::init(OutputCostKind::RecipThroughput), cl::values(clEnumValN(OutputCostKind::RecipThroughput, "throughput", "Reciprocal throughput"), clEnumValN(OutputCostKind::Latency, "latency", "Instruction latency"), clEnumValN(OutputCostKind::CodeSize, "code-size", "Code size"), clEnumValN(OutputCostKind::SizeAndLatency, "size-latency", "Code size and latency"), clEnumValN(OutputCostKind::All, "all", "Print all cost kinds")))
static bool runOnFunction(Function &F, bool PostInlining)
This file exposes an interface to building/using memory SSA to walk memory instructions using a use/d...
FunctionAnalysisManager FAM
#define INITIALIZE_PASS_DEPENDENCY(depName)
#define INITIALIZE_PASS_END(passName, arg, name, cfg, analysis)
#define INITIALIZE_PASS_BEGIN(passName, arg, name, cfg, analysis)
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
This file describes how to lower LLVM code to machine code.
Target-Independent Code Generator Pass Configuration Options pass.
Class for arbitrary precision integers.
bool isZero() const
Determine if this value is zero, i.e. all bits are clear.
AnalysisUsage & addRequired()
LLVM Basic Block Representation.
InstListType::iterator iterator
Instruction iterators...
BinaryOps getOpcode() const
This class represents a no-op cast from one type to another.
This is the base class for all instructions that perform data casts.
Instruction::CastOps getOpcode() const
Return the opcode of this CastInst.
This is the shared class of boolean and integer constants.
A parsed version of the target data layout string in and methods for querying it.
Analysis pass which computes a DominatorTree.
Legacy analysis pass which computes a DominatorTree.
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree.
LLVM_ABI bool dominates(const BasicBlock *BB, const Use &U) const
Return true if the (end of the) basic block BB dominates the use U.
Class to represent fixed width SIMD vectors.
unsigned getNumElements() const
static LLVM_ABI FixedVectorType * get(Type *ElementType, unsigned NumElts)
FunctionPass class - This class is used to implement most global optimizations.
an instruction for type-safe pointer arithmetic to access elements of arrays and structs
LLVM_ABI bool isCommutative() const LLVM_READONLY
Return true if the instruction is commutative:
LLVM_ABI bool isAtomic() const LLVM_READONLY
Return true if this instruction has an AtomicOrdering of unordered or higher.
Class to represent integer types.
PreservedAnalyses run(Function &F, FunctionAnalysisManager &FAM)
An instruction for reading from memory.
unsigned getPointerAddressSpace() const
Returns the address space of the pointer operand.
Value * getPointerOperand()
bool isVolatile() const
Return true if this is a load from a volatile memory location.
Align getAlign() const
Return the alignment of the access that is being performed.
An analysis that produces MemorySSA for a function.
Legacy analysis pass which computes MemorySSA.
Encapsulates MemorySSA, including all data associated with memory accesses.
LLVM_ABI bool dominates(const MemoryAccess *A, const MemoryAccess *B) const
Given two memory accesses in potentially different blocks, determine whether MemoryAccess A dominates...
MemoryUseOrDef * getMemoryAccess(const Instruction *I) const
Given a memory Mod/Ref'ing instruction, get the MemorySSA access associated with it.
MemoryAccess * getDefiningAccess() const
Get the access that produces the memory state used by this Use.
static LLVM_ABI PassRegistry * getPassRegistry()
getPassRegistry - Access the global registry object, which is automatically initialized at applicatio...
static PreservedAnalyses none()
Convenience factory function for the empty preserved set.
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
This instruction constructs a fixed permutation of two input vectors.
static LLVM_ABI void getShuffleMask(const Constant *Mask, SmallVectorImpl< int > &Result)
Convert the input shuffle mask operand to a vector of integers.
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Analysis pass providing the TargetTransformInfo.
virtual unsigned getMaxSupportedInterleaveFactor() const
Get the maximum supported factor for interleaved memory accesses.
This class defines information used to lower LLVM code to legal SelectionDAG operators that the targe...
Primary interface to the complete machine description for the target machine.
static LLVM_ABI IntegerType * getInt32Ty(LLVMContext &C)
Value * getOperand(unsigned i) const
LLVM Value Representation.
Type * getType() const
All values are typed, get the type of this value.
LLVM_ABI LLVMContext & getContext() const
All values hold a context through their type.
Type * getElementType() const
const ParentTy * getParent() const
This class implements an extremely fast bulk output stream that can only output to a stream.
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
@ C
The default llvm calling convention, compatible with C.
@ FMA
FMA - Perform a * b + c with no intermediate rounding step.
@ BasicBlock
Various leaf nodes.
initializer< Ty > init(const Ty &Val)
DiagnosticInfoOptimizationBase::Argument NV
This is an optimization pass for GlobalISel generic memory operations.
Printable print(const GCNRegPressure &RP, const GCNSubtarget *ST=nullptr, unsigned DynamicVGPRBlockSize=0)
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
LLVM_ABI void initializeInterleavedLoadCombinePass(PassRegistry &)
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
SmallVector< ValueTypeFromRangeType< R >, Size > to_vector(R &&Range)
Given a range of type R, iterate the entire range and return a SmallVector with elements of the vecto...
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
@ First
Helpers to iterate all locations in the MemoryEffectsBase class.
IRBuilder(LLVMContext &, FolderTy, InserterTy, MDNode *, ArrayRef< OperandBundleDef >) -> IRBuilder< FolderTy, InserterTy >
DWARFExpression::Operation Op
raw_ostream & operator<<(raw_ostream &OS, const APFixedPoint &FX)
constexpr unsigned BitWidth
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
auto find_if(R &&Range, UnaryPredicate P)
Provide wrappers to std::find_if which take ranges instead of having to pass begin/end explicitly.
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
APInt operator+(APInt a, const APInt &b)
auto seq(T Begin, T End)
Iterate over an integral type from Begin up to - but not including - End.
AnalysisManager< Function > FunctionAnalysisManager
Convenience typedef for the Function analysis manager.
LLVM_ABI FunctionPass * createInterleavedLoadCombinePass()
InterleavedLoadCombines Pass - This pass identifies interleaved loads and combines them into wide loa...
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.