46class X86InterleavedAccessGroup {
55 ArrayRef<unsigned> Indices;
58 const unsigned Factor;
61 const X86Subtarget &Subtarget;
69 void decompose(Instruction *Inst,
unsigned NumSubVectors, FixedVectorType *
T,
70 SmallVectorImpl<Instruction *> &DecomposedVectors);
86 SmallVectorImpl<Value *> &TransposedMatrix);
88 SmallVectorImpl<Value *> &TransposedMatrix,
89 unsigned NumSubVecElems);
91 SmallVectorImpl<Value *> &TransposedMatrix);
93 SmallVectorImpl<Value *> &TransposedMatrix,
94 unsigned NumSubVecElems);
96 SmallVectorImpl<Value *> &TransposedMatrix,
97 unsigned NumSubVecElems);
106 explicit X86InterleavedAccessGroup(Instruction *
I,
108 ArrayRef<unsigned> Ind,
const unsigned F,
109 const X86Subtarget &STarget,
111 : Inst(
I), Shuffles(Shuffs), Indices(Ind), Factor(
F), Subtarget(STarget),
112 DL(Inst->getDataLayout()), Builder(
B) {}
116 bool isSupported()
const;
120 bool lowerIntoOptimizedSequence();
125bool X86InterleavedAccessGroup::isSupported()
const {
126 VectorType *ShuffleVecTy = Shuffles[0]->getType();
127 Type *ShuffleEltTy = ShuffleVecTy->getElementType();
128 unsigned ShuffleElemSize =
DL.getTypeSizeInBits(ShuffleEltTy);
129 unsigned WideInstSize;
137 if (!Subtarget.
hasAVX() || (Factor != 4 && Factor != 3))
141 WideInstSize =
DL.getTypeSizeInBits(Inst->
getType());
145 WideInstSize =
DL.getTypeSizeInBits(Shuffles[0]->
getType());
149 if (ShuffleElemSize == 64 && WideInstSize == 1024 && Factor == 4)
152 if (ShuffleElemSize == 8 &&
isa<StoreInst>(Inst) && Factor == 4 &&
153 (WideInstSize == 256 || WideInstSize == 512 || WideInstSize == 1024 ||
154 WideInstSize == 2048))
157 if (ShuffleElemSize == 8 && Factor == 3 &&
158 (WideInstSize == 384 || WideInstSize == 768 || WideInstSize == 1536))
164void X86InterleavedAccessGroup::decompose(
165 Instruction *VecInst,
unsigned NumSubVectors, FixedVectorType *SubVecTy,
166 SmallVectorImpl<Instruction *> &DecomposedVectors) {
168 "Expected Load or Shuffle");
173 DL.getTypeSizeInBits(VecWidth) >=
174 DL.getTypeSizeInBits(SubVecTy) * NumSubVectors &&
175 "Invalid Inst-size!!!");
178 Value *Op0 = SVI->getOperand(0);
179 Value *Op1 = SVI->getOperand(1);
182 for (
unsigned i = 0; i < NumSubVectors; ++i)
194 unsigned int NumLoads = NumSubVectors;
198 unsigned VecLength =
DL.getTypeSizeInBits(VecWidth);
200 if (VecLength == 768 || VecLength == 1536) {
202 NumLoads = NumSubVectors * (VecLength / 384);
204 VecBaseTy = SubVecTy;
208 "VecBaseTy's size must be a multiple of 8");
212 Align Alignment = FirstAlignment;
213 for (
unsigned i = 0; i < NumLoads; i++) {
220 Alignment = SubsequentAlignment;
233 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
234 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
235 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47,
236 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63};
258 "This function doesn't accept width smaller then 256");
286 unsigned VecElems,
unsigned Stride,
289 if (VecElems == 16) {
290 for (
unsigned i = 0; i < Stride; i++)
291 TransposedMatrix[i] = Builder.CreateShuffleVector(Vec[i], VPShuf);
298 for (
unsigned i = 0; i < (VecElems / 16) * Stride; i += 2) {
300 (i + 1) / Stride * 16);
301 Temp[i / 2] = Builder.CreateShuffleVector(
302 Vec[i % Stride], Vec[(i + 1) % Stride], OptimizeShuf);
303 OptimizeShuf.
clear();
306 if (VecElems == 32) {
307 std::copy(Temp, Temp + Stride, TransposedMatrix.
begin());
310 for (
unsigned i = 0; i < Stride; i++)
311 TransposedMatrix[i] =
312 Builder.CreateShuffleVector(Temp[2 * i], Temp[2 * i + 1],
Concat);
315void X86InterleavedAccessGroup::interleave8bitStride4VF8(
317 SmallVectorImpl<Value *> &TransposedMatrix) {
325 TransposedMatrix.
resize(2);
326 SmallVector<int, 16> MaskLow;
330 for (
unsigned i = 0; i < 8; ++i) {
349 TransposedMatrix[0] =
351 TransposedMatrix[1] =
355void X86InterleavedAccessGroup::interleave8bitStride4(
367 TransposedMatrix.
resize(4);
405 for (
int i = 0; i < 4; i++)
414 if (VT == MVT::v16i8) {
415 std::copy(VecOut, VecOut + 4, TransposedMatrix.
begin());
437 int LaneCount = std::max(VectorSize / 128, 1);
438 for (
int Lane = 0; Lane < LaneCount; Lane++)
439 for (
int i = 0, LaneSize = VF / LaneCount; i != LaneSize; ++i)
440 Mask.push_back((i * Stride) % LaneSize + LaneSize * Lane);
450 for (
int i = 0, FirstGroupElement = 0; i < 3; i++) {
451 int GroupSize = std::ceil((VF - FirstGroupElement) / 3.0);
453 FirstGroupElement = ((GroupSize)*3 + FirstGroupElement) % VF;
472 bool AlignDirection =
true,
bool Unary =
false) {
474 unsigned NumLanes = std::max((
int)VT.
getSizeInBits() / 128, 1);
475 unsigned NumLaneElts = NumElts / NumLanes;
477 Imm = AlignDirection ? Imm : (NumLaneElts - Imm);
480 for (
unsigned l = 0; l != NumElts; l += NumLaneElts) {
481 for (
unsigned i = 0; i != NumLaneElts; ++i) {
485 if (
Base >= NumLaneElts)
486 Base = Unary ?
Base % NumLaneElts :
Base + NumElts - NumLaneElts;
487 ShuffleMask.push_back(
Base + l);
521 if (VecElems == 16) {
522 for (
int i = 0; i < 3; i++)
527 for (
unsigned j = 0; j < VecElems / 32; j++)
528 for (
int i = 0; i < 3; i++)
529 Vec[i + j * 3] = Builder.CreateShuffleVector(
535 for (
int i = 0; i < 3; i++)
536 Vec[i] = Builder.CreateShuffleVector(Vec[i], Vec[i + 3],
Concat);
539void X86InterleavedAccessGroup::deinterleave8bitStride3(
547 TransposedMatrix.
resize(3);
553 Value *Vec[6], *TempVector[3];
560 for (
int i = 0; i < 2; i++)
571 for (
int i = 0; i < 3; i++)
578 for (
int i = 0; i < 3; i++)
586 for (
int i = 0; i < 3; i++)
596 TransposedMatrix[1] = VecElems == 8 ? Vec[2] : TempVec;
597 TransposedMatrix[2] = VecElems == 8 ? TempVec : Vec[2];
605 int IndexGroup[3] = {0, 0, 0};
610 int Lane = (VectorWidth / 128 > 0) ? VectorWidth / 128 : 1;
611 for (
int i = 0; i < 3; i++) {
612 IndexGroup[(Index * 3) % (VF / Lane)] = Index;
616 for (
int i = 0; i < VF / Lane; i++) {
622void X86InterleavedAccessGroup::interleave8bitStride3(
630 TransposedMatrix.
resize(3);
637 Value *Vec[3], *TempVector[3];
642 for (
int i = 0; i < 3; i++)
660 for (
int i = 0; i < 3; i++)
668 for (
int i = 0; i < 3; i++)
681void X86InterleavedAccessGroup::transpose_4x4(
683 SmallVectorImpl<Value *> &TransposedMatrix) {
685 TransposedMatrix.
resize(4);
688 static constexpr int IntMask1[] = {0, 1, 4, 5};
694 static constexpr int IntMask2[] = {2, 3, 6, 7};
700 static constexpr int IntMask3[] = {0, 4, 2, 6};
706 static constexpr int IntMask4[] = {1, 5, 3, 7};
714bool X86InterleavedAccessGroup::lowerIntoOptimizedSequence() {
715 SmallVector<Instruction *, 4> DecomposedVectors;
721 unsigned NumSubVecElems = ShuffleEltTy->getNumElements() / Factor;
722 switch (NumSubVecElems) {
730 if (ShuffleTy->getNumElements() != NumSubVecElems)
736 decompose(Inst, Factor, ShuffleTy, DecomposedVectors);
742 if (NumSubVecElems == 4)
743 transpose_4x4(DecomposedVectors, TransposedVectors);
745 deinterleave8bitStride3(DecomposedVectors, TransposedVectors,
750 for (
unsigned i = 0, e = Shuffles.size(); i < e; ++i)
751 Shuffles[i]->replaceAllUsesWith(TransposedVectors[Indices[i]]);
756 Type *ShuffleEltTy = ShuffleTy->getElementType();
757 unsigned NumSubVecElems = ShuffleTy->getNumElements() / Factor;
768 switch (NumSubVecElems) {
770 transpose_4x4(DecomposedVectors, TransposedVectors);
773 interleave8bitStride4VF8(DecomposedVectors, TransposedVectors);
779 interleave8bitStride4(DecomposedVectors, TransposedVectors,
782 interleave8bitStride3(DecomposedVectors, TransposedVectors,
807 "Invalid interleave factor");
808 assert(!Shuffles.
empty() &&
"Empty shufflevector input");
810 "Unmatched number of shufflevectors and indices");
815 assert(!Mask && GapMask.
popcount() == Factor &&
"Unexpected mask on a load");
819 X86InterleavedAccessGroup Grp(LI, Shuffles, Indices, Factor, Subtarget,
822 return Grp.isSupported() && Grp.lowerIntoOptimizedSequence();
829 const APInt &GapMask)
const {
831 "Invalid interleave factor");
835 "Invalid interleaved store");
841 "Unexpected mask on store");
852 X86InterleavedAccessGroup Grp(
SI, Shuffles, Indices, Factor, Subtarget,
855 return Grp.isSupported() && Grp.lowerIntoOptimizedSequence();
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static Decomposition decompose(Value *V, SmallVectorImpl< ConditionTy > &Preconditions, bool IsSigned, const DataLayout &DL)
This file defines the SmallVector class.
static SymbolRef::Type getType(const Symbol *Sym)
static void genShuffleBland(MVT VT, ArrayRef< int > Mask, SmallVectorImpl< int > &Out, int LowOffset, int HighOffset)
static MVT scaleVectorType(MVT VT)
static constexpr int Concat[]
static void group2Shuffle(MVT VT, SmallVectorImpl< int > &Mask, SmallVectorImpl< int > &Output)
static void createShuffleStride(MVT VT, int Stride, SmallVectorImpl< int > &Mask)
static void concatSubVector(Value **Vec, ArrayRef< Instruction * > InVec, unsigned VecElems, IRBuilder<> &Builder)
static void setGroupSize(MVT VT, SmallVectorImpl< int > &SizeInfo)
static void reorderSubVector(MVT VT, SmallVectorImpl< Value * > &TransposedMatrix, ArrayRef< Value * > Vec, ArrayRef< int > VPShuf, unsigned VecElems, unsigned Stride, IRBuilder<> &Builder)
Class for arbitrary precision integers.
unsigned popcount() const
Count the number of bits set.
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
size_t size() const
size - Get the array size.
bool empty() const
empty - Check if the array is empty.
unsigned getNumElements() const
static LLVM_ABI FixedVectorType * get(Type *ElementType, unsigned NumElts)
LoadInst * CreateAlignedLoad(Type *Ty, Value *Ptr, MaybeAlign Align, const char *Name)
Value * CreateGEP(Type *Ty, Value *Ptr, ArrayRef< Value * > IdxList, const Twine &Name="", GEPNoWrapFlags NW=GEPNoWrapFlags::none())
ConstantInt * getInt32(uint32_t C)
Get a constant 32-bit value.
Value * CreateShuffleVector(Value *V1, Value *V2, Value *Mask, const Twine &Name="")
StoreInst * CreateAlignedStore(Value *Val, Value *Ptr, MaybeAlign Align, bool isVolatile=false)
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
Value * getPointerOperand()
Align getAlign() const
Return the alignment of the access that is being performed.
uint64_t getScalarSizeInBits() const
unsigned getVectorNumElements() const
static LLVM_ABI MVT getVT(Type *Ty, bool HandleUnknown=false)
Return the value type corresponding to the specified type.
TypeSize getSizeInBits() const
Returns the size of the specified MVT in bits.
static MVT getVectorVT(MVT VT, unsigned NumElements)
MVT getVectorElementType() const
static MVT getIntegerVT(unsigned BitWidth)
This instruction constructs a fixed permutation of two input vectors.
VectorType * getType() const
Overload to return most specific vector type.
static LLVM_ABI void getShuffleMask(const Constant *Mask, SmallVectorImpl< int > &Result)
Convert the input shuffle mask operand to a vector of integers.
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
bool isVectorTy() const
True if this is an instance of VectorType.
LLVM_ABI TypeSize getPrimitiveSizeInBits() const LLVM_READONLY
Return the basic size of this type if it is a primitive type.
LLVM Value Representation.
Type * getType() const
All values are typed, get the type of this value.
LLVM_ABI LLVMContext & getContext() const
All values hold a context through their type.
bool lowerInterleavedStore(Instruction *Store, Value *Mask, ShuffleVectorInst *SVI, unsigned Factor, const APInt &GapMask) const override
Lower interleaved store(s) into target specific instructions/intrinsics.
unsigned getMaxSupportedInterleaveFactor() const override
Get the maximum supported factor for interleaved memory accesses.
bool lowerInterleavedLoad(Instruction *Load, Value *Mask, ArrayRef< ShuffleVectorInst * > Shuffles, ArrayRef< unsigned > Indices, unsigned Factor, const APInt &GapMask) const override
Lower interleaved load(s) into target specific instructions/intrinsics.
constexpr bool isKnownMultipleOf(ScalarTy RHS) const
This function tells the caller whether the element count is known at compile time to be a multiple of...
constexpr ScalarTy getFixedValue() const
constexpr char Align[]
Key for Kernel::Arg::Metadata::mAlign.
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
friend class Instruction
Iterator for Instructions in a `BasicBlock.
This is an optimization pass for GlobalISel generic memory operations.
FunctionAddr VTableAddr Value
unsigned getPointerAddressSpace(const Type *T)
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
LLVM_ABI Value * concatenateVectors(IRBuilderBase &Builder, ArrayRef< Value * > Vecs)
Concatenate a list of vectors.
void createUnpackShuffleMask(EVT VT, SmallVectorImpl< int > &Mask, bool Lo, bool Unary)
Generate unpacklo/unpackhi shuffle mask.
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
IRBuilder(LLVMContext &, FolderTy, InserterTy, MDNode *, ArrayRef< OperandBundleDef >) -> IRBuilder< FolderTy, InserterTy >
LLVM_ABI void narrowShuffleMaskElts(int Scale, ArrayRef< int > Mask, SmallVectorImpl< int > &ScaledMask)
Replace each shuffle mask index with the scaled sequential indices for an equivalent mask of narrowed...
void DecodePALIGNRMask(unsigned NumElts, unsigned Imm, SmallVectorImpl< int > &ShuffleMask)
ArrayRef(const T &OneElt) -> ArrayRef< T >
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Align commonAlignment(Align A, uint64_t Offset)
Returns the alignment that satisfies both alignments.
LLVM_ABI llvm::SmallVector< int, 16 > createSequentialMask(unsigned Start, unsigned NumInts, unsigned NumUndefs)
Create a sequential shuffle mask.