clang 22.0.0git
X86.cpp
Go to the documentation of this file.
1//===---------- X86.cpp - Emit LLVM Code for builtins ---------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This contains code to emit Builtin calls as LLVM code.
10//
11//===----------------------------------------------------------------------===//
12
13#include "CGBuiltin.h"
15#include "llvm/IR/InlineAsm.h"
16#include "llvm/IR/IntrinsicsX86.h"
17#include "llvm/TargetParser/X86TargetParser.h"
18
19using namespace clang;
20using namespace CodeGen;
21using namespace llvm;
22
23static std::optional<CodeGenFunction::MSVCIntrin>
24translateX86ToMsvcIntrin(unsigned BuiltinID) {
25 using MSVCIntrin = CodeGenFunction::MSVCIntrin;
26 switch (BuiltinID) {
27 default:
28 return std::nullopt;
29 case clang::X86::BI_BitScanForward:
30 case clang::X86::BI_BitScanForward64:
31 return MSVCIntrin::_BitScanForward;
32 case clang::X86::BI_BitScanReverse:
33 case clang::X86::BI_BitScanReverse64:
34 return MSVCIntrin::_BitScanReverse;
35 case clang::X86::BI_InterlockedAnd64:
36 return MSVCIntrin::_InterlockedAnd;
37 case clang::X86::BI_InterlockedCompareExchange128:
38 return MSVCIntrin::_InterlockedCompareExchange128;
39 case clang::X86::BI_InterlockedExchange64:
40 return MSVCIntrin::_InterlockedExchange;
41 case clang::X86::BI_InterlockedExchangeAdd64:
42 return MSVCIntrin::_InterlockedExchangeAdd;
43 case clang::X86::BI_InterlockedExchangeSub64:
44 return MSVCIntrin::_InterlockedExchangeSub;
45 case clang::X86::BI_InterlockedOr64:
46 return MSVCIntrin::_InterlockedOr;
47 case clang::X86::BI_InterlockedXor64:
48 return MSVCIntrin::_InterlockedXor;
49 case clang::X86::BI_InterlockedDecrement64:
50 return MSVCIntrin::_InterlockedDecrement;
51 case clang::X86::BI_InterlockedIncrement64:
52 return MSVCIntrin::_InterlockedIncrement;
53 }
54 llvm_unreachable("must return from switch");
55}
56
57// Convert the mask from an integer type to a vector of i1.
59 unsigned NumElts) {
60
61 auto *MaskTy = llvm::FixedVectorType::get(
62 CGF.Builder.getInt1Ty(),
63 cast<IntegerType>(Mask->getType())->getBitWidth());
64 Value *MaskVec = CGF.Builder.CreateBitCast(Mask, MaskTy);
65
66 // If we have less than 8 elements, then the starting mask was an i8 and
67 // we need to extract down to the right number of elements.
68 if (NumElts < 8) {
69 int Indices[4];
70 for (unsigned i = 0; i != NumElts; ++i)
71 Indices[i] = i;
72 MaskVec = CGF.Builder.CreateShuffleVector(
73 MaskVec, MaskVec, ArrayRef(Indices, NumElts), "extract");
74 }
75 return MaskVec;
76}
77
79 Align Alignment) {
80 Value *Ptr = Ops[0];
81
82 Value *MaskVec = getMaskVecValue(
83 CGF, Ops[2],
84 cast<llvm::FixedVectorType>(Ops[1]->getType())->getNumElements());
85
86 return CGF.Builder.CreateMaskedStore(Ops[1], Ptr, Alignment, MaskVec);
87}
88
90 Align Alignment) {
91 llvm::Type *Ty = Ops[1]->getType();
92 Value *Ptr = Ops[0];
93
94 Value *MaskVec = getMaskVecValue(
95 CGF, Ops[2], cast<llvm::FixedVectorType>(Ty)->getNumElements());
96
97 return CGF.Builder.CreateMaskedLoad(Ty, Ptr, Alignment, MaskVec, Ops[1]);
98}
99
101 ArrayRef<Value *> Ops) {
102 auto *ResultTy = cast<llvm::VectorType>(Ops[1]->getType());
103 Value *Ptr = Ops[0];
104
105 Value *MaskVec = getMaskVecValue(
106 CGF, Ops[2], cast<FixedVectorType>(ResultTy)->getNumElements());
107
108 llvm::Function *F = CGF.CGM.getIntrinsic(Intrinsic::masked_expandload,
109 ResultTy);
110 return CGF.Builder.CreateCall(F, { Ptr, MaskVec, Ops[1] });
111}
112
115 bool IsCompress) {
116 auto *ResultTy = cast<llvm::FixedVectorType>(Ops[1]->getType());
117
118 Value *MaskVec = getMaskVecValue(CGF, Ops[2], ResultTy->getNumElements());
119
120 Intrinsic::ID IID = IsCompress ? Intrinsic::x86_avx512_mask_compress
121 : Intrinsic::x86_avx512_mask_expand;
122 llvm::Function *F = CGF.CGM.getIntrinsic(IID, ResultTy);
123 return CGF.Builder.CreateCall(F, { Ops[0], Ops[1], MaskVec });
124}
125
127 ArrayRef<Value *> Ops) {
128 auto *ResultTy = cast<llvm::FixedVectorType>(Ops[1]->getType());
129 Value *Ptr = Ops[0];
130
131 Value *MaskVec = getMaskVecValue(CGF, Ops[2], ResultTy->getNumElements());
132
133 llvm::Function *F = CGF.CGM.getIntrinsic(Intrinsic::masked_compressstore,
134 ResultTy);
135 return CGF.Builder.CreateCall(F, { Ops[1], Ptr, MaskVec });
136}
137
138static Value *EmitX86MaskLogic(CodeGenFunction &CGF, Instruction::BinaryOps Opc,
140 bool InvertLHS = false) {
141 unsigned NumElts = Ops[0]->getType()->getIntegerBitWidth();
142 Value *LHS = getMaskVecValue(CGF, Ops[0], NumElts);
143 Value *RHS = getMaskVecValue(CGF, Ops[1], NumElts);
144
145 if (InvertLHS)
146 LHS = CGF.Builder.CreateNot(LHS);
147
148 return CGF.Builder.CreateBitCast(CGF.Builder.CreateBinOp(Opc, LHS, RHS),
149 Ops[0]->getType());
150}
151
153 Value *Amt, bool IsRight) {
154 llvm::Type *Ty = Op0->getType();
155
156 // Amount may be scalar immediate, in which case create a splat vector.
157 // Funnel shifts amounts are treated as modulo and types are all power-of-2 so
158 // we only care about the lowest log2 bits anyway.
159 if (Amt->getType() != Ty) {
160 unsigned NumElts = cast<llvm::FixedVectorType>(Ty)->getNumElements();
161 Amt = CGF.Builder.CreateIntCast(Amt, Ty->getScalarType(), false);
162 Amt = CGF.Builder.CreateVectorSplat(NumElts, Amt);
163 }
164
165 unsigned IID = IsRight ? Intrinsic::fshr : Intrinsic::fshl;
166 Function *F = CGF.CGM.getIntrinsic(IID, Ty);
167 return CGF.Builder.CreateCall(F, {Op0, Op1, Amt});
168}
169
171 bool IsSigned) {
172 Value *Op0 = Ops[0];
173 Value *Op1 = Ops[1];
174 llvm::Type *Ty = Op0->getType();
175 uint64_t Imm = cast<llvm::ConstantInt>(Ops[2])->getZExtValue() & 0x7;
176
177 CmpInst::Predicate Pred;
178 switch (Imm) {
179 case 0x0:
180 Pred = IsSigned ? ICmpInst::ICMP_SLT : ICmpInst::ICMP_ULT;
181 break;
182 case 0x1:
183 Pred = IsSigned ? ICmpInst::ICMP_SLE : ICmpInst::ICMP_ULE;
184 break;
185 case 0x2:
186 Pred = IsSigned ? ICmpInst::ICMP_SGT : ICmpInst::ICMP_UGT;
187 break;
188 case 0x3:
189 Pred = IsSigned ? ICmpInst::ICMP_SGE : ICmpInst::ICMP_UGE;
190 break;
191 case 0x4:
192 Pred = ICmpInst::ICMP_EQ;
193 break;
194 case 0x5:
195 Pred = ICmpInst::ICMP_NE;
196 break;
197 case 0x6:
198 return llvm::Constant::getNullValue(Ty); // FALSE
199 case 0x7:
200 return llvm::Constant::getAllOnesValue(Ty); // TRUE
201 default:
202 llvm_unreachable("Unexpected XOP vpcom/vpcomu predicate");
203 }
204
205 Value *Cmp = CGF.Builder.CreateICmp(Pred, Op0, Op1);
206 Value *Res = CGF.Builder.CreateSExt(Cmp, Ty);
207 return Res;
208}
209
211 Value *Mask, Value *Op0, Value *Op1) {
212
213 // If the mask is all ones just return first argument.
214 if (const auto *C = dyn_cast<Constant>(Mask))
215 if (C->isAllOnesValue())
216 return Op0;
217
218 Mask = getMaskVecValue(
219 CGF, Mask, cast<llvm::FixedVectorType>(Op0->getType())->getNumElements());
220
221 return CGF.Builder.CreateSelect(Mask, Op0, Op1);
222}
223
225 Value *Mask, Value *Op0, Value *Op1) {
226 // If the mask is all ones just return first argument.
227 if (const auto *C = dyn_cast<Constant>(Mask))
228 if (C->isAllOnesValue())
229 return Op0;
230
231 auto *MaskTy = llvm::FixedVectorType::get(
232 CGF.Builder.getInt1Ty(), Mask->getType()->getIntegerBitWidth());
233 Mask = CGF.Builder.CreateBitCast(Mask, MaskTy);
234 Mask = CGF.Builder.CreateExtractElement(Mask, (uint64_t)0);
235 return CGF.Builder.CreateSelect(Mask, Op0, Op1);
236}
237
239 unsigned NumElts, Value *MaskIn) {
240 if (MaskIn) {
241 const auto *C = dyn_cast<Constant>(MaskIn);
242 if (!C || !C->isAllOnesValue())
243 Cmp = CGF.Builder.CreateAnd(Cmp, getMaskVecValue(CGF, MaskIn, NumElts));
244 }
245
246 if (NumElts < 8) {
247 int Indices[8];
248 for (unsigned i = 0; i != NumElts; ++i)
249 Indices[i] = i;
250 for (unsigned i = NumElts; i != 8; ++i)
251 Indices[i] = i % NumElts + NumElts;
252 Cmp = CGF.Builder.CreateShuffleVector(
253 Cmp, llvm::Constant::getNullValue(Cmp->getType()), Indices);
254 }
255
256 return CGF.Builder.CreateBitCast(Cmp,
257 IntegerType::get(CGF.getLLVMContext(),
258 std::max(NumElts, 8U)));
259}
260
262 bool Signed, ArrayRef<Value *> Ops) {
263 assert((Ops.size() == 2 || Ops.size() == 4) &&
264 "Unexpected number of arguments");
265 unsigned NumElts =
266 cast<llvm::FixedVectorType>(Ops[0]->getType())->getNumElements();
267 Value *Cmp;
268
269 if (CC == 3) {
270 Cmp = Constant::getNullValue(
271 llvm::FixedVectorType::get(CGF.Builder.getInt1Ty(), NumElts));
272 } else if (CC == 7) {
273 Cmp = Constant::getAllOnesValue(
274 llvm::FixedVectorType::get(CGF.Builder.getInt1Ty(), NumElts));
275 } else {
276 ICmpInst::Predicate Pred;
277 switch (CC) {
278 default: llvm_unreachable("Unknown condition code");
279 case 0: Pred = ICmpInst::ICMP_EQ; break;
280 case 1: Pred = Signed ? ICmpInst::ICMP_SLT : ICmpInst::ICMP_ULT; break;
281 case 2: Pred = Signed ? ICmpInst::ICMP_SLE : ICmpInst::ICMP_ULE; break;
282 case 4: Pred = ICmpInst::ICMP_NE; break;
283 case 5: Pred = Signed ? ICmpInst::ICMP_SGE : ICmpInst::ICMP_UGE; break;
284 case 6: Pred = Signed ? ICmpInst::ICMP_SGT : ICmpInst::ICMP_UGT; break;
285 }
286 Cmp = CGF.Builder.CreateICmp(Pred, Ops[0], Ops[1]);
287 }
288
289 Value *MaskIn = nullptr;
290 if (Ops.size() == 4)
291 MaskIn = Ops[3];
292
293 return EmitX86MaskedCompareResult(CGF, Cmp, NumElts, MaskIn);
294}
295
297 Value *Zero = Constant::getNullValue(In->getType());
298 return EmitX86MaskedCompare(CGF, 1, true, { In, Zero });
299}
300
302 ArrayRef<Value *> Ops, bool IsSigned) {
303 unsigned Rnd = cast<llvm::ConstantInt>(Ops[3])->getZExtValue();
304 llvm::Type *Ty = Ops[1]->getType();
305
306 Value *Res;
307 if (Rnd != 4) {
308 Intrinsic::ID IID = IsSigned ? Intrinsic::x86_avx512_sitofp_round
309 : Intrinsic::x86_avx512_uitofp_round;
310 Function *F = CGF.CGM.getIntrinsic(IID, { Ty, Ops[0]->getType() });
311 Res = CGF.Builder.CreateCall(F, { Ops[0], Ops[3] });
312 } else {
313 CodeGenFunction::CGFPOptionsRAII FPOptsRAII(CGF, E);
314 Res = IsSigned ? CGF.Builder.CreateSIToFP(Ops[0], Ty)
315 : CGF.Builder.CreateUIToFP(Ops[0], Ty);
316 }
317
318 return EmitX86Select(CGF, Ops[2], Res, Ops[1]);
319}
320
321// Lowers X86 FMA intrinsics to IR.
323 ArrayRef<Value *> Ops, unsigned BuiltinID,
324 bool IsAddSub) {
325
326 bool Subtract = false;
327 Intrinsic::ID IID = Intrinsic::not_intrinsic;
328 switch (BuiltinID) {
329 default: break;
330 case clang::X86::BI__builtin_ia32_vfmsubph512_mask3:
331 Subtract = true;
332 [[fallthrough]];
333 case clang::X86::BI__builtin_ia32_vfmaddph512_mask:
334 case clang::X86::BI__builtin_ia32_vfmaddph512_maskz:
335 case clang::X86::BI__builtin_ia32_vfmaddph512_mask3:
336 IID = Intrinsic::x86_avx512fp16_vfmadd_ph_512;
337 break;
338 case clang::X86::BI__builtin_ia32_vfmsubaddph512_mask3:
339 Subtract = true;
340 [[fallthrough]];
341 case clang::X86::BI__builtin_ia32_vfmaddsubph512_mask:
342 case clang::X86::BI__builtin_ia32_vfmaddsubph512_maskz:
343 case clang::X86::BI__builtin_ia32_vfmaddsubph512_mask3:
344 IID = Intrinsic::x86_avx512fp16_vfmaddsub_ph_512;
345 break;
346 case clang::X86::BI__builtin_ia32_vfmsubps512_mask3:
347 Subtract = true;
348 [[fallthrough]];
349 case clang::X86::BI__builtin_ia32_vfmaddps512_mask:
350 case clang::X86::BI__builtin_ia32_vfmaddps512_maskz:
351 case clang::X86::BI__builtin_ia32_vfmaddps512_mask3:
352 IID = Intrinsic::x86_avx512_vfmadd_ps_512; break;
353 case clang::X86::BI__builtin_ia32_vfmsubpd512_mask3:
354 Subtract = true;
355 [[fallthrough]];
356 case clang::X86::BI__builtin_ia32_vfmaddpd512_mask:
357 case clang::X86::BI__builtin_ia32_vfmaddpd512_maskz:
358 case clang::X86::BI__builtin_ia32_vfmaddpd512_mask3:
359 IID = Intrinsic::x86_avx512_vfmadd_pd_512; break;
360 case clang::X86::BI__builtin_ia32_vfmsubaddps512_mask3:
361 Subtract = true;
362 [[fallthrough]];
363 case clang::X86::BI__builtin_ia32_vfmaddsubps512_mask:
364 case clang::X86::BI__builtin_ia32_vfmaddsubps512_maskz:
365 case clang::X86::BI__builtin_ia32_vfmaddsubps512_mask3:
366 IID = Intrinsic::x86_avx512_vfmaddsub_ps_512;
367 break;
368 case clang::X86::BI__builtin_ia32_vfmsubaddpd512_mask3:
369 Subtract = true;
370 [[fallthrough]];
371 case clang::X86::BI__builtin_ia32_vfmaddsubpd512_mask:
372 case clang::X86::BI__builtin_ia32_vfmaddsubpd512_maskz:
373 case clang::X86::BI__builtin_ia32_vfmaddsubpd512_mask3:
374 IID = Intrinsic::x86_avx512_vfmaddsub_pd_512;
375 break;
376 }
377
378 Value *A = Ops[0];
379 Value *B = Ops[1];
380 Value *C = Ops[2];
381
382 if (Subtract)
383 C = CGF.Builder.CreateFNeg(C);
384
385 Value *Res;
386
387 // Only handle in case of _MM_FROUND_CUR_DIRECTION/4 (no rounding).
388 if (IID != Intrinsic::not_intrinsic &&
389 (cast<llvm::ConstantInt>(Ops.back())->getZExtValue() != (uint64_t)4 ||
390 IsAddSub)) {
391 Function *Intr = CGF.CGM.getIntrinsic(IID);
392 Res = CGF.Builder.CreateCall(Intr, {A, B, C, Ops.back() });
393 } else {
394 llvm::Type *Ty = A->getType();
395 Function *FMA;
396 if (CGF.Builder.getIsFPConstrained()) {
397 CodeGenFunction::CGFPOptionsRAII FPOptsRAII(CGF, E);
398 FMA = CGF.CGM.getIntrinsic(Intrinsic::experimental_constrained_fma, Ty);
399 Res = CGF.Builder.CreateConstrainedFPCall(FMA, {A, B, C});
400 } else {
401 FMA = CGF.CGM.getIntrinsic(Intrinsic::fma, Ty);
402 Res = CGF.Builder.CreateCall(FMA, {A, B, C});
403 }
404 }
405
406 // Handle any required masking.
407 Value *MaskFalseVal = nullptr;
408 switch (BuiltinID) {
409 case clang::X86::BI__builtin_ia32_vfmaddph512_mask:
410 case clang::X86::BI__builtin_ia32_vfmaddps512_mask:
411 case clang::X86::BI__builtin_ia32_vfmaddpd512_mask:
412 case clang::X86::BI__builtin_ia32_vfmaddsubph512_mask:
413 case clang::X86::BI__builtin_ia32_vfmaddsubps512_mask:
414 case clang::X86::BI__builtin_ia32_vfmaddsubpd512_mask:
415 MaskFalseVal = Ops[0];
416 break;
417 case clang::X86::BI__builtin_ia32_vfmaddph512_maskz:
418 case clang::X86::BI__builtin_ia32_vfmaddps512_maskz:
419 case clang::X86::BI__builtin_ia32_vfmaddpd512_maskz:
420 case clang::X86::BI__builtin_ia32_vfmaddsubph512_maskz:
421 case clang::X86::BI__builtin_ia32_vfmaddsubps512_maskz:
422 case clang::X86::BI__builtin_ia32_vfmaddsubpd512_maskz:
423 MaskFalseVal = Constant::getNullValue(Ops[0]->getType());
424 break;
425 case clang::X86::BI__builtin_ia32_vfmsubph512_mask3:
426 case clang::X86::BI__builtin_ia32_vfmaddph512_mask3:
427 case clang::X86::BI__builtin_ia32_vfmsubps512_mask3:
428 case clang::X86::BI__builtin_ia32_vfmaddps512_mask3:
429 case clang::X86::BI__builtin_ia32_vfmsubpd512_mask3:
430 case clang::X86::BI__builtin_ia32_vfmaddpd512_mask3:
431 case clang::X86::BI__builtin_ia32_vfmsubaddph512_mask3:
432 case clang::X86::BI__builtin_ia32_vfmaddsubph512_mask3:
433 case clang::X86::BI__builtin_ia32_vfmsubaddps512_mask3:
434 case clang::X86::BI__builtin_ia32_vfmaddsubps512_mask3:
435 case clang::X86::BI__builtin_ia32_vfmsubaddpd512_mask3:
436 case clang::X86::BI__builtin_ia32_vfmaddsubpd512_mask3:
437 MaskFalseVal = Ops[2];
438 break;
439 }
440
441 if (MaskFalseVal)
442 return EmitX86Select(CGF, Ops[3], Res, MaskFalseVal);
443
444 return Res;
445}
446
449 bool ZeroMask = false, unsigned PTIdx = 0,
450 bool NegAcc = false) {
451 unsigned Rnd = 4;
452 if (Ops.size() > 4)
453 Rnd = cast<llvm::ConstantInt>(Ops[4])->getZExtValue();
454
455 if (NegAcc)
456 Ops[2] = CGF.Builder.CreateFNeg(Ops[2]);
457
458 Ops[0] = CGF.Builder.CreateExtractElement(Ops[0], (uint64_t)0);
459 Ops[1] = CGF.Builder.CreateExtractElement(Ops[1], (uint64_t)0);
460 Ops[2] = CGF.Builder.CreateExtractElement(Ops[2], (uint64_t)0);
461 Value *Res;
462 if (Rnd != 4) {
463 Intrinsic::ID IID;
464
465 switch (Ops[0]->getType()->getPrimitiveSizeInBits()) {
466 case 16:
467 IID = Intrinsic::x86_avx512fp16_vfmadd_f16;
468 break;
469 case 32:
470 IID = Intrinsic::x86_avx512_vfmadd_f32;
471 break;
472 case 64:
473 IID = Intrinsic::x86_avx512_vfmadd_f64;
474 break;
475 default:
476 llvm_unreachable("Unexpected size");
477 }
478 Res = CGF.Builder.CreateCall(CGF.CGM.getIntrinsic(IID),
479 {Ops[0], Ops[1], Ops[2], Ops[4]});
480 } else if (CGF.Builder.getIsFPConstrained()) {
481 CodeGenFunction::CGFPOptionsRAII FPOptsRAII(CGF, E);
482 Function *FMA = CGF.CGM.getIntrinsic(
483 Intrinsic::experimental_constrained_fma, Ops[0]->getType());
484 Res = CGF.Builder.CreateConstrainedFPCall(FMA, Ops.slice(0, 3));
485 } else {
486 Function *FMA = CGF.CGM.getIntrinsic(Intrinsic::fma, Ops[0]->getType());
487 Res = CGF.Builder.CreateCall(FMA, Ops.slice(0, 3));
488 }
489 // If we have more than 3 arguments, we need to do masking.
490 if (Ops.size() > 3) {
491 Value *PassThru = ZeroMask ? Constant::getNullValue(Res->getType())
492 : Ops[PTIdx];
493
494 // If we negated the accumulator and the its the PassThru value we need to
495 // bypass the negate. Conveniently Upper should be the same thing in this
496 // case.
497 if (NegAcc && PTIdx == 2)
498 PassThru = CGF.Builder.CreateExtractElement(Upper, (uint64_t)0);
499
500 Res = EmitX86ScalarSelect(CGF, Ops[3], Res, PassThru);
501 }
502 return CGF.Builder.CreateInsertElement(Upper, Res, (uint64_t)0);
503}
504
505static Value *EmitX86Muldq(CodeGenFunction &CGF, bool IsSigned,
506 ArrayRef<Value *> Ops) {
507 llvm::Type *Ty = Ops[0]->getType();
508 // Arguments have a vXi32 type so cast to vXi64.
509 Ty = llvm::FixedVectorType::get(CGF.Int64Ty,
510 Ty->getPrimitiveSizeInBits() / 64);
511 Value *LHS = CGF.Builder.CreateBitCast(Ops[0], Ty);
512 Value *RHS = CGF.Builder.CreateBitCast(Ops[1], Ty);
513
514 if (IsSigned) {
515 // Shift left then arithmetic shift right.
516 Constant *ShiftAmt = ConstantInt::get(Ty, 32);
517 LHS = CGF.Builder.CreateShl(LHS, ShiftAmt);
518 LHS = CGF.Builder.CreateAShr(LHS, ShiftAmt);
519 RHS = CGF.Builder.CreateShl(RHS, ShiftAmt);
520 RHS = CGF.Builder.CreateAShr(RHS, ShiftAmt);
521 } else {
522 // Clear the upper bits.
523 Constant *Mask = ConstantInt::get(Ty, 0xffffffff);
524 LHS = CGF.Builder.CreateAnd(LHS, Mask);
525 RHS = CGF.Builder.CreateAnd(RHS, Mask);
526 }
527
528 return CGF.Builder.CreateMul(LHS, RHS);
529}
530
531// Emit a masked pternlog intrinsic. This only exists because the header has to
532// use a macro and we aren't able to pass the input argument to a pternlog
533// builtin and a select builtin without evaluating it twice.
534static Value *EmitX86Ternlog(CodeGenFunction &CGF, bool ZeroMask,
535 ArrayRef<Value *> Ops) {
536 llvm::Type *Ty = Ops[0]->getType();
537
538 unsigned VecWidth = Ty->getPrimitiveSizeInBits();
539 unsigned EltWidth = Ty->getScalarSizeInBits();
540 Intrinsic::ID IID;
541 if (VecWidth == 128 && EltWidth == 32)
542 IID = Intrinsic::x86_avx512_pternlog_d_128;
543 else if (VecWidth == 256 && EltWidth == 32)
544 IID = Intrinsic::x86_avx512_pternlog_d_256;
545 else if (VecWidth == 512 && EltWidth == 32)
546 IID = Intrinsic::x86_avx512_pternlog_d_512;
547 else if (VecWidth == 128 && EltWidth == 64)
548 IID = Intrinsic::x86_avx512_pternlog_q_128;
549 else if (VecWidth == 256 && EltWidth == 64)
550 IID = Intrinsic::x86_avx512_pternlog_q_256;
551 else if (VecWidth == 512 && EltWidth == 64)
552 IID = Intrinsic::x86_avx512_pternlog_q_512;
553 else
554 llvm_unreachable("Unexpected intrinsic");
555
556 Value *Ternlog = CGF.Builder.CreateCall(CGF.CGM.getIntrinsic(IID),
557 Ops.drop_back());
558 Value *PassThru = ZeroMask ? ConstantAggregateZero::get(Ty) : Ops[0];
559 return EmitX86Select(CGF, Ops[4], Ternlog, PassThru);
560}
561
563 llvm::Type *DstTy) {
564 unsigned NumberOfElements =
565 cast<llvm::FixedVectorType>(DstTy)->getNumElements();
566 Value *Mask = getMaskVecValue(CGF, Op, NumberOfElements);
567 return CGF.Builder.CreateSExt(Mask, DstTy, "vpmovm2");
568}
569
570Value *CodeGenFunction::EmitX86CpuIs(const CallExpr *E) {
571 const Expr *CPUExpr = E->getArg(0)->IgnoreParenCasts();
572 StringRef CPUStr = cast<clang::StringLiteral>(CPUExpr)->getString();
573 return EmitX86CpuIs(CPUStr);
574}
575
576// Convert F16 halfs to floats.
579 llvm::Type *DstTy) {
580 assert((Ops.size() == 1 || Ops.size() == 3 || Ops.size() == 4) &&
581 "Unknown cvtph2ps intrinsic");
582
583 // If the SAE intrinsic doesn't use default rounding then we can't upgrade.
584 if (Ops.size() == 4 && cast<llvm::ConstantInt>(Ops[3])->getZExtValue() != 4) {
585 Function *F =
586 CGF.CGM.getIntrinsic(Intrinsic::x86_avx512_mask_vcvtph2ps_512);
587 return CGF.Builder.CreateCall(F, {Ops[0], Ops[1], Ops[2], Ops[3]});
588 }
589
590 unsigned NumDstElts = cast<llvm::FixedVectorType>(DstTy)->getNumElements();
591 Value *Src = Ops[0];
592
593 // Extract the subvector.
594 if (NumDstElts !=
595 cast<llvm::FixedVectorType>(Src->getType())->getNumElements()) {
596 assert(NumDstElts == 4 && "Unexpected vector size");
597 Src = CGF.Builder.CreateShuffleVector(Src, {0, 1, 2, 3});
598 }
599
600 // Bitcast from vXi16 to vXf16.
601 auto *HalfTy = llvm::FixedVectorType::get(
602 llvm::Type::getHalfTy(CGF.getLLVMContext()), NumDstElts);
603 Src = CGF.Builder.CreateBitCast(Src, HalfTy);
604
605 // Perform the fp-extension.
606 Value *Res = CGF.Builder.CreateFPExt(Src, DstTy, "cvtph2ps");
607
608 if (Ops.size() >= 3)
609 Res = EmitX86Select(CGF, Ops[2], Res, Ops[1]);
610 return Res;
611}
612
613Value *CodeGenFunction::EmitX86CpuIs(StringRef CPUStr) {
614
615 llvm::Type *Int32Ty = Builder.getInt32Ty();
616
617 // Matching the struct layout from the compiler-rt/libgcc structure that is
618 // filled in:
619 // unsigned int __cpu_vendor;
620 // unsigned int __cpu_type;
621 // unsigned int __cpu_subtype;
622 // unsigned int __cpu_features[1];
623 llvm::Type *STy = llvm::StructType::get(Int32Ty, Int32Ty, Int32Ty,
624 llvm::ArrayType::get(Int32Ty, 1));
625
626 // Grab the global __cpu_model.
627 llvm::Constant *CpuModel = CGM.CreateRuntimeVariable(STy, "__cpu_model");
628 cast<llvm::GlobalValue>(CpuModel)->setDSOLocal(true);
629
630 // Calculate the index needed to access the correct field based on the
631 // range. Also adjust the expected value.
632 auto [Index, Value] = StringSwitch<std::pair<unsigned, unsigned>>(CPUStr)
633#define X86_VENDOR(ENUM, STRING) \
634 .Case(STRING, {0u, static_cast<unsigned>(llvm::X86::ENUM)})
635#define X86_CPU_TYPE_ALIAS(ENUM, ALIAS) \
636 .Case(ALIAS, {1u, static_cast<unsigned>(llvm::X86::ENUM)})
637#define X86_CPU_TYPE(ENUM, STR) \
638 .Case(STR, {1u, static_cast<unsigned>(llvm::X86::ENUM)})
640 .Case(ALIAS, {2u, static_cast<unsigned>(llvm::X86::ENUM)})
641#define X86_CPU_SUBTYPE(ENUM, STR) \
642 .Case(STR, {2u, static_cast<unsigned>(llvm::X86::ENUM)})
643#include "llvm/TargetParser/X86TargetParser.def"
644 .Default({0, 0});
645 assert(Value != 0 && "Invalid CPUStr passed to CpuIs");
646
647 // Grab the appropriate field from __cpu_model.
648 llvm::Value *Idxs[] = {ConstantInt::get(Int32Ty, 0),
649 ConstantInt::get(Int32Ty, Index)};
650 llvm::Value *CpuValue = Builder.CreateInBoundsGEP(STy, CpuModel, Idxs);
651 CpuValue = Builder.CreateAlignedLoad(Int32Ty, CpuValue,
653
654 // Check the value of the field against the requested value.
655 return Builder.CreateICmpEQ(CpuValue,
656 llvm::ConstantInt::get(Int32Ty, Value));
657}
658
659Value *CodeGenFunction::EmitX86CpuSupports(const CallExpr *E) {
660 const Expr *FeatureExpr = E->getArg(0)->IgnoreParenCasts();
661 StringRef FeatureStr = cast<StringLiteral>(FeatureExpr)->getString();
662 if (!getContext().getTargetInfo().validateCpuSupports(FeatureStr))
663 return Builder.getFalse();
664 return EmitX86CpuSupports(FeatureStr);
665}
666
667Value *CodeGenFunction::EmitX86CpuSupports(ArrayRef<StringRef> FeatureStrs) {
668 return EmitX86CpuSupports(llvm::X86::getCpuSupportsMask(FeatureStrs));
669}
670
671llvm::Value *
672CodeGenFunction::EmitX86CpuSupports(std::array<uint32_t, 4> FeatureMask) {
673 Value *Result = Builder.getTrue();
674 if (FeatureMask[0] != 0) {
675 // Matching the struct layout from the compiler-rt/libgcc structure that is
676 // filled in:
677 // unsigned int __cpu_vendor;
678 // unsigned int __cpu_type;
679 // unsigned int __cpu_subtype;
680 // unsigned int __cpu_features[1];
681 llvm::Type *STy = llvm::StructType::get(Int32Ty, Int32Ty, Int32Ty,
682 llvm::ArrayType::get(Int32Ty, 1));
683
684 // Grab the global __cpu_model.
685 llvm::Constant *CpuModel = CGM.CreateRuntimeVariable(STy, "__cpu_model");
686 cast<llvm::GlobalValue>(CpuModel)->setDSOLocal(true);
687
688 // Grab the first (0th) element from the field __cpu_features off of the
689 // global in the struct STy.
690 Value *Idxs[] = {Builder.getInt32(0), Builder.getInt32(3),
691 Builder.getInt32(0)};
692 Value *CpuFeatures = Builder.CreateInBoundsGEP(STy, CpuModel, Idxs);
693 Value *Features = Builder.CreateAlignedLoad(Int32Ty, CpuFeatures,
695
696 // Check the value of the bit corresponding to the feature requested.
697 Value *Mask = Builder.getInt32(FeatureMask[0]);
698 Value *Bitset = Builder.CreateAnd(Features, Mask);
699 Value *Cmp = Builder.CreateICmpEQ(Bitset, Mask);
700 Result = Builder.CreateAnd(Result, Cmp);
701 }
702
703 llvm::Type *ATy = llvm::ArrayType::get(Int32Ty, 3);
704 llvm::Constant *CpuFeatures2 =
705 CGM.CreateRuntimeVariable(ATy, "__cpu_features2");
706 cast<llvm::GlobalValue>(CpuFeatures2)->setDSOLocal(true);
707 for (int i = 1; i != 4; ++i) {
708 const uint32_t M = FeatureMask[i];
709 if (!M)
710 continue;
711 Value *Idxs[] = {Builder.getInt32(0), Builder.getInt32(i - 1)};
712 Value *Features = Builder.CreateAlignedLoad(
713 Int32Ty, Builder.CreateInBoundsGEP(ATy, CpuFeatures2, Idxs),
715 // Check the value of the bit corresponding to the feature requested.
716 Value *Mask = Builder.getInt32(M);
717 Value *Bitset = Builder.CreateAnd(Features, Mask);
718 Value *Cmp = Builder.CreateICmpEQ(Bitset, Mask);
719 Result = Builder.CreateAnd(Result, Cmp);
720 }
721
722 return Result;
723}
724
725Value *CodeGenFunction::EmitX86CpuInit() {
726 llvm::FunctionType *FTy = llvm::FunctionType::get(VoidTy,
727 /*Variadic*/ false);
728 llvm::FunctionCallee Func =
729 CGM.CreateRuntimeFunction(FTy, "__cpu_indicator_init");
730 cast<llvm::GlobalValue>(Func.getCallee())->setDSOLocal(true);
731 cast<llvm::GlobalValue>(Func.getCallee())
732 ->setDLLStorageClass(llvm::GlobalValue::DefaultStorageClass);
733 return Builder.CreateCall(Func);
734}
735
736
738 const CallExpr *E) {
739 if (BuiltinID == Builtin::BI__builtin_cpu_is)
740 return EmitX86CpuIs(E);
741 if (BuiltinID == Builtin::BI__builtin_cpu_supports)
742 return EmitX86CpuSupports(E);
743 if (BuiltinID == Builtin::BI__builtin_cpu_init)
744 return EmitX86CpuInit();
745
746 // Handle MSVC intrinsics before argument evaluation to prevent double
747 // evaluation.
748 if (std::optional<MSVCIntrin> MsvcIntId = translateX86ToMsvcIntrin(BuiltinID))
749 return EmitMSVCBuiltinExpr(*MsvcIntId, E);
750
752 bool IsMaskFCmp = false;
753 bool IsConjFMA = false;
754
755 // Find out if any arguments are required to be integer constant expressions.
756 unsigned ICEArguments = 0;
758 getContext().GetBuiltinType(BuiltinID, Error, &ICEArguments);
759 assert(Error == ASTContext::GE_None && "Should not codegen an error");
760
761 for (unsigned i = 0, e = E->getNumArgs(); i != e; i++) {
762 Ops.push_back(EmitScalarOrConstFoldImmArg(ICEArguments, i, E));
763 }
764
765 // These exist so that the builtin that takes an immediate can be bounds
766 // checked by clang to avoid passing bad immediates to the backend. Since
767 // AVX has a larger immediate than SSE we would need separate builtins to
768 // do the different bounds checking. Rather than create a clang specific
769 // SSE only builtin, this implements eight separate builtins to match gcc
770 // implementation.
771 auto getCmpIntrinsicCall = [this, &Ops](Intrinsic::ID ID, unsigned Imm) {
772 Ops.push_back(llvm::ConstantInt::get(Int8Ty, Imm));
773 llvm::Function *F = CGM.getIntrinsic(ID);
774 return Builder.CreateCall(F, Ops);
775 };
776
777 // For the vector forms of FP comparisons, translate the builtins directly to
778 // IR.
779 // TODO: The builtins could be removed if the SSE header files used vector
780 // extension comparisons directly (vector ordered/unordered may need
781 // additional support via __builtin_isnan()).
782 auto getVectorFCmpIR = [this, &Ops, E](CmpInst::Predicate Pred,
783 bool IsSignaling) {
784 CodeGenFunction::CGFPOptionsRAII FPOptsRAII(*this, E);
785 Value *Cmp;
786 if (IsSignaling)
787 Cmp = Builder.CreateFCmpS(Pred, Ops[0], Ops[1]);
788 else
789 Cmp = Builder.CreateFCmp(Pred, Ops[0], Ops[1]);
790 llvm::VectorType *FPVecTy = cast<llvm::VectorType>(Ops[0]->getType());
791 llvm::VectorType *IntVecTy = llvm::VectorType::getInteger(FPVecTy);
792 Value *Sext = Builder.CreateSExt(Cmp, IntVecTy);
793 return Builder.CreateBitCast(Sext, FPVecTy);
794 };
795
796 switch (BuiltinID) {
797 default: return nullptr;
798 case X86::BI_mm_prefetch: {
799 Value *Address = Ops[0];
800 ConstantInt *C = cast<ConstantInt>(Ops[1]);
801 Value *RW = ConstantInt::get(Int32Ty, (C->getZExtValue() >> 2) & 0x1);
802 Value *Locality = ConstantInt::get(Int32Ty, C->getZExtValue() & 0x3);
803 Value *Data = ConstantInt::get(Int32Ty, 1);
804 Function *F = CGM.getIntrinsic(Intrinsic::prefetch, Address->getType());
805 return Builder.CreateCall(F, {Address, RW, Locality, Data});
806 }
807 case X86::BI_m_prefetch:
808 case X86::BI_m_prefetchw: {
809 Value *Address = Ops[0];
810 // The 'w' suffix implies write.
811 Value *RW =
812 ConstantInt::get(Int32Ty, BuiltinID == X86::BI_m_prefetchw ? 1 : 0);
813 Value *Locality = ConstantInt::get(Int32Ty, 0x3);
814 Value *Data = ConstantInt::get(Int32Ty, 1);
815 Function *F = CGM.getIntrinsic(Intrinsic::prefetch, Address->getType());
816 return Builder.CreateCall(F, {Address, RW, Locality, Data});
817 }
818 case X86::BI_mm_clflush: {
819 return Builder.CreateCall(CGM.getIntrinsic(Intrinsic::x86_sse2_clflush),
820 Ops[0]);
821 }
822 case X86::BI_mm_lfence: {
823 return Builder.CreateCall(CGM.getIntrinsic(Intrinsic::x86_sse2_lfence));
824 }
825 case X86::BI_mm_mfence: {
826 return Builder.CreateCall(CGM.getIntrinsic(Intrinsic::x86_sse2_mfence));
827 }
828 case X86::BI_mm_sfence: {
829 return Builder.CreateCall(CGM.getIntrinsic(Intrinsic::x86_sse_sfence));
830 }
831 case X86::BI_mm_pause: {
832 return Builder.CreateCall(CGM.getIntrinsic(Intrinsic::x86_sse2_pause));
833 }
834 case X86::BI__rdtsc: {
835 return Builder.CreateCall(CGM.getIntrinsic(Intrinsic::x86_rdtsc));
836 }
837 case X86::BI__builtin_ia32_rdtscp: {
838 Value *Call = Builder.CreateCall(CGM.getIntrinsic(Intrinsic::x86_rdtscp));
839 Builder.CreateDefaultAlignedStore(Builder.CreateExtractValue(Call, 1),
840 Ops[0]);
841 return Builder.CreateExtractValue(Call, 0);
842 }
843 case X86::BI__builtin_ia32_lzcnt_u16:
844 case X86::BI__builtin_ia32_lzcnt_u32:
845 case X86::BI__builtin_ia32_lzcnt_u64: {
846 Function *F = CGM.getIntrinsic(Intrinsic::ctlz, Ops[0]->getType());
847 return Builder.CreateCall(F, {Ops[0], Builder.getInt1(false)});
848 }
849 case X86::BI__builtin_ia32_tzcnt_u16:
850 case X86::BI__builtin_ia32_tzcnt_u32:
851 case X86::BI__builtin_ia32_tzcnt_u64: {
852 Function *F = CGM.getIntrinsic(Intrinsic::cttz, Ops[0]->getType());
853 return Builder.CreateCall(F, {Ops[0], Builder.getInt1(false)});
854 }
855 case X86::BI__builtin_ia32_undef128:
856 case X86::BI__builtin_ia32_undef256:
857 case X86::BI__builtin_ia32_undef512:
858 // The x86 definition of "undef" is not the same as the LLVM definition
859 // (PR32176). We leave optimizing away an unnecessary zero constant to the
860 // IR optimizer and backend.
861 // TODO: If we had a "freeze" IR instruction to generate a fixed undef
862 // value, we should use that here instead of a zero.
863 return llvm::Constant::getNullValue(ConvertType(E->getType()));
864 case X86::BI__builtin_ia32_vec_ext_v4hi:
865 case X86::BI__builtin_ia32_vec_ext_v16qi:
866 case X86::BI__builtin_ia32_vec_ext_v8hi:
867 case X86::BI__builtin_ia32_vec_ext_v4si:
868 case X86::BI__builtin_ia32_vec_ext_v4sf:
869 case X86::BI__builtin_ia32_vec_ext_v2di:
870 case X86::BI__builtin_ia32_vec_ext_v32qi:
871 case X86::BI__builtin_ia32_vec_ext_v16hi:
872 case X86::BI__builtin_ia32_vec_ext_v8si:
873 case X86::BI__builtin_ia32_vec_ext_v4di: {
874 unsigned NumElts =
875 cast<llvm::FixedVectorType>(Ops[0]->getType())->getNumElements();
876 uint64_t Index = cast<ConstantInt>(Ops[1])->getZExtValue();
877 Index &= NumElts - 1;
878 // These builtins exist so we can ensure the index is an ICE and in range.
879 // Otherwise we could just do this in the header file.
880 return Builder.CreateExtractElement(Ops[0], Index);
881 }
882 case X86::BI__builtin_ia32_vec_set_v4hi:
883 case X86::BI__builtin_ia32_vec_set_v16qi:
884 case X86::BI__builtin_ia32_vec_set_v8hi:
885 case X86::BI__builtin_ia32_vec_set_v4si:
886 case X86::BI__builtin_ia32_vec_set_v2di:
887 case X86::BI__builtin_ia32_vec_set_v32qi:
888 case X86::BI__builtin_ia32_vec_set_v16hi:
889 case X86::BI__builtin_ia32_vec_set_v8si:
890 case X86::BI__builtin_ia32_vec_set_v4di: {
891 unsigned NumElts =
892 cast<llvm::FixedVectorType>(Ops[0]->getType())->getNumElements();
893 unsigned Index = cast<ConstantInt>(Ops[2])->getZExtValue();
894 Index &= NumElts - 1;
895 // These builtins exist so we can ensure the index is an ICE and in range.
896 // Otherwise we could just do this in the header file.
897 return Builder.CreateInsertElement(Ops[0], Ops[1], Index);
898 }
899 case X86::BI_mm_setcsr:
900 case X86::BI__builtin_ia32_ldmxcsr: {
901 RawAddress Tmp = CreateMemTemp(E->getArg(0)->getType());
902 Builder.CreateStore(Ops[0], Tmp);
903 return Builder.CreateCall(CGM.getIntrinsic(Intrinsic::x86_sse_ldmxcsr),
904 Tmp.getPointer());
905 }
906 case X86::BI_mm_getcsr:
907 case X86::BI__builtin_ia32_stmxcsr: {
908 RawAddress Tmp = CreateMemTemp(E->getType());
909 Builder.CreateCall(CGM.getIntrinsic(Intrinsic::x86_sse_stmxcsr),
910 Tmp.getPointer());
911 return Builder.CreateLoad(Tmp, "stmxcsr");
912 }
913 case X86::BI__builtin_ia32_xsave:
914 case X86::BI__builtin_ia32_xsave64:
915 case X86::BI__builtin_ia32_xrstor:
916 case X86::BI__builtin_ia32_xrstor64:
917 case X86::BI__builtin_ia32_xsaveopt:
918 case X86::BI__builtin_ia32_xsaveopt64:
919 case X86::BI__builtin_ia32_xrstors:
920 case X86::BI__builtin_ia32_xrstors64:
921 case X86::BI__builtin_ia32_xsavec:
922 case X86::BI__builtin_ia32_xsavec64:
923 case X86::BI__builtin_ia32_xsaves:
924 case X86::BI__builtin_ia32_xsaves64:
925 case X86::BI__builtin_ia32_xsetbv:
926 case X86::BI_xsetbv: {
927 Intrinsic::ID ID;
928#define INTRINSIC_X86_XSAVE_ID(NAME) \
929 case X86::BI__builtin_ia32_##NAME: \
930 ID = Intrinsic::x86_##NAME; \
931 break
932 switch (BuiltinID) {
933 default: llvm_unreachable("Unsupported intrinsic!");
935 INTRINSIC_X86_XSAVE_ID(xsave64);
937 INTRINSIC_X86_XSAVE_ID(xrstor64);
938 INTRINSIC_X86_XSAVE_ID(xsaveopt);
939 INTRINSIC_X86_XSAVE_ID(xsaveopt64);
940 INTRINSIC_X86_XSAVE_ID(xrstors);
941 INTRINSIC_X86_XSAVE_ID(xrstors64);
943 INTRINSIC_X86_XSAVE_ID(xsavec64);
945 INTRINSIC_X86_XSAVE_ID(xsaves64);
947 case X86::BI_xsetbv:
948 ID = Intrinsic::x86_xsetbv;
949 break;
950 }
951#undef INTRINSIC_X86_XSAVE_ID
952 Value *Mhi = Builder.CreateTrunc(
953 Builder.CreateLShr(Ops[1], ConstantInt::get(Int64Ty, 32)), Int32Ty);
954 Value *Mlo = Builder.CreateTrunc(Ops[1], Int32Ty);
955 Ops[1] = Mhi;
956 Ops.push_back(Mlo);
957 return Builder.CreateCall(CGM.getIntrinsic(ID), Ops);
958 }
959 case X86::BI__builtin_ia32_xgetbv:
960 case X86::BI_xgetbv:
961 return Builder.CreateCall(CGM.getIntrinsic(Intrinsic::x86_xgetbv), Ops);
962 case X86::BI__builtin_ia32_storedqudi128_mask:
963 case X86::BI__builtin_ia32_storedqusi128_mask:
964 case X86::BI__builtin_ia32_storedquhi128_mask:
965 case X86::BI__builtin_ia32_storedquqi128_mask:
966 case X86::BI__builtin_ia32_storeupd128_mask:
967 case X86::BI__builtin_ia32_storeups128_mask:
968 case X86::BI__builtin_ia32_storedqudi256_mask:
969 case X86::BI__builtin_ia32_storedqusi256_mask:
970 case X86::BI__builtin_ia32_storedquhi256_mask:
971 case X86::BI__builtin_ia32_storedquqi256_mask:
972 case X86::BI__builtin_ia32_storeupd256_mask:
973 case X86::BI__builtin_ia32_storeups256_mask:
974 case X86::BI__builtin_ia32_storedqudi512_mask:
975 case X86::BI__builtin_ia32_storedqusi512_mask:
976 case X86::BI__builtin_ia32_storedquhi512_mask:
977 case X86::BI__builtin_ia32_storedquqi512_mask:
978 case X86::BI__builtin_ia32_storeupd512_mask:
979 case X86::BI__builtin_ia32_storeups512_mask:
980 return EmitX86MaskedStore(*this, Ops, Align(1));
981
982 case X86::BI__builtin_ia32_storesbf16128_mask:
983 case X86::BI__builtin_ia32_storesh128_mask:
984 case X86::BI__builtin_ia32_storess128_mask:
985 case X86::BI__builtin_ia32_storesd128_mask:
986 return EmitX86MaskedStore(*this, Ops, Align(1));
987
988 case X86::BI__builtin_ia32_cvtmask2b128:
989 case X86::BI__builtin_ia32_cvtmask2b256:
990 case X86::BI__builtin_ia32_cvtmask2b512:
991 case X86::BI__builtin_ia32_cvtmask2w128:
992 case X86::BI__builtin_ia32_cvtmask2w256:
993 case X86::BI__builtin_ia32_cvtmask2w512:
994 case X86::BI__builtin_ia32_cvtmask2d128:
995 case X86::BI__builtin_ia32_cvtmask2d256:
996 case X86::BI__builtin_ia32_cvtmask2d512:
997 case X86::BI__builtin_ia32_cvtmask2q128:
998 case X86::BI__builtin_ia32_cvtmask2q256:
999 case X86::BI__builtin_ia32_cvtmask2q512:
1000 return EmitX86SExtMask(*this, Ops[0], ConvertType(E->getType()));
1001
1002 case X86::BI__builtin_ia32_cvtb2mask128:
1003 case X86::BI__builtin_ia32_cvtb2mask256:
1004 case X86::BI__builtin_ia32_cvtb2mask512:
1005 case X86::BI__builtin_ia32_cvtw2mask128:
1006 case X86::BI__builtin_ia32_cvtw2mask256:
1007 case X86::BI__builtin_ia32_cvtw2mask512:
1008 case X86::BI__builtin_ia32_cvtd2mask128:
1009 case X86::BI__builtin_ia32_cvtd2mask256:
1010 case X86::BI__builtin_ia32_cvtd2mask512:
1011 case X86::BI__builtin_ia32_cvtq2mask128:
1012 case X86::BI__builtin_ia32_cvtq2mask256:
1013 case X86::BI__builtin_ia32_cvtq2mask512:
1014 return EmitX86ConvertToMask(*this, Ops[0]);
1015
1016 case X86::BI__builtin_ia32_cvtdq2ps512_mask:
1017 case X86::BI__builtin_ia32_cvtqq2ps512_mask:
1018 case X86::BI__builtin_ia32_cvtqq2pd512_mask:
1019 case X86::BI__builtin_ia32_vcvtw2ph512_mask:
1020 case X86::BI__builtin_ia32_vcvtdq2ph512_mask:
1021 case X86::BI__builtin_ia32_vcvtqq2ph512_mask:
1022 return EmitX86ConvertIntToFp(*this, E, Ops, /*IsSigned*/ true);
1023 case X86::BI__builtin_ia32_cvtudq2ps512_mask:
1024 case X86::BI__builtin_ia32_cvtuqq2ps512_mask:
1025 case X86::BI__builtin_ia32_cvtuqq2pd512_mask:
1026 case X86::BI__builtin_ia32_vcvtuw2ph512_mask:
1027 case X86::BI__builtin_ia32_vcvtudq2ph512_mask:
1028 case X86::BI__builtin_ia32_vcvtuqq2ph512_mask:
1029 return EmitX86ConvertIntToFp(*this, E, Ops, /*IsSigned*/ false);
1030
1031 case X86::BI__builtin_ia32_vfmaddss3:
1032 case X86::BI__builtin_ia32_vfmaddsd3:
1033 case X86::BI__builtin_ia32_vfmaddsh3_mask:
1034 case X86::BI__builtin_ia32_vfmaddss3_mask:
1035 case X86::BI__builtin_ia32_vfmaddsd3_mask:
1036 return EmitScalarFMAExpr(*this, E, Ops, Ops[0]);
1037 case X86::BI__builtin_ia32_vfmaddss:
1038 case X86::BI__builtin_ia32_vfmaddsd:
1039 return EmitScalarFMAExpr(*this, E, Ops,
1040 Constant::getNullValue(Ops[0]->getType()));
1041 case X86::BI__builtin_ia32_vfmaddsh3_maskz:
1042 case X86::BI__builtin_ia32_vfmaddss3_maskz:
1043 case X86::BI__builtin_ia32_vfmaddsd3_maskz:
1044 return EmitScalarFMAExpr(*this, E, Ops, Ops[0], /*ZeroMask*/ true);
1045 case X86::BI__builtin_ia32_vfmaddsh3_mask3:
1046 case X86::BI__builtin_ia32_vfmaddss3_mask3:
1047 case X86::BI__builtin_ia32_vfmaddsd3_mask3:
1048 return EmitScalarFMAExpr(*this, E, Ops, Ops[2], /*ZeroMask*/ false, 2);
1049 case X86::BI__builtin_ia32_vfmsubsh3_mask3:
1050 case X86::BI__builtin_ia32_vfmsubss3_mask3:
1051 case X86::BI__builtin_ia32_vfmsubsd3_mask3:
1052 return EmitScalarFMAExpr(*this, E, Ops, Ops[2], /*ZeroMask*/ false, 2,
1053 /*NegAcc*/ true);
1054 case X86::BI__builtin_ia32_vfmaddph512_mask:
1055 case X86::BI__builtin_ia32_vfmaddph512_maskz:
1056 case X86::BI__builtin_ia32_vfmaddph512_mask3:
1057 case X86::BI__builtin_ia32_vfmaddps512_mask:
1058 case X86::BI__builtin_ia32_vfmaddps512_maskz:
1059 case X86::BI__builtin_ia32_vfmaddps512_mask3:
1060 case X86::BI__builtin_ia32_vfmsubps512_mask3:
1061 case X86::BI__builtin_ia32_vfmaddpd512_mask:
1062 case X86::BI__builtin_ia32_vfmaddpd512_maskz:
1063 case X86::BI__builtin_ia32_vfmaddpd512_mask3:
1064 case X86::BI__builtin_ia32_vfmsubpd512_mask3:
1065 case X86::BI__builtin_ia32_vfmsubph512_mask3:
1066 return EmitX86FMAExpr(*this, E, Ops, BuiltinID, /*IsAddSub*/ false);
1067 case X86::BI__builtin_ia32_vfmaddsubph512_mask:
1068 case X86::BI__builtin_ia32_vfmaddsubph512_maskz:
1069 case X86::BI__builtin_ia32_vfmaddsubph512_mask3:
1070 case X86::BI__builtin_ia32_vfmsubaddph512_mask3:
1071 case X86::BI__builtin_ia32_vfmaddsubps512_mask:
1072 case X86::BI__builtin_ia32_vfmaddsubps512_maskz:
1073 case X86::BI__builtin_ia32_vfmaddsubps512_mask3:
1074 case X86::BI__builtin_ia32_vfmsubaddps512_mask3:
1075 case X86::BI__builtin_ia32_vfmaddsubpd512_mask:
1076 case X86::BI__builtin_ia32_vfmaddsubpd512_maskz:
1077 case X86::BI__builtin_ia32_vfmaddsubpd512_mask3:
1078 case X86::BI__builtin_ia32_vfmsubaddpd512_mask3:
1079 return EmitX86FMAExpr(*this, E, Ops, BuiltinID, /*IsAddSub*/ true);
1080
1081 case X86::BI__builtin_ia32_movdqa32store128_mask:
1082 case X86::BI__builtin_ia32_movdqa64store128_mask:
1083 case X86::BI__builtin_ia32_storeaps128_mask:
1084 case X86::BI__builtin_ia32_storeapd128_mask:
1085 case X86::BI__builtin_ia32_movdqa32store256_mask:
1086 case X86::BI__builtin_ia32_movdqa64store256_mask:
1087 case X86::BI__builtin_ia32_storeaps256_mask:
1088 case X86::BI__builtin_ia32_storeapd256_mask:
1089 case X86::BI__builtin_ia32_movdqa32store512_mask:
1090 case X86::BI__builtin_ia32_movdqa64store512_mask:
1091 case X86::BI__builtin_ia32_storeaps512_mask:
1092 case X86::BI__builtin_ia32_storeapd512_mask:
1093 return EmitX86MaskedStore(
1094 *this, Ops,
1095 getContext().getTypeAlignInChars(E->getArg(1)->getType()).getAsAlign());
1096
1097 case X86::BI__builtin_ia32_loadups128_mask:
1098 case X86::BI__builtin_ia32_loadups256_mask:
1099 case X86::BI__builtin_ia32_loadups512_mask:
1100 case X86::BI__builtin_ia32_loadupd128_mask:
1101 case X86::BI__builtin_ia32_loadupd256_mask:
1102 case X86::BI__builtin_ia32_loadupd512_mask:
1103 case X86::BI__builtin_ia32_loaddquqi128_mask:
1104 case X86::BI__builtin_ia32_loaddquqi256_mask:
1105 case X86::BI__builtin_ia32_loaddquqi512_mask:
1106 case X86::BI__builtin_ia32_loaddquhi128_mask:
1107 case X86::BI__builtin_ia32_loaddquhi256_mask:
1108 case X86::BI__builtin_ia32_loaddquhi512_mask:
1109 case X86::BI__builtin_ia32_loaddqusi128_mask:
1110 case X86::BI__builtin_ia32_loaddqusi256_mask:
1111 case X86::BI__builtin_ia32_loaddqusi512_mask:
1112 case X86::BI__builtin_ia32_loaddqudi128_mask:
1113 case X86::BI__builtin_ia32_loaddqudi256_mask:
1114 case X86::BI__builtin_ia32_loaddqudi512_mask:
1115 return EmitX86MaskedLoad(*this, Ops, Align(1));
1116
1117 case X86::BI__builtin_ia32_loadsbf16128_mask:
1118 case X86::BI__builtin_ia32_loadsh128_mask:
1119 case X86::BI__builtin_ia32_loadss128_mask:
1120 case X86::BI__builtin_ia32_loadsd128_mask:
1121 return EmitX86MaskedLoad(*this, Ops, Align(1));
1122
1123 case X86::BI__builtin_ia32_loadaps128_mask:
1124 case X86::BI__builtin_ia32_loadaps256_mask:
1125 case X86::BI__builtin_ia32_loadaps512_mask:
1126 case X86::BI__builtin_ia32_loadapd128_mask:
1127 case X86::BI__builtin_ia32_loadapd256_mask:
1128 case X86::BI__builtin_ia32_loadapd512_mask:
1129 case X86::BI__builtin_ia32_movdqa32load128_mask:
1130 case X86::BI__builtin_ia32_movdqa32load256_mask:
1131 case X86::BI__builtin_ia32_movdqa32load512_mask:
1132 case X86::BI__builtin_ia32_movdqa64load128_mask:
1133 case X86::BI__builtin_ia32_movdqa64load256_mask:
1134 case X86::BI__builtin_ia32_movdqa64load512_mask:
1135 return EmitX86MaskedLoad(
1136 *this, Ops,
1137 getContext().getTypeAlignInChars(E->getArg(1)->getType()).getAsAlign());
1138
1139 case X86::BI__builtin_ia32_expandloaddf128_mask:
1140 case X86::BI__builtin_ia32_expandloaddf256_mask:
1141 case X86::BI__builtin_ia32_expandloaddf512_mask:
1142 case X86::BI__builtin_ia32_expandloadsf128_mask:
1143 case X86::BI__builtin_ia32_expandloadsf256_mask:
1144 case X86::BI__builtin_ia32_expandloadsf512_mask:
1145 case X86::BI__builtin_ia32_expandloaddi128_mask:
1146 case X86::BI__builtin_ia32_expandloaddi256_mask:
1147 case X86::BI__builtin_ia32_expandloaddi512_mask:
1148 case X86::BI__builtin_ia32_expandloadsi128_mask:
1149 case X86::BI__builtin_ia32_expandloadsi256_mask:
1150 case X86::BI__builtin_ia32_expandloadsi512_mask:
1151 case X86::BI__builtin_ia32_expandloadhi128_mask:
1152 case X86::BI__builtin_ia32_expandloadhi256_mask:
1153 case X86::BI__builtin_ia32_expandloadhi512_mask:
1154 case X86::BI__builtin_ia32_expandloadqi128_mask:
1155 case X86::BI__builtin_ia32_expandloadqi256_mask:
1156 case X86::BI__builtin_ia32_expandloadqi512_mask:
1157 return EmitX86ExpandLoad(*this, Ops);
1158
1159 case X86::BI__builtin_ia32_compressstoredf128_mask:
1160 case X86::BI__builtin_ia32_compressstoredf256_mask:
1161 case X86::BI__builtin_ia32_compressstoredf512_mask:
1162 case X86::BI__builtin_ia32_compressstoresf128_mask:
1163 case X86::BI__builtin_ia32_compressstoresf256_mask:
1164 case X86::BI__builtin_ia32_compressstoresf512_mask:
1165 case X86::BI__builtin_ia32_compressstoredi128_mask:
1166 case X86::BI__builtin_ia32_compressstoredi256_mask:
1167 case X86::BI__builtin_ia32_compressstoredi512_mask:
1168 case X86::BI__builtin_ia32_compressstoresi128_mask:
1169 case X86::BI__builtin_ia32_compressstoresi256_mask:
1170 case X86::BI__builtin_ia32_compressstoresi512_mask:
1171 case X86::BI__builtin_ia32_compressstorehi128_mask:
1172 case X86::BI__builtin_ia32_compressstorehi256_mask:
1173 case X86::BI__builtin_ia32_compressstorehi512_mask:
1174 case X86::BI__builtin_ia32_compressstoreqi128_mask:
1175 case X86::BI__builtin_ia32_compressstoreqi256_mask:
1176 case X86::BI__builtin_ia32_compressstoreqi512_mask:
1177 return EmitX86CompressStore(*this, Ops);
1178
1179 case X86::BI__builtin_ia32_expanddf128_mask:
1180 case X86::BI__builtin_ia32_expanddf256_mask:
1181 case X86::BI__builtin_ia32_expanddf512_mask:
1182 case X86::BI__builtin_ia32_expandsf128_mask:
1183 case X86::BI__builtin_ia32_expandsf256_mask:
1184 case X86::BI__builtin_ia32_expandsf512_mask:
1185 case X86::BI__builtin_ia32_expanddi128_mask:
1186 case X86::BI__builtin_ia32_expanddi256_mask:
1187 case X86::BI__builtin_ia32_expanddi512_mask:
1188 case X86::BI__builtin_ia32_expandsi128_mask:
1189 case X86::BI__builtin_ia32_expandsi256_mask:
1190 case X86::BI__builtin_ia32_expandsi512_mask:
1191 case X86::BI__builtin_ia32_expandhi128_mask:
1192 case X86::BI__builtin_ia32_expandhi256_mask:
1193 case X86::BI__builtin_ia32_expandhi512_mask:
1194 case X86::BI__builtin_ia32_expandqi128_mask:
1195 case X86::BI__builtin_ia32_expandqi256_mask:
1196 case X86::BI__builtin_ia32_expandqi512_mask:
1197 return EmitX86CompressExpand(*this, Ops, /*IsCompress*/false);
1198
1199 case X86::BI__builtin_ia32_compressdf128_mask:
1200 case X86::BI__builtin_ia32_compressdf256_mask:
1201 case X86::BI__builtin_ia32_compressdf512_mask:
1202 case X86::BI__builtin_ia32_compresssf128_mask:
1203 case X86::BI__builtin_ia32_compresssf256_mask:
1204 case X86::BI__builtin_ia32_compresssf512_mask:
1205 case X86::BI__builtin_ia32_compressdi128_mask:
1206 case X86::BI__builtin_ia32_compressdi256_mask:
1207 case X86::BI__builtin_ia32_compressdi512_mask:
1208 case X86::BI__builtin_ia32_compresssi128_mask:
1209 case X86::BI__builtin_ia32_compresssi256_mask:
1210 case X86::BI__builtin_ia32_compresssi512_mask:
1211 case X86::BI__builtin_ia32_compresshi128_mask:
1212 case X86::BI__builtin_ia32_compresshi256_mask:
1213 case X86::BI__builtin_ia32_compresshi512_mask:
1214 case X86::BI__builtin_ia32_compressqi128_mask:
1215 case X86::BI__builtin_ia32_compressqi256_mask:
1216 case X86::BI__builtin_ia32_compressqi512_mask:
1217 return EmitX86CompressExpand(*this, Ops, /*IsCompress*/true);
1218
1219 case X86::BI__builtin_ia32_gather3div2df:
1220 case X86::BI__builtin_ia32_gather3div2di:
1221 case X86::BI__builtin_ia32_gather3div4df:
1222 case X86::BI__builtin_ia32_gather3div4di:
1223 case X86::BI__builtin_ia32_gather3div4sf:
1224 case X86::BI__builtin_ia32_gather3div4si:
1225 case X86::BI__builtin_ia32_gather3div8sf:
1226 case X86::BI__builtin_ia32_gather3div8si:
1227 case X86::BI__builtin_ia32_gather3siv2df:
1228 case X86::BI__builtin_ia32_gather3siv2di:
1229 case X86::BI__builtin_ia32_gather3siv4df:
1230 case X86::BI__builtin_ia32_gather3siv4di:
1231 case X86::BI__builtin_ia32_gather3siv4sf:
1232 case X86::BI__builtin_ia32_gather3siv4si:
1233 case X86::BI__builtin_ia32_gather3siv8sf:
1234 case X86::BI__builtin_ia32_gather3siv8si:
1235 case X86::BI__builtin_ia32_gathersiv8df:
1236 case X86::BI__builtin_ia32_gathersiv16sf:
1237 case X86::BI__builtin_ia32_gatherdiv8df:
1238 case X86::BI__builtin_ia32_gatherdiv16sf:
1239 case X86::BI__builtin_ia32_gathersiv8di:
1240 case X86::BI__builtin_ia32_gathersiv16si:
1241 case X86::BI__builtin_ia32_gatherdiv8di:
1242 case X86::BI__builtin_ia32_gatherdiv16si: {
1243 Intrinsic::ID IID;
1244 switch (BuiltinID) {
1245 default: llvm_unreachable("Unexpected builtin");
1246 case X86::BI__builtin_ia32_gather3div2df:
1247 IID = Intrinsic::x86_avx512_mask_gather3div2_df;
1248 break;
1249 case X86::BI__builtin_ia32_gather3div2di:
1250 IID = Intrinsic::x86_avx512_mask_gather3div2_di;
1251 break;
1252 case X86::BI__builtin_ia32_gather3div4df:
1253 IID = Intrinsic::x86_avx512_mask_gather3div4_df;
1254 break;
1255 case X86::BI__builtin_ia32_gather3div4di:
1256 IID = Intrinsic::x86_avx512_mask_gather3div4_di;
1257 break;
1258 case X86::BI__builtin_ia32_gather3div4sf:
1259 IID = Intrinsic::x86_avx512_mask_gather3div4_sf;
1260 break;
1261 case X86::BI__builtin_ia32_gather3div4si:
1262 IID = Intrinsic::x86_avx512_mask_gather3div4_si;
1263 break;
1264 case X86::BI__builtin_ia32_gather3div8sf:
1265 IID = Intrinsic::x86_avx512_mask_gather3div8_sf;
1266 break;
1267 case X86::BI__builtin_ia32_gather3div8si:
1268 IID = Intrinsic::x86_avx512_mask_gather3div8_si;
1269 break;
1270 case X86::BI__builtin_ia32_gather3siv2df:
1271 IID = Intrinsic::x86_avx512_mask_gather3siv2_df;
1272 break;
1273 case X86::BI__builtin_ia32_gather3siv2di:
1274 IID = Intrinsic::x86_avx512_mask_gather3siv2_di;
1275 break;
1276 case X86::BI__builtin_ia32_gather3siv4df:
1277 IID = Intrinsic::x86_avx512_mask_gather3siv4_df;
1278 break;
1279 case X86::BI__builtin_ia32_gather3siv4di:
1280 IID = Intrinsic::x86_avx512_mask_gather3siv4_di;
1281 break;
1282 case X86::BI__builtin_ia32_gather3siv4sf:
1283 IID = Intrinsic::x86_avx512_mask_gather3siv4_sf;
1284 break;
1285 case X86::BI__builtin_ia32_gather3siv4si:
1286 IID = Intrinsic::x86_avx512_mask_gather3siv4_si;
1287 break;
1288 case X86::BI__builtin_ia32_gather3siv8sf:
1289 IID = Intrinsic::x86_avx512_mask_gather3siv8_sf;
1290 break;
1291 case X86::BI__builtin_ia32_gather3siv8si:
1292 IID = Intrinsic::x86_avx512_mask_gather3siv8_si;
1293 break;
1294 case X86::BI__builtin_ia32_gathersiv8df:
1295 IID = Intrinsic::x86_avx512_mask_gather_dpd_512;
1296 break;
1297 case X86::BI__builtin_ia32_gathersiv16sf:
1298 IID = Intrinsic::x86_avx512_mask_gather_dps_512;
1299 break;
1300 case X86::BI__builtin_ia32_gatherdiv8df:
1301 IID = Intrinsic::x86_avx512_mask_gather_qpd_512;
1302 break;
1303 case X86::BI__builtin_ia32_gatherdiv16sf:
1304 IID = Intrinsic::x86_avx512_mask_gather_qps_512;
1305 break;
1306 case X86::BI__builtin_ia32_gathersiv8di:
1307 IID = Intrinsic::x86_avx512_mask_gather_dpq_512;
1308 break;
1309 case X86::BI__builtin_ia32_gathersiv16si:
1310 IID = Intrinsic::x86_avx512_mask_gather_dpi_512;
1311 break;
1312 case X86::BI__builtin_ia32_gatherdiv8di:
1313 IID = Intrinsic::x86_avx512_mask_gather_qpq_512;
1314 break;
1315 case X86::BI__builtin_ia32_gatherdiv16si:
1316 IID = Intrinsic::x86_avx512_mask_gather_qpi_512;
1317 break;
1318 }
1319
1320 unsigned MinElts = std::min(
1321 cast<llvm::FixedVectorType>(Ops[0]->getType())->getNumElements(),
1322 cast<llvm::FixedVectorType>(Ops[2]->getType())->getNumElements());
1323 Ops[3] = getMaskVecValue(*this, Ops[3], MinElts);
1324 Function *Intr = CGM.getIntrinsic(IID);
1325 return Builder.CreateCall(Intr, Ops);
1326 }
1327
1328 case X86::BI__builtin_ia32_scattersiv8df:
1329 case X86::BI__builtin_ia32_scattersiv16sf:
1330 case X86::BI__builtin_ia32_scatterdiv8df:
1331 case X86::BI__builtin_ia32_scatterdiv16sf:
1332 case X86::BI__builtin_ia32_scattersiv8di:
1333 case X86::BI__builtin_ia32_scattersiv16si:
1334 case X86::BI__builtin_ia32_scatterdiv8di:
1335 case X86::BI__builtin_ia32_scatterdiv16si:
1336 case X86::BI__builtin_ia32_scatterdiv2df:
1337 case X86::BI__builtin_ia32_scatterdiv2di:
1338 case X86::BI__builtin_ia32_scatterdiv4df:
1339 case X86::BI__builtin_ia32_scatterdiv4di:
1340 case X86::BI__builtin_ia32_scatterdiv4sf:
1341 case X86::BI__builtin_ia32_scatterdiv4si:
1342 case X86::BI__builtin_ia32_scatterdiv8sf:
1343 case X86::BI__builtin_ia32_scatterdiv8si:
1344 case X86::BI__builtin_ia32_scattersiv2df:
1345 case X86::BI__builtin_ia32_scattersiv2di:
1346 case X86::BI__builtin_ia32_scattersiv4df:
1347 case X86::BI__builtin_ia32_scattersiv4di:
1348 case X86::BI__builtin_ia32_scattersiv4sf:
1349 case X86::BI__builtin_ia32_scattersiv4si:
1350 case X86::BI__builtin_ia32_scattersiv8sf:
1351 case X86::BI__builtin_ia32_scattersiv8si: {
1352 Intrinsic::ID IID;
1353 switch (BuiltinID) {
1354 default: llvm_unreachable("Unexpected builtin");
1355 case X86::BI__builtin_ia32_scattersiv8df:
1356 IID = Intrinsic::x86_avx512_mask_scatter_dpd_512;
1357 break;
1358 case X86::BI__builtin_ia32_scattersiv16sf:
1359 IID = Intrinsic::x86_avx512_mask_scatter_dps_512;
1360 break;
1361 case X86::BI__builtin_ia32_scatterdiv8df:
1362 IID = Intrinsic::x86_avx512_mask_scatter_qpd_512;
1363 break;
1364 case X86::BI__builtin_ia32_scatterdiv16sf:
1365 IID = Intrinsic::x86_avx512_mask_scatter_qps_512;
1366 break;
1367 case X86::BI__builtin_ia32_scattersiv8di:
1368 IID = Intrinsic::x86_avx512_mask_scatter_dpq_512;
1369 break;
1370 case X86::BI__builtin_ia32_scattersiv16si:
1371 IID = Intrinsic::x86_avx512_mask_scatter_dpi_512;
1372 break;
1373 case X86::BI__builtin_ia32_scatterdiv8di:
1374 IID = Intrinsic::x86_avx512_mask_scatter_qpq_512;
1375 break;
1376 case X86::BI__builtin_ia32_scatterdiv16si:
1377 IID = Intrinsic::x86_avx512_mask_scatter_qpi_512;
1378 break;
1379 case X86::BI__builtin_ia32_scatterdiv2df:
1380 IID = Intrinsic::x86_avx512_mask_scatterdiv2_df;
1381 break;
1382 case X86::BI__builtin_ia32_scatterdiv2di:
1383 IID = Intrinsic::x86_avx512_mask_scatterdiv2_di;
1384 break;
1385 case X86::BI__builtin_ia32_scatterdiv4df:
1386 IID = Intrinsic::x86_avx512_mask_scatterdiv4_df;
1387 break;
1388 case X86::BI__builtin_ia32_scatterdiv4di:
1389 IID = Intrinsic::x86_avx512_mask_scatterdiv4_di;
1390 break;
1391 case X86::BI__builtin_ia32_scatterdiv4sf:
1392 IID = Intrinsic::x86_avx512_mask_scatterdiv4_sf;
1393 break;
1394 case X86::BI__builtin_ia32_scatterdiv4si:
1395 IID = Intrinsic::x86_avx512_mask_scatterdiv4_si;
1396 break;
1397 case X86::BI__builtin_ia32_scatterdiv8sf:
1398 IID = Intrinsic::x86_avx512_mask_scatterdiv8_sf;
1399 break;
1400 case X86::BI__builtin_ia32_scatterdiv8si:
1401 IID = Intrinsic::x86_avx512_mask_scatterdiv8_si;
1402 break;
1403 case X86::BI__builtin_ia32_scattersiv2df:
1404 IID = Intrinsic::x86_avx512_mask_scattersiv2_df;
1405 break;
1406 case X86::BI__builtin_ia32_scattersiv2di:
1407 IID = Intrinsic::x86_avx512_mask_scattersiv2_di;
1408 break;
1409 case X86::BI__builtin_ia32_scattersiv4df:
1410 IID = Intrinsic::x86_avx512_mask_scattersiv4_df;
1411 break;
1412 case X86::BI__builtin_ia32_scattersiv4di:
1413 IID = Intrinsic::x86_avx512_mask_scattersiv4_di;
1414 break;
1415 case X86::BI__builtin_ia32_scattersiv4sf:
1416 IID = Intrinsic::x86_avx512_mask_scattersiv4_sf;
1417 break;
1418 case X86::BI__builtin_ia32_scattersiv4si:
1419 IID = Intrinsic::x86_avx512_mask_scattersiv4_si;
1420 break;
1421 case X86::BI__builtin_ia32_scattersiv8sf:
1422 IID = Intrinsic::x86_avx512_mask_scattersiv8_sf;
1423 break;
1424 case X86::BI__builtin_ia32_scattersiv8si:
1425 IID = Intrinsic::x86_avx512_mask_scattersiv8_si;
1426 break;
1427 }
1428
1429 unsigned MinElts = std::min(
1430 cast<llvm::FixedVectorType>(Ops[2]->getType())->getNumElements(),
1431 cast<llvm::FixedVectorType>(Ops[3]->getType())->getNumElements());
1432 Ops[1] = getMaskVecValue(*this, Ops[1], MinElts);
1433 Function *Intr = CGM.getIntrinsic(IID);
1434 return Builder.CreateCall(Intr, Ops);
1435 }
1436
1437 case X86::BI__builtin_ia32_vextractf128_pd256:
1438 case X86::BI__builtin_ia32_vextractf128_ps256:
1439 case X86::BI__builtin_ia32_vextractf128_si256:
1440 case X86::BI__builtin_ia32_extract128i256:
1441 case X86::BI__builtin_ia32_extractf64x4_mask:
1442 case X86::BI__builtin_ia32_extractf32x4_mask:
1443 case X86::BI__builtin_ia32_extracti64x4_mask:
1444 case X86::BI__builtin_ia32_extracti32x4_mask:
1445 case X86::BI__builtin_ia32_extractf32x8_mask:
1446 case X86::BI__builtin_ia32_extracti32x8_mask:
1447 case X86::BI__builtin_ia32_extractf32x4_256_mask:
1448 case X86::BI__builtin_ia32_extracti32x4_256_mask:
1449 case X86::BI__builtin_ia32_extractf64x2_256_mask:
1450 case X86::BI__builtin_ia32_extracti64x2_256_mask:
1451 case X86::BI__builtin_ia32_extractf64x2_512_mask:
1452 case X86::BI__builtin_ia32_extracti64x2_512_mask: {
1453 auto *DstTy = cast<llvm::FixedVectorType>(ConvertType(E->getType()));
1454 unsigned NumElts = DstTy->getNumElements();
1455 unsigned SrcNumElts =
1456 cast<llvm::FixedVectorType>(Ops[0]->getType())->getNumElements();
1457 unsigned SubVectors = SrcNumElts / NumElts;
1458 unsigned Index = cast<ConstantInt>(Ops[1])->getZExtValue();
1459 assert(llvm::isPowerOf2_32(SubVectors) && "Expected power of 2 subvectors");
1460 Index &= SubVectors - 1; // Remove any extra bits.
1461 Index *= NumElts;
1462
1463 int Indices[16];
1464 for (unsigned i = 0; i != NumElts; ++i)
1465 Indices[i] = i + Index;
1466
1467 Value *Res = Builder.CreateShuffleVector(Ops[0], ArrayRef(Indices, NumElts),
1468 "extract");
1469
1470 if (Ops.size() == 4)
1471 Res = EmitX86Select(*this, Ops[3], Res, Ops[2]);
1472
1473 return Res;
1474 }
1475 case X86::BI__builtin_ia32_vinsertf128_pd256:
1476 case X86::BI__builtin_ia32_vinsertf128_ps256:
1477 case X86::BI__builtin_ia32_vinsertf128_si256:
1478 case X86::BI__builtin_ia32_insert128i256:
1479 case X86::BI__builtin_ia32_insertf64x4:
1480 case X86::BI__builtin_ia32_insertf32x4:
1481 case X86::BI__builtin_ia32_inserti64x4:
1482 case X86::BI__builtin_ia32_inserti32x4:
1483 case X86::BI__builtin_ia32_insertf32x8:
1484 case X86::BI__builtin_ia32_inserti32x8:
1485 case X86::BI__builtin_ia32_insertf32x4_256:
1486 case X86::BI__builtin_ia32_inserti32x4_256:
1487 case X86::BI__builtin_ia32_insertf64x2_256:
1488 case X86::BI__builtin_ia32_inserti64x2_256:
1489 case X86::BI__builtin_ia32_insertf64x2_512:
1490 case X86::BI__builtin_ia32_inserti64x2_512: {
1491 unsigned DstNumElts =
1492 cast<llvm::FixedVectorType>(Ops[0]->getType())->getNumElements();
1493 unsigned SrcNumElts =
1494 cast<llvm::FixedVectorType>(Ops[1]->getType())->getNumElements();
1495 unsigned SubVectors = DstNumElts / SrcNumElts;
1496 unsigned Index = cast<ConstantInt>(Ops[2])->getZExtValue();
1497 assert(llvm::isPowerOf2_32(SubVectors) && "Expected power of 2 subvectors");
1498 Index &= SubVectors - 1; // Remove any extra bits.
1499 Index *= SrcNumElts;
1500
1501 int Indices[16];
1502 for (unsigned i = 0; i != DstNumElts; ++i)
1503 Indices[i] = (i >= SrcNumElts) ? SrcNumElts + (i % SrcNumElts) : i;
1504
1505 Value *Op1 = Builder.CreateShuffleVector(
1506 Ops[1], ArrayRef(Indices, DstNumElts), "widen");
1507
1508 for (unsigned i = 0; i != DstNumElts; ++i) {
1509 if (i >= Index && i < (Index + SrcNumElts))
1510 Indices[i] = (i - Index) + DstNumElts;
1511 else
1512 Indices[i] = i;
1513 }
1514
1515 return Builder.CreateShuffleVector(Ops[0], Op1,
1516 ArrayRef(Indices, DstNumElts), "insert");
1517 }
1518 case X86::BI__builtin_ia32_pmovqd512_mask:
1519 case X86::BI__builtin_ia32_pmovwb512_mask: {
1520 Value *Res = Builder.CreateTrunc(Ops[0], Ops[1]->getType());
1521 return EmitX86Select(*this, Ops[2], Res, Ops[1]);
1522 }
1523 case X86::BI__builtin_ia32_pmovdb512_mask:
1524 case X86::BI__builtin_ia32_pmovdw512_mask:
1525 case X86::BI__builtin_ia32_pmovqw512_mask: {
1526 if (const auto *C = dyn_cast<Constant>(Ops[2]))
1527 if (C->isAllOnesValue())
1528 return Builder.CreateTrunc(Ops[0], Ops[1]->getType());
1529
1530 Intrinsic::ID IID;
1531 switch (BuiltinID) {
1532 default: llvm_unreachable("Unsupported intrinsic!");
1533 case X86::BI__builtin_ia32_pmovdb512_mask:
1534 IID = Intrinsic::x86_avx512_mask_pmov_db_512;
1535 break;
1536 case X86::BI__builtin_ia32_pmovdw512_mask:
1537 IID = Intrinsic::x86_avx512_mask_pmov_dw_512;
1538 break;
1539 case X86::BI__builtin_ia32_pmovqw512_mask:
1540 IID = Intrinsic::x86_avx512_mask_pmov_qw_512;
1541 break;
1542 }
1543
1544 Function *Intr = CGM.getIntrinsic(IID);
1545 return Builder.CreateCall(Intr, Ops);
1546 }
1547 case X86::BI__builtin_ia32_pblendw128:
1548 case X86::BI__builtin_ia32_blendpd:
1549 case X86::BI__builtin_ia32_blendps:
1550 case X86::BI__builtin_ia32_blendpd256:
1551 case X86::BI__builtin_ia32_blendps256:
1552 case X86::BI__builtin_ia32_pblendw256:
1553 case X86::BI__builtin_ia32_pblendd128:
1554 case X86::BI__builtin_ia32_pblendd256: {
1555 unsigned NumElts =
1556 cast<llvm::FixedVectorType>(Ops[0]->getType())->getNumElements();
1557 unsigned Imm = cast<llvm::ConstantInt>(Ops[2])->getZExtValue();
1558
1559 int Indices[16];
1560 // If there are more than 8 elements, the immediate is used twice so make
1561 // sure we handle that.
1562 for (unsigned i = 0; i != NumElts; ++i)
1563 Indices[i] = ((Imm >> (i % 8)) & 0x1) ? NumElts + i : i;
1564
1565 return Builder.CreateShuffleVector(Ops[0], Ops[1],
1566 ArrayRef(Indices, NumElts), "blend");
1567 }
1568 case X86::BI__builtin_ia32_pshuflw:
1569 case X86::BI__builtin_ia32_pshuflw256:
1570 case X86::BI__builtin_ia32_pshuflw512: {
1571 uint32_t Imm = cast<llvm::ConstantInt>(Ops[1])->getZExtValue();
1572 auto *Ty = cast<llvm::FixedVectorType>(Ops[0]->getType());
1573 unsigned NumElts = Ty->getNumElements();
1574
1575 // Splat the 8-bits of immediate 4 times to help the loop wrap around.
1576 Imm = (Imm & 0xff) * 0x01010101;
1577
1578 int Indices[32];
1579 for (unsigned l = 0; l != NumElts; l += 8) {
1580 for (unsigned i = 0; i != 4; ++i) {
1581 Indices[l + i] = l + (Imm & 3);
1582 Imm >>= 2;
1583 }
1584 for (unsigned i = 4; i != 8; ++i)
1585 Indices[l + i] = l + i;
1586 }
1587
1588 return Builder.CreateShuffleVector(Ops[0], ArrayRef(Indices, NumElts),
1589 "pshuflw");
1590 }
1591 case X86::BI__builtin_ia32_pshufhw:
1592 case X86::BI__builtin_ia32_pshufhw256:
1593 case X86::BI__builtin_ia32_pshufhw512: {
1594 uint32_t Imm = cast<llvm::ConstantInt>(Ops[1])->getZExtValue();
1595 auto *Ty = cast<llvm::FixedVectorType>(Ops[0]->getType());
1596 unsigned NumElts = Ty->getNumElements();
1597
1598 // Splat the 8-bits of immediate 4 times to help the loop wrap around.
1599 Imm = (Imm & 0xff) * 0x01010101;
1600
1601 int Indices[32];
1602 for (unsigned l = 0; l != NumElts; l += 8) {
1603 for (unsigned i = 0; i != 4; ++i)
1604 Indices[l + i] = l + i;
1605 for (unsigned i = 4; i != 8; ++i) {
1606 Indices[l + i] = l + 4 + (Imm & 3);
1607 Imm >>= 2;
1608 }
1609 }
1610
1611 return Builder.CreateShuffleVector(Ops[0], ArrayRef(Indices, NumElts),
1612 "pshufhw");
1613 }
1614 case X86::BI__builtin_ia32_pshufd:
1615 case X86::BI__builtin_ia32_pshufd256:
1616 case X86::BI__builtin_ia32_pshufd512:
1617 case X86::BI__builtin_ia32_vpermilpd:
1618 case X86::BI__builtin_ia32_vpermilps:
1619 case X86::BI__builtin_ia32_vpermilpd256:
1620 case X86::BI__builtin_ia32_vpermilps256:
1621 case X86::BI__builtin_ia32_vpermilpd512:
1622 case X86::BI__builtin_ia32_vpermilps512: {
1623 uint32_t Imm = cast<llvm::ConstantInt>(Ops[1])->getZExtValue();
1624 auto *Ty = cast<llvm::FixedVectorType>(Ops[0]->getType());
1625 unsigned NumElts = Ty->getNumElements();
1626 unsigned NumLanes = Ty->getPrimitiveSizeInBits() / 128;
1627 unsigned NumLaneElts = NumElts / NumLanes;
1628
1629 // Splat the 8-bits of immediate 4 times to help the loop wrap around.
1630 Imm = (Imm & 0xff) * 0x01010101;
1631
1632 int Indices[16];
1633 for (unsigned l = 0; l != NumElts; l += NumLaneElts) {
1634 for (unsigned i = 0; i != NumLaneElts; ++i) {
1635 Indices[i + l] = (Imm % NumLaneElts) + l;
1636 Imm /= NumLaneElts;
1637 }
1638 }
1639
1640 return Builder.CreateShuffleVector(Ops[0], ArrayRef(Indices, NumElts),
1641 "permil");
1642 }
1643 case X86::BI__builtin_ia32_shufpd:
1644 case X86::BI__builtin_ia32_shufpd256:
1645 case X86::BI__builtin_ia32_shufpd512:
1646 case X86::BI__builtin_ia32_shufps:
1647 case X86::BI__builtin_ia32_shufps256:
1648 case X86::BI__builtin_ia32_shufps512: {
1649 uint32_t Imm = cast<llvm::ConstantInt>(Ops[2])->getZExtValue();
1650 auto *Ty = cast<llvm::FixedVectorType>(Ops[0]->getType());
1651 unsigned NumElts = Ty->getNumElements();
1652 unsigned NumLanes = Ty->getPrimitiveSizeInBits() / 128;
1653 unsigned NumLaneElts = NumElts / NumLanes;
1654
1655 // Splat the 8-bits of immediate 4 times to help the loop wrap around.
1656 Imm = (Imm & 0xff) * 0x01010101;
1657
1658 int Indices[16];
1659 for (unsigned l = 0; l != NumElts; l += NumLaneElts) {
1660 for (unsigned i = 0; i != NumLaneElts; ++i) {
1661 unsigned Index = Imm % NumLaneElts;
1662 Imm /= NumLaneElts;
1663 if (i >= (NumLaneElts / 2))
1664 Index += NumElts;
1665 Indices[l + i] = l + Index;
1666 }
1667 }
1668
1669 return Builder.CreateShuffleVector(Ops[0], Ops[1],
1670 ArrayRef(Indices, NumElts), "shufp");
1671 }
1672 case X86::BI__builtin_ia32_permdi256:
1673 case X86::BI__builtin_ia32_permdf256:
1674 case X86::BI__builtin_ia32_permdi512:
1675 case X86::BI__builtin_ia32_permdf512: {
1676 unsigned Imm = cast<llvm::ConstantInt>(Ops[1])->getZExtValue();
1677 auto *Ty = cast<llvm::FixedVectorType>(Ops[0]->getType());
1678 unsigned NumElts = Ty->getNumElements();
1679
1680 // These intrinsics operate on 256-bit lanes of four 64-bit elements.
1681 int Indices[8];
1682 for (unsigned l = 0; l != NumElts; l += 4)
1683 for (unsigned i = 0; i != 4; ++i)
1684 Indices[l + i] = l + ((Imm >> (2 * i)) & 0x3);
1685
1686 return Builder.CreateShuffleVector(Ops[0], ArrayRef(Indices, NumElts),
1687 "perm");
1688 }
1689 case X86::BI__builtin_ia32_palignr128:
1690 case X86::BI__builtin_ia32_palignr256:
1691 case X86::BI__builtin_ia32_palignr512: {
1692 unsigned ShiftVal = cast<llvm::ConstantInt>(Ops[2])->getZExtValue() & 0xff;
1693
1694 unsigned NumElts =
1695 cast<llvm::FixedVectorType>(Ops[0]->getType())->getNumElements();
1696 assert(NumElts % 16 == 0);
1697
1698 // If palignr is shifting the pair of vectors more than the size of two
1699 // lanes, emit zero.
1700 if (ShiftVal >= 32)
1701 return llvm::Constant::getNullValue(ConvertType(E->getType()));
1702
1703 // If palignr is shifting the pair of input vectors more than one lane,
1704 // but less than two lanes, convert to shifting in zeroes.
1705 if (ShiftVal > 16) {
1706 ShiftVal -= 16;
1707 Ops[1] = Ops[0];
1708 Ops[0] = llvm::Constant::getNullValue(Ops[0]->getType());
1709 }
1710
1711 int Indices[64];
1712 // 256-bit palignr operates on 128-bit lanes so we need to handle that
1713 for (unsigned l = 0; l != NumElts; l += 16) {
1714 for (unsigned i = 0; i != 16; ++i) {
1715 unsigned Idx = ShiftVal + i;
1716 if (Idx >= 16)
1717 Idx += NumElts - 16; // End of lane, switch operand.
1718 Indices[l + i] = Idx + l;
1719 }
1720 }
1721
1722 return Builder.CreateShuffleVector(Ops[1], Ops[0],
1723 ArrayRef(Indices, NumElts), "palignr");
1724 }
1725 case X86::BI__builtin_ia32_alignd128:
1726 case X86::BI__builtin_ia32_alignd256:
1727 case X86::BI__builtin_ia32_alignd512:
1728 case X86::BI__builtin_ia32_alignq128:
1729 case X86::BI__builtin_ia32_alignq256:
1730 case X86::BI__builtin_ia32_alignq512: {
1731 unsigned NumElts =
1732 cast<llvm::FixedVectorType>(Ops[0]->getType())->getNumElements();
1733 unsigned ShiftVal = cast<llvm::ConstantInt>(Ops[2])->getZExtValue() & 0xff;
1734
1735 // Mask the shift amount to width of a vector.
1736 ShiftVal &= NumElts - 1;
1737
1738 int Indices[16];
1739 for (unsigned i = 0; i != NumElts; ++i)
1740 Indices[i] = i + ShiftVal;
1741
1742 return Builder.CreateShuffleVector(Ops[1], Ops[0],
1743 ArrayRef(Indices, NumElts), "valign");
1744 }
1745 case X86::BI__builtin_ia32_shuf_f32x4_256:
1746 case X86::BI__builtin_ia32_shuf_f64x2_256:
1747 case X86::BI__builtin_ia32_shuf_i32x4_256:
1748 case X86::BI__builtin_ia32_shuf_i64x2_256:
1749 case X86::BI__builtin_ia32_shuf_f32x4:
1750 case X86::BI__builtin_ia32_shuf_f64x2:
1751 case X86::BI__builtin_ia32_shuf_i32x4:
1752 case X86::BI__builtin_ia32_shuf_i64x2: {
1753 unsigned Imm = cast<llvm::ConstantInt>(Ops[2])->getZExtValue();
1754 auto *Ty = cast<llvm::FixedVectorType>(Ops[0]->getType());
1755 unsigned NumElts = Ty->getNumElements();
1756 unsigned NumLanes = Ty->getPrimitiveSizeInBits() == 512 ? 4 : 2;
1757 unsigned NumLaneElts = NumElts / NumLanes;
1758
1759 int Indices[16];
1760 for (unsigned l = 0; l != NumElts; l += NumLaneElts) {
1761 unsigned Index = (Imm % NumLanes) * NumLaneElts;
1762 Imm /= NumLanes; // Discard the bits we just used.
1763 if (l >= (NumElts / 2))
1764 Index += NumElts; // Switch to other source.
1765 for (unsigned i = 0; i != NumLaneElts; ++i) {
1766 Indices[l + i] = Index + i;
1767 }
1768 }
1769
1770 return Builder.CreateShuffleVector(Ops[0], Ops[1],
1771 ArrayRef(Indices, NumElts), "shuf");
1772 }
1773
1774 case X86::BI__builtin_ia32_vperm2f128_pd256:
1775 case X86::BI__builtin_ia32_vperm2f128_ps256:
1776 case X86::BI__builtin_ia32_vperm2f128_si256:
1777 case X86::BI__builtin_ia32_permti256: {
1778 unsigned Imm = cast<llvm::ConstantInt>(Ops[2])->getZExtValue();
1779 unsigned NumElts =
1780 cast<llvm::FixedVectorType>(Ops[0]->getType())->getNumElements();
1781
1782 // This takes a very simple approach since there are two lanes and a
1783 // shuffle can have 2 inputs. So we reserve the first input for the first
1784 // lane and the second input for the second lane. This may result in
1785 // duplicate sources, but this can be dealt with in the backend.
1786
1787 Value *OutOps[2];
1788 int Indices[8];
1789 for (unsigned l = 0; l != 2; ++l) {
1790 // Determine the source for this lane.
1791 if (Imm & (1 << ((l * 4) + 3)))
1792 OutOps[l] = llvm::ConstantAggregateZero::get(Ops[0]->getType());
1793 else if (Imm & (1 << ((l * 4) + 1)))
1794 OutOps[l] = Ops[1];
1795 else
1796 OutOps[l] = Ops[0];
1797
1798 for (unsigned i = 0; i != NumElts/2; ++i) {
1799 // Start with ith element of the source for this lane.
1800 unsigned Idx = (l * NumElts) + i;
1801 // If bit 0 of the immediate half is set, switch to the high half of
1802 // the source.
1803 if (Imm & (1 << (l * 4)))
1804 Idx += NumElts/2;
1805 Indices[(l * (NumElts/2)) + i] = Idx;
1806 }
1807 }
1808
1809 return Builder.CreateShuffleVector(OutOps[0], OutOps[1],
1810 ArrayRef(Indices, NumElts), "vperm");
1811 }
1812
1813 case X86::BI__builtin_ia32_pslldqi128_byteshift:
1814 case X86::BI__builtin_ia32_pslldqi256_byteshift:
1815 case X86::BI__builtin_ia32_pslldqi512_byteshift: {
1816 unsigned ShiftVal = cast<llvm::ConstantInt>(Ops[1])->getZExtValue() & 0xff;
1817 auto *ResultType = cast<llvm::FixedVectorType>(Ops[0]->getType());
1818 // Builtin type is vXi64 so multiply by 8 to get bytes.
1819 unsigned NumElts = ResultType->getNumElements() * 8;
1820
1821 // If pslldq is shifting the vector more than 15 bytes, emit zero.
1822 if (ShiftVal >= 16)
1823 return llvm::Constant::getNullValue(ResultType);
1824
1825 int Indices[64];
1826 // 256/512-bit pslldq operates on 128-bit lanes so we need to handle that
1827 for (unsigned l = 0; l != NumElts; l += 16) {
1828 for (unsigned i = 0; i != 16; ++i) {
1829 unsigned Idx = NumElts + i - ShiftVal;
1830 if (Idx < NumElts) Idx -= NumElts - 16; // end of lane, switch operand.
1831 Indices[l + i] = Idx + l;
1832 }
1833 }
1834
1835 auto *VecTy = llvm::FixedVectorType::get(Int8Ty, NumElts);
1836 Value *Cast = Builder.CreateBitCast(Ops[0], VecTy, "cast");
1837 Value *Zero = llvm::Constant::getNullValue(VecTy);
1838 Value *SV = Builder.CreateShuffleVector(
1839 Zero, Cast, ArrayRef(Indices, NumElts), "pslldq");
1840 return Builder.CreateBitCast(SV, Ops[0]->getType(), "cast");
1841 }
1842 case X86::BI__builtin_ia32_psrldqi128_byteshift:
1843 case X86::BI__builtin_ia32_psrldqi256_byteshift:
1844 case X86::BI__builtin_ia32_psrldqi512_byteshift: {
1845 unsigned ShiftVal = cast<llvm::ConstantInt>(Ops[1])->getZExtValue() & 0xff;
1846 auto *ResultType = cast<llvm::FixedVectorType>(Ops[0]->getType());
1847 // Builtin type is vXi64 so multiply by 8 to get bytes.
1848 unsigned NumElts = ResultType->getNumElements() * 8;
1849
1850 // If psrldq is shifting the vector more than 15 bytes, emit zero.
1851 if (ShiftVal >= 16)
1852 return llvm::Constant::getNullValue(ResultType);
1853
1854 int Indices[64];
1855 // 256/512-bit psrldq operates on 128-bit lanes so we need to handle that
1856 for (unsigned l = 0; l != NumElts; l += 16) {
1857 for (unsigned i = 0; i != 16; ++i) {
1858 unsigned Idx = i + ShiftVal;
1859 if (Idx >= 16) Idx += NumElts - 16; // end of lane, switch operand.
1860 Indices[l + i] = Idx + l;
1861 }
1862 }
1863
1864 auto *VecTy = llvm::FixedVectorType::get(Int8Ty, NumElts);
1865 Value *Cast = Builder.CreateBitCast(Ops[0], VecTy, "cast");
1866 Value *Zero = llvm::Constant::getNullValue(VecTy);
1867 Value *SV = Builder.CreateShuffleVector(
1868 Cast, Zero, ArrayRef(Indices, NumElts), "psrldq");
1869 return Builder.CreateBitCast(SV, ResultType, "cast");
1870 }
1871 case X86::BI__builtin_ia32_kshiftliqi:
1872 case X86::BI__builtin_ia32_kshiftlihi:
1873 case X86::BI__builtin_ia32_kshiftlisi:
1874 case X86::BI__builtin_ia32_kshiftlidi: {
1875 unsigned ShiftVal = cast<llvm::ConstantInt>(Ops[1])->getZExtValue() & 0xff;
1876 unsigned NumElts = Ops[0]->getType()->getIntegerBitWidth();
1877
1878 if (ShiftVal >= NumElts)
1879 return llvm::Constant::getNullValue(Ops[0]->getType());
1880
1881 Value *In = getMaskVecValue(*this, Ops[0], NumElts);
1882
1883 int Indices[64];
1884 for (unsigned i = 0; i != NumElts; ++i)
1885 Indices[i] = NumElts + i - ShiftVal;
1886
1887 Value *Zero = llvm::Constant::getNullValue(In->getType());
1888 Value *SV = Builder.CreateShuffleVector(
1889 Zero, In, ArrayRef(Indices, NumElts), "kshiftl");
1890 return Builder.CreateBitCast(SV, Ops[0]->getType());
1891 }
1892 case X86::BI__builtin_ia32_kshiftriqi:
1893 case X86::BI__builtin_ia32_kshiftrihi:
1894 case X86::BI__builtin_ia32_kshiftrisi:
1895 case X86::BI__builtin_ia32_kshiftridi: {
1896 unsigned ShiftVal = cast<llvm::ConstantInt>(Ops[1])->getZExtValue() & 0xff;
1897 unsigned NumElts = Ops[0]->getType()->getIntegerBitWidth();
1898
1899 if (ShiftVal >= NumElts)
1900 return llvm::Constant::getNullValue(Ops[0]->getType());
1901
1902 Value *In = getMaskVecValue(*this, Ops[0], NumElts);
1903
1904 int Indices[64];
1905 for (unsigned i = 0; i != NumElts; ++i)
1906 Indices[i] = i + ShiftVal;
1907
1908 Value *Zero = llvm::Constant::getNullValue(In->getType());
1909 Value *SV = Builder.CreateShuffleVector(
1910 In, Zero, ArrayRef(Indices, NumElts), "kshiftr");
1911 return Builder.CreateBitCast(SV, Ops[0]->getType());
1912 }
1913 case X86::BI__builtin_ia32_movnti:
1914 case X86::BI__builtin_ia32_movnti64:
1915 case X86::BI__builtin_ia32_movntsd:
1916 case X86::BI__builtin_ia32_movntss: {
1917 llvm::MDNode *Node = llvm::MDNode::get(
1918 getLLVMContext(), llvm::ConstantAsMetadata::get(Builder.getInt32(1)));
1919
1920 Value *Ptr = Ops[0];
1921 Value *Src = Ops[1];
1922
1923 // Extract the 0'th element of the source vector.
1924 if (BuiltinID == X86::BI__builtin_ia32_movntsd ||
1925 BuiltinID == X86::BI__builtin_ia32_movntss)
1926 Src = Builder.CreateExtractElement(Src, (uint64_t)0, "extract");
1927
1928 // Unaligned nontemporal store of the scalar value.
1929 StoreInst *SI = Builder.CreateDefaultAlignedStore(Src, Ptr);
1930 SI->setMetadata(llvm::LLVMContext::MD_nontemporal, Node);
1931 SI->setAlignment(llvm::Align(1));
1932 return SI;
1933 }
1934 // Rotate is a special case of funnel shift - 1st 2 args are the same.
1935 case X86::BI__builtin_ia32_vprotbi:
1936 case X86::BI__builtin_ia32_vprotwi:
1937 case X86::BI__builtin_ia32_vprotdi:
1938 case X86::BI__builtin_ia32_vprotqi:
1939 case X86::BI__builtin_ia32_prold128:
1940 case X86::BI__builtin_ia32_prold256:
1941 case X86::BI__builtin_ia32_prold512:
1942 case X86::BI__builtin_ia32_prolq128:
1943 case X86::BI__builtin_ia32_prolq256:
1944 case X86::BI__builtin_ia32_prolq512:
1945 return EmitX86FunnelShift(*this, Ops[0], Ops[0], Ops[1], false);
1946 case X86::BI__builtin_ia32_prord128:
1947 case X86::BI__builtin_ia32_prord256:
1948 case X86::BI__builtin_ia32_prord512:
1949 case X86::BI__builtin_ia32_prorq128:
1950 case X86::BI__builtin_ia32_prorq256:
1951 case X86::BI__builtin_ia32_prorq512:
1952 return EmitX86FunnelShift(*this, Ops[0], Ops[0], Ops[1], true);
1953 case X86::BI__builtin_ia32_selectb_128:
1954 case X86::BI__builtin_ia32_selectb_256:
1955 case X86::BI__builtin_ia32_selectb_512:
1956 case X86::BI__builtin_ia32_selectw_128:
1957 case X86::BI__builtin_ia32_selectw_256:
1958 case X86::BI__builtin_ia32_selectw_512:
1959 case X86::BI__builtin_ia32_selectd_128:
1960 case X86::BI__builtin_ia32_selectd_256:
1961 case X86::BI__builtin_ia32_selectd_512:
1962 case X86::BI__builtin_ia32_selectq_128:
1963 case X86::BI__builtin_ia32_selectq_256:
1964 case X86::BI__builtin_ia32_selectq_512:
1965 case X86::BI__builtin_ia32_selectph_128:
1966 case X86::BI__builtin_ia32_selectph_256:
1967 case X86::BI__builtin_ia32_selectph_512:
1968 case X86::BI__builtin_ia32_selectpbf_128:
1969 case X86::BI__builtin_ia32_selectpbf_256:
1970 case X86::BI__builtin_ia32_selectpbf_512:
1971 case X86::BI__builtin_ia32_selectps_128:
1972 case X86::BI__builtin_ia32_selectps_256:
1973 case X86::BI__builtin_ia32_selectps_512:
1974 case X86::BI__builtin_ia32_selectpd_128:
1975 case X86::BI__builtin_ia32_selectpd_256:
1976 case X86::BI__builtin_ia32_selectpd_512:
1977 return EmitX86Select(*this, Ops[0], Ops[1], Ops[2]);
1978 case X86::BI__builtin_ia32_selectsh_128:
1979 case X86::BI__builtin_ia32_selectsbf_128:
1980 case X86::BI__builtin_ia32_selectss_128:
1981 case X86::BI__builtin_ia32_selectsd_128: {
1982 Value *A = Builder.CreateExtractElement(Ops[1], (uint64_t)0);
1983 Value *B = Builder.CreateExtractElement(Ops[2], (uint64_t)0);
1984 A = EmitX86ScalarSelect(*this, Ops[0], A, B);
1985 return Builder.CreateInsertElement(Ops[1], A, (uint64_t)0);
1986 }
1987 case X86::BI__builtin_ia32_cmpb128_mask:
1988 case X86::BI__builtin_ia32_cmpb256_mask:
1989 case X86::BI__builtin_ia32_cmpb512_mask:
1990 case X86::BI__builtin_ia32_cmpw128_mask:
1991 case X86::BI__builtin_ia32_cmpw256_mask:
1992 case X86::BI__builtin_ia32_cmpw512_mask:
1993 case X86::BI__builtin_ia32_cmpd128_mask:
1994 case X86::BI__builtin_ia32_cmpd256_mask:
1995 case X86::BI__builtin_ia32_cmpd512_mask:
1996 case X86::BI__builtin_ia32_cmpq128_mask:
1997 case X86::BI__builtin_ia32_cmpq256_mask:
1998 case X86::BI__builtin_ia32_cmpq512_mask: {
1999 unsigned CC = cast<llvm::ConstantInt>(Ops[2])->getZExtValue() & 0x7;
2000 return EmitX86MaskedCompare(*this, CC, true, Ops);
2001 }
2002 case X86::BI__builtin_ia32_ucmpb128_mask:
2003 case X86::BI__builtin_ia32_ucmpb256_mask:
2004 case X86::BI__builtin_ia32_ucmpb512_mask:
2005 case X86::BI__builtin_ia32_ucmpw128_mask:
2006 case X86::BI__builtin_ia32_ucmpw256_mask:
2007 case X86::BI__builtin_ia32_ucmpw512_mask:
2008 case X86::BI__builtin_ia32_ucmpd128_mask:
2009 case X86::BI__builtin_ia32_ucmpd256_mask:
2010 case X86::BI__builtin_ia32_ucmpd512_mask:
2011 case X86::BI__builtin_ia32_ucmpq128_mask:
2012 case X86::BI__builtin_ia32_ucmpq256_mask:
2013 case X86::BI__builtin_ia32_ucmpq512_mask: {
2014 unsigned CC = cast<llvm::ConstantInt>(Ops[2])->getZExtValue() & 0x7;
2015 return EmitX86MaskedCompare(*this, CC, false, Ops);
2016 }
2017 case X86::BI__builtin_ia32_vpcomb:
2018 case X86::BI__builtin_ia32_vpcomw:
2019 case X86::BI__builtin_ia32_vpcomd:
2020 case X86::BI__builtin_ia32_vpcomq:
2021 return EmitX86vpcom(*this, Ops, true);
2022 case X86::BI__builtin_ia32_vpcomub:
2023 case X86::BI__builtin_ia32_vpcomuw:
2024 case X86::BI__builtin_ia32_vpcomud:
2025 case X86::BI__builtin_ia32_vpcomuq:
2026 return EmitX86vpcom(*this, Ops, false);
2027
2028 case X86::BI__builtin_ia32_kortestcqi:
2029 case X86::BI__builtin_ia32_kortestchi:
2030 case X86::BI__builtin_ia32_kortestcsi:
2031 case X86::BI__builtin_ia32_kortestcdi: {
2032 Value *Or = EmitX86MaskLogic(*this, Instruction::Or, Ops);
2033 Value *C = llvm::Constant::getAllOnesValue(Ops[0]->getType());
2034 Value *Cmp = Builder.CreateICmpEQ(Or, C);
2035 return Builder.CreateZExt(Cmp, ConvertType(E->getType()));
2036 }
2037 case X86::BI__builtin_ia32_kortestzqi:
2038 case X86::BI__builtin_ia32_kortestzhi:
2039 case X86::BI__builtin_ia32_kortestzsi:
2040 case X86::BI__builtin_ia32_kortestzdi: {
2041 Value *Or = EmitX86MaskLogic(*this, Instruction::Or, Ops);
2042 Value *C = llvm::Constant::getNullValue(Ops[0]->getType());
2043 Value *Cmp = Builder.CreateICmpEQ(Or, C);
2044 return Builder.CreateZExt(Cmp, ConvertType(E->getType()));
2045 }
2046
2047 case X86::BI__builtin_ia32_ktestcqi:
2048 case X86::BI__builtin_ia32_ktestzqi:
2049 case X86::BI__builtin_ia32_ktestchi:
2050 case X86::BI__builtin_ia32_ktestzhi:
2051 case X86::BI__builtin_ia32_ktestcsi:
2052 case X86::BI__builtin_ia32_ktestzsi:
2053 case X86::BI__builtin_ia32_ktestcdi:
2054 case X86::BI__builtin_ia32_ktestzdi: {
2055 Intrinsic::ID IID;
2056 switch (BuiltinID) {
2057 default: llvm_unreachable("Unsupported intrinsic!");
2058 case X86::BI__builtin_ia32_ktestcqi:
2059 IID = Intrinsic::x86_avx512_ktestc_b;
2060 break;
2061 case X86::BI__builtin_ia32_ktestzqi:
2062 IID = Intrinsic::x86_avx512_ktestz_b;
2063 break;
2064 case X86::BI__builtin_ia32_ktestchi:
2065 IID = Intrinsic::x86_avx512_ktestc_w;
2066 break;
2067 case X86::BI__builtin_ia32_ktestzhi:
2068 IID = Intrinsic::x86_avx512_ktestz_w;
2069 break;
2070 case X86::BI__builtin_ia32_ktestcsi:
2071 IID = Intrinsic::x86_avx512_ktestc_d;
2072 break;
2073 case X86::BI__builtin_ia32_ktestzsi:
2074 IID = Intrinsic::x86_avx512_ktestz_d;
2075 break;
2076 case X86::BI__builtin_ia32_ktestcdi:
2077 IID = Intrinsic::x86_avx512_ktestc_q;
2078 break;
2079 case X86::BI__builtin_ia32_ktestzdi:
2080 IID = Intrinsic::x86_avx512_ktestz_q;
2081 break;
2082 }
2083
2084 unsigned NumElts = Ops[0]->getType()->getIntegerBitWidth();
2085 Value *LHS = getMaskVecValue(*this, Ops[0], NumElts);
2086 Value *RHS = getMaskVecValue(*this, Ops[1], NumElts);
2087 Function *Intr = CGM.getIntrinsic(IID);
2088 return Builder.CreateCall(Intr, {LHS, RHS});
2089 }
2090
2091 case X86::BI__builtin_ia32_kaddqi:
2092 case X86::BI__builtin_ia32_kaddhi:
2093 case X86::BI__builtin_ia32_kaddsi:
2094 case X86::BI__builtin_ia32_kadddi: {
2095 Intrinsic::ID IID;
2096 switch (BuiltinID) {
2097 default: llvm_unreachable("Unsupported intrinsic!");
2098 case X86::BI__builtin_ia32_kaddqi:
2099 IID = Intrinsic::x86_avx512_kadd_b;
2100 break;
2101 case X86::BI__builtin_ia32_kaddhi:
2102 IID = Intrinsic::x86_avx512_kadd_w;
2103 break;
2104 case X86::BI__builtin_ia32_kaddsi:
2105 IID = Intrinsic::x86_avx512_kadd_d;
2106 break;
2107 case X86::BI__builtin_ia32_kadddi:
2108 IID = Intrinsic::x86_avx512_kadd_q;
2109 break;
2110 }
2111
2112 unsigned NumElts = Ops[0]->getType()->getIntegerBitWidth();
2113 Value *LHS = getMaskVecValue(*this, Ops[0], NumElts);
2114 Value *RHS = getMaskVecValue(*this, Ops[1], NumElts);
2115 Function *Intr = CGM.getIntrinsic(IID);
2116 Value *Res = Builder.CreateCall(Intr, {LHS, RHS});
2117 return Builder.CreateBitCast(Res, Ops[0]->getType());
2118 }
2119 case X86::BI__builtin_ia32_kandqi:
2120 case X86::BI__builtin_ia32_kandhi:
2121 case X86::BI__builtin_ia32_kandsi:
2122 case X86::BI__builtin_ia32_kanddi:
2123 return EmitX86MaskLogic(*this, Instruction::And, Ops);
2124 case X86::BI__builtin_ia32_kandnqi:
2125 case X86::BI__builtin_ia32_kandnhi:
2126 case X86::BI__builtin_ia32_kandnsi:
2127 case X86::BI__builtin_ia32_kandndi:
2128 return EmitX86MaskLogic(*this, Instruction::And, Ops, true);
2129 case X86::BI__builtin_ia32_korqi:
2130 case X86::BI__builtin_ia32_korhi:
2131 case X86::BI__builtin_ia32_korsi:
2132 case X86::BI__builtin_ia32_kordi:
2133 return EmitX86MaskLogic(*this, Instruction::Or, Ops);
2134 case X86::BI__builtin_ia32_kxnorqi:
2135 case X86::BI__builtin_ia32_kxnorhi:
2136 case X86::BI__builtin_ia32_kxnorsi:
2137 case X86::BI__builtin_ia32_kxnordi:
2138 return EmitX86MaskLogic(*this, Instruction::Xor, Ops, true);
2139 case X86::BI__builtin_ia32_kxorqi:
2140 case X86::BI__builtin_ia32_kxorhi:
2141 case X86::BI__builtin_ia32_kxorsi:
2142 case X86::BI__builtin_ia32_kxordi:
2143 return EmitX86MaskLogic(*this, Instruction::Xor, Ops);
2144 case X86::BI__builtin_ia32_knotqi:
2145 case X86::BI__builtin_ia32_knothi:
2146 case X86::BI__builtin_ia32_knotsi:
2147 case X86::BI__builtin_ia32_knotdi: {
2148 unsigned NumElts = Ops[0]->getType()->getIntegerBitWidth();
2149 Value *Res = getMaskVecValue(*this, Ops[0], NumElts);
2150 return Builder.CreateBitCast(Builder.CreateNot(Res),
2151 Ops[0]->getType());
2152 }
2153 case X86::BI__builtin_ia32_kmovb:
2154 case X86::BI__builtin_ia32_kmovw:
2155 case X86::BI__builtin_ia32_kmovd:
2156 case X86::BI__builtin_ia32_kmovq: {
2157 // Bitcast to vXi1 type and then back to integer. This gets the mask
2158 // register type into the IR, but might be optimized out depending on
2159 // what's around it.
2160 unsigned NumElts = Ops[0]->getType()->getIntegerBitWidth();
2161 Value *Res = getMaskVecValue(*this, Ops[0], NumElts);
2162 return Builder.CreateBitCast(Res, Ops[0]->getType());
2163 }
2164
2165 case X86::BI__builtin_ia32_kunpckdi:
2166 case X86::BI__builtin_ia32_kunpcksi:
2167 case X86::BI__builtin_ia32_kunpckhi: {
2168 unsigned NumElts = Ops[0]->getType()->getIntegerBitWidth();
2169 Value *LHS = getMaskVecValue(*this, Ops[0], NumElts);
2170 Value *RHS = getMaskVecValue(*this, Ops[1], NumElts);
2171 int Indices[64];
2172 for (unsigned i = 0; i != NumElts; ++i)
2173 Indices[i] = i;
2174
2175 // First extract half of each vector. This gives better codegen than
2176 // doing it in a single shuffle.
2177 LHS = Builder.CreateShuffleVector(LHS, LHS, ArrayRef(Indices, NumElts / 2));
2178 RHS = Builder.CreateShuffleVector(RHS, RHS, ArrayRef(Indices, NumElts / 2));
2179 // Concat the vectors.
2180 // NOTE: Operands are swapped to match the intrinsic definition.
2181 Value *Res =
2182 Builder.CreateShuffleVector(RHS, LHS, ArrayRef(Indices, NumElts));
2183 return Builder.CreateBitCast(Res, Ops[0]->getType());
2184 }
2185
2186 case X86::BI__builtin_ia32_sqrtss:
2187 case X86::BI__builtin_ia32_sqrtsd: {
2188 Value *A = Builder.CreateExtractElement(Ops[0], (uint64_t)0);
2189 Function *F;
2190 if (Builder.getIsFPConstrained()) {
2191 CodeGenFunction::CGFPOptionsRAII FPOptsRAII(*this, E);
2192 F = CGM.getIntrinsic(Intrinsic::experimental_constrained_sqrt,
2193 A->getType());
2194 A = Builder.CreateConstrainedFPCall(F, {A});
2195 } else {
2196 F = CGM.getIntrinsic(Intrinsic::sqrt, A->getType());
2197 A = Builder.CreateCall(F, {A});
2198 }
2199 return Builder.CreateInsertElement(Ops[0], A, (uint64_t)0);
2200 }
2201 case X86::BI__builtin_ia32_sqrtsh_round_mask:
2202 case X86::BI__builtin_ia32_sqrtsd_round_mask:
2203 case X86::BI__builtin_ia32_sqrtss_round_mask: {
2204 unsigned CC = cast<llvm::ConstantInt>(Ops[4])->getZExtValue();
2205 // Support only if the rounding mode is 4 (AKA CUR_DIRECTION),
2206 // otherwise keep the intrinsic.
2207 if (CC != 4) {
2208 Intrinsic::ID IID;
2209
2210 switch (BuiltinID) {
2211 default:
2212 llvm_unreachable("Unsupported intrinsic!");
2213 case X86::BI__builtin_ia32_sqrtsh_round_mask:
2214 IID = Intrinsic::x86_avx512fp16_mask_sqrt_sh;
2215 break;
2216 case X86::BI__builtin_ia32_sqrtsd_round_mask:
2217 IID = Intrinsic::x86_avx512_mask_sqrt_sd;
2218 break;
2219 case X86::BI__builtin_ia32_sqrtss_round_mask:
2220 IID = Intrinsic::x86_avx512_mask_sqrt_ss;
2221 break;
2222 }
2223 return Builder.CreateCall(CGM.getIntrinsic(IID), Ops);
2224 }
2225 Value *A = Builder.CreateExtractElement(Ops[1], (uint64_t)0);
2226 Function *F;
2227 if (Builder.getIsFPConstrained()) {
2228 CodeGenFunction::CGFPOptionsRAII FPOptsRAII(*this, E);
2229 F = CGM.getIntrinsic(Intrinsic::experimental_constrained_sqrt,
2230 A->getType());
2231 A = Builder.CreateConstrainedFPCall(F, A);
2232 } else {
2233 F = CGM.getIntrinsic(Intrinsic::sqrt, A->getType());
2234 A = Builder.CreateCall(F, A);
2235 }
2236 Value *Src = Builder.CreateExtractElement(Ops[2], (uint64_t)0);
2237 A = EmitX86ScalarSelect(*this, Ops[3], A, Src);
2238 return Builder.CreateInsertElement(Ops[0], A, (uint64_t)0);
2239 }
2240 case X86::BI__builtin_ia32_sqrtpd256:
2241 case X86::BI__builtin_ia32_sqrtpd:
2242 case X86::BI__builtin_ia32_sqrtps256:
2243 case X86::BI__builtin_ia32_sqrtps:
2244 case X86::BI__builtin_ia32_sqrtph256:
2245 case X86::BI__builtin_ia32_sqrtph:
2246 case X86::BI__builtin_ia32_sqrtph512:
2247 case X86::BI__builtin_ia32_vsqrtbf16256:
2248 case X86::BI__builtin_ia32_vsqrtbf16:
2249 case X86::BI__builtin_ia32_vsqrtbf16512:
2250 case X86::BI__builtin_ia32_sqrtps512:
2251 case X86::BI__builtin_ia32_sqrtpd512: {
2252 if (Ops.size() == 2) {
2253 unsigned CC = cast<llvm::ConstantInt>(Ops[1])->getZExtValue();
2254 // Support only if the rounding mode is 4 (AKA CUR_DIRECTION),
2255 // otherwise keep the intrinsic.
2256 if (CC != 4) {
2257 Intrinsic::ID IID;
2258
2259 switch (BuiltinID) {
2260 default:
2261 llvm_unreachable("Unsupported intrinsic!");
2262 case X86::BI__builtin_ia32_sqrtph512:
2263 IID = Intrinsic::x86_avx512fp16_sqrt_ph_512;
2264 break;
2265 case X86::BI__builtin_ia32_sqrtps512:
2266 IID = Intrinsic::x86_avx512_sqrt_ps_512;
2267 break;
2268 case X86::BI__builtin_ia32_sqrtpd512:
2269 IID = Intrinsic::x86_avx512_sqrt_pd_512;
2270 break;
2271 }
2272 return Builder.CreateCall(CGM.getIntrinsic(IID), Ops);
2273 }
2274 }
2275 if (Builder.getIsFPConstrained()) {
2276 CodeGenFunction::CGFPOptionsRAII FPOptsRAII(*this, E);
2277 Function *F = CGM.getIntrinsic(Intrinsic::experimental_constrained_sqrt,
2278 Ops[0]->getType());
2279 return Builder.CreateConstrainedFPCall(F, Ops[0]);
2280 } else {
2281 Function *F = CGM.getIntrinsic(Intrinsic::sqrt, Ops[0]->getType());
2282 return Builder.CreateCall(F, Ops[0]);
2283 }
2284 }
2285
2286 case X86::BI__builtin_ia32_pmuludq128:
2287 case X86::BI__builtin_ia32_pmuludq256:
2288 case X86::BI__builtin_ia32_pmuludq512:
2289 return EmitX86Muldq(*this, /*IsSigned*/false, Ops);
2290
2291 case X86::BI__builtin_ia32_pmuldq128:
2292 case X86::BI__builtin_ia32_pmuldq256:
2293 case X86::BI__builtin_ia32_pmuldq512:
2294 return EmitX86Muldq(*this, /*IsSigned*/true, Ops);
2295
2296 case X86::BI__builtin_ia32_pternlogd512_mask:
2297 case X86::BI__builtin_ia32_pternlogq512_mask:
2298 case X86::BI__builtin_ia32_pternlogd128_mask:
2299 case X86::BI__builtin_ia32_pternlogd256_mask:
2300 case X86::BI__builtin_ia32_pternlogq128_mask:
2301 case X86::BI__builtin_ia32_pternlogq256_mask:
2302 return EmitX86Ternlog(*this, /*ZeroMask*/false, Ops);
2303
2304 case X86::BI__builtin_ia32_pternlogd512_maskz:
2305 case X86::BI__builtin_ia32_pternlogq512_maskz:
2306 case X86::BI__builtin_ia32_pternlogd128_maskz:
2307 case X86::BI__builtin_ia32_pternlogd256_maskz:
2308 case X86::BI__builtin_ia32_pternlogq128_maskz:
2309 case X86::BI__builtin_ia32_pternlogq256_maskz:
2310 return EmitX86Ternlog(*this, /*ZeroMask*/true, Ops);
2311
2312 case X86::BI__builtin_ia32_vpshldd128:
2313 case X86::BI__builtin_ia32_vpshldd256:
2314 case X86::BI__builtin_ia32_vpshldd512:
2315 case X86::BI__builtin_ia32_vpshldq128:
2316 case X86::BI__builtin_ia32_vpshldq256:
2317 case X86::BI__builtin_ia32_vpshldq512:
2318 case X86::BI__builtin_ia32_vpshldw128:
2319 case X86::BI__builtin_ia32_vpshldw256:
2320 case X86::BI__builtin_ia32_vpshldw512:
2321 return EmitX86FunnelShift(*this, Ops[0], Ops[1], Ops[2], false);
2322
2323 case X86::BI__builtin_ia32_vpshrdd128:
2324 case X86::BI__builtin_ia32_vpshrdd256:
2325 case X86::BI__builtin_ia32_vpshrdd512:
2326 case X86::BI__builtin_ia32_vpshrdq128:
2327 case X86::BI__builtin_ia32_vpshrdq256:
2328 case X86::BI__builtin_ia32_vpshrdq512:
2329 case X86::BI__builtin_ia32_vpshrdw128:
2330 case X86::BI__builtin_ia32_vpshrdw256:
2331 case X86::BI__builtin_ia32_vpshrdw512:
2332 // Ops 0 and 1 are swapped.
2333 return EmitX86FunnelShift(*this, Ops[1], Ops[0], Ops[2], true);
2334
2335 // Reductions
2336 case X86::BI__builtin_ia32_reduce_fadd_pd512:
2337 case X86::BI__builtin_ia32_reduce_fadd_ps512:
2338 case X86::BI__builtin_ia32_reduce_fadd_ph512:
2339 case X86::BI__builtin_ia32_reduce_fadd_ph256:
2340 case X86::BI__builtin_ia32_reduce_fadd_ph128: {
2341 Function *F =
2342 CGM.getIntrinsic(Intrinsic::vector_reduce_fadd, Ops[1]->getType());
2343 IRBuilder<>::FastMathFlagGuard FMFGuard(Builder);
2344 Builder.getFastMathFlags().setAllowReassoc();
2345 return Builder.CreateCall(F, {Ops[0], Ops[1]});
2346 }
2347 case X86::BI__builtin_ia32_reduce_fmul_pd512:
2348 case X86::BI__builtin_ia32_reduce_fmul_ps512:
2349 case X86::BI__builtin_ia32_reduce_fmul_ph512:
2350 case X86::BI__builtin_ia32_reduce_fmul_ph256:
2351 case X86::BI__builtin_ia32_reduce_fmul_ph128: {
2352 Function *F =
2353 CGM.getIntrinsic(Intrinsic::vector_reduce_fmul, Ops[1]->getType());
2354 IRBuilder<>::FastMathFlagGuard FMFGuard(Builder);
2355 Builder.getFastMathFlags().setAllowReassoc();
2356 return Builder.CreateCall(F, {Ops[0], Ops[1]});
2357 }
2358 case X86::BI__builtin_ia32_reduce_fmax_pd512:
2359 case X86::BI__builtin_ia32_reduce_fmax_ps512:
2360 case X86::BI__builtin_ia32_reduce_fmax_ph512:
2361 case X86::BI__builtin_ia32_reduce_fmax_ph256:
2362 case X86::BI__builtin_ia32_reduce_fmax_ph128: {
2363 Function *F =
2364 CGM.getIntrinsic(Intrinsic::vector_reduce_fmax, Ops[0]->getType());
2365 IRBuilder<>::FastMathFlagGuard FMFGuard(Builder);
2366 Builder.getFastMathFlags().setNoNaNs();
2367 return Builder.CreateCall(F, {Ops[0]});
2368 }
2369 case X86::BI__builtin_ia32_reduce_fmin_pd512:
2370 case X86::BI__builtin_ia32_reduce_fmin_ps512:
2371 case X86::BI__builtin_ia32_reduce_fmin_ph512:
2372 case X86::BI__builtin_ia32_reduce_fmin_ph256:
2373 case X86::BI__builtin_ia32_reduce_fmin_ph128: {
2374 Function *F =
2375 CGM.getIntrinsic(Intrinsic::vector_reduce_fmin, Ops[0]->getType());
2376 IRBuilder<>::FastMathFlagGuard FMFGuard(Builder);
2377 Builder.getFastMathFlags().setNoNaNs();
2378 return Builder.CreateCall(F, {Ops[0]});
2379 }
2380
2381 case X86::BI__builtin_ia32_rdrand16_step:
2382 case X86::BI__builtin_ia32_rdrand32_step:
2383 case X86::BI__builtin_ia32_rdrand64_step:
2384 case X86::BI__builtin_ia32_rdseed16_step:
2385 case X86::BI__builtin_ia32_rdseed32_step:
2386 case X86::BI__builtin_ia32_rdseed64_step: {
2387 Intrinsic::ID ID;
2388 switch (BuiltinID) {
2389 default: llvm_unreachable("Unsupported intrinsic!");
2390 case X86::BI__builtin_ia32_rdrand16_step:
2391 ID = Intrinsic::x86_rdrand_16;
2392 break;
2393 case X86::BI__builtin_ia32_rdrand32_step:
2394 ID = Intrinsic::x86_rdrand_32;
2395 break;
2396 case X86::BI__builtin_ia32_rdrand64_step:
2397 ID = Intrinsic::x86_rdrand_64;
2398 break;
2399 case X86::BI__builtin_ia32_rdseed16_step:
2400 ID = Intrinsic::x86_rdseed_16;
2401 break;
2402 case X86::BI__builtin_ia32_rdseed32_step:
2403 ID = Intrinsic::x86_rdseed_32;
2404 break;
2405 case X86::BI__builtin_ia32_rdseed64_step:
2406 ID = Intrinsic::x86_rdseed_64;
2407 break;
2408 }
2409
2410 Value *Call = Builder.CreateCall(CGM.getIntrinsic(ID));
2411 Builder.CreateDefaultAlignedStore(Builder.CreateExtractValue(Call, 0),
2412 Ops[0]);
2413 return Builder.CreateExtractValue(Call, 1);
2414 }
2415 case X86::BI__builtin_ia32_addcarryx_u32:
2416 case X86::BI__builtin_ia32_addcarryx_u64:
2417 case X86::BI__builtin_ia32_subborrow_u32:
2418 case X86::BI__builtin_ia32_subborrow_u64: {
2419 Intrinsic::ID IID;
2420 switch (BuiltinID) {
2421 default: llvm_unreachable("Unsupported intrinsic!");
2422 case X86::BI__builtin_ia32_addcarryx_u32:
2423 IID = Intrinsic::x86_addcarry_32;
2424 break;
2425 case X86::BI__builtin_ia32_addcarryx_u64:
2426 IID = Intrinsic::x86_addcarry_64;
2427 break;
2428 case X86::BI__builtin_ia32_subborrow_u32:
2429 IID = Intrinsic::x86_subborrow_32;
2430 break;
2431 case X86::BI__builtin_ia32_subborrow_u64:
2432 IID = Intrinsic::x86_subborrow_64;
2433 break;
2434 }
2435
2436 Value *Call = Builder.CreateCall(CGM.getIntrinsic(IID),
2437 { Ops[0], Ops[1], Ops[2] });
2438 Builder.CreateDefaultAlignedStore(Builder.CreateExtractValue(Call, 1),
2439 Ops[3]);
2440 return Builder.CreateExtractValue(Call, 0);
2441 }
2442
2443 case X86::BI__builtin_ia32_fpclassps128_mask:
2444 case X86::BI__builtin_ia32_fpclassps256_mask:
2445 case X86::BI__builtin_ia32_fpclassps512_mask:
2446 case X86::BI__builtin_ia32_vfpclassbf16128_mask:
2447 case X86::BI__builtin_ia32_vfpclassbf16256_mask:
2448 case X86::BI__builtin_ia32_vfpclassbf16512_mask:
2449 case X86::BI__builtin_ia32_fpclassph128_mask:
2450 case X86::BI__builtin_ia32_fpclassph256_mask:
2451 case X86::BI__builtin_ia32_fpclassph512_mask:
2452 case X86::BI__builtin_ia32_fpclasspd128_mask:
2453 case X86::BI__builtin_ia32_fpclasspd256_mask:
2454 case X86::BI__builtin_ia32_fpclasspd512_mask: {
2455 unsigned NumElts =
2456 cast<llvm::FixedVectorType>(Ops[0]->getType())->getNumElements();
2457 Value *MaskIn = Ops[2];
2458 Ops.erase(&Ops[2]);
2459
2460 Intrinsic::ID ID;
2461 switch (BuiltinID) {
2462 default: llvm_unreachable("Unsupported intrinsic!");
2463 case X86::BI__builtin_ia32_vfpclassbf16128_mask:
2464 ID = Intrinsic::x86_avx10_fpclass_bf16_128;
2465 break;
2466 case X86::BI__builtin_ia32_vfpclassbf16256_mask:
2467 ID = Intrinsic::x86_avx10_fpclass_bf16_256;
2468 break;
2469 case X86::BI__builtin_ia32_vfpclassbf16512_mask:
2470 ID = Intrinsic::x86_avx10_fpclass_bf16_512;
2471 break;
2472 case X86::BI__builtin_ia32_fpclassph128_mask:
2473 ID = Intrinsic::x86_avx512fp16_fpclass_ph_128;
2474 break;
2475 case X86::BI__builtin_ia32_fpclassph256_mask:
2476 ID = Intrinsic::x86_avx512fp16_fpclass_ph_256;
2477 break;
2478 case X86::BI__builtin_ia32_fpclassph512_mask:
2479 ID = Intrinsic::x86_avx512fp16_fpclass_ph_512;
2480 break;
2481 case X86::BI__builtin_ia32_fpclassps128_mask:
2482 ID = Intrinsic::x86_avx512_fpclass_ps_128;
2483 break;
2484 case X86::BI__builtin_ia32_fpclassps256_mask:
2485 ID = Intrinsic::x86_avx512_fpclass_ps_256;
2486 break;
2487 case X86::BI__builtin_ia32_fpclassps512_mask:
2488 ID = Intrinsic::x86_avx512_fpclass_ps_512;
2489 break;
2490 case X86::BI__builtin_ia32_fpclasspd128_mask:
2491 ID = Intrinsic::x86_avx512_fpclass_pd_128;
2492 break;
2493 case X86::BI__builtin_ia32_fpclasspd256_mask:
2494 ID = Intrinsic::x86_avx512_fpclass_pd_256;
2495 break;
2496 case X86::BI__builtin_ia32_fpclasspd512_mask:
2497 ID = Intrinsic::x86_avx512_fpclass_pd_512;
2498 break;
2499 }
2500
2501 Value *Fpclass = Builder.CreateCall(CGM.getIntrinsic(ID), Ops);
2502 return EmitX86MaskedCompareResult(*this, Fpclass, NumElts, MaskIn);
2503 }
2504
2505 case X86::BI__builtin_ia32_vp2intersect_q_512:
2506 case X86::BI__builtin_ia32_vp2intersect_q_256:
2507 case X86::BI__builtin_ia32_vp2intersect_q_128:
2508 case X86::BI__builtin_ia32_vp2intersect_d_512:
2509 case X86::BI__builtin_ia32_vp2intersect_d_256:
2510 case X86::BI__builtin_ia32_vp2intersect_d_128: {
2511 unsigned NumElts =
2512 cast<llvm::FixedVectorType>(Ops[0]->getType())->getNumElements();
2513 Intrinsic::ID ID;
2514
2515 switch (BuiltinID) {
2516 default: llvm_unreachable("Unsupported intrinsic!");
2517 case X86::BI__builtin_ia32_vp2intersect_q_512:
2518 ID = Intrinsic::x86_avx512_vp2intersect_q_512;
2519 break;
2520 case X86::BI__builtin_ia32_vp2intersect_q_256:
2521 ID = Intrinsic::x86_avx512_vp2intersect_q_256;
2522 break;
2523 case X86::BI__builtin_ia32_vp2intersect_q_128:
2524 ID = Intrinsic::x86_avx512_vp2intersect_q_128;
2525 break;
2526 case X86::BI__builtin_ia32_vp2intersect_d_512:
2527 ID = Intrinsic::x86_avx512_vp2intersect_d_512;
2528 break;
2529 case X86::BI__builtin_ia32_vp2intersect_d_256:
2530 ID = Intrinsic::x86_avx512_vp2intersect_d_256;
2531 break;
2532 case X86::BI__builtin_ia32_vp2intersect_d_128:
2533 ID = Intrinsic::x86_avx512_vp2intersect_d_128;
2534 break;
2535 }
2536
2537 Value *Call = Builder.CreateCall(CGM.getIntrinsic(ID), {Ops[0], Ops[1]});
2538 Value *Result = Builder.CreateExtractValue(Call, 0);
2539 Result = EmitX86MaskedCompareResult(*this, Result, NumElts, nullptr);
2540 Builder.CreateDefaultAlignedStore(Result, Ops[2]);
2541
2542 Result = Builder.CreateExtractValue(Call, 1);
2543 Result = EmitX86MaskedCompareResult(*this, Result, NumElts, nullptr);
2544 return Builder.CreateDefaultAlignedStore(Result, Ops[3]);
2545 }
2546
2547 case X86::BI__builtin_ia32_vpmultishiftqb128:
2548 case X86::BI__builtin_ia32_vpmultishiftqb256:
2549 case X86::BI__builtin_ia32_vpmultishiftqb512: {
2550 Intrinsic::ID ID;
2551 switch (BuiltinID) {
2552 default: llvm_unreachable("Unsupported intrinsic!");
2553 case X86::BI__builtin_ia32_vpmultishiftqb128:
2554 ID = Intrinsic::x86_avx512_pmultishift_qb_128;
2555 break;
2556 case X86::BI__builtin_ia32_vpmultishiftqb256:
2557 ID = Intrinsic::x86_avx512_pmultishift_qb_256;
2558 break;
2559 case X86::BI__builtin_ia32_vpmultishiftqb512:
2560 ID = Intrinsic::x86_avx512_pmultishift_qb_512;
2561 break;
2562 }
2563
2564 return Builder.CreateCall(CGM.getIntrinsic(ID), Ops);
2565 }
2566
2567 case X86::BI__builtin_ia32_vpshufbitqmb128_mask:
2568 case X86::BI__builtin_ia32_vpshufbitqmb256_mask:
2569 case X86::BI__builtin_ia32_vpshufbitqmb512_mask: {
2570 unsigned NumElts =
2571 cast<llvm::FixedVectorType>(Ops[0]->getType())->getNumElements();
2572 Value *MaskIn = Ops[2];
2573 Ops.erase(&Ops[2]);
2574
2575 Intrinsic::ID ID;
2576 switch (BuiltinID) {
2577 default: llvm_unreachable("Unsupported intrinsic!");
2578 case X86::BI__builtin_ia32_vpshufbitqmb128_mask:
2579 ID = Intrinsic::x86_avx512_vpshufbitqmb_128;
2580 break;
2581 case X86::BI__builtin_ia32_vpshufbitqmb256_mask:
2582 ID = Intrinsic::x86_avx512_vpshufbitqmb_256;
2583 break;
2584 case X86::BI__builtin_ia32_vpshufbitqmb512_mask:
2585 ID = Intrinsic::x86_avx512_vpshufbitqmb_512;
2586 break;
2587 }
2588
2589 Value *Shufbit = Builder.CreateCall(CGM.getIntrinsic(ID), Ops);
2590 return EmitX86MaskedCompareResult(*this, Shufbit, NumElts, MaskIn);
2591 }
2592
2593 // packed comparison intrinsics
2594 case X86::BI__builtin_ia32_cmpeqps:
2595 case X86::BI__builtin_ia32_cmpeqpd:
2596 return getVectorFCmpIR(CmpInst::FCMP_OEQ, /*IsSignaling*/false);
2597 case X86::BI__builtin_ia32_cmpltps:
2598 case X86::BI__builtin_ia32_cmpltpd:
2599 return getVectorFCmpIR(CmpInst::FCMP_OLT, /*IsSignaling*/true);
2600 case X86::BI__builtin_ia32_cmpleps:
2601 case X86::BI__builtin_ia32_cmplepd:
2602 return getVectorFCmpIR(CmpInst::FCMP_OLE, /*IsSignaling*/true);
2603 case X86::BI__builtin_ia32_cmpunordps:
2604 case X86::BI__builtin_ia32_cmpunordpd:
2605 return getVectorFCmpIR(CmpInst::FCMP_UNO, /*IsSignaling*/false);
2606 case X86::BI__builtin_ia32_cmpneqps:
2607 case X86::BI__builtin_ia32_cmpneqpd:
2608 return getVectorFCmpIR(CmpInst::FCMP_UNE, /*IsSignaling*/false);
2609 case X86::BI__builtin_ia32_cmpnltps:
2610 case X86::BI__builtin_ia32_cmpnltpd:
2611 return getVectorFCmpIR(CmpInst::FCMP_UGE, /*IsSignaling*/true);
2612 case X86::BI__builtin_ia32_cmpnleps:
2613 case X86::BI__builtin_ia32_cmpnlepd:
2614 return getVectorFCmpIR(CmpInst::FCMP_UGT, /*IsSignaling*/true);
2615 case X86::BI__builtin_ia32_cmpordps:
2616 case X86::BI__builtin_ia32_cmpordpd:
2617 return getVectorFCmpIR(CmpInst::FCMP_ORD, /*IsSignaling*/false);
2618 case X86::BI__builtin_ia32_cmpph128_mask:
2619 case X86::BI__builtin_ia32_cmpph256_mask:
2620 case X86::BI__builtin_ia32_cmpph512_mask:
2621 case X86::BI__builtin_ia32_cmpps128_mask:
2622 case X86::BI__builtin_ia32_cmpps256_mask:
2623 case X86::BI__builtin_ia32_cmpps512_mask:
2624 case X86::BI__builtin_ia32_cmppd128_mask:
2625 case X86::BI__builtin_ia32_cmppd256_mask:
2626 case X86::BI__builtin_ia32_cmppd512_mask:
2627 case X86::BI__builtin_ia32_vcmpbf16512_mask:
2628 case X86::BI__builtin_ia32_vcmpbf16256_mask:
2629 case X86::BI__builtin_ia32_vcmpbf16128_mask:
2630 IsMaskFCmp = true;
2631 [[fallthrough]];
2632 case X86::BI__builtin_ia32_cmpps:
2633 case X86::BI__builtin_ia32_cmpps256:
2634 case X86::BI__builtin_ia32_cmppd:
2635 case X86::BI__builtin_ia32_cmppd256: {
2636 // Lowering vector comparisons to fcmp instructions, while
2637 // ignoring signalling behaviour requested
2638 // ignoring rounding mode requested
2639 // This is only possible if fp-model is not strict and FENV_ACCESS is off.
2640
2641 // The third argument is the comparison condition, and integer in the
2642 // range [0, 31]
2643 unsigned CC = cast<llvm::ConstantInt>(Ops[2])->getZExtValue() & 0x1f;
2644
2645 // Lowering to IR fcmp instruction.
2646 // Ignoring requested signaling behaviour,
2647 // e.g. both _CMP_GT_OS & _CMP_GT_OQ are translated to FCMP_OGT.
2648 FCmpInst::Predicate Pred;
2649 bool IsSignaling;
2650 // Predicates for 16-31 repeat the 0-15 predicates. Only the signalling
2651 // behavior is inverted. We'll handle that after the switch.
2652 switch (CC & 0xf) {
2653 case 0x00: Pred = FCmpInst::FCMP_OEQ; IsSignaling = false; break;
2654 case 0x01: Pred = FCmpInst::FCMP_OLT; IsSignaling = true; break;
2655 case 0x02: Pred = FCmpInst::FCMP_OLE; IsSignaling = true; break;
2656 case 0x03: Pred = FCmpInst::FCMP_UNO; IsSignaling = false; break;
2657 case 0x04: Pred = FCmpInst::FCMP_UNE; IsSignaling = false; break;
2658 case 0x05: Pred = FCmpInst::FCMP_UGE; IsSignaling = true; break;
2659 case 0x06: Pred = FCmpInst::FCMP_UGT; IsSignaling = true; break;
2660 case 0x07: Pred = FCmpInst::FCMP_ORD; IsSignaling = false; break;
2661 case 0x08: Pred = FCmpInst::FCMP_UEQ; IsSignaling = false; break;
2662 case 0x09: Pred = FCmpInst::FCMP_ULT; IsSignaling = true; break;
2663 case 0x0a: Pred = FCmpInst::FCMP_ULE; IsSignaling = true; break;
2664 case 0x0b: Pred = FCmpInst::FCMP_FALSE; IsSignaling = false; break;
2665 case 0x0c: Pred = FCmpInst::FCMP_ONE; IsSignaling = false; break;
2666 case 0x0d: Pred = FCmpInst::FCMP_OGE; IsSignaling = true; break;
2667 case 0x0e: Pred = FCmpInst::FCMP_OGT; IsSignaling = true; break;
2668 case 0x0f: Pred = FCmpInst::FCMP_TRUE; IsSignaling = false; break;
2669 default: llvm_unreachable("Unhandled CC");
2670 }
2671
2672 // Invert the signalling behavior for 16-31.
2673 if (CC & 0x10)
2674 IsSignaling = !IsSignaling;
2675
2676 // If the predicate is true or false and we're using constrained intrinsics,
2677 // we don't have a compare intrinsic we can use. Just use the legacy X86
2678 // specific intrinsic.
2679 // If the intrinsic is mask enabled and we're using constrained intrinsics,
2680 // use the legacy X86 specific intrinsic.
2681 if (Builder.getIsFPConstrained() &&
2682 (Pred == FCmpInst::FCMP_TRUE || Pred == FCmpInst::FCMP_FALSE ||
2683 IsMaskFCmp)) {
2684
2685 Intrinsic::ID IID;
2686 switch (BuiltinID) {
2687 default: llvm_unreachable("Unexpected builtin");
2688 case X86::BI__builtin_ia32_cmpps:
2689 IID = Intrinsic::x86_sse_cmp_ps;
2690 break;
2691 case X86::BI__builtin_ia32_cmpps256:
2692 IID = Intrinsic::x86_avx_cmp_ps_256;
2693 break;
2694 case X86::BI__builtin_ia32_cmppd:
2695 IID = Intrinsic::x86_sse2_cmp_pd;
2696 break;
2697 case X86::BI__builtin_ia32_cmppd256:
2698 IID = Intrinsic::x86_avx_cmp_pd_256;
2699 break;
2700 case X86::BI__builtin_ia32_cmpph128_mask:
2701 IID = Intrinsic::x86_avx512fp16_mask_cmp_ph_128;
2702 break;
2703 case X86::BI__builtin_ia32_cmpph256_mask:
2704 IID = Intrinsic::x86_avx512fp16_mask_cmp_ph_256;
2705 break;
2706 case X86::BI__builtin_ia32_cmpph512_mask:
2707 IID = Intrinsic::x86_avx512fp16_mask_cmp_ph_512;
2708 break;
2709 case X86::BI__builtin_ia32_cmpps512_mask:
2710 IID = Intrinsic::x86_avx512_mask_cmp_ps_512;
2711 break;
2712 case X86::BI__builtin_ia32_cmppd512_mask:
2713 IID = Intrinsic::x86_avx512_mask_cmp_pd_512;
2714 break;
2715 case X86::BI__builtin_ia32_cmpps128_mask:
2716 IID = Intrinsic::x86_avx512_mask_cmp_ps_128;
2717 break;
2718 case X86::BI__builtin_ia32_cmpps256_mask:
2719 IID = Intrinsic::x86_avx512_mask_cmp_ps_256;
2720 break;
2721 case X86::BI__builtin_ia32_cmppd128_mask:
2722 IID = Intrinsic::x86_avx512_mask_cmp_pd_128;
2723 break;
2724 case X86::BI__builtin_ia32_cmppd256_mask:
2725 IID = Intrinsic::x86_avx512_mask_cmp_pd_256;
2726 break;
2727 }
2728
2729 Function *Intr = CGM.getIntrinsic(IID);
2730 if (IsMaskFCmp) {
2731 unsigned NumElts =
2732 cast<llvm::FixedVectorType>(Ops[0]->getType())->getNumElements();
2733 Ops[3] = getMaskVecValue(*this, Ops[3], NumElts);
2734 Value *Cmp = Builder.CreateCall(Intr, Ops);
2735 return EmitX86MaskedCompareResult(*this, Cmp, NumElts, nullptr);
2736 }
2737
2738 return Builder.CreateCall(Intr, Ops);
2739 }
2740
2741 // Builtins without the _mask suffix return a vector of integers
2742 // of the same width as the input vectors
2743 if (IsMaskFCmp) {
2744 // We ignore SAE if strict FP is disabled. We only keep precise
2745 // exception behavior under strict FP.
2746 // NOTE: If strict FP does ever go through here a CGFPOptionsRAII
2747 // object will be required.
2748 unsigned NumElts =
2749 cast<llvm::FixedVectorType>(Ops[0]->getType())->getNumElements();
2750 Value *Cmp;
2751 if (IsSignaling)
2752 Cmp = Builder.CreateFCmpS(Pred, Ops[0], Ops[1]);
2753 else
2754 Cmp = Builder.CreateFCmp(Pred, Ops[0], Ops[1]);
2755 return EmitX86MaskedCompareResult(*this, Cmp, NumElts, Ops[3]);
2756 }
2757
2758 return getVectorFCmpIR(Pred, IsSignaling);
2759 }
2760
2761 // SSE scalar comparison intrinsics
2762 case X86::BI__builtin_ia32_cmpeqss:
2763 return getCmpIntrinsicCall(Intrinsic::x86_sse_cmp_ss, 0);
2764 case X86::BI__builtin_ia32_cmpltss:
2765 return getCmpIntrinsicCall(Intrinsic::x86_sse_cmp_ss, 1);
2766 case X86::BI__builtin_ia32_cmpless:
2767 return getCmpIntrinsicCall(Intrinsic::x86_sse_cmp_ss, 2);
2768 case X86::BI__builtin_ia32_cmpunordss:
2769 return getCmpIntrinsicCall(Intrinsic::x86_sse_cmp_ss, 3);
2770 case X86::BI__builtin_ia32_cmpneqss:
2771 return getCmpIntrinsicCall(Intrinsic::x86_sse_cmp_ss, 4);
2772 case X86::BI__builtin_ia32_cmpnltss:
2773 return getCmpIntrinsicCall(Intrinsic::x86_sse_cmp_ss, 5);
2774 case X86::BI__builtin_ia32_cmpnless:
2775 return getCmpIntrinsicCall(Intrinsic::x86_sse_cmp_ss, 6);
2776 case X86::BI__builtin_ia32_cmpordss:
2777 return getCmpIntrinsicCall(Intrinsic::x86_sse_cmp_ss, 7);
2778 case X86::BI__builtin_ia32_cmpeqsd:
2779 return getCmpIntrinsicCall(Intrinsic::x86_sse2_cmp_sd, 0);
2780 case X86::BI__builtin_ia32_cmpltsd:
2781 return getCmpIntrinsicCall(Intrinsic::x86_sse2_cmp_sd, 1);
2782 case X86::BI__builtin_ia32_cmplesd:
2783 return getCmpIntrinsicCall(Intrinsic::x86_sse2_cmp_sd, 2);
2784 case X86::BI__builtin_ia32_cmpunordsd:
2785 return getCmpIntrinsicCall(Intrinsic::x86_sse2_cmp_sd, 3);
2786 case X86::BI__builtin_ia32_cmpneqsd:
2787 return getCmpIntrinsicCall(Intrinsic::x86_sse2_cmp_sd, 4);
2788 case X86::BI__builtin_ia32_cmpnltsd:
2789 return getCmpIntrinsicCall(Intrinsic::x86_sse2_cmp_sd, 5);
2790 case X86::BI__builtin_ia32_cmpnlesd:
2791 return getCmpIntrinsicCall(Intrinsic::x86_sse2_cmp_sd, 6);
2792 case X86::BI__builtin_ia32_cmpordsd:
2793 return getCmpIntrinsicCall(Intrinsic::x86_sse2_cmp_sd, 7);
2794
2795 // f16c half2float intrinsics
2796 case X86::BI__builtin_ia32_vcvtph2ps_mask:
2797 case X86::BI__builtin_ia32_vcvtph2ps256_mask:
2798 case X86::BI__builtin_ia32_vcvtph2ps512_mask: {
2799 CodeGenFunction::CGFPOptionsRAII FPOptsRAII(*this, E);
2800 return EmitX86CvtF16ToFloatExpr(*this, Ops, ConvertType(E->getType()));
2801 }
2802
2803 // AVX512 bf16 intrinsics
2804 case X86::BI__builtin_ia32_cvtneps2bf16_128_mask: {
2805 Ops[2] = getMaskVecValue(
2806 *this, Ops[2],
2807 cast<llvm::FixedVectorType>(Ops[0]->getType())->getNumElements());
2808 Intrinsic::ID IID = Intrinsic::x86_avx512bf16_mask_cvtneps2bf16_128;
2809 return Builder.CreateCall(CGM.getIntrinsic(IID), Ops);
2810 }
2811 case X86::BI__builtin_ia32_cvtsbf162ss_32:
2812 return Builder.CreateFPExt(Ops[0], Builder.getFloatTy());
2813
2814 case X86::BI__builtin_ia32_cvtneps2bf16_256_mask:
2815 case X86::BI__builtin_ia32_cvtneps2bf16_512_mask: {
2816 Intrinsic::ID IID;
2817 switch (BuiltinID) {
2818 default: llvm_unreachable("Unsupported intrinsic!");
2819 case X86::BI__builtin_ia32_cvtneps2bf16_256_mask:
2820 IID = Intrinsic::x86_avx512bf16_cvtneps2bf16_256;
2821 break;
2822 case X86::BI__builtin_ia32_cvtneps2bf16_512_mask:
2823 IID = Intrinsic::x86_avx512bf16_cvtneps2bf16_512;
2824 break;
2825 }
2826 Value *Res = Builder.CreateCall(CGM.getIntrinsic(IID), Ops[0]);
2827 return EmitX86Select(*this, Ops[2], Res, Ops[1]);
2828 }
2829
2830 case X86::BI__cpuid:
2831 case X86::BI__cpuidex: {
2832 Value *FuncId = EmitScalarExpr(E->getArg(1));
2833 Value *SubFuncId = BuiltinID == X86::BI__cpuidex
2834 ? EmitScalarExpr(E->getArg(2))
2835 : llvm::ConstantInt::get(Int32Ty, 0);
2836
2837 llvm::StructType *CpuidRetTy =
2838 llvm::StructType::get(Int32Ty, Int32Ty, Int32Ty, Int32Ty);
2839 llvm::FunctionType *FTy =
2840 llvm::FunctionType::get(CpuidRetTy, {Int32Ty, Int32Ty}, false);
2841
2842 StringRef Asm, Constraints;
2843 if (getTarget().getTriple().getArch() == llvm::Triple::x86) {
2844 Asm = "cpuid";
2845 Constraints = "={ax},={bx},={cx},={dx},{ax},{cx}";
2846 } else {
2847 // x86-64 uses %rbx as the base register, so preserve it.
2848 Asm = "xchgq %rbx, ${1:q}\n"
2849 "cpuid\n"
2850 "xchgq %rbx, ${1:q}";
2851 Constraints = "={ax},=r,={cx},={dx},0,2";
2852 }
2853
2854 llvm::InlineAsm *IA = llvm::InlineAsm::get(FTy, Asm, Constraints,
2855 /*hasSideEffects=*/false);
2856 Value *IACall = Builder.CreateCall(IA, {FuncId, SubFuncId});
2857 Value *BasePtr = EmitScalarExpr(E->getArg(0));
2858 Value *Store = nullptr;
2859 for (unsigned i = 0; i < 4; i++) {
2860 Value *Extracted = Builder.CreateExtractValue(IACall, i);
2861 Value *StorePtr = Builder.CreateConstInBoundsGEP1_32(Int32Ty, BasePtr, i);
2862 Store = Builder.CreateAlignedStore(Extracted, StorePtr, getIntAlign());
2863 }
2864
2865 // Return the last store instruction to signal that we have emitted the
2866 // the intrinsic.
2867 return Store;
2868 }
2869
2870 case X86::BI__emul:
2871 case X86::BI__emulu: {
2872 llvm::Type *Int64Ty = llvm::IntegerType::get(getLLVMContext(), 64);
2873 bool isSigned = (BuiltinID == X86::BI__emul);
2874 Value *LHS = Builder.CreateIntCast(Ops[0], Int64Ty, isSigned);
2875 Value *RHS = Builder.CreateIntCast(Ops[1], Int64Ty, isSigned);
2876 return Builder.CreateMul(LHS, RHS, "", !isSigned, isSigned);
2877 }
2878 case X86::BI__mulh:
2879 case X86::BI__umulh:
2880 case X86::BI_mul128:
2881 case X86::BI_umul128: {
2882 llvm::Type *ResType = ConvertType(E->getType());
2883 llvm::Type *Int128Ty = llvm::IntegerType::get(getLLVMContext(), 128);
2884
2885 bool IsSigned = (BuiltinID == X86::BI__mulh || BuiltinID == X86::BI_mul128);
2886 Value *LHS = Builder.CreateIntCast(Ops[0], Int128Ty, IsSigned);
2887 Value *RHS = Builder.CreateIntCast(Ops[1], Int128Ty, IsSigned);
2888
2889 Value *MulResult, *HigherBits;
2890 if (IsSigned) {
2891 MulResult = Builder.CreateNSWMul(LHS, RHS);
2892 HigherBits = Builder.CreateAShr(MulResult, 64);
2893 } else {
2894 MulResult = Builder.CreateNUWMul(LHS, RHS);
2895 HigherBits = Builder.CreateLShr(MulResult, 64);
2896 }
2897 HigherBits = Builder.CreateIntCast(HigherBits, ResType, IsSigned);
2898
2899 if (BuiltinID == X86::BI__mulh || BuiltinID == X86::BI__umulh)
2900 return HigherBits;
2901
2902 Address HighBitsAddress = EmitPointerWithAlignment(E->getArg(2));
2903 Builder.CreateStore(HigherBits, HighBitsAddress);
2904 return Builder.CreateIntCast(MulResult, ResType, IsSigned);
2905 }
2906
2907 case X86::BI__faststorefence: {
2908 return Builder.CreateFence(llvm::AtomicOrdering::SequentiallyConsistent,
2909 llvm::SyncScope::System);
2910 }
2911 case X86::BI__shiftleft128:
2912 case X86::BI__shiftright128: {
2913 llvm::Function *F = CGM.getIntrinsic(
2914 BuiltinID == X86::BI__shiftleft128 ? Intrinsic::fshl : Intrinsic::fshr,
2915 Int64Ty);
2916 // Flip low/high ops and zero-extend amount to matching type.
2917 // shiftleft128(Low, High, Amt) -> fshl(High, Low, Amt)
2918 // shiftright128(Low, High, Amt) -> fshr(High, Low, Amt)
2919 std::swap(Ops[0], Ops[1]);
2920 Ops[2] = Builder.CreateZExt(Ops[2], Int64Ty);
2921 return Builder.CreateCall(F, Ops);
2922 }
2923 case X86::BI_ReadWriteBarrier:
2924 case X86::BI_ReadBarrier:
2925 case X86::BI_WriteBarrier: {
2926 return Builder.CreateFence(llvm::AtomicOrdering::SequentiallyConsistent,
2927 llvm::SyncScope::SingleThread);
2928 }
2929
2930 case X86::BI_AddressOfReturnAddress: {
2931 Function *F =
2932 CGM.getIntrinsic(Intrinsic::addressofreturnaddress, AllocaInt8PtrTy);
2933 return Builder.CreateCall(F);
2934 }
2935 case X86::BI__stosb: {
2936 // We treat __stosb as a volatile memset - it may not generate "rep stosb"
2937 // instruction, but it will create a memset that won't be optimized away.
2938 return Builder.CreateMemSet(Ops[0], Ops[1], Ops[2], Align(1), true);
2939 }
2940 // Corresponding to intrisics which will return 2 tiles (tile0_tile1).
2941 case X86::BI__builtin_ia32_t2rpntlvwz0_internal:
2942 case X86::BI__builtin_ia32_t2rpntlvwz0rs_internal:
2943 case X86::BI__builtin_ia32_t2rpntlvwz0t1_internal:
2944 case X86::BI__builtin_ia32_t2rpntlvwz0rst1_internal:
2945 case X86::BI__builtin_ia32_t2rpntlvwz1_internal:
2946 case X86::BI__builtin_ia32_t2rpntlvwz1rs_internal:
2947 case X86::BI__builtin_ia32_t2rpntlvwz1t1_internal:
2948 case X86::BI__builtin_ia32_t2rpntlvwz1rst1_internal: {
2949 Intrinsic::ID IID;
2950 switch (BuiltinID) {
2951 default:
2952 llvm_unreachable("Unsupported intrinsic!");
2953 case X86::BI__builtin_ia32_t2rpntlvwz0_internal:
2954 IID = Intrinsic::x86_t2rpntlvwz0_internal;
2955 break;
2956 case X86::BI__builtin_ia32_t2rpntlvwz0rs_internal:
2957 IID = Intrinsic::x86_t2rpntlvwz0rs_internal;
2958 break;
2959 case X86::BI__builtin_ia32_t2rpntlvwz0t1_internal:
2960 IID = Intrinsic::x86_t2rpntlvwz0t1_internal;
2961 break;
2962 case X86::BI__builtin_ia32_t2rpntlvwz0rst1_internal:
2963 IID = Intrinsic::x86_t2rpntlvwz0rst1_internal;
2964 break;
2965 case X86::BI__builtin_ia32_t2rpntlvwz1_internal:
2966 IID = Intrinsic::x86_t2rpntlvwz1_internal;
2967 break;
2968 case X86::BI__builtin_ia32_t2rpntlvwz1rs_internal:
2969 IID = Intrinsic::x86_t2rpntlvwz1rs_internal;
2970 break;
2971 case X86::BI__builtin_ia32_t2rpntlvwz1t1_internal:
2972 IID = Intrinsic::x86_t2rpntlvwz1t1_internal;
2973 break;
2974 case X86::BI__builtin_ia32_t2rpntlvwz1rst1_internal:
2975 IID = Intrinsic::x86_t2rpntlvwz1rst1_internal;
2976 break;
2977 }
2978
2979 // Ops = (Row0, Col0, Col1, DstPtr0, DstPtr1, SrcPtr, Stride)
2980 Value *Call = Builder.CreateCall(CGM.getIntrinsic(IID),
2981 {Ops[0], Ops[1], Ops[2], Ops[5], Ops[6]});
2982
2983 auto *PtrTy = E->getArg(3)->getType()->getAs<PointerType>();
2984 assert(PtrTy && "arg3 must be of pointer type");
2985 QualType PtreeTy = PtrTy->getPointeeType();
2986 llvm::Type *TyPtee = ConvertType(PtreeTy);
2987
2988 // Bitcast amx type (x86_amx) to vector type (256 x i32)
2989 // Then store tile0 into DstPtr0
2990 Value *T0 = Builder.CreateExtractValue(Call, 0);
2991 Value *VecT0 = Builder.CreateIntrinsic(Intrinsic::x86_cast_tile_to_vector,
2992 {TyPtee}, {T0});
2993 Builder.CreateDefaultAlignedStore(VecT0, Ops[3]);
2994
2995 // Then store tile1 into DstPtr1
2996 Value *T1 = Builder.CreateExtractValue(Call, 1);
2997 Value *VecT1 = Builder.CreateIntrinsic(Intrinsic::x86_cast_tile_to_vector,
2998 {TyPtee}, {T1});
2999 Value *Store = Builder.CreateDefaultAlignedStore(VecT1, Ops[4]);
3000
3001 // Note: Here we escape directly use x86_tilestored64_internal to store
3002 // the results due to it can't make sure the Mem written scope. This may
3003 // cause shapes reloads after first amx intrinsic, which current amx reg-
3004 // ister allocation has no ability to handle it.
3005
3006 return Store;
3007 }
3008 case X86::BI__ud2:
3009 // llvm.trap makes a ud2a instruction on x86.
3010 return EmitTrapCall(Intrinsic::trap);
3011 case X86::BI__int2c: {
3012 // This syscall signals a driver assertion failure in x86 NT kernels.
3013 llvm::FunctionType *FTy = llvm::FunctionType::get(VoidTy, false);
3014 llvm::InlineAsm *IA =
3015 llvm::InlineAsm::get(FTy, "int $$0x2c", "", /*hasSideEffects=*/true);
3016 llvm::AttributeList NoReturnAttr = llvm::AttributeList::get(
3017 getLLVMContext(), llvm::AttributeList::FunctionIndex,
3018 llvm::Attribute::NoReturn);
3019 llvm::CallInst *CI = Builder.CreateCall(IA);
3020 CI->setAttributes(NoReturnAttr);
3021 return CI;
3022 }
3023 case X86::BI__readfsbyte:
3024 case X86::BI__readfsword:
3025 case X86::BI__readfsdword:
3026 case X86::BI__readfsqword: {
3027 llvm::Type *IntTy = ConvertType(E->getType());
3028 Value *Ptr = Builder.CreateIntToPtr(
3029 Ops[0], llvm::PointerType::get(getLLVMContext(), 257));
3030 LoadInst *Load = Builder.CreateAlignedLoad(
3031 IntTy, Ptr, getContext().getTypeAlignInChars(E->getType()));
3032 Load->setVolatile(true);
3033 return Load;
3034 }
3035 case X86::BI__readgsbyte:
3036 case X86::BI__readgsword:
3037 case X86::BI__readgsdword:
3038 case X86::BI__readgsqword: {
3039 llvm::Type *IntTy = ConvertType(E->getType());
3040 Value *Ptr = Builder.CreateIntToPtr(
3041 Ops[0], llvm::PointerType::get(getLLVMContext(), 256));
3042 LoadInst *Load = Builder.CreateAlignedLoad(
3043 IntTy, Ptr, getContext().getTypeAlignInChars(E->getType()));
3044 Load->setVolatile(true);
3045 return Load;
3046 }
3047 case X86::BI__builtin_ia32_encodekey128_u32: {
3048 Intrinsic::ID IID = Intrinsic::x86_encodekey128;
3049
3050 Value *Call = Builder.CreateCall(CGM.getIntrinsic(IID), {Ops[0], Ops[1]});
3051
3052 for (int i = 0; i < 3; ++i) {
3053 Value *Extract = Builder.CreateExtractValue(Call, i + 1);
3054 Value *Ptr = Builder.CreateConstGEP1_32(Int8Ty, Ops[2], i * 16);
3055 Builder.CreateAlignedStore(Extract, Ptr, Align(1));
3056 }
3057
3058 return Builder.CreateExtractValue(Call, 0);
3059 }
3060 case X86::BI__builtin_ia32_encodekey256_u32: {
3061 Intrinsic::ID IID = Intrinsic::x86_encodekey256;
3062
3063 Value *Call =
3064 Builder.CreateCall(CGM.getIntrinsic(IID), {Ops[0], Ops[1], Ops[2]});
3065
3066 for (int i = 0; i < 4; ++i) {
3067 Value *Extract = Builder.CreateExtractValue(Call, i + 1);
3068 Value *Ptr = Builder.CreateConstGEP1_32(Int8Ty, Ops[3], i * 16);
3069 Builder.CreateAlignedStore(Extract, Ptr, Align(1));
3070 }
3071
3072 return Builder.CreateExtractValue(Call, 0);
3073 }
3074 case X86::BI__builtin_ia32_aesenc128kl_u8:
3075 case X86::BI__builtin_ia32_aesdec128kl_u8:
3076 case X86::BI__builtin_ia32_aesenc256kl_u8:
3077 case X86::BI__builtin_ia32_aesdec256kl_u8: {
3078 Intrinsic::ID IID;
3079 StringRef BlockName;
3080 switch (BuiltinID) {
3081 default:
3082 llvm_unreachable("Unexpected builtin");
3083 case X86::BI__builtin_ia32_aesenc128kl_u8:
3084 IID = Intrinsic::x86_aesenc128kl;
3085 BlockName = "aesenc128kl";
3086 break;
3087 case X86::BI__builtin_ia32_aesdec128kl_u8:
3088 IID = Intrinsic::x86_aesdec128kl;
3089 BlockName = "aesdec128kl";
3090 break;
3091 case X86::BI__builtin_ia32_aesenc256kl_u8:
3092 IID = Intrinsic::x86_aesenc256kl;
3093 BlockName = "aesenc256kl";
3094 break;
3095 case X86::BI__builtin_ia32_aesdec256kl_u8:
3096 IID = Intrinsic::x86_aesdec256kl;
3097 BlockName = "aesdec256kl";
3098 break;
3099 }
3100
3101 Value *Call = Builder.CreateCall(CGM.getIntrinsic(IID), {Ops[1], Ops[2]});
3102
3103 BasicBlock *NoError =
3104 createBasicBlock(BlockName + "_no_error", this->CurFn);
3105 BasicBlock *Error = createBasicBlock(BlockName + "_error", this->CurFn);
3106 BasicBlock *End = createBasicBlock(BlockName + "_end", this->CurFn);
3107
3108 Value *Ret = Builder.CreateExtractValue(Call, 0);
3109 Value *Succ = Builder.CreateTrunc(Ret, Builder.getInt1Ty());
3110 Value *Out = Builder.CreateExtractValue(Call, 1);
3111 Builder.CreateCondBr(Succ, NoError, Error);
3112
3113 Builder.SetInsertPoint(NoError);
3114 Builder.CreateDefaultAlignedStore(Out, Ops[0]);
3115 Builder.CreateBr(End);
3116
3117 Builder.SetInsertPoint(Error);
3118 Constant *Zero = llvm::Constant::getNullValue(Out->getType());
3119 Builder.CreateDefaultAlignedStore(Zero, Ops[0]);
3120 Builder.CreateBr(End);
3121
3122 Builder.SetInsertPoint(End);
3123 return Builder.CreateExtractValue(Call, 0);
3124 }
3125 case X86::BI__builtin_ia32_aesencwide128kl_u8:
3126 case X86::BI__builtin_ia32_aesdecwide128kl_u8:
3127 case X86::BI__builtin_ia32_aesencwide256kl_u8:
3128 case X86::BI__builtin_ia32_aesdecwide256kl_u8: {
3129 Intrinsic::ID IID;
3130 StringRef BlockName;
3131 switch (BuiltinID) {
3132 case X86::BI__builtin_ia32_aesencwide128kl_u8:
3133 IID = Intrinsic::x86_aesencwide128kl;
3134 BlockName = "aesencwide128kl";
3135 break;
3136 case X86::BI__builtin_ia32_aesdecwide128kl_u8:
3137 IID = Intrinsic::x86_aesdecwide128kl;
3138 BlockName = "aesdecwide128kl";
3139 break;
3140 case X86::BI__builtin_ia32_aesencwide256kl_u8:
3141 IID = Intrinsic::x86_aesencwide256kl;
3142 BlockName = "aesencwide256kl";
3143 break;
3144 case X86::BI__builtin_ia32_aesdecwide256kl_u8:
3145 IID = Intrinsic::x86_aesdecwide256kl;
3146 BlockName = "aesdecwide256kl";
3147 break;
3148 }
3149
3150 llvm::Type *Ty = FixedVectorType::get(Builder.getInt64Ty(), 2);
3151 Value *InOps[9];
3152 InOps[0] = Ops[2];
3153 for (int i = 0; i != 8; ++i) {
3154 Value *Ptr = Builder.CreateConstGEP1_32(Ty, Ops[1], i);
3155 InOps[i + 1] = Builder.CreateAlignedLoad(Ty, Ptr, Align(16));
3156 }
3157
3158 Value *Call = Builder.CreateCall(CGM.getIntrinsic(IID), InOps);
3159
3160 BasicBlock *NoError =
3161 createBasicBlock(BlockName + "_no_error", this->CurFn);
3162 BasicBlock *Error = createBasicBlock(BlockName + "_error", this->CurFn);
3163 BasicBlock *End = createBasicBlock(BlockName + "_end", this->CurFn);
3164
3165 Value *Ret = Builder.CreateExtractValue(Call, 0);
3166 Value *Succ = Builder.CreateTrunc(Ret, Builder.getInt1Ty());
3167 Builder.CreateCondBr(Succ, NoError, Error);
3168
3169 Builder.SetInsertPoint(NoError);
3170 for (int i = 0; i != 8; ++i) {
3171 Value *Extract = Builder.CreateExtractValue(Call, i + 1);
3172 Value *Ptr = Builder.CreateConstGEP1_32(Extract->getType(), Ops[0], i);
3173 Builder.CreateAlignedStore(Extract, Ptr, Align(16));
3174 }
3175 Builder.CreateBr(End);
3176
3177 Builder.SetInsertPoint(Error);
3178 for (int i = 0; i != 8; ++i) {
3179 Value *Out = Builder.CreateExtractValue(Call, i + 1);
3180 Constant *Zero = llvm::Constant::getNullValue(Out->getType());
3181 Value *Ptr = Builder.CreateConstGEP1_32(Out->getType(), Ops[0], i);
3182 Builder.CreateAlignedStore(Zero, Ptr, Align(16));
3183 }
3184 Builder.CreateBr(End);
3185
3186 Builder.SetInsertPoint(End);
3187 return Builder.CreateExtractValue(Call, 0);
3188 }
3189 case X86::BI__builtin_ia32_vfcmaddcph512_mask:
3190 IsConjFMA = true;
3191 [[fallthrough]];
3192 case X86::BI__builtin_ia32_vfmaddcph512_mask: {
3193 Intrinsic::ID IID = IsConjFMA
3194 ? Intrinsic::x86_avx512fp16_mask_vfcmadd_cph_512
3195 : Intrinsic::x86_avx512fp16_mask_vfmadd_cph_512;
3196 Value *Call = Builder.CreateCall(CGM.getIntrinsic(IID), Ops);
3197 return EmitX86Select(*this, Ops[3], Call, Ops[0]);
3198 }
3199 case X86::BI__builtin_ia32_vfcmaddcsh_round_mask:
3200 IsConjFMA = true;
3201 [[fallthrough]];
3202 case X86::BI__builtin_ia32_vfmaddcsh_round_mask: {
3203 Intrinsic::ID IID = IsConjFMA ? Intrinsic::x86_avx512fp16_mask_vfcmadd_csh
3204 : Intrinsic::x86_avx512fp16_mask_vfmadd_csh;
3205 Value *Call = Builder.CreateCall(CGM.getIntrinsic(IID), Ops);
3206 Value *And = Builder.CreateAnd(Ops[3], llvm::ConstantInt::get(Int8Ty, 1));
3207 return EmitX86Select(*this, And, Call, Ops[0]);
3208 }
3209 case X86::BI__builtin_ia32_vfcmaddcsh_round_mask3:
3210 IsConjFMA = true;
3211 [[fallthrough]];
3212 case X86::BI__builtin_ia32_vfmaddcsh_round_mask3: {
3213 Intrinsic::ID IID = IsConjFMA ? Intrinsic::x86_avx512fp16_mask_vfcmadd_csh
3214 : Intrinsic::x86_avx512fp16_mask_vfmadd_csh;
3215 Value *Call = Builder.CreateCall(CGM.getIntrinsic(IID), Ops);
3216 static constexpr int Mask[] = {0, 5, 6, 7};
3217 return Builder.CreateShuffleVector(Call, Ops[2], Mask);
3218 }
3219 case X86::BI__builtin_ia32_prefetchi:
3220 return Builder.CreateCall(
3221 CGM.getIntrinsic(Intrinsic::prefetch, Ops[0]->getType()),
3222 {Ops[0], llvm::ConstantInt::get(Int32Ty, 0), Ops[1],
3223 llvm::ConstantInt::get(Int32Ty, 0)});
3224 }
3225}
#define X86_CPU_SUBTYPE(ENUM, STR)
#define X86_CPU_SUBTYPE_ALIAS(ENUM, ALIAS)
#define X86_VENDOR(ENUM, STRING)
#define X86_CPU_TYPE_ALIAS(ENUM, ALIAS)
#define X86_CPU_TYPE(ENUM, STR)
#define INTRINSIC_X86_XSAVE_ID(NAME)
static Value * EmitX86CompressExpand(CodeGenFunction &CGF, ArrayRef< Value * > Ops, bool IsCompress)
Definition X86.cpp:113
static Value * EmitX86MaskedCompare(CodeGenFunction &CGF, unsigned CC, bool Signed, ArrayRef< Value * > Ops)
Definition X86.cpp:261
static Value * EmitScalarFMAExpr(CodeGenFunction &CGF, const CallExpr *E, MutableArrayRef< Value * > Ops, Value *Upper, bool ZeroMask=false, unsigned PTIdx=0, bool NegAcc=false)
Definition X86.cpp:447
static Value * EmitX86ExpandLoad(CodeGenFunction &CGF, ArrayRef< Value * > Ops)
Definition X86.cpp:100
static Value * EmitX86MaskedLoad(CodeGenFunction &CGF, ArrayRef< Value * > Ops, Align Alignment)
Definition X86.cpp:89
static std::optional< CodeGenFunction::MSVCIntrin > translateX86ToMsvcIntrin(unsigned BuiltinID)
Definition X86.cpp:24
static Value * EmitX86MaskedStore(CodeGenFunction &CGF, ArrayRef< Value * > Ops, Align Alignment)
Definition X86.cpp:78
static Value * EmitX86Muldq(CodeGenFunction &CGF, bool IsSigned, ArrayRef< Value * > Ops)
Definition X86.cpp:505
static Value * EmitX86CvtF16ToFloatExpr(CodeGenFunction &CGF, ArrayRef< Value * > Ops, llvm::Type *DstTy)
Definition X86.cpp:577
static Value * EmitX86SExtMask(CodeGenFunction &CGF, Value *Op, llvm::Type *DstTy)
Definition X86.cpp:562
static Value * EmitX86FunnelShift(CodeGenFunction &CGF, Value *Op0, Value *Op1, Value *Amt, bool IsRight)
Definition X86.cpp:152
static Value * EmitX86MaskLogic(CodeGenFunction &CGF, Instruction::BinaryOps Opc, ArrayRef< Value * > Ops, bool InvertLHS=false)
Definition X86.cpp:138
static Value * EmitX86Select(CodeGenFunction &CGF, Value *Mask, Value *Op0, Value *Op1)
Definition X86.cpp:210
static Value * EmitX86FMAExpr(CodeGenFunction &CGF, const CallExpr *E, ArrayRef< Value * > Ops, unsigned BuiltinID, bool IsAddSub)
Definition X86.cpp:322
static Value * getMaskVecValue(CodeGenFunction &CGF, Value *Mask, unsigned NumElts)
Definition X86.cpp:58
static Value * EmitX86MaskedCompareResult(CodeGenFunction &CGF, Value *Cmp, unsigned NumElts, Value *MaskIn)
Definition X86.cpp:238
static Value * EmitX86CompressStore(CodeGenFunction &CGF, ArrayRef< Value * > Ops)
Definition X86.cpp:126
static Value * EmitX86vpcom(CodeGenFunction &CGF, ArrayRef< Value * > Ops, bool IsSigned)
Definition X86.cpp:170
static Value * EmitX86ConvertToMask(CodeGenFunction &CGF, Value *In)
Definition X86.cpp:296
static Value * EmitX86Ternlog(CodeGenFunction &CGF, bool ZeroMask, ArrayRef< Value * > Ops)
Definition X86.cpp:534
static Value * EmitX86ConvertIntToFp(CodeGenFunction &CGF, const CallExpr *E, ArrayRef< Value * > Ops, bool IsSigned)
Definition X86.cpp:301
static Value * EmitX86ScalarSelect(CodeGenFunction &CGF, Value *Mask, Value *Op0, Value *Op1)
Definition X86.cpp:224
TokenType getType() const
Returns the token's type, e.g.
#define ALIAS(NAME, TOK, FLAGS)
#define ENUM(NAME, LIT)
Enumerates target-specific builtins in their own namespaces within namespace clang.
QualType GetBuiltinType(unsigned ID, GetBuiltinTypeError &Error, unsigned *IntegerConstantArgs=nullptr) const
Return the type for the specified builtin.
@ GE_None
No error.
CallExpr - Represents a function call (C99 6.5.2.2, C++ [expr.call]).
Definition Expr.h:2879
Expr * getArg(unsigned Arg)
getArg - Return the specified argument.
Definition Expr.h:3083
unsigned getNumArgs() const
getNumArgs - Return the number of actual arguments to this call.
Definition Expr.h:3070
static CharUnits fromQuantity(QuantityType Quantity)
fromQuantity - Construct a CharUnits quantity from a raw integer type.
Definition CharUnits.h:63
Like RawAddress, an abstract representation of an aligned address, but the pointer contained in this ...
Definition Address.h:128
llvm::PointerType * getType() const
Return the type of the pointer value.
Definition Address.h:204
CodeGenFunction - This class organizes the per-function state that is used while generating LLVM code...
llvm::Value * EmitScalarOrConstFoldImmArg(unsigned ICEArguments, unsigned Idx, const CallExpr *E)
Definition AMDGPU.cpp:258
llvm::Type * ConvertType(QualType T)
llvm::BasicBlock * createBasicBlock(const Twine &name="", llvm::Function *parent=nullptr, llvm::BasicBlock *before=nullptr)
createBasicBlock - Create an LLVM basic block.
@ Default
! No language constraints on evaluation order.
const TargetInfo & getTarget() const
llvm::Value * EmitMSVCBuiltinExpr(MSVCIntrin BuiltinID, const CallExpr *E)
llvm::Value * EmitX86BuiltinExpr(unsigned BuiltinID, const CallExpr *E)
Definition X86.cpp:737
Address EmitPointerWithAlignment(const Expr *Addr, LValueBaseInfo *BaseInfo=nullptr, TBAAAccessInfo *TBAAInfo=nullptr, KnownNonNull_t IsKnownNonNull=NotKnownNonNull)
EmitPointerWithAlignment - Given an expression with a pointer type, emit the value and compute our be...
Definition CGExpr.cpp:1515
RawAddress CreateMemTemp(QualType T, const Twine &Name="tmp", RawAddress *Alloca=nullptr)
CreateMemTemp - Create a temporary memory object of the given type, with appropriate alignmen and cas...
Definition CGExpr.cpp:186
llvm::Value * EmitScalarExpr(const Expr *E, bool IgnoreResultAssign=false)
EmitScalarExpr - Emit the computation of the specified expression of LLVM scalar type,...
llvm::CallInst * EmitTrapCall(llvm::Intrinsic::ID IntrID)
Emit a call to trap or debugtrap and attach function attribute "trap-func-name" if specified.
Definition CGExpr.cpp:4213
llvm::LLVMContext & getLLVMContext()
llvm::Function * getIntrinsic(unsigned IID, ArrayRef< llvm::Type * > Tys={})
An abstract representation of an aligned address.
Definition Address.h:42
llvm::Value * getPointer() const
Definition Address.h:66
Expr * IgnoreParenCasts() LLVM_READONLY
Skip past any parentheses and casts which might surround this expression until reaching a fixed point...
Definition Expr.cpp:3078
QualType getType() const
Definition Expr.h:144
PointerType - C99 6.7.5.1 - Pointer Declarators.
Definition TypeBase.h:3328
A (possibly-)qualified type.
Definition TypeBase.h:937
QualType getPointeeType() const
If this is a pointer, ObjC object pointer, or block pointer, this returns the respective pointee.
Definition Type.cpp:752
const T * getAs() const
Member-template getAs<specific type>'.
Definition TypeBase.h:9101
QualType getType() const
Definition Value.cpp:237
The JSON file list parser is used to communicate input to InstallAPI.
@ Asm
Assembly: we accept this only so that we can preprocess it.
@ Result
The result type of a method or function.
Definition TypeBase.h:905
U cast(CodeGen::Address addr)
Definition Address.h:327
unsigned int uint32_t
Diagnostic wrappers for TextAPI types for error reporting.
Definition Dominators.h:30
llvm::IntegerType * Int8Ty
i8, i16, i32, and i64