TARGET_BUILTIN(__builtin_ia32_storehps, "vV2i*V4f", "", "sse")
TARGET_BUILTIN(__builtin_ia32_storelps, "vV2i*V4f", "", "sse")
TARGET_BUILTIN(__builtin_ia32_movmskps, "iV4f", "", "sse")
-TARGET_BUILTIN(__builtin_ia32_movntps, "vf*V4f", "", "sse")
TARGET_BUILTIN(__builtin_ia32_sfence, "v", "", "sse")
TARGET_BUILTIN(__builtin_ia32_rcpps, "V4fV4f", "", "sse")
TARGET_BUILTIN(__builtin_ia32_rcpss, "V4fV4f", "", "sse")
TARGET_BUILTIN(__builtin_ia32_pmovmskb128, "iV16c", "", "sse2")
TARGET_BUILTIN(__builtin_ia32_movnti, "vi*i", "", "sse2")
TARGET_BUILTIN(__builtin_ia32_movnti64, "vLLi*LLi", "", "sse2")
-TARGET_BUILTIN(__builtin_ia32_movntpd, "vd*V2d", "", "sse2")
-TARGET_BUILTIN(__builtin_ia32_movntdq, "vV2LLi*V2LLi", "", "sse2")
TARGET_BUILTIN(__builtin_ia32_psadbw128, "V2LLiV16cV16c", "", "sse2")
TARGET_BUILTIN(__builtin_ia32_sqrtpd, "V2dV2d", "", "sse2")
TARGET_BUILTIN(__builtin_ia32_sqrtsd, "V2dV2d", "", "sse2")
TARGET_BUILTIN(__builtin_ia32_vbroadcastf128_pd256, "V4dV2dC*", "", "avx")
TARGET_BUILTIN(__builtin_ia32_vbroadcastf128_ps256, "V8fV4fC*", "", "avx")
TARGET_BUILTIN(__builtin_ia32_lddqu256, "V32ccC*", "", "avx")
-TARGET_BUILTIN(__builtin_ia32_movntdq256, "vV4LLi*V4LLi", "", "avx")
-TARGET_BUILTIN(__builtin_ia32_movntpd256, "vd*V4d", "", "avx")
-TARGET_BUILTIN(__builtin_ia32_movntps256, "vf*V8f", "", "avx")
TARGET_BUILTIN(__builtin_ia32_maskloadpd, "V2dV2dC*V2LLi", "", "avx")
TARGET_BUILTIN(__builtin_ia32_maskloadps, "V4fV4fC*V4i", "", "avx")
TARGET_BUILTIN(__builtin_ia32_maskloadpd256, "V4dV4dC*V4LLi", "", "avx")
TARGET_BUILTIN(__builtin_ia32_kunpckhi, "UsUsUs","","avx512f")
TARGET_BUILTIN(__builtin_ia32_kxnorhi, "UsUsUs","","avx512f")
TARGET_BUILTIN(__builtin_ia32_kxorhi, "UsUsUs","","avx512f")
-TARGET_BUILTIN(__builtin_ia32_movntdq512, "vV8LLi*V8LLi","","avx512f")
TARGET_BUILTIN(__builtin_ia32_movntdqa512, "V8LLiV8LLi*","","avx512f")
-TARGET_BUILTIN(__builtin_ia32_movntpd512, "vd*V8d","","avx512f")
-TARGET_BUILTIN(__builtin_ia32_movntps512, "vf*V16f","","avx512f")
TARGET_BUILTIN(__builtin_ia32_palignr512_mask, "V64cV64cV64cIiV64cULLi","","avx512bw")
TARGET_BUILTIN(__builtin_ia32_palignr128_mask, "V16cV16cV16cIiV16cUs","","avx512bw,avx512vl")
TARGET_BUILTIN(__builtin_ia32_palignr256_mask, "V32cV32cV32cIiV32cUi","","avx512bw,avx512vl")
// little-Endian, the high bits in big-Endian. Therefore, on big-Endian
// we need to shift the high bits down to the low before truncating.
Width >>= 1;
- if (CGF.getTarget().isBigEndian()) {
- Value *ShiftCst = llvm::ConstantInt::get(IntTy, Width);
- V = CGF.Builder.CreateLShr(V, ShiftCst);
- }
- // We are truncating value in order to extract the higher-order
- // double, which we will be using to extract the sign from.
- IntTy = llvm::IntegerType::get(C, Width);
- V = CGF.Builder.CreateTrunc(V, IntTy);
+ if (CGF.getTarget().isBigEndian()) {\r
+ Value *ShiftCst = llvm::ConstantInt::get(IntTy, Width);\r
+ V = CGF.Builder.CreateLShr(V, ShiftCst);\r
+ } \r
+ // We are truncating value in order to extract the higher-order \r
+ // double, which we will be using to extract the sign from.\r
+ IntTy = llvm::IntegerType::get(C, Width);\r
+ V = CGF.Builder.CreateTrunc(V, IntTy);\r
}
Value *Zero = llvm::Constant::getNullValue(IntTy);
return CGF.Builder.CreateICmpSLT(V, Zero);
case Builtin::BI__builtin_smull_overflow:
case Builtin::BI__builtin_smulll_overflow:
IntrinsicId = llvm::Intrinsic::smul_with_overflow;
- break;
- }
-
-
- llvm::Value *Carry;
- llvm::Value *Sum = EmitOverflowIntrinsic(*this, IntrinsicId, X, Y, Carry);
- Builder.CreateStore(Sum, SumOutPtr);
+ break;\r
+ }\r
+\r
+ \r
+ llvm::Value *Carry;\r
+ llvm::Value *Sum = EmitOverflowIntrinsic(*this, IntrinsicId, X, Y, Carry);\r
+ Builder.CreateStore(Sum, SumOutPtr);\r
return RValue::get(Carry);
}
llvm::Type *ResTy, unsigned IntID,
const char *Name) {
SmallVector<Value *, 2> TblOps;
- if (ExtOp)
- TblOps.push_back(ExtOp);
-
- // Build a vector containing sequential number like (0, 1, 2, ..., 15)
- SmallVector<uint32_t, 16> Indices;
- llvm::VectorType *TblTy = cast<llvm::VectorType>(Ops[0]->getType());
- for (unsigned i = 0, e = TblTy->getNumElements(); i != e; ++i) {
+ if (ExtOp)\r
+ TblOps.push_back(ExtOp);\r
+\r
+ // Build a vector containing sequential number like (0, 1, 2, ..., 15) \r
+ SmallVector<uint32_t, 16> Indices;\r
+ llvm::VectorType *TblTy = cast<llvm::VectorType>(Ops[0]->getType());\r
+ for (unsigned i = 0, e = TblTy->getNumElements(); i != e; ++i) {\r
Indices.push_back(2*i);
Indices.push_back(2*i+1);
}
ZeroTbl, Indices, Name));
}
- Function *TblF;
- TblOps.push_back(IndexOp);
- TblF = CGF.CGM.getIntrinsic(IntID, ResTy);
-
- return CGF.EmitNeonCall(TblF, TblOps, Name);
-}
-
+ Function *TblF;\r
+ TblOps.push_back(IndexOp);\r
+ TblF = CGF.CGM.getIntrinsic(IntID, ResTy);\r
+ \r
+ return CGF.EmitNeonCall(TblF, TblOps, Name);\r
+}\r
+\r
Value *CodeGenFunction::GetValueForARMHint(unsigned BuiltinID) {
unsigned Value;
switch (BuiltinID) {
"vsha1h");
// The ARM _MoveToCoprocessor builtins put the input register value as
- // the first argument, but the LLVM intrinsic expects it as the third one.
- case ARM::BI_MoveToCoprocessor:
- case ARM::BI_MoveToCoprocessor2: {
- Function *F = CGM.getIntrinsic(BuiltinID == ARM::BI_MoveToCoprocessor ?
- Intrinsic::arm_mcr : Intrinsic::arm_mcr2);
- return Builder.CreateCall(F, {Ops[1], Ops[2], Ops[0],
- Ops[3], Ops[4], Ops[5]});
+ // the first argument, but the LLVM intrinsic expects it as the third one.\r
+ case ARM::BI_MoveToCoprocessor:\r
+ case ARM::BI_MoveToCoprocessor2: {\r
+ Function *F = CGM.getIntrinsic(BuiltinID == ARM::BI_MoveToCoprocessor ? \r
+ Intrinsic::arm_mcr : Intrinsic::arm_mcr2);\r
+ return Builder.CreateCall(F, {Ops[1], Ops[2], Ops[0],\r
+ Ops[3], Ops[4], Ops[5]});\r
}
}
if (Ops.size() == 3)
return Align;
- return EmitX86Select(*this, Ops[4], Align, Ops[3]);
- }
-
- case X86::BI__builtin_ia32_movntps:
- case X86::BI__builtin_ia32_movntps256:
- case X86::BI__builtin_ia32_movntpd:
- case X86::BI__builtin_ia32_movntpd256:
- case X86::BI__builtin_ia32_movntdq:
- case X86::BI__builtin_ia32_movntdq256:
- case X86::BI__builtin_ia32_movnti:
- case X86::BI__builtin_ia32_movnti64: {
- llvm::MDNode *Node = llvm::MDNode::get(
+ return EmitX86Select(*this, Ops[4], Align, Ops[3]);\r
+ }\r
+\r
+ case X86::BI__builtin_ia32_movnti:\r
+ case X86::BI__builtin_ia32_movnti64: {\r
+ llvm::MDNode *Node = llvm::MDNode::get(\r
getLLVMContext(), llvm::ConstantAsMetadata::get(Builder.getInt32(1)));
// Convert the type of the pointer to a pointer to the stored type.
Value *BC = Builder.CreateBitCast(Ops[0],
llvm::PointerType::getUnqual(Ops[1]->getType()),
"cast");
- StoreInst *SI = Builder.CreateDefaultAlignedStore(Ops[1], BC);
- SI->setMetadata(CGM.getModule().getMDKindID("nontemporal"), Node);
-
- // If the operand is an integer, we can't assume alignment. Otherwise,
- // assume natural alignment.
- QualType ArgTy = E->getArg(1)->getType();
- unsigned Align;
- if (ArgTy->isIntegerType())
- Align = 1;
- else
- Align = getContext().getTypeSizeInChars(ArgTy).getQuantity();
- SI->setAlignment(Align);
- return SI;
- }
- case X86::BI__builtin_ia32_selectb_128:
+ StoreInst *SI = Builder.CreateDefaultAlignedStore(Ops[1], BC);\r
+ SI->setMetadata(CGM.getModule().getMDKindID("nontemporal"), Node);\r
+\r
+ // No alignment for scalar intrinsic store.\r
+ QualType ArgTy = E->getArg(1)->getType();\r
+ SI->setAlignment(1);\r
+ return SI;\r
+ }\r
+ case X86::BI__builtin_ia32_selectb_128:\r
case X86::BI__builtin_ia32_selectb_256:
case X86::BI__builtin_ia32_selectb_512:
case X86::BI__builtin_ia32_selectw_128:
static __inline__ void __DEFAULT_FN_ATTRS
_mm512_stream_si512 (__m512i * __P, __m512i __A)
{
- __builtin_ia32_movntdq512 ((__v8di *) __P, (__v8di) __A);
+ __builtin_nontemporal_store((__v8di)__A, (__v8di*)__P);
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
static __inline__ void __DEFAULT_FN_ATTRS
_mm512_stream_pd (double *__P, __m512d __A)
{
- __builtin_ia32_movntpd512 (__P, (__v8df) __A);
+ __builtin_nontemporal_store((__v8df)__A, (__v8df*)__P);
}
static __inline__ void __DEFAULT_FN_ATTRS
_mm512_stream_ps (float *__P, __m512 __A)
{
- __builtin_ia32_movntps512 (__P, (__v16sf) __A);
+ __builtin_nontemporal_store((__v16sf)__A, (__v16sf*)__P);
}
static __inline__ __m512d __DEFAULT_FN_ATTRS
static __inline void __DEFAULT_FN_ATTRS
_mm256_stream_si256(__m256i *__a, __m256i __b)
{
- __builtin_ia32_movntdq256((__v4di *)__a, (__v4di)__b);
+ __builtin_nontemporal_store((__v4di)__b, (__v4di*)__a);
}
static __inline void __DEFAULT_FN_ATTRS
_mm256_stream_pd(double *__a, __m256d __b)
{
- __builtin_ia32_movntpd256(__a, (__v4df)__b);
+ __builtin_nontemporal_store((__v4df)__b, (__v4df*)__a);
}
static __inline void __DEFAULT_FN_ATTRS
_mm256_stream_ps(float *__p, __m256 __a)
{
- __builtin_ia32_movntps256(__p, (__v8sf)__a);
+ __builtin_nontemporal_store((__v8sf)__a, (__v8sf*)__p);
}
/* Create vectors */
static __inline__ void __DEFAULT_FN_ATTRS
_mm_stream_pd(double *__p, __m128d __a)
{
- __builtin_ia32_movntpd(__p, (__v2df)__a);
+ __builtin_nontemporal_store((__v2df)__a, (__v2df*)__p);
}
static __inline__ void __DEFAULT_FN_ATTRS
_mm_stream_si128(__m128i *__p, __m128i __a)
{
- __builtin_ia32_movntdq(__p, (__v2di)__a);
+ __builtin_nontemporal_store((__v2di)__a, (__v2di*)__p);
}
static __inline__ void __DEFAULT_FN_ATTRS
static __inline__ void __DEFAULT_FN_ATTRS
_mm_stream_ps(float *__p, __m128 __a)
{
- __builtin_ia32_movntps(__p, (__v4sf)__a);
+ __builtin_nontemporal_store((__v4sf)__a, (__v4sf*)__p);
}
/// \brief Forces strong memory ordering (serialization) between store
void test_mm512_stream_si512(__m512i * __P, __m512i __A) {
// CHECK-LABEL: @test_mm512_stream_si512
- // CHECK: @llvm.x86.avx512.storent.q.512
+ // CHECK: store <8 x i64> %{{.*}}, <8 x i64>* %{{.*}}, align 64, !nontemporal
_mm512_stream_si512(__P, __A);
}
void test_mm512_stream_pd(double *__P, __m512d __A) {
// CHECK-LABEL: @test_mm512_stream_pd
- // CHECK: @llvm.x86.avx512.storent.pd.512
+ // CHECK: store <8 x double> %{{.*}}, <8 x double>* %{{.*}}, align 64, !nontemporal
return _mm512_stream_pd(__P, __A);
}
void test_mm512_stream_ps(float *__P, __m512 __A) {
// CHECK-LABEL: @test_mm512_stream_ps
- // CHECK: @llvm.x86.avx512.storent.ps.512
+ // CHECK: store <16 x float> %{{.*}}, <16 x float>* %{{.*}}, align 64, !nontemporal
_mm512_stream_ps(__P, __A);
}
(void) __builtin_ia32_storelps(tmp_V2ip, tmp_V4f);
tmp_i = __builtin_ia32_movmskps(tmp_V4f);
tmp_i = __builtin_ia32_pmovmskb(tmp_V8c);
- (void) __builtin_ia32_movntps(tmp_fp, tmp_V4f);
(void) __builtin_ia32_movntq(tmp_V1LLip, tmp_V1LLi);
(void) __builtin_ia32_sfence();
#ifdef USE_64
(void) __builtin_ia32_movnti64(tmp_LLip, tmp_LLi);
#endif
- (void) __builtin_ia32_movntpd(tmp_dp, tmp_V2d);
- (void) __builtin_ia32_movntdq(tmp_V2LLip, tmp_V2LLi);
tmp_V2LLi = __builtin_ia32_psadbw128(tmp_V16c, tmp_V16c);
tmp_V2d = __builtin_ia32_sqrtpd(tmp_V2d);
tmp_V2d = __builtin_ia32_sqrtsd(tmp_V2d);
tmp_V4d = __builtin_ia32_vbroadcastf128_pd256(tmp_V2dCp);
tmp_V8f = __builtin_ia32_vbroadcastf128_ps256(tmp_V4fCp);
tmp_V32c = __builtin_ia32_lddqu256(tmp_cCp);
- __builtin_ia32_movntdq256(tmp_V4LLip, tmp_V4LLi);
- __builtin_ia32_movntpd256(tmp_dp, tmp_V4d);
- __builtin_ia32_movntps256(tmp_fp, tmp_V8f);
tmp_V2d = __builtin_ia32_maskloadpd(tmp_V2dCp, tmp_V2LLi);
tmp_V4f = __builtin_ia32_maskloadps(tmp_V4fCp, tmp_V4i);
tmp_V4d = __builtin_ia32_maskloadpd256(tmp_V4dCp, tmp_V4LLi);