(__m128){ p[0], p[1], p[2], p[3] }
which produces really bad code. This could be done in instcombine, but it's
probably better to do it in the front-end instead.
<rdar://problem/
9424836>
git-svn-id: https://llvm.org/svn/llvm-project/cfe/trunk@131237
91177308-0d34-0410-b5e6-
96231b3b80d8
BUILTIN(__builtin_ia32_cvtss2si, "iV4f", "")
BUILTIN(__builtin_ia32_cvtss2si64, "LLiV4f", "")
BUILTIN(__builtin_ia32_cvttps2pi, "V2iV4f", "")
+BUILTIN(__builtin_ia32_loadups, "V4ffC*", "")
BUILTIN(__builtin_ia32_storeups, "vf*V4f", "")
BUILTIN(__builtin_ia32_storehps, "vV2i*V4f", "")
BUILTIN(__builtin_ia32_storelps, "vV2i*V4f", "")
BUILTIN(__builtin_ia32_sqrtps, "V4fV4f", "")
BUILTIN(__builtin_ia32_sqrtss, "V4fV4f", "")
BUILTIN(__builtin_ia32_maskmovdqu, "vV16cV16cc*", "")
+BUILTIN(__builtin_ia32_loadupd, "V2ddC*", "")
BUILTIN(__builtin_ia32_storeupd, "vd*V2d", "")
BUILTIN(__builtin_ia32_movmskpd, "iV2d", "")
BUILTIN(__builtin_ia32_pmovmskb128, "iV16c", "")
// If palignr is shifting the pair of vectors more than 32 bytes, emit zero.
return llvm::Constant::getNullValue(ConvertType(E->getType()));
}
+ case X86::BI__builtin_ia32_loadups:
+ case X86::BI__builtin_ia32_loadupd:
case X86::BI__builtin_ia32_loaddqu: {
const llvm::Type *VecTy = ConvertType(E->getType());
const llvm::Type *IntTy = llvm::IntegerType::get(getLLVMContext(), 128);
static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
_mm_loadu_pd(double const *dp)
{
- return (__m128d){ dp[0], dp[1] };
+ return __builtin_ia32_loadupd(dp);
}
static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
_mm_loadu_ps(const float *p)
{
- return (__m128){ p[0], p[1], p[2], p[3] };
+ return __builtin_ia32_loadups(p);
}
static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
#endif
tmp_V2i = __builtin_ia32_cvttps2pi(tmp_V4f);
(void) __builtin_ia32_maskmovq(tmp_V8c, tmp_V8c, tmp_cp);
+ tmp_V4f = __builtin_ia32_loadups(tmp_fCp);
(void) __builtin_ia32_storeups(tmp_fp, tmp_V4f);
(void) __builtin_ia32_storehps(tmp_V2ip, tmp_V4f);
(void) __builtin_ia32_storelps(tmp_V2ip, tmp_V4f);
tmp_V4f = __builtin_ia32_sqrtps(tmp_V4f);
tmp_V4f = __builtin_ia32_sqrtss(tmp_V4f);
(void) __builtin_ia32_maskmovdqu(tmp_V16c, tmp_V16c, tmp_cp);
+ tmp_V2d = __builtin_ia32_loadupd(tmp_dCp);
(void) __builtin_ia32_storeupd(tmp_dp, tmp_V2d);
tmp_i = __builtin_ia32_movmskpd(tmp_V2d);
tmp_i = __builtin_ia32_pmovmskb128(tmp_V16c);