From 0c9643008e5a35cac76cf3419b3308dcad97e53e Mon Sep 17 00:00:00 2001 From: Bill Wendling Date: Thu, 12 May 2011 19:02:15 +0000 Subject: [PATCH] LLVM doesn't always optimize away the four loads from this: (__m128){ p[0], p[1], p[2], p[3] } which produces really bad code. This could be done in instcombine, but it's probably better to do it in the front-end instead. git-svn-id: https://llvm.org/svn/llvm-project/cfe/trunk@131237 91177308-0d34-0410-b5e6-96231b3b80d8 --- include/clang/Basic/BuiltinsX86.def | 2 ++ lib/CodeGen/CGBuiltin.cpp | 2 ++ lib/Headers/emmintrin.h | 2 +- lib/Headers/xmmintrin.h | 2 +- test/CodeGen/builtins-x86.c | 2 ++ 5 files changed, 8 insertions(+), 2 deletions(-) diff --git a/include/clang/Basic/BuiltinsX86.def b/include/clang/Basic/BuiltinsX86.def index 2c2a84ab30..ce376dd3d7 100644 --- a/include/clang/Basic/BuiltinsX86.def +++ b/include/clang/Basic/BuiltinsX86.def @@ -240,6 +240,7 @@ BUILTIN(__builtin_ia32_cvtps2pi, "V2iV4f", "") BUILTIN(__builtin_ia32_cvtss2si, "iV4f", "") BUILTIN(__builtin_ia32_cvtss2si64, "LLiV4f", "") BUILTIN(__builtin_ia32_cvttps2pi, "V2iV4f", "") +BUILTIN(__builtin_ia32_loadups, "V4ffC*", "") BUILTIN(__builtin_ia32_storeups, "vf*V4f", "") BUILTIN(__builtin_ia32_storehps, "vV2i*V4f", "") BUILTIN(__builtin_ia32_storelps, "vV2i*V4f", "") @@ -253,6 +254,7 @@ BUILTIN(__builtin_ia32_rsqrtss, "V4fV4f", "") BUILTIN(__builtin_ia32_sqrtps, "V4fV4f", "") BUILTIN(__builtin_ia32_sqrtss, "V4fV4f", "") BUILTIN(__builtin_ia32_maskmovdqu, "vV16cV16cc*", "") +BUILTIN(__builtin_ia32_loadupd, "V2ddC*", "") BUILTIN(__builtin_ia32_storeupd, "vd*V2d", "") BUILTIN(__builtin_ia32_movmskpd, "iV2d", "") BUILTIN(__builtin_ia32_pmovmskb128, "iV16c", "") diff --git a/lib/CodeGen/CGBuiltin.cpp b/lib/CodeGen/CGBuiltin.cpp index 46546177ae..494dfaeff7 100644 --- a/lib/CodeGen/CGBuiltin.cpp +++ b/lib/CodeGen/CGBuiltin.cpp @@ -2143,6 +2143,8 @@ Value *CodeGenFunction::EmitX86BuiltinExpr(unsigned BuiltinID, // If palignr is shifting the pair of vectors more than 32 bytes, emit zero. return llvm::Constant::getNullValue(ConvertType(E->getType())); } + case X86::BI__builtin_ia32_loadups: + case X86::BI__builtin_ia32_loadupd: case X86::BI__builtin_ia32_loaddqu: { const llvm::Type *VecTy = ConvertType(E->getType()); const llvm::Type *IntTy = llvm::IntegerType::get(getLLVMContext(), 128); diff --git a/lib/Headers/emmintrin.h b/lib/Headers/emmintrin.h index 62c10b5134..746e717a30 100644 --- a/lib/Headers/emmintrin.h +++ b/lib/Headers/emmintrin.h @@ -466,7 +466,7 @@ _mm_loadr_pd(double const *dp) static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) _mm_loadu_pd(double const *dp) { - return (__m128d){ dp[0], dp[1] }; + return __builtin_ia32_loadupd(dp); } static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) diff --git a/lib/Headers/xmmintrin.h b/lib/Headers/xmmintrin.h index 00760ed6d1..42dd3e8d3b 100644 --- a/lib/Headers/xmmintrin.h +++ b/lib/Headers/xmmintrin.h @@ -539,7 +539,7 @@ _mm_load_ps(const float *p) static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) _mm_loadu_ps(const float *p) { - return (__m128){ p[0], p[1], p[2], p[3] }; + return __builtin_ia32_loadups(p); } static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) diff --git a/test/CodeGen/builtins-x86.c b/test/CodeGen/builtins-x86.c index bb63048b61..190fa55c57 100644 --- a/test/CodeGen/builtins-x86.c +++ b/test/CodeGen/builtins-x86.c @@ -273,6 +273,7 @@ void f0() { #endif tmp_V2i = __builtin_ia32_cvttps2pi(tmp_V4f); (void) __builtin_ia32_maskmovq(tmp_V8c, tmp_V8c, tmp_cp); + tmp_V4f = __builtin_ia32_loadups(tmp_fCp); (void) __builtin_ia32_storeups(tmp_fp, tmp_V4f); (void) __builtin_ia32_storehps(tmp_V2ip, tmp_V4f); (void) __builtin_ia32_storelps(tmp_V2ip, tmp_V4f); @@ -290,6 +291,7 @@ void f0() { tmp_V4f = __builtin_ia32_sqrtps(tmp_V4f); tmp_V4f = __builtin_ia32_sqrtss(tmp_V4f); (void) __builtin_ia32_maskmovdqu(tmp_V16c, tmp_V16c, tmp_cp); + tmp_V2d = __builtin_ia32_loadupd(tmp_dCp); (void) __builtin_ia32_storeupd(tmp_dp, tmp_V2d); tmp_i = __builtin_ia32_movmskpd(tmp_V2d); tmp_i = __builtin_ia32_pmovmskb128(tmp_V16c); -- 2.40.0