From: Eli Friedman Date: Thu, 15 Sep 2011 23:15:27 +0000 (+0000) Subject: Tweak *mmintrin.h so that they don't make any bad assumptions about alignment (which... X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=7c06f6b319b272ea9e2df8ef03793c7b89e8dfab;p=clang Tweak *mmintrin.h so that they don't make any bad assumptions about alignment (which probably has little effect in practice, but better to get it right). Make the load in _mm_loadh_pi and _mm_loadl_pi a single LLVM IR instruction to make optimizing easier for CodeGen. rdar://10054986 git-svn-id: https://llvm.org/svn/llvm-project/cfe/trunk@139874 91177308-0d34-0410-b5e6-96231b3b80d8 --- diff --git a/lib/Headers/emmintrin.h b/lib/Headers/emmintrin.h index 6f82c264fd..fab67adb08 100644 --- a/lib/Headers/emmintrin.h +++ b/lib/Headers/emmintrin.h @@ -458,7 +458,11 @@ _mm_load_pd(double const *dp) static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) _mm_load1_pd(double const *dp) { - return (__m128d){ dp[0], dp[0] }; + struct __mm_load1_pd_struct { + double u; + } __attribute__((__packed__, __may_alias__)); + double u = ((struct __mm_load1_pd_struct*)dp)->u; + return (__m128d){ u, u }; } #define _mm_load_pd1(dp) _mm_load1_pd(dp) @@ -466,7 +470,8 @@ _mm_load1_pd(double const *dp) static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) _mm_loadr_pd(double const *dp) { - return (__m128d){ dp[1], dp[0] }; + __m128d u = *(__m128d*)dp; + return __builtin_shufflevector(u, u, 1, 0); } static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) @@ -481,19 +486,31 @@ _mm_loadu_pd(double const *dp) static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) _mm_load_sd(double const *dp) { - return (__m128d){ *dp, 0.0 }; + struct __mm_load_sd_struct { + double u; + } __attribute__((__packed__, __may_alias__)); + double u = ((struct __mm_load_sd_struct*)dp)->u; + return (__m128d){ u, 0 }; } static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) _mm_loadh_pd(__m128d a, double const *dp) { - return (__m128d){ a[0], *dp }; + struct __mm_loadh_pd_struct { + double u; + } __attribute__((__packed__, __may_alias__)); + double u = ((struct __mm_loadh_pd_struct*)dp)->u; + return (__m128d){ a[0], u }; } static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) _mm_loadl_pd(__m128d a, double const *dp) { - return (__m128d){ *dp, a[1] }; + struct __mm_loadl_pd_struct { + double u; + } __attribute__((__packed__, __may_alias__)); + double u = ((struct __mm_loadl_pd_struct*)dp)->u; + return (__m128d){ u, a[1] }; } static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) @@ -535,14 +552,20 @@ _mm_move_sd(__m128d a, __m128d b) static __inline__ void __attribute__((__always_inline__, __nodebug__)) _mm_store_sd(double *dp, __m128d a) { - dp[0] = a[0]; + struct __mm_store_sd_struct { + double u; + } __attribute__((__packed__, __may_alias__)); + ((struct __mm_store_sd_struct*)dp)->u = a[0]; } static __inline__ void __attribute__((__always_inline__, __nodebug__)) _mm_store1_pd(double *dp, __m128d a) { - dp[0] = a[0]; - dp[1] = a[0]; + struct __mm_store1_pd_struct { + double u[2]; + } __attribute__((__packed__, __may_alias__)); + ((struct __mm_store1_pd_struct*)dp)->u[0] = a[0]; + ((struct __mm_store1_pd_struct*)dp)->u[1] = a[0]; } static __inline__ void __attribute__((__always_inline__, __nodebug__)) @@ -560,20 +583,26 @@ _mm_storeu_pd(double *dp, __m128d a) static __inline__ void __attribute__((__always_inline__, __nodebug__)) _mm_storer_pd(double *dp, __m128d a) { - dp[0] = a[1]; - dp[1] = a[0]; + a = __builtin_shufflevector(a, a, 1, 0); + *(__m128d *)dp = a; } static __inline__ void __attribute__((__always_inline__, __nodebug__)) _mm_storeh_pd(double *dp, __m128d a) { - dp[0] = a[1]; + struct __mm_storeh_pd_struct { + double u; + } __attribute__((__packed__, __may_alias__)); + ((struct __mm_storeh_pd_struct*)dp)->u = a[1]; } static __inline__ void __attribute__((__always_inline__, __nodebug__)) _mm_storel_pd(double *dp, __m128d a) { - dp[0] = a[0]; + struct __mm_storeh_pd_struct { + double u; + } __attribute__((__packed__, __may_alias__)); + ((struct __mm_storeh_pd_struct*)dp)->u = a[0]; } static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) @@ -1029,7 +1058,10 @@ _mm_loadu_si128(__m128i const *p) static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) _mm_loadl_epi64(__m128i const *p) { - return (__m128i) { *(long long*)p, 0}; + struct __mm_loadl_epi64_struct { + long long u; + } __attribute__((__packed__, __may_alias__)); + return (__m128i) { ((struct __mm_loadl_epi64_struct*)p)->u, 0}; } static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) diff --git a/lib/Headers/pmmintrin.h b/lib/Headers/pmmintrin.h index 7ca386cee9..5f9b097ba6 100644 --- a/lib/Headers/pmmintrin.h +++ b/lib/Headers/pmmintrin.h @@ -84,11 +84,7 @@ _mm_hsub_pd(__m128d a, __m128d b) return __builtin_ia32_hsubpd(a, b); } -static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) -_mm_loaddup_pd(double const *dp) -{ - return (__m128d){ *dp, *dp }; -} +#define _mm_loaddup_pd(dp) _mm_load1_pd(dp) static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) _mm_movedup_pd(__m128d a) diff --git a/lib/Headers/xmmintrin.h b/lib/Headers/xmmintrin.h index 50f275dce0..a0bc0bb092 100644 --- a/lib/Headers/xmmintrin.h +++ b/lib/Headers/xmmintrin.h @@ -501,31 +501,45 @@ _mm_cvtss_f32(__m128 a) static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) _mm_loadh_pi(__m128 a, const __m64 *p) { - __m128 b; - b[0] = *(float*)p; - b[1] = *((float*)p+1); - return __builtin_shufflevector(a, b, 0, 1, 4, 5); + typedef float __mm_loadh_pi_v2f32 __attribute__((__vector_size__(8))); + struct __mm_loadh_pi_struct { + __mm_loadh_pi_v2f32 u; + } __attribute__((__packed__, __may_alias__)); + __mm_loadh_pi_v2f32 b = ((struct __mm_loadh_pi_struct*)p)->u; + __m128 bb = __builtin_shufflevector(b, b, 0, 1, 0, 1); + return __builtin_shufflevector(a, bb, 0, 1, 4, 5); } static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) _mm_loadl_pi(__m128 a, const __m64 *p) { - __m128 b; - b[0] = *(float*)p; - b[1] = *((float*)p+1); - return __builtin_shufflevector(a, b, 4, 5, 2, 3); + typedef float __mm_loadl_pi_v2f32 __attribute__((__vector_size__(8))); + struct __mm_loadl_pi_struct { + __mm_loadl_pi_v2f32 u; + } __attribute__((__packed__, __may_alias__)); + __mm_loadl_pi_v2f32 b = ((struct __mm_loadl_pi_struct*)p)->u; + __m128 bb = __builtin_shufflevector(b, b, 0, 1, 0, 1); + return __builtin_shufflevector(a, bb, 4, 5, 2, 3); } static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) _mm_load_ss(const float *p) { - return (__m128){ *p, 0, 0, 0 }; + struct __mm_load_ss_struct { + float u; + } __attribute__((__packed__, __may_alias__)); + float u = ((struct __mm_load_ss_struct*)p)->u; + return (__m128){ u, 0, 0, 0 }; } static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) _mm_load1_ps(const float *p) { - return (__m128){ *p, *p, *p, *p }; + struct __mm_load1_ps_struct { + float u; + } __attribute__((__packed__, __may_alias__)); + float u = ((struct __mm_load1_ps_struct*)p)->u; + return (__m128){ u, u, u, u }; } #define _mm_load_ps1(p) _mm_load1_ps(p) @@ -541,7 +555,7 @@ _mm_loadu_ps(const float *p) { struct __loadu_ps { __m128 v; - } __attribute__((packed, may_alias)); + } __attribute__((__packed__, __may_alias__)); return ((struct __loadu_ps*)p)->v; } @@ -604,7 +618,10 @@ _mm_storel_pi(__m64 *p, __m128 a) static __inline__ void __attribute__((__always_inline__)) _mm_store_ss(float *p, __m128 a) { - *p = a[0]; + struct __mm_store_ss_struct { + float u; + } __attribute__((__packed__, __may_alias__)); + ((struct __mm_store_ss_struct*)p)->u = a[0]; } static __inline__ void __attribute__((__always_inline__, __nodebug__)) diff --git a/test/CodeGen/sse-builtins.c b/test/CodeGen/sse-builtins.c new file mode 100644 index 0000000000..a89c42e08a --- /dev/null +++ b/test/CodeGen/sse-builtins.c @@ -0,0 +1,104 @@ +// RUN: %clang_cc1 -triple i386-apple-darwin9 -target-cpu pentium4 -target-feature +sse4.1 -g -emit-llvm %s -o - | FileCheck %s + +#include + +__m128 test_loadl_pi(__m128 x, void* y) { + // CHECK: define {{.*}} @test_loadl_pi + // CHECK: load <2 x float>* {{.*}}, align 1{{$}} + // CHECK: shufflevector {{.*}} <4 x i32> + return _mm_loadl_pi(x,y); +} + +__m128 test_loadh_pi(__m128 x, void* y) { + // CHECK: define {{.*}} @test_loadh_pi + // CHECK: load <2 x float>* {{.*}}, align 1{{$}} + // CHECK: shufflevector {{.*}} <4 x i32> + return _mm_loadh_pi(x,y); +} + +__m128 test_load_ss(void* y) { + // CHECK: define {{.*}} @test_load_ss + // CHECK: load float* {{.*}}, align 1{{$}} + return _mm_load_ss(y); +} + +__m128 test_load1_ps(void* y) { + // CHECK: define {{.*}} @test_load1_ps + // CHECK: load float* {{.*}}, align 1{{$}} + return _mm_load1_ps(y); +} + +void test_store_ss(__m128 x, void* y) { + // CHECK: define void @test_store_ss + // CHECK: store {{.*}} float* {{.*}}, align 1, + _mm_store_ss(y, x); +} + +__m128d test_load1_pd(__m128 x, void* y) { + // CHECK: define {{.*}} @test_load1_pd + // CHECK: load double* {{.*}}, align 1{{$}} + return _mm_load1_pd(y); +} + +__m128d test_loadr_pd(__m128 x, void* y) { + // CHECK: define {{.*}} @test_loadr_pd + // CHECK: load <2 x double>* {{.*}}, align 16{{$}} + return _mm_loadr_pd(y); +} + +__m128d test_load_sd(void* y) { + // CHECK: define {{.*}} @test_load_sd + // CHECK: load double* {{.*}}, align 1{{$}} + return _mm_load_sd(y); +} + +__m128d test_loadh_pd(__m128d x, void* y) { + // CHECK: define {{.*}} @test_loadh_pd + // CHECK: load double* {{.*}}, align 1{{$}} + return _mm_loadh_pd(x, y); +} + +__m128d test_loadl_pd(__m128d x, void* y) { + // CHECK: define {{.*}} @test_loadl_pd + // CHECK: load double* {{.*}}, align 1{{$}} + return _mm_loadl_pd(x, y); +} + +void test_store_sd(__m128d x, void* y) { + // CHECK: define void @test_store_sd + // CHECK: store {{.*}} double* {{.*}}, align 1{{$}} + _mm_store_sd(y, x); +} + +void test_store1_pd(__m128d x, void* y) { + // CHECK: define void @test_store1_pd + // CHECK: store {{.*}} double* {{.*}}, align 1{{$}} + // CHECK: store {{.*}} double* {{.*}}, align 1{{$}} + _mm_store1_pd(y, x); +} + +void test_storer_pd(__m128d x, void* y) { + // CHECK: define void @test_storer_pd + // CHECK: store {{.*}} <2 x double>* {{.*}}, align 16{{$}} + _mm_storer_pd(y, x); +} + +void test_storeh_pd(__m128d x, void* y) { + // CHECK: define void @test_storeh_pd + // CHECK: store {{.*}} double* {{.*}}, align 1{{$}} + _mm_storeh_pd(y, x); +} + +void test_storel_pd(__m128d x, void* y) { + // CHECK: define void @test_storel_pd + // CHECK: store {{.*}} double* {{.*}}, align 1{{$}} + _mm_storel_pd(y, x); +} + +__m128i test_loadl_epi64(void* y) { + // CHECK: define {{.*}} @test_loadl_epi64 + // CHECK: load i64* {{.*}}, align 1{{$}} + return _mm_loadl_epi64(y); +}