From 22c5bf688796f53a29b1421e1c27028fb9c2bc17 Mon Sep 17 00:00:00 2001 From: Adam Nemet Date: Thu, 29 May 2014 20:47:29 +0000 Subject: [PATCH] Implement AVX1 vbroadcast intrinsics with vector initializers These intrinsics are special because they directly take a memory operand (AVX2 adds the register counterparts). Typically, other non-memop intrinsics take registers and then it's left to isel to fold memory operands. In order to LICM intrinsics directly reading memory, we require that no stores are in the loop (LICM) or that the folded load accesses constant memory (MachineLICM). When neither is the case we fail to hoist a loop-invariant broadcast. We can work around this limitation if we expose the load as a regular load and then just implement the broadcast using the vector initializer syntax. This exposes the load to LICM and other optimizations. At the IR level this is translated into a series of insertelements. The sequence is already recognized as a broadcast so there is no impact on the quality of codegen. _mm256_broadcast_pd and _mm256_broadcast_ps are not updated by this patch because right now we lack the DAG-combiner smartness to recover the broadcast instructions. This will be tackled in a follow-on. There will be completing changes on the LLVM side to remove the LLVM intrinsics and to auto-upgrade bitcode files. Fixes git-svn-id: https://llvm.org/svn/llvm-project/cfe/trunk@209846 91177308-0d34-0410-b5e6-96231b3b80d8 --- include/clang/Basic/BuiltinsX86.def | 3 --- lib/Headers/avxintrin.h | 9 +++++--- test/CodeGen/avx-shuffle-builtins.c | 34 +++++++++++++++++++++++++++++ test/CodeGen/builtins-x86.c | 3 --- 4 files changed, 40 insertions(+), 9 deletions(-) diff --git a/include/clang/Basic/BuiltinsX86.def b/include/clang/Basic/BuiltinsX86.def index 8faaea381a..6cd7a79d54 100644 --- a/include/clang/Basic/BuiltinsX86.def +++ b/include/clang/Basic/BuiltinsX86.def @@ -453,9 +453,6 @@ BUILTIN(__builtin_ia32_movmskpd256, "iV4d", "") BUILTIN(__builtin_ia32_movmskps256, "iV8f", "") BUILTIN(__builtin_ia32_vzeroall, "v", "") BUILTIN(__builtin_ia32_vzeroupper, "v", "") -BUILTIN(__builtin_ia32_vbroadcastss, "V4ffC*", "") -BUILTIN(__builtin_ia32_vbroadcastsd256, "V4ddC*", "") -BUILTIN(__builtin_ia32_vbroadcastss256, "V8ffC*", "") BUILTIN(__builtin_ia32_vbroadcastf128_pd256, "V4dV2dC*", "") BUILTIN(__builtin_ia32_vbroadcastf128_ps256, "V8fV4fC*", "") BUILTIN(__builtin_ia32_storeupd256, "vd*V4d", "") diff --git a/lib/Headers/avxintrin.h b/lib/Headers/avxintrin.h index 3d50439d36..4e1044af56 100644 --- a/lib/Headers/avxintrin.h +++ b/lib/Headers/avxintrin.h @@ -737,19 +737,22 @@ _mm256_zeroupper(void) static __inline __m128 __attribute__((__always_inline__, __nodebug__)) _mm_broadcast_ss(float const *__a) { - return (__m128)__builtin_ia32_vbroadcastss(__a); + float __f = *__a; + return (__m128)(__v4sf){ __f, __f, __f, __f }; } static __inline __m256d __attribute__((__always_inline__, __nodebug__)) _mm256_broadcast_sd(double const *__a) { - return (__m256d)__builtin_ia32_vbroadcastsd256(__a); + double __d = *__a; + return (__m256d)(__v4df){ __d, __d, __d, __d }; } static __inline __m256 __attribute__((__always_inline__, __nodebug__)) _mm256_broadcast_ss(float const *__a) { - return (__m256)__builtin_ia32_vbroadcastss256(__a); + float __f = *__a; + return (__m256)(__v8sf){ __f, __f, __f, __f, __f, __f, __f, __f }; } static __inline __m256d __attribute__((__always_inline__, __nodebug__)) diff --git a/test/CodeGen/avx-shuffle-builtins.c b/test/CodeGen/avx-shuffle-builtins.c index d071f825aa..76e2395fe8 100644 --- a/test/CodeGen/avx-shuffle-builtins.c +++ b/test/CodeGen/avx-shuffle-builtins.c @@ -63,3 +63,37 @@ __m256i test_mm256_permute2f128_si256(__m256i a, __m256i b) { // CHECK: @llvm.x86.avx.vperm2f128.si.256 return _mm256_permute2f128_si256(a, b, 0x20); } + +__m128 +test_mm_broadcast_ss(float const *__a) { + // CHECK-LABEL: @test_mm_broadcast_ss + // CHECK: insertelement <4 x float> {{.*}}, i32 0 + // CHECK: insertelement <4 x float> {{.*}}, i32 1 + // CHECK: insertelement <4 x float> {{.*}}, i32 2 + // CHECK: insertelement <4 x float> {{.*}}, i32 3 + return _mm_broadcast_ss(__a); +} + +__m256d +test_mm256_broadcast_sd(double const *__a) { + // CHECK-LABEL: @test_mm256_broadcast_sd + // CHECK: insertelement <4 x double> {{.*}}, i32 0 + // CHECK: insertelement <4 x double> {{.*}}, i32 1 + // CHECK: insertelement <4 x double> {{.*}}, i32 2 + // CHECK: insertelement <4 x double> {{.*}}, i32 3 + return _mm256_broadcast_sd(__a); +} + +__m256 +test_mm256_broadcast_ss(float const *__a) { + // CHECK-LABEL: @test_mm256_broadcast_ss + // CHECK: insertelement <8 x float> {{.*}}, i32 0 + // CHECK: insertelement <8 x float> {{.*}}, i32 1 + // CHECK: insertelement <8 x float> {{.*}}, i32 2 + // CHECK: insertelement <8 x float> {{.*}}, i32 3 + // CHECK: insertelement <8 x float> {{.*}}, i32 4 + // CHECK: insertelement <8 x float> {{.*}}, i32 5 + // CHECK: insertelement <8 x float> {{.*}}, i32 6 + // CHECK: insertelement <8 x float> {{.*}}, i32 7 + return _mm256_broadcast_ss(__a); +} diff --git a/test/CodeGen/builtins-x86.c b/test/CodeGen/builtins-x86.c index 6df005d860..8443574c52 100644 --- a/test/CodeGen/builtins-x86.c +++ b/test/CodeGen/builtins-x86.c @@ -451,9 +451,6 @@ void f0() { tmp_i = __builtin_ia32_movmskps256(tmp_V8f); __builtin_ia32_vzeroall(); __builtin_ia32_vzeroupper(); - tmp_V4f = __builtin_ia32_vbroadcastss(tmp_fCp); - tmp_V4d = __builtin_ia32_vbroadcastsd256(tmp_dCp); - tmp_V8f = __builtin_ia32_vbroadcastss256(tmp_fCp); tmp_V4d = __builtin_ia32_vbroadcastf128_pd256(tmp_V2dCp); tmp_V8f = __builtin_ia32_vbroadcastf128_ps256(tmp_V4fCp); __builtin_ia32_storeupd256(tmp_dp, tmp_V4d); -- 2.40.0