From b0da26f02f30c5c802d4bf9e7c7d15fe696f1810 Mon Sep 17 00:00:00 2001 From: Sanjay Patel Date: Thu, 12 Mar 2015 21:54:24 +0000 Subject: [PATCH] [X86, AVX2] Replace inserti128 and extracti128 intrinsics with generic shuffles This is nearly identical to the v*f128_si256 parts of r231792 and r232052. AVX2 introduced proper integer variants of the hacked integer insert/extract C intrinsics that were created for this same functionality with AVX1. This should complete the front end fixes for insert/extract128 intrinsics. Corresponding LLVM patch to follow. git-svn-id: https://llvm.org/svn/llvm-project/cfe/trunk@232109 91177308-0d34-0410-b5e6-96231b3b80d8 --- include/clang/Basic/BuiltinsX86.def | 2 -- lib/Headers/avx2intrin.h | 23 +++++++++++------- lib/Sema/SemaChecking.cpp | 2 -- test/CodeGen/avx2-builtins.c | 36 +++++++++++++++++++++++++---- 4 files changed, 47 insertions(+), 16 deletions(-) diff --git a/include/clang/Basic/BuiltinsX86.def b/include/clang/Basic/BuiltinsX86.def index 5eb0b84f34..a60e44242a 100644 --- a/include/clang/Basic/BuiltinsX86.def +++ b/include/clang/Basic/BuiltinsX86.def @@ -587,8 +587,6 @@ BUILTIN(__builtin_ia32_pbroadcastq128, "V2LLiV2LLi", "") BUILTIN(__builtin_ia32_permvarsi256, "V8iV8iV8i", "") BUILTIN(__builtin_ia32_permvarsf256, "V8fV8fV8f", "") BUILTIN(__builtin_ia32_permti256, "V4LLiV4LLiV4LLiIc", "") -BUILTIN(__builtin_ia32_extract128i256, "V2LLiV4LLiIc", "") -BUILTIN(__builtin_ia32_insert128i256, "V4LLiV4LLiV2LLiIc", "") BUILTIN(__builtin_ia32_maskloadd256, "V8iV8iC*V8i", "") BUILTIN(__builtin_ia32_maskloadq256, "V4LLiV4LLiC*V4LLi", "") BUILTIN(__builtin_ia32_maskloadd, "V4iV4iC*V4i", "") diff --git a/lib/Headers/avx2intrin.h b/lib/Headers/avx2intrin.h index af43bec0ef..7485bddb41 100644 --- a/lib/Headers/avx2intrin.h +++ b/lib/Headers/avx2intrin.h @@ -874,14 +874,21 @@ _mm256_permutevar8x32_ps(__m256 __a, __m256 __b) __m256i __V2 = (V2); \ (__m256i)__builtin_ia32_permti256(__V1, __V2, (M)); }) -#define _mm256_extracti128_si256(A, O) __extension__ ({ \ - __m256i __A = (A); \ - (__m128i)__builtin_ia32_extract128i256(__A, (O)); }) - -#define _mm256_inserti128_si256(V1, V2, O) __extension__ ({ \ - __m256i __V1 = (V1); \ - __m128i __V2 = (V2); \ - (__m256i)__builtin_ia32_insert128i256(__V1, __V2, (O)); }) +#define _mm256_extracti128_si256(V, M) __extension__ ({ \ + (__m128i)__builtin_shufflevector( \ + (__v4di)(V), \ + (__v4di)(_mm256_setzero_si256()), \ + (((M) & 1) ? 2 : 0), \ + (((M) & 1) ? 3 : 1) );}) + +#define _mm256_inserti128_si256(V1, V2, M) __extension__ ({ \ + (__m256i)__builtin_shufflevector( \ + (__v4di)(V1), \ + (__v4di)_mm256_castsi128_si256((__m128i)(V2)), \ + (((M) & 1) ? 0 : 4), \ + (((M) & 1) ? 1 : 5), \ + (((M) & 1) ? 4 : 2), \ + (((M) & 1) ? 5 : 3) );}) static __inline__ __m256i __attribute__((__always_inline__, __nodebug__)) _mm256_maskload_epi32(int const *__X, __m256i __M) diff --git a/lib/Sema/SemaChecking.cpp b/lib/Sema/SemaChecking.cpp index 7a4a370adc..60382e8b61 100644 --- a/lib/Sema/SemaChecking.cpp +++ b/lib/Sema/SemaChecking.cpp @@ -882,8 +882,6 @@ bool Sema::CheckX86BuiltinFunctionCall(unsigned BuiltinID, CallExpr *TheCall) { switch (BuiltinID) { default: return false; case X86::BI_mm_prefetch: i = 1; l = 0; u = 3; break; - case X86::BI__builtin_ia32_extract128i256: i = 1, l = 0, u = 1; break; - case X86::BI__builtin_ia32_insert128i256: i = 2, l = 0; u = 1; break; case X86::BI__builtin_ia32_sha1rnds4: i = 2, l = 0; u = 3; break; case X86::BI__builtin_ia32_vpermil2pd: case X86::BI__builtin_ia32_vpermil2pd256: diff --git a/test/CodeGen/avx2-builtins.c b/test/CodeGen/avx2-builtins.c index 371f9c6ee2..fa5a27c7d3 100644 --- a/test/CodeGen/avx2-builtins.c +++ b/test/CodeGen/avx2-builtins.c @@ -695,16 +695,44 @@ __m256i test_mm256_permute2x128_si256(__m256i a, __m256i b) { return _mm256_permute2x128_si256(a, b, 0x31); } -__m128i test_mm256_extracti128_si256(__m256i a) { - // CHECK: @llvm.x86.avx2.vextracti128 +__m128i test_mm256_extracti128_si256_0(__m256i a) { + // CHECK-LABEL: @test_mm256_extracti128_si256_0 + // CHECK: shufflevector{{.*}} + return _mm256_extracti128_si256(a, 0); +} + +__m128i test_mm256_extracti128_si256_1(__m256i a) { + // CHECK-LABEL: @test_mm256_extracti128_si256_1 + // CHECK: shufflevector{{.*}} return _mm256_extracti128_si256(a, 1); } -__m256i test_mm256_inserti128_si256(__m256i a, __m128i b) { - // CHECK: @llvm.x86.avx2.vinserti128 +// Immediate should be truncated to one bit. +__m128i test_mm256_extracti128_si256_2(__m256i a) { + // CHECK-LABEL: @test_mm256_extracti128_si256_2 + // CHECK: shufflevector{{.*}} + return _mm256_extracti128_si256(a, 2); +} + +__m256i test_mm256_inserti128_si256_0(__m256i a, __m128i b) { + // CHECK-LABEL: @test_mm256_inserti128_si256_0 + // CHECK: shufflevector{{.*}} + return _mm256_inserti128_si256(a, b, 0); +} + +__m256i test_mm256_inserti128_si256_1(__m256i a, __m128i b) { + // CHECK-LABEL: @test_mm256_inserti128_si256_1 + // CHECK: shufflevector{{.*}} return _mm256_inserti128_si256(a, b, 1); } +// Immediate should be truncated to one bit. +__m256i test_mm256_inserti128_si256_2(__m256i a, __m128i b) { + // CHECK-LABEL: @test_mm256_inserti128_si256_2 + // CHECK: shufflevector{{.*}} + return _mm256_inserti128_si256(a, b, 2); +} + __m256i test_mm256_maskload_epi32(int const *a, __m256i m) { // CHECK: @llvm.x86.avx2.maskload.d.256 return _mm256_maskload_epi32(a, m); -- 2.40.0