From 353a6ccfbf42d12d3eb89075efbf96e7371690d1 Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Sat, 11 Jun 2016 03:31:13 +0000 Subject: [PATCH] [AVX512] Implement 512-bit and masked shufflelo and shufflehi intrinsics directly with __builtin_shufflevector and __builtin_ia32_select. Also improve the formatting of the AVX2 version. git-svn-id: https://llvm.org/svn/llvm-project/cfe/trunk@272452 91177308-0d34-0410-b5e6-96231b3b80d8 --- include/clang/Basic/BuiltinsX86.def | 6 --- lib/Headers/avx2intrin.h | 6 ++- lib/Headers/avx512bwintrin.h | 75 +++++++++++++++++++++-------- lib/Headers/avx512vlbwintrin.h | 53 +++++++++----------- lib/Sema/SemaChecking.cpp | 6 --- test/CodeGen/avx512bw-builtins.c | 16 +++--- test/CodeGen/avx512vlbw-builtins.c | 55 +++++++++++++++++++++ 7 files changed, 149 insertions(+), 68 deletions(-) diff --git a/include/clang/Basic/BuiltinsX86.def b/include/clang/Basic/BuiltinsX86.def index d07c274171..d2da5512a1 100644 --- a/include/clang/Basic/BuiltinsX86.def +++ b/include/clang/Basic/BuiltinsX86.def @@ -1615,12 +1615,6 @@ TARGET_BUILTIN(__builtin_ia32_prorvd128_mask, "V4iV4iV4iV4iUc","","avx512vl") TARGET_BUILTIN(__builtin_ia32_prorvd256_mask, "V8iV8iV8iV8iUc","","avx512vl") TARGET_BUILTIN(__builtin_ia32_prorvq128_mask, "V2LLiV2LLiV2LLiV2LLiUc","","avx512vl") TARGET_BUILTIN(__builtin_ia32_prorvq256_mask, "V4LLiV4LLiV4LLiV4LLiUc","","avx512vl") -TARGET_BUILTIN(__builtin_ia32_pshufhw512_mask, "V32sV32sIiV32sUi","","avx512bw") -TARGET_BUILTIN(__builtin_ia32_pshuflw512_mask, "V32sV32sIiV32sUi","","avx512bw") -TARGET_BUILTIN(__builtin_ia32_pshufhw128_mask, "V8sV8sIiV8sUc","","avx512bw,avx512vl") -TARGET_BUILTIN(__builtin_ia32_pshufhw256_mask, "V16sV16sIiV16sUs","","avx512bw,avx512vl") -TARGET_BUILTIN(__builtin_ia32_pshuflw128_mask, "V8sV8sIiV8sUc","","avx512bw,avx512vl") -TARGET_BUILTIN(__builtin_ia32_pshuflw256_mask, "V16sV16sIiV16sUs","","avx512bw,avx512vl") TARGET_BUILTIN(__builtin_ia32_psllv32hi_mask, "V32sV32sV32sV32sUi","","avx512bw") TARGET_BUILTIN(__builtin_ia32_psllw512_mask, "V32sV32sV8sV32sUi","","avx512bw") TARGET_BUILTIN(__builtin_ia32_psllwi512_mask, "V32sV32sIiV32sUi","","avx512bw") diff --git a/lib/Headers/avx2intrin.h b/lib/Headers/avx2intrin.h index fc8b939fe3..fbca5df7db 100644 --- a/lib/Headers/avx2intrin.h +++ b/lib/Headers/avx2intrin.h @@ -522,8 +522,10 @@ _mm256_shuffle_epi8(__m256i __a, __m256i __b) #define _mm256_shufflelo_epi16(a, imm) __extension__ ({ \ (__m256i)__builtin_shufflevector((__v16hi)(__m256i)(a), \ (__v16hi)_mm256_setzero_si256(), \ - (imm) & 0x3,((imm) & 0xc) >> 2, \ - ((imm) & 0x30) >> 4, ((imm) & 0xc0) >> 6, \ + 0 + (((imm) & 0x03) >> 0), \ + 0 + (((imm) & 0x0c) >> 2), \ + 0 + (((imm) & 0x30) >> 4), \ + 0 + (((imm) & 0xc0) >> 6), \ 4, 5, 6, 7, \ 8 + (((imm) & 0x03) >> 0), \ 8 + (((imm) & 0x0c) >> 2), \ diff --git a/lib/Headers/avx512bwintrin.h b/lib/Headers/avx512bwintrin.h index f4cbdd5bd3..a217421025 100644 --- a/lib/Headers/avx512bwintrin.h +++ b/lib/Headers/avx512bwintrin.h @@ -1588,37 +1588,74 @@ _mm512_maskz_cvtepu8_epi16 (__mmask32 __U, __m256i __A) (__mmask32)(m)); }) #define _mm512_shufflehi_epi16(A, imm) __extension__ ({ \ - (__m512i)__builtin_ia32_pshufhw512_mask((__v32hi)(__m512i)(A), (int)(imm), \ - (__v32hi)_mm512_setzero_hi(), \ - (__mmask32)-1); }) + (__m512i)__builtin_shufflevector((__v32hi)(__m512i)(A), \ + (__v32hi)_mm512_setzero_hi(), \ + 0, 1, 2, 3, \ + 4 + (((imm) & 0x03) >> 0), \ + 4 + (((imm) & 0x0c) >> 2), \ + 4 + (((imm) & 0x30) >> 4), \ + 4 + (((imm) & 0xc0) >> 6), \ + 8, 9, 10, 11, \ + 12 + (((imm) & 0x03) >> 0), \ + 12 + (((imm) & 0x0c) >> 2), \ + 12 + (((imm) & 0x30) >> 4), \ + 12 + (((imm) & 0xc0) >> 6), \ + 16, 17, 18, 19, \ + 20 + (((imm) & 0x03) >> 0), \ + 20 + (((imm) & 0x0c) >> 2), \ + 20 + (((imm) & 0x30) >> 4), \ + 20 + (((imm) & 0xc0) >> 6), \ + 24, 25, 26, 27, \ + 28 + (((imm) & 0x03) >> 0), \ + 28 + (((imm) & 0x0c) >> 2), \ + 28 + (((imm) & 0x30) >> 4), \ + 28 + (((imm) & 0xc0) >> 6)); }) #define _mm512_mask_shufflehi_epi16(W, U, A, imm) __extension__ ({ \ - (__m512i)__builtin_ia32_pshufhw512_mask((__v32hi)(__m512i)(A), (int)(imm), \ - (__v32hi)(__m512i)(W), \ - (__mmask32)(U)); }) - + (__m512i)__builtin_ia32_selectw_512((__mmask32)(U), \ + _mm512_shufflehi_epi16((A), (imm)), \ + (__v32hi)(__m512i)(W)); }) #define _mm512_maskz_shufflehi_epi16(U, A, imm) __extension__ ({ \ - (__m512i)__builtin_ia32_pshufhw512_mask((__v32hi)(__m512i)(A), (int)(imm), \ - (__v32hi)_mm512_setzero_hi(), \ - (__mmask32)(U)); }) + (__m512i)__builtin_ia32_selectw_512((__mmask32)(U), \ + _mm512_shufflehi_epi16((A), (imm)), \ + (__v32hi)_mm512_setzero_hi()); }) #define _mm512_shufflelo_epi16(A, imm) __extension__ ({ \ - (__m512i)__builtin_ia32_pshuflw512_mask((__v32hi)(__m512i)(A), (int)(imm), \ - (__v32hi)_mm512_setzero_hi(), \ - (__mmask32)-1); }) + (__m512i)__builtin_shufflevector((__v32hi)(__m512i)(A), \ + (__v32hi)_mm512_setzero_hi(), \ + 0 + (((imm) & 0x03) >> 0), \ + 0 + (((imm) & 0x0c) >> 2), \ + 0 + (((imm) & 0x30) >> 4), \ + 0 + (((imm) & 0xc0) >> 6), \ + 4, 5, 6, 7, \ + 8 + (((imm) & 0x03) >> 0), \ + 8 + (((imm) & 0x0c) >> 2), \ + 8 + (((imm) & 0x30) >> 4), \ + 8 + (((imm) & 0xc0) >> 6), \ + 12, 13, 14, 15, \ + 16 + (((imm) & 0x03) >> 0), \ + 16 + (((imm) & 0x0c) >> 2), \ + 16 + (((imm) & 0x30) >> 4), \ + 16 + (((imm) & 0xc0) >> 6), \ + 20, 21, 22, 23, \ + 24 + (((imm) & 0x03) >> 0), \ + 24 + (((imm) & 0x0c) >> 2), \ + 24 + (((imm) & 0x30) >> 4), \ + 24 + (((imm) & 0xc0) >> 6), \ + 28, 29, 30, 31); }) #define _mm512_mask_shufflelo_epi16(W, U, A, imm) __extension__ ({ \ - (__m512i)__builtin_ia32_pshuflw512_mask((__v32hi)(__m512i)(A), (int)(imm), \ - (__v32hi)(__m512i)(W), \ - (__mmask32)(U)); }) + (__m512i)__builtin_ia32_selectw_512((__mmask32)(U), \ + _mm512_shufflelo_epi16((A), (imm)), \ + (__v32hi)(__m512i)(W)); }) #define _mm512_maskz_shufflelo_epi16(U, A, imm) __extension__ ({ \ - (__m512i)__builtin_ia32_pshuflw512_mask((__v32hi)(__m512i)(A), (int)(imm), \ - (__v32hi)_mm512_setzero_hi(), \ - (__mmask32)(U)); }) + (__m512i)__builtin_ia32_selectw_512((__mmask32)(U), \ + _mm512_shufflelo_epi16((A), (imm)), \ + (__v32hi)_mm512_setzero_hi()); }) static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_sllv_epi16 (__m512i __A, __m512i __B) diff --git a/lib/Headers/avx512vlbwintrin.h b/lib/Headers/avx512vlbwintrin.h index 43cdadbf4d..52499fe842 100644 --- a/lib/Headers/avx512vlbwintrin.h +++ b/lib/Headers/avx512vlbwintrin.h @@ -2407,49 +2407,44 @@ _mm256_maskz_cvtepu8_epi16 (__mmask16 __U, __m128i __A) (__mmask16)(m)); }) #define _mm_mask_shufflehi_epi16(W, U, A, imm) __extension__ ({ \ - (__m128i)__builtin_ia32_pshufhw128_mask((__v8hi)(__m128i)(A), (int)(imm), \ - (__v8hi)(__m128i)(W), \ - (__mmask8)(U)); }) + (__m128i)__builtin_ia32_selectw_128((__mmask8)(U), \ + _mm_shufflehi_epi16((A), (imm)), \ + (__v8hi)(__m128i)(W)); }) #define _mm_maskz_shufflehi_epi16(U, A, imm) __extension__ ({ \ - (__m128i)__builtin_ia32_pshufhw128_mask((__v8hi)(__m128i)(A), (int)(imm), \ - (__v8hi)_mm_setzero_hi(), \ - (__mmask8)(U)); }) - + (__m128i)__builtin_ia32_selectw_128((__mmask8)(U), \ + _mm_shufflehi_epi16((A), (imm)), \ + (__v8hi)_mm_setzero_hi()); }) #define _mm256_mask_shufflehi_epi16(W, U, A, imm) __extension__ ({ \ - (__m256i)__builtin_ia32_pshufhw256_mask((__v16hi)(__m256i)(A), (int)(imm), \ - (__v16hi)(__m256i)(W), \ - (__mmask16)(U)); }) - + (__m256i)__builtin_ia32_selectw_256((__mmask16)(U), \ + _mm256_shufflehi_epi16((A), (imm)), \ + (__v16hi)(__m256i)(W)); }) #define _mm256_maskz_shufflehi_epi16(U, A, imm) __extension__ ({ \ - (__m256i)__builtin_ia32_pshufhw256_mask((__v16hi)(__m256i)(A), (int)(imm), \ - (__v16hi)_mm256_setzero_si256(), \ - (__mmask16)(U)); }) - + (__m256i)__builtin_ia32_selectw_256((__mmask16)(U), \ + _mm256_shufflehi_epi16((A), (imm)), \ + (__v16hi)_mm256_setzero_si256()); }) #define _mm_mask_shufflelo_epi16(W, U, A, imm) __extension__ ({ \ - (__m128i)__builtin_ia32_pshuflw128_mask((__v8hi)(__m128i)(A), (int)(imm), \ - (__v8hi)(__m128i)(W), \ - (__mmask8)(U)); }) + (__m128i)__builtin_ia32_selectw_128((__mmask8)(U), \ + _mm_shufflelo_epi16((A), (imm)), \ + (__v8hi)(__m128i)(W)); }) #define _mm_maskz_shufflelo_epi16(U, A, imm) __extension__ ({ \ - (__m128i)__builtin_ia32_pshuflw128_mask((__v8hi)(__m128i)(A), (int)(imm), \ - (__v8hi)_mm_setzero_hi(), \ - (__mmask8)(U)); }) - + (__m128i)__builtin_ia32_selectw_128((__mmask8)(U), \ + _mm_shufflelo_epi16((A), (imm)), \ + (__v8hi)_mm_setzero_hi()); }) #define _mm256_mask_shufflelo_epi16(W, U, A, imm) __extension__ ({ \ - (__m256i)__builtin_ia32_pshuflw256_mask((__v16hi)(__m256i)(A), (int)(imm), \ - (__v16hi)(__m256i)(W), \ - (__mmask16)(U)); }) - + (__m256i)__builtin_ia32_selectw_256((__mmask16)(U), \ + _mm256_shufflelo_epi16((A), (imm)), \ + (__v16hi)(__m256i)(W)); }) #define _mm256_maskz_shufflelo_epi16(U, A, imm) __extension__ ({ \ - (__m256i)__builtin_ia32_pshuflw256_mask((__v16hi)(__m256i)(A), (int)(imm), \ - (__v16hi)_mm256_setzero_si256(), \ - (__mmask16)(U)); }) + (__m256i)__builtin_ia32_selectw_256((__mmask16)(U), \ + _mm256_shufflelo_epi16((A), (imm)), \ + (__v16hi)_mm256_setzero_si256()); }) static __inline__ __m256i __DEFAULT_FN_ATTRS _mm256_sllv_epi16 (__m256i __A, __m256i __B) diff --git a/lib/Sema/SemaChecking.cpp b/lib/Sema/SemaChecking.cpp index 670a67de2b..6f944e0d92 100644 --- a/lib/Sema/SemaChecking.cpp +++ b/lib/Sema/SemaChecking.cpp @@ -1525,12 +1525,6 @@ bool Sema::CheckX86BuiltinFunctionCall(unsigned BuiltinID, CallExpr *TheCall) { case X86::BI__builtin_ia32_prord256_mask: case X86::BI__builtin_ia32_prorq128_mask: case X86::BI__builtin_ia32_prorq256_mask: - case X86::BI__builtin_ia32_pshufhw512_mask: - case X86::BI__builtin_ia32_pshuflw512_mask: - case X86::BI__builtin_ia32_pshufhw128_mask: - case X86::BI__builtin_ia32_pshufhw256_mask: - case X86::BI__builtin_ia32_pshuflw128_mask: - case X86::BI__builtin_ia32_pshuflw256_mask: case X86::BI__builtin_ia32_psllwi512_mask: case X86::BI__builtin_ia32_psllwi128_mask: case X86::BI__builtin_ia32_psllwi256_mask: diff --git a/test/CodeGen/avx512bw-builtins.c b/test/CodeGen/avx512bw-builtins.c index aae808dac8..d598ae925e 100644 --- a/test/CodeGen/avx512bw-builtins.c +++ b/test/CodeGen/avx512bw-builtins.c @@ -1079,37 +1079,41 @@ __m512i test_mm512_maskz_cvtepu8_epi16(__mmask32 __U, __m256i __A) { __m512i test_mm512_shufflehi_epi16(__m512i __A) { // CHECK-LABEL: @test_mm512_shufflehi_epi16 - // CHECK: @llvm.x86.avx512.mask.pshufh.w.512 + // CHECK: shufflevector <32 x i16> %{{.*}}, <32 x i16> %{{.*}}, <32 x i32> return _mm512_shufflehi_epi16(__A, 5); } __m512i test_mm512_mask_shufflehi_epi16(__m512i __W, __mmask32 __U, __m512i __A) { // CHECK-LABEL: @test_mm512_mask_shufflehi_epi16 - // CHECK: @llvm.x86.avx512.mask.pshufh.w.512 + // CHECK: shufflevector <32 x i16> %{{.*}}, <32 x i16> %{{.*}}, <32 x i32> + // CHECK: select <32 x i1> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16> %{{.*}} return _mm512_mask_shufflehi_epi16(__W, __U, __A, 5); } __m512i test_mm512_maskz_shufflehi_epi16(__mmask32 __U, __m512i __A) { // CHECK-LABEL: @test_mm512_maskz_shufflehi_epi16 - // CHECK: @llvm.x86.avx512.mask.pshufh.w.512 + // CHECK: shufflevector <32 x i16> %{{.*}}, <32 x i16> %{{.*}}, <32 x i32> + // CHECK: select <32 x i1> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16> %{{.*}} return _mm512_maskz_shufflehi_epi16(__U, __A, 5); } __m512i test_mm512_shufflelo_epi16(__m512i __A) { // CHECK-LABEL: @test_mm512_shufflelo_epi16 - // CHECK: @llvm.x86.avx512.mask.pshufl.w.512 + // CHECK: shufflevector <32 x i16> %{{.*}}, <32 x i16> %{{.*}}, <32 x i32> return _mm512_shufflelo_epi16(__A, 5); } __m512i test_mm512_mask_shufflelo_epi16(__m512i __W, __mmask32 __U, __m512i __A) { // CHECK-LABEL: @test_mm512_mask_shufflelo_epi16 - // CHECK: @llvm.x86.avx512.mask.pshufl.w.512 + // CHECK: shufflevector <32 x i16> %{{.*}}, <32 x i16> %{{.*}}, <32 x i32> + // CHECK: select <32 x i1> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16> %{{.*}} return _mm512_mask_shufflelo_epi16(__W, __U, __A, 5); } __m512i test_mm512_maskz_shufflelo_epi16(__mmask32 __U, __m512i __A) { // CHECK-LABEL: @test_mm512_maskz_shufflelo_epi16 - // CHECK: @llvm.x86.avx512.mask.pshufl.w.512 + // CHECK: shufflevector <32 x i16> %{{.*}}, <32 x i16> %{{.*}}, <32 x i32> + // CHECK: select <32 x i1> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16> %{{.*}} return _mm512_maskz_shufflelo_epi16(__U, __A, 5); } diff --git a/test/CodeGen/avx512vlbw-builtins.c b/test/CodeGen/avx512vlbw-builtins.c index a9e267ade7..1d2e0ef1cd 100644 --- a/test/CodeGen/avx512vlbw-builtins.c +++ b/test/CodeGen/avx512vlbw-builtins.c @@ -2391,3 +2391,58 @@ __mmask16 test_mm256_movepi16_mask(__m256i __A) { return _mm256_movepi16_mask(__A); } +__m128i test_mm_mask_shufflehi_epi16(__m128i __W, __mmask32 __U, __m128i __A) { + // CHECK-LABEL: @test_mm_mask_shufflehi_epi16 + // CHECK: shufflevector <8 x i16> %2, <8 x i16> %4, <8 x i32> + // CHECK: select <8 x i1> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}} + return _mm_mask_shufflehi_epi16(__W, __U, __A, 5); +} + +__m128i test_mm_maskz_shufflehi_epi16(__mmask32 __U, __m128i __A) { + // CHECK-LABEL: @test_mm_maskz_shufflehi_epi16 + // CHECK: shufflevector <8 x i16> %2, <8 x i16> %4, <8 x i32> + // CHECK: select <8 x i1> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}} + return _mm_maskz_shufflehi_epi16(__U, __A, 5); +} + +__m128i test_mm_mask_shufflelo_epi16(__m128i __W, __mmask32 __U, __m128i __A) { + // CHECK-LABEL: @test_mm_mask_shufflelo_epi16 + // CHECK: shufflevector <8 x i16> %{{.*}}, <8 x i16> %{{.*}}, <8 x i32> + // CHECK: select <8 x i1> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}} + return _mm_mask_shufflelo_epi16(__W, __U, __A, 5); +} + +__m128i test_mm_maskz_shufflelo_epi16(__mmask32 __U, __m128i __A) { + // CHECK-LABEL: @test_mm_maskz_shufflelo_epi16 + // CHECK: shufflevector <8 x i16> %{{.*}}, <8 x i16> %{{.*}}, <8 x i32> + // CHECK: select <8 x i1> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}} + return _mm_maskz_shufflelo_epi16(__U, __A, 5); +} + +__m256i test_mm256_mask_shufflehi_epi16(__m256i __W, __mmask32 __U, __m256i __A) { + // CHECK-LABEL: @test_mm256_mask_shufflehi_epi16 + // CHECK: shufflevector <16 x i16> %{{.*}}, <16 x i16> %{{.*}}, <16 x i32> + // CHECK: select <16 x i1> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}} + return _mm256_mask_shufflehi_epi16(__W, __U, __A, 5); +} + +__m256i test_mm256_maskz_shufflehi_epi16(__mmask32 __U, __m256i __A) { + // CHECK-LABEL: @test_mm256_maskz_shufflehi_epi16 + // CHECK: shufflevector <16 x i16> %{{.*}}, <16 x i16> %{{.*}}, <16 x i32> + // CHECK: select <16 x i1> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}} + return _mm256_maskz_shufflehi_epi16(__U, __A, 5); +} + +__m256i test_mm256_mask_shufflelo_epi16(__m256i __W, __mmask32 __U, __m256i __A) { + // CHECK-LABEL: @test_mm256_mask_shufflelo_epi16 + // CHECK: shufflevector <16 x i16> %2, <16 x i16> %4, <16 x i32> + // CHECK: select <16 x i1> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}} + return _mm256_mask_shufflelo_epi16(__W, __U, __A, 5); +} + +__m256i test_mm256_maskz_shufflelo_epi16(__mmask32 __U, __m256i __A) { + // CHECK-LABEL: @test_mm256_maskz_shufflelo_epi16 + // CHECK: shufflevector <16 x i16> %2, <16 x i16> %4, <16 x i32> + // CHECK: select <16 x i1> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}} + return _mm256_maskz_shufflelo_epi16(__U, __A, 5); +} -- 2.50.1