From 1efc61399759e092c2fea20d27d6da707a0db1b9 Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Fri, 8 Jun 2018 00:00:21 +0000 Subject: [PATCH] [X86] Add builtins for blend with immediate control to enforce target feature requirements and check immediate range. git-svn-id: https://llvm.org/svn/llvm-project/cfe/trunk@334249 91177308-0d34-0410-b5e6-96231b3b80d8 --- include/clang/Basic/BuiltinsX86.def | 8 ++++++ lib/CodeGen/CGBuiltin.cpp | 21 +++++++++++++++ lib/Headers/avx2intrin.h | 40 +++++------------------------ lib/Headers/avxintrin.h | 20 +++------------ lib/Headers/smmintrin.h | 25 +++++------------- lib/Sema/SemaChecking.cpp | 8 ++++++ test/CodeGen/avx-builtins.c | 2 +- test/CodeGen/avx2-builtins.c | 2 +- 8 files changed, 55 insertions(+), 71 deletions(-) diff --git a/include/clang/Basic/BuiltinsX86.def b/include/clang/Basic/BuiltinsX86.def index 9e56028281..6f952bc061 100644 --- a/include/clang/Basic/BuiltinsX86.def +++ b/include/clang/Basic/BuiltinsX86.def @@ -369,6 +369,9 @@ TARGET_BUILTIN(__builtin_ia32_palignr128, "V16cV16cV16cIi", "nc", "ssse3") TARGET_BUILTIN(__builtin_ia32_insertps128, "V4fV4fV4fIc", "nc", "sse4.1") TARGET_BUILTIN(__builtin_ia32_pblendvb128, "V16cV16cV16cV16c", "nc", "sse4.1") +TARGET_BUILTIN(__builtin_ia32_pblendw128, "V8sV8sV8sIi", "nc", "sse4.1") +TARGET_BUILTIN(__builtin_ia32_blendpd, "V2dV2dV2dIi", "nc", "sse4.1") +TARGET_BUILTIN(__builtin_ia32_blendps, "V4fV4fV4fIi", "nc", "sse4.1") TARGET_BUILTIN(__builtin_ia32_blendvpd, "V2dV2dV2dV2d", "nc", "sse4.1") TARGET_BUILTIN(__builtin_ia32_blendvps, "V4fV4fV4fV4f", "nc", "sse4.1") TARGET_BUILTIN(__builtin_ia32_packusdw128, "V8sV4iV4i", "nc", "sse4.1") @@ -477,6 +480,8 @@ TARGET_BUILTIN(__builtin_ia32_vpermilvarpd, "V2dV2dV2LLi", "nc", "avx") TARGET_BUILTIN(__builtin_ia32_vpermilvarps, "V4fV4fV4i", "nc", "avx") TARGET_BUILTIN(__builtin_ia32_vpermilvarpd256, "V4dV4dV4LLi", "nc", "avx") TARGET_BUILTIN(__builtin_ia32_vpermilvarps256, "V8fV8fV8i", "nc", "avx") +TARGET_BUILTIN(__builtin_ia32_blendpd256, "V4dV4dV4dIi", "nc", "avx") +TARGET_BUILTIN(__builtin_ia32_blendps256, "V8fV8fV8fIi", "nc", "avx") TARGET_BUILTIN(__builtin_ia32_blendvpd256, "V4dV4dV4dV4d", "nc", "avx") TARGET_BUILTIN(__builtin_ia32_blendvps256, "V8fV8fV8fV8f", "nc", "avx") TARGET_BUILTIN(__builtin_ia32_dpps256, "V8fV8fV8fIc", "nc", "avx") @@ -554,6 +559,7 @@ TARGET_BUILTIN(__builtin_ia32_psubusb256, "V32cV32cV32c", "nc", "avx2") TARGET_BUILTIN(__builtin_ia32_psubusw256, "V16sV16sV16s", "nc", "avx2") TARGET_BUILTIN(__builtin_ia32_palignr256, "V32cV32cV32cIi", "nc", "avx2") TARGET_BUILTIN(__builtin_ia32_pblendvb256, "V32cV32cV32cV32c", "nc", "avx2") +TARGET_BUILTIN(__builtin_ia32_pblendw256, "V16sV16sV16sIi", "nc", "avx2") TARGET_BUILTIN(__builtin_ia32_phaddw256, "V16sV16sV16s", "nc", "avx2") TARGET_BUILTIN(__builtin_ia32_phaddd256, "V8iV8iV8i", "nc", "avx2") TARGET_BUILTIN(__builtin_ia32_phaddsw256, "V16sV16sV16s", "nc", "avx2") @@ -603,6 +609,8 @@ TARGET_BUILTIN(__builtin_ia32_psrldi256, "V8iV8ii", "nc", "avx2") TARGET_BUILTIN(__builtin_ia32_psrld256, "V8iV8iV4i", "nc", "avx2") TARGET_BUILTIN(__builtin_ia32_psrlqi256, "V4LLiV4LLii", "nc", "avx2") TARGET_BUILTIN(__builtin_ia32_psrlq256, "V4LLiV4LLiV2LLi", "nc", "avx2") +TARGET_BUILTIN(__builtin_ia32_pblendd128, "V4iV4iV4iIi", "nc", "avx2") +TARGET_BUILTIN(__builtin_ia32_pblendd256, "V8iV8iV8iIi", "nc", "avx2") TARGET_BUILTIN(__builtin_ia32_permvarsi256, "V8iV8iV8i", "nc", "avx2") TARGET_BUILTIN(__builtin_ia32_permvarsf256, "V8fV8fV8i", "nc", "avx2") TARGET_BUILTIN(__builtin_ia32_permti256, "V4LLiV4LLiV4LLiIc", "nc", "avx2") diff --git a/lib/CodeGen/CGBuiltin.cpp b/lib/CodeGen/CGBuiltin.cpp index 69f7d91b38..0002058495 100644 --- a/lib/CodeGen/CGBuiltin.cpp +++ b/lib/CodeGen/CGBuiltin.cpp @@ -9235,6 +9235,27 @@ Value *CodeGenFunction::EmitX86BuiltinExpr(unsigned BuiltinID, Ops[0] = Builder.CreateBitCast(Ops[0], PtrTy); return Builder.CreateDefaultAlignedStore(Ops[1], Ops[0]); } + case X86::BI__builtin_ia32_pblendw128: + case X86::BI__builtin_ia32_blendpd: + case X86::BI__builtin_ia32_blendps: + case X86::BI__builtin_ia32_blendpd256: + case X86::BI__builtin_ia32_blendps256: + case X86::BI__builtin_ia32_pblendw256: + case X86::BI__builtin_ia32_pblendd128: + case X86::BI__builtin_ia32_pblendd256: { + unsigned NumElts = Ops[0]->getType()->getVectorNumElements(); + unsigned Imm = cast(Ops[2])->getZExtValue(); + + uint32_t Indices[16]; + // If there are more than 8 elements, the immediate is used twice so make + // sure we handle that. + for (unsigned i = 0; i != NumElts; ++i) + Indices[i] = ((Imm >> (i % 8)) & 0x1) ? NumElts + i : i; + + return Builder.CreateShuffleVector(Ops[1], Ops[0], + makeArrayRef(Indices, NumElts), + "blend"); + } case X86::BI__builtin_ia32_palignr128: case X86::BI__builtin_ia32_palignr256: case X86::BI__builtin_ia32_palignr512: { diff --git a/lib/Headers/avx2intrin.h b/lib/Headers/avx2intrin.h index 9371fcb949..d1c530693b 100644 --- a/lib/Headers/avx2intrin.h +++ b/lib/Headers/avx2intrin.h @@ -170,24 +170,8 @@ _mm256_blendv_epi8(__m256i __V1, __m256i __V2, __m256i __M) } #define _mm256_blend_epi16(V1, V2, M) \ - (__m256i)__builtin_shufflevector((__v16hi)(__m256i)(V1), \ - (__v16hi)(__m256i)(V2), \ - (((M) & 0x01) ? 16 : 0), \ - (((M) & 0x02) ? 17 : 1), \ - (((M) & 0x04) ? 18 : 2), \ - (((M) & 0x08) ? 19 : 3), \ - (((M) & 0x10) ? 20 : 4), \ - (((M) & 0x20) ? 21 : 5), \ - (((M) & 0x40) ? 22 : 6), \ - (((M) & 0x80) ? 23 : 7), \ - (((M) & 0x01) ? 24 : 8), \ - (((M) & 0x02) ? 25 : 9), \ - (((M) & 0x04) ? 26 : 10), \ - (((M) & 0x08) ? 27 : 11), \ - (((M) & 0x10) ? 28 : 12), \ - (((M) & 0x20) ? 29 : 13), \ - (((M) & 0x40) ? 30 : 14), \ - (((M) & 0x80) ? 31 : 15)) + (__m256i)__builtin_ia32_pblendw256((__v16hi)(__m256i)(V1), \ + (__v16hi)(__m256i)(V2), (int)(M)) static __inline__ __m256i __DEFAULT_FN_ATTRS _mm256_cmpeq_epi8(__m256i __a, __m256i __b) @@ -809,24 +793,12 @@ _mm256_broadcastsi128_si256(__m128i __X) } #define _mm_blend_epi32(V1, V2, M) \ - (__m128i)__builtin_shufflevector((__v4si)(__m128i)(V1), \ - (__v4si)(__m128i)(V2), \ - (((M) & 0x01) ? 4 : 0), \ - (((M) & 0x02) ? 5 : 1), \ - (((M) & 0x04) ? 6 : 2), \ - (((M) & 0x08) ? 7 : 3)) + (__m128i)__builtin_ia32_pblendd128((__v4si)(__m128i)(V1), \ + (__v4si)(__m128i)(V2), (int)(M)) #define _mm256_blend_epi32(V1, V2, M) \ - (__m256i)__builtin_shufflevector((__v8si)(__m256i)(V1), \ - (__v8si)(__m256i)(V2), \ - (((M) & 0x01) ? 8 : 0), \ - (((M) & 0x02) ? 9 : 1), \ - (((M) & 0x04) ? 10 : 2), \ - (((M) & 0x08) ? 11 : 3), \ - (((M) & 0x10) ? 12 : 4), \ - (((M) & 0x20) ? 13 : 5), \ - (((M) & 0x40) ? 14 : 6), \ - (((M) & 0x80) ? 15 : 7)) + (__m256i)__builtin_ia32_pblendd256((__v8si)(__m256i)(V1), \ + (__v8si)(__m256i)(V2), (int)(M)) static __inline__ __m256i __DEFAULT_FN_ATTRS _mm256_broadcastb_epi8(__m128i __X) diff --git a/lib/Headers/avxintrin.h b/lib/Headers/avxintrin.h index 883b2e350d..7e3c51ffb6 100644 --- a/lib/Headers/avxintrin.h +++ b/lib/Headers/avxintrin.h @@ -1355,12 +1355,8 @@ _mm256_permutevar_ps(__m256 __a, __m256i __c) /// operand \a V2 is copied to the same position in the destination. /// \returns A 256-bit vector of [4 x double] containing the copied values. #define _mm256_blend_pd(V1, V2, M) \ - (__m256d)__builtin_shufflevector((__v4df)(__m256d)(V1), \ - (__v4df)(__m256d)(V2), \ - (((M) & 0x01) ? 4 : 0), \ - (((M) & 0x02) ? 5 : 1), \ - (((M) & 0x04) ? 6 : 2), \ - (((M) & 0x08) ? 7 : 3)) + (__m256d)__builtin_ia32_blendpd256((__v4df)(__m256d)(V1), \ + (__v4df)(__m256d)(V2), (int)(M)) /// Merges 32-bit single-precision data values stored in either of the /// two 256-bit vectors of [8 x float], as specified by the immediate @@ -1387,16 +1383,8 @@ _mm256_permutevar_ps(__m256 __a, __m256i __c) /// operand \a V2 is copied to the same position in the destination. /// \returns A 256-bit vector of [8 x float] containing the copied values. #define _mm256_blend_ps(V1, V2, M) \ - (__m256)__builtin_shufflevector((__v8sf)(__m256)(V1), \ - (__v8sf)(__m256)(V2), \ - (((M) & 0x01) ? 8 : 0), \ - (((M) & 0x02) ? 9 : 1), \ - (((M) & 0x04) ? 10 : 2), \ - (((M) & 0x08) ? 11 : 3), \ - (((M) & 0x10) ? 12 : 4), \ - (((M) & 0x20) ? 13 : 5), \ - (((M) & 0x40) ? 14 : 6), \ - (((M) & 0x80) ? 15 : 7)) + (__m256)__builtin_ia32_blendps256((__v8sf)(__m256)(V1), \ + (__v8sf)(__m256)(V2), (int)(M)) /// Merges 64-bit double-precision data values stored in either of the /// two 256-bit vectors of [4 x double], as specified by the 256-bit vector diff --git a/lib/Headers/smmintrin.h b/lib/Headers/smmintrin.h index b3084562a0..7ccb1a37f5 100644 --- a/lib/Headers/smmintrin.h +++ b/lib/Headers/smmintrin.h @@ -390,10 +390,8 @@ /// is copied to the same position in the result. /// \returns A 128-bit vector of [2 x double] containing the copied values. #define _mm_blend_pd(V1, V2, M) \ - (__m128d)__builtin_shufflevector((__v2df)(__m128d)(V1), \ - (__v2df)(__m128d)(V2), \ - (((M) & 0x01) ? 2 : 0), \ - (((M) & 0x02) ? 3 : 1)) + (__m128d) __builtin_ia32_blendpd ((__v2df)(__m128d)(V1), \ + (__v2df)(__m128d)(V2), (int)(M)) /// Returns a 128-bit vector of [4 x float] where the values are selected /// from either the first or second operand as specified by the third @@ -420,11 +418,8 @@ /// is copied to the same position in the result. /// \returns A 128-bit vector of [4 x float] containing the copied values. #define _mm_blend_ps(V1, V2, M) \ - (__m128)__builtin_shufflevector((__v4sf)(__m128)(V1), (__v4sf)(__m128)(V2), \ - (((M) & 0x01) ? 4 : 0), \ - (((M) & 0x02) ? 5 : 1), \ - (((M) & 0x04) ? 6 : 2), \ - (((M) & 0x08) ? 7 : 3)) + (__m128) __builtin_ia32_blendps ((__v4sf)(__m128)(V1), \ + (__v4sf)(__m128)(V2), (int)(M)) /// Returns a 128-bit vector of [2 x double] where the values are /// selected from either the first or second operand as specified by the @@ -532,16 +527,8 @@ _mm_blendv_epi8 (__m128i __V1, __m128i __V2, __m128i __M) /// is copied to the same position in the result. /// \returns A 128-bit vector of [8 x i16] containing the copied values. #define _mm_blend_epi16(V1, V2, M) \ - (__m128i)__builtin_shufflevector((__v8hi)(__m128i)(V1), \ - (__v8hi)(__m128i)(V2), \ - (((M) & 0x01) ? 8 : 0), \ - (((M) & 0x02) ? 9 : 1), \ - (((M) & 0x04) ? 10 : 2), \ - (((M) & 0x08) ? 11 : 3), \ - (((M) & 0x10) ? 12 : 4), \ - (((M) & 0x20) ? 13 : 5), \ - (((M) & 0x40) ? 14 : 6), \ - (((M) & 0x80) ? 15 : 7)) + (__m128i) __builtin_ia32_pblendw128 ((__v8hi)(__m128i)(V1), \ + (__v8hi)(__m128i)(V2), (int)(M)) /* SSE4 Dword Multiply Instructions. */ /// Multiples corresponding elements of two 128-bit vectors of [4 x i32] diff --git a/lib/Sema/SemaChecking.cpp b/lib/Sema/SemaChecking.cpp index 03a117f435..db004607cc 100644 --- a/lib/Sema/SemaChecking.cpp +++ b/lib/Sema/SemaChecking.cpp @@ -2624,6 +2624,7 @@ bool Sema::CheckX86BuiltinFunctionCall(unsigned BuiltinID, CallExpr *TheCall) { i = 1; l = 0; u = 7; break; case X86::BI__builtin_ia32_sha1rnds4: + case X86::BI__builtin_ia32_blendpd: case X86::BI__builtin_ia32_vec_set_v4hi: case X86::BI__builtin_ia32_vec_set_v4si: case X86::BI__builtin_ia32_vec_set_v4di: @@ -2683,6 +2684,9 @@ bool Sema::CheckX86BuiltinFunctionCall(unsigned BuiltinID, CallExpr *TheCall) { case X86::BI__builtin_ia32_vec_ext_v16hi: i = 1; l = 0; u = 15; break; + case X86::BI__builtin_ia32_pblendd128: + case X86::BI__builtin_ia32_blendps: + case X86::BI__builtin_ia32_blendpd256: case X86::BI__builtin_ia32_roundss: case X86::BI__builtin_ia32_roundsd: case X86::BI__builtin_ia32_rangepd128_mask: @@ -2754,6 +2758,10 @@ bool Sema::CheckX86BuiltinFunctionCall(unsigned BuiltinID, CallExpr *TheCall) { case X86::BI__builtin_ia32_fpclassss_mask: i = 1; l = 0; u = 255; break; + case X86::BI__builtin_ia32_pblendw128: + case X86::BI__builtin_ia32_pblendw256: + case X86::BI__builtin_ia32_blendps256: + case X86::BI__builtin_ia32_pblendd256: case X86::BI__builtin_ia32_palignr128: case X86::BI__builtin_ia32_palignr256: case X86::BI__builtin_ia32_palignr512: diff --git a/test/CodeGen/avx-builtins.c b/test/CodeGen/avx-builtins.c index 43c502f47f..e40a03d8cf 100644 --- a/test/CodeGen/avx-builtins.c +++ b/test/CodeGen/avx-builtins.c @@ -59,7 +59,7 @@ __m256 test_mm256_andnot_ps(__m256 A, __m256 B) { __m256d test_mm256_blend_pd(__m256d A, __m256d B) { // CHECK-LABEL: test_mm256_blend_pd // CHECK: shufflevector <4 x double> %{{.*}}, <4 x double> %{{.*}}, <4 x i32> - return _mm256_blend_pd(A, B, 0x35); + return _mm256_blend_pd(A, B, 0x05); } __m256 test_mm256_blend_ps(__m256 A, __m256 B) { diff --git a/test/CodeGen/avx2-builtins.c b/test/CodeGen/avx2-builtins.c index b647ca595b..bd7e3a9596 100644 --- a/test/CodeGen/avx2-builtins.c +++ b/test/CodeGen/avx2-builtins.c @@ -141,7 +141,7 @@ __m128i test_mm_blend_epi32(__m128i a, __m128i b) { // CHECK-LABEL: test_mm_blend_epi32 // CHECK-NOT: @llvm.x86.avx2.pblendd.128 // CHECK: shufflevector <4 x i32> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> - return _mm_blend_epi32(a, b, 0x35); + return _mm_blend_epi32(a, b, 0x05); } __m256i test_mm256_blend_epi32(__m256i a, __m256i b) { -- 2.40.0