From 1efc61399759e092c2fea20d27d6da707a0db1b9 Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper@intel.com>
Date: Fri, 8 Jun 2018 00:00:21 +0000
Subject: [PATCH] [X86] Add builtins for blend with immediate control to
 enforce target feature requirements and check immediate range.

git-svn-id: https://llvm.org/svn/llvm-project/cfe/trunk@334249 91177308-0d34-0410-b5e6-96231b3b80d8
---
 include/clang/Basic/BuiltinsX86.def |  8 ++++++
 lib/CodeGen/CGBuiltin.cpp           | 21 +++++++++++++++
 lib/Headers/avx2intrin.h            | 40 +++++------------------------
 lib/Headers/avxintrin.h             | 20 +++------------
 lib/Headers/smmintrin.h             | 25 +++++-------------
 lib/Sema/SemaChecking.cpp           |  8 ++++++
 test/CodeGen/avx-builtins.c         |  2 +-
 test/CodeGen/avx2-builtins.c        |  2 +-
 8 files changed, 55 insertions(+), 71 deletions(-)

diff --git a/include/clang/Basic/BuiltinsX86.def b/include/clang/Basic/BuiltinsX86.def
index 9e56028281..6f952bc061 100644
--- a/include/clang/Basic/BuiltinsX86.def
+++ b/include/clang/Basic/BuiltinsX86.def
@@ -369,6 +369,9 @@ TARGET_BUILTIN(__builtin_ia32_palignr128, "V16cV16cV16cIi", "nc", "ssse3")
 
 TARGET_BUILTIN(__builtin_ia32_insertps128, "V4fV4fV4fIc", "nc", "sse4.1")
 TARGET_BUILTIN(__builtin_ia32_pblendvb128, "V16cV16cV16cV16c", "nc", "sse4.1")
+TARGET_BUILTIN(__builtin_ia32_pblendw128, "V8sV8sV8sIi", "nc", "sse4.1")
+TARGET_BUILTIN(__builtin_ia32_blendpd, "V2dV2dV2dIi", "nc", "sse4.1")
+TARGET_BUILTIN(__builtin_ia32_blendps, "V4fV4fV4fIi", "nc", "sse4.1")
 TARGET_BUILTIN(__builtin_ia32_blendvpd, "V2dV2dV2dV2d", "nc", "sse4.1")
 TARGET_BUILTIN(__builtin_ia32_blendvps, "V4fV4fV4fV4f", "nc", "sse4.1")
 TARGET_BUILTIN(__builtin_ia32_packusdw128, "V8sV4iV4i", "nc", "sse4.1")
@@ -477,6 +480,8 @@ TARGET_BUILTIN(__builtin_ia32_vpermilvarpd, "V2dV2dV2LLi", "nc", "avx")
 TARGET_BUILTIN(__builtin_ia32_vpermilvarps, "V4fV4fV4i", "nc", "avx")
 TARGET_BUILTIN(__builtin_ia32_vpermilvarpd256, "V4dV4dV4LLi", "nc", "avx")
 TARGET_BUILTIN(__builtin_ia32_vpermilvarps256, "V8fV8fV8i", "nc", "avx")
+TARGET_BUILTIN(__builtin_ia32_blendpd256, "V4dV4dV4dIi", "nc", "avx")
+TARGET_BUILTIN(__builtin_ia32_blendps256, "V8fV8fV8fIi", "nc", "avx")
 TARGET_BUILTIN(__builtin_ia32_blendvpd256, "V4dV4dV4dV4d", "nc", "avx")
 TARGET_BUILTIN(__builtin_ia32_blendvps256, "V8fV8fV8fV8f", "nc", "avx")
 TARGET_BUILTIN(__builtin_ia32_dpps256, "V8fV8fV8fIc", "nc", "avx")
@@ -554,6 +559,7 @@ TARGET_BUILTIN(__builtin_ia32_psubusb256, "V32cV32cV32c", "nc", "avx2")
 TARGET_BUILTIN(__builtin_ia32_psubusw256, "V16sV16sV16s", "nc", "avx2")
 TARGET_BUILTIN(__builtin_ia32_palignr256, "V32cV32cV32cIi", "nc", "avx2")
 TARGET_BUILTIN(__builtin_ia32_pblendvb256, "V32cV32cV32cV32c", "nc", "avx2")
+TARGET_BUILTIN(__builtin_ia32_pblendw256, "V16sV16sV16sIi", "nc", "avx2")
 TARGET_BUILTIN(__builtin_ia32_phaddw256, "V16sV16sV16s", "nc", "avx2")
 TARGET_BUILTIN(__builtin_ia32_phaddd256, "V8iV8iV8i", "nc", "avx2")
 TARGET_BUILTIN(__builtin_ia32_phaddsw256, "V16sV16sV16s", "nc", "avx2")
@@ -603,6 +609,8 @@ TARGET_BUILTIN(__builtin_ia32_psrldi256, "V8iV8ii", "nc", "avx2")
 TARGET_BUILTIN(__builtin_ia32_psrld256, "V8iV8iV4i", "nc", "avx2")
 TARGET_BUILTIN(__builtin_ia32_psrlqi256, "V4LLiV4LLii", "nc", "avx2")
 TARGET_BUILTIN(__builtin_ia32_psrlq256, "V4LLiV4LLiV2LLi", "nc", "avx2")
+TARGET_BUILTIN(__builtin_ia32_pblendd128, "V4iV4iV4iIi", "nc", "avx2")
+TARGET_BUILTIN(__builtin_ia32_pblendd256, "V8iV8iV8iIi", "nc", "avx2")
 TARGET_BUILTIN(__builtin_ia32_permvarsi256, "V8iV8iV8i", "nc", "avx2")
 TARGET_BUILTIN(__builtin_ia32_permvarsf256, "V8fV8fV8i", "nc", "avx2")
 TARGET_BUILTIN(__builtin_ia32_permti256, "V4LLiV4LLiV4LLiIc", "nc", "avx2")
diff --git a/lib/CodeGen/CGBuiltin.cpp b/lib/CodeGen/CGBuiltin.cpp
index 69f7d91b38..0002058495 100644
--- a/lib/CodeGen/CGBuiltin.cpp
+++ b/lib/CodeGen/CGBuiltin.cpp
@@ -9235,6 +9235,27 @@ Value *CodeGenFunction::EmitX86BuiltinExpr(unsigned BuiltinID,
     Ops[0] = Builder.CreateBitCast(Ops[0], PtrTy);
     return Builder.CreateDefaultAlignedStore(Ops[1], Ops[0]);
   }
+  case X86::BI__builtin_ia32_pblendw128:
+  case X86::BI__builtin_ia32_blendpd:
+  case X86::BI__builtin_ia32_blendps:
+  case X86::BI__builtin_ia32_blendpd256:
+  case X86::BI__builtin_ia32_blendps256:
+  case X86::BI__builtin_ia32_pblendw256:
+  case X86::BI__builtin_ia32_pblendd128:
+  case X86::BI__builtin_ia32_pblendd256: {
+    unsigned NumElts = Ops[0]->getType()->getVectorNumElements();
+    unsigned Imm = cast<llvm::ConstantInt>(Ops[2])->getZExtValue();
+
+    uint32_t Indices[16];
+    // If there are more than 8 elements, the immediate is used twice so make
+    // sure we handle that.
+    for (unsigned i = 0; i != NumElts; ++i)
+      Indices[i] = ((Imm >> (i % 8)) & 0x1) ? NumElts + i : i;
+
+    return Builder.CreateShuffleVector(Ops[1], Ops[0],
+                                       makeArrayRef(Indices, NumElts),
+                                       "blend");
+  }
   case X86::BI__builtin_ia32_palignr128:
   case X86::BI__builtin_ia32_palignr256:
   case X86::BI__builtin_ia32_palignr512: {
diff --git a/lib/Headers/avx2intrin.h b/lib/Headers/avx2intrin.h
index 9371fcb949..d1c530693b 100644
--- a/lib/Headers/avx2intrin.h
+++ b/lib/Headers/avx2intrin.h
@@ -170,24 +170,8 @@ _mm256_blendv_epi8(__m256i __V1, __m256i __V2, __m256i __M)
 }
 
 #define _mm256_blend_epi16(V1, V2, M) \
-  (__m256i)__builtin_shufflevector((__v16hi)(__m256i)(V1),   \
-                                   (__v16hi)(__m256i)(V2),   \
-                                   (((M) & 0x01) ? 16 : 0),  \
-                                   (((M) & 0x02) ? 17 : 1),  \
-                                   (((M) & 0x04) ? 18 : 2),  \
-                                   (((M) & 0x08) ? 19 : 3),  \
-                                   (((M) & 0x10) ? 20 : 4),  \
-                                   (((M) & 0x20) ? 21 : 5),  \
-                                   (((M) & 0x40) ? 22 : 6),  \
-                                   (((M) & 0x80) ? 23 : 7),  \
-                                   (((M) & 0x01) ? 24 : 8),  \
-                                   (((M) & 0x02) ? 25 : 9),  \
-                                   (((M) & 0x04) ? 26 : 10), \
-                                   (((M) & 0x08) ? 27 : 11), \
-                                   (((M) & 0x10) ? 28 : 12), \
-                                   (((M) & 0x20) ? 29 : 13), \
-                                   (((M) & 0x40) ? 30 : 14), \
-                                   (((M) & 0x80) ? 31 : 15))
+  (__m256i)__builtin_ia32_pblendw256((__v16hi)(__m256i)(V1), \
+                                     (__v16hi)(__m256i)(V2), (int)(M))
 
 static __inline__ __m256i __DEFAULT_FN_ATTRS
 _mm256_cmpeq_epi8(__m256i __a, __m256i __b)
@@ -809,24 +793,12 @@ _mm256_broadcastsi128_si256(__m128i __X)
 }
 
 #define _mm_blend_epi32(V1, V2, M) \
-  (__m128i)__builtin_shufflevector((__v4si)(__m128i)(V1),  \
-                                   (__v4si)(__m128i)(V2),  \
-                                   (((M) & 0x01) ? 4 : 0), \
-                                   (((M) & 0x02) ? 5 : 1), \
-                                   (((M) & 0x04) ? 6 : 2), \
-                                   (((M) & 0x08) ? 7 : 3))
+  (__m128i)__builtin_ia32_pblendd128((__v4si)(__m128i)(V1), \
+                                     (__v4si)(__m128i)(V2), (int)(M))
 
 #define _mm256_blend_epi32(V1, V2, M) \
-  (__m256i)__builtin_shufflevector((__v8si)(__m256i)(V1),   \
-                                   (__v8si)(__m256i)(V2),   \
-                                   (((M) & 0x01) ?  8 : 0), \
-                                   (((M) & 0x02) ?  9 : 1), \
-                                   (((M) & 0x04) ? 10 : 2), \
-                                   (((M) & 0x08) ? 11 : 3), \
-                                   (((M) & 0x10) ? 12 : 4), \
-                                   (((M) & 0x20) ? 13 : 5), \
-                                   (((M) & 0x40) ? 14 : 6), \
-                                   (((M) & 0x80) ? 15 : 7))
+  (__m256i)__builtin_ia32_pblendd256((__v8si)(__m256i)(V1), \
+                                     (__v8si)(__m256i)(V2), (int)(M))
 
 static __inline__ __m256i __DEFAULT_FN_ATTRS
 _mm256_broadcastb_epi8(__m128i __X)
diff --git a/lib/Headers/avxintrin.h b/lib/Headers/avxintrin.h
index 883b2e350d..7e3c51ffb6 100644
--- a/lib/Headers/avxintrin.h
+++ b/lib/Headers/avxintrin.h
@@ -1355,12 +1355,8 @@ _mm256_permutevar_ps(__m256 __a, __m256i __c)
 ///    operand \a V2 is copied to the same position in the destination.
 /// \returns A 256-bit vector of [4 x double] containing the copied values.
 #define _mm256_blend_pd(V1, V2, M) \
-  (__m256d)__builtin_shufflevector((__v4df)(__m256d)(V1), \
-                                   (__v4df)(__m256d)(V2), \
-                                   (((M) & 0x01) ? 4 : 0), \
-                                   (((M) & 0x02) ? 5 : 1), \
-                                   (((M) & 0x04) ? 6 : 2), \
-                                   (((M) & 0x08) ? 7 : 3))
+  (__m256d)__builtin_ia32_blendpd256((__v4df)(__m256d)(V1), \
+                                     (__v4df)(__m256d)(V2), (int)(M))
 
 /// Merges 32-bit single-precision data values stored in either of the
 ///    two 256-bit vectors of [8 x float], as specified by the immediate
@@ -1387,16 +1383,8 @@ _mm256_permutevar_ps(__m256 __a, __m256i __c)
 ///    operand \a V2 is copied to the same position in the destination.
 /// \returns A 256-bit vector of [8 x float] containing the copied values.
 #define _mm256_blend_ps(V1, V2, M) \
-  (__m256)__builtin_shufflevector((__v8sf)(__m256)(V1), \
-                                  (__v8sf)(__m256)(V2), \
-                                  (((M) & 0x01) ?  8 : 0), \
-                                  (((M) & 0x02) ?  9 : 1), \
-                                  (((M) & 0x04) ? 10 : 2), \
-                                  (((M) & 0x08) ? 11 : 3), \
-                                  (((M) & 0x10) ? 12 : 4), \
-                                  (((M) & 0x20) ? 13 : 5), \
-                                  (((M) & 0x40) ? 14 : 6), \
-                                  (((M) & 0x80) ? 15 : 7))
+  (__m256)__builtin_ia32_blendps256((__v8sf)(__m256)(V1), \
+                                    (__v8sf)(__m256)(V2), (int)(M))
 
 /// Merges 64-bit double-precision data values stored in either of the
 ///    two 256-bit vectors of [4 x double], as specified by the 256-bit vector
diff --git a/lib/Headers/smmintrin.h b/lib/Headers/smmintrin.h
index b3084562a0..7ccb1a37f5 100644
--- a/lib/Headers/smmintrin.h
+++ b/lib/Headers/smmintrin.h
@@ -390,10 +390,8 @@
 ///    is copied to the same position in the result.
 /// \returns A 128-bit vector of [2 x double] containing the copied values.
 #define _mm_blend_pd(V1, V2, M) \
-  (__m128d)__builtin_shufflevector((__v2df)(__m128d)(V1), \
-                                   (__v2df)(__m128d)(V2), \
-                                   (((M) & 0x01) ? 2 : 0), \
-                                   (((M) & 0x02) ? 3 : 1))
+  (__m128d) __builtin_ia32_blendpd ((__v2df)(__m128d)(V1), \
+                                    (__v2df)(__m128d)(V2), (int)(M))
 
 /// Returns a 128-bit vector of [4 x float] where the values are selected
 ///    from either the first or second operand as specified by the third
@@ -420,11 +418,8 @@
 ///    is copied to the same position in the result.
 /// \returns A 128-bit vector of [4 x float] containing the copied values.
 #define _mm_blend_ps(V1, V2, M) \
-  (__m128)__builtin_shufflevector((__v4sf)(__m128)(V1), (__v4sf)(__m128)(V2), \
-                                  (((M) & 0x01) ? 4 : 0), \
-                                  (((M) & 0x02) ? 5 : 1), \
-                                  (((M) & 0x04) ? 6 : 2), \
-                                  (((M) & 0x08) ? 7 : 3))
+  (__m128) __builtin_ia32_blendps ((__v4sf)(__m128)(V1), \
+                                   (__v4sf)(__m128)(V2), (int)(M))
 
 /// Returns a 128-bit vector of [2 x double] where the values are
 ///    selected from either the first or second operand as specified by the
@@ -532,16 +527,8 @@ _mm_blendv_epi8 (__m128i __V1, __m128i __V2, __m128i __M)
 ///    is copied to the same position in the result.
 /// \returns A 128-bit vector of [8 x i16] containing the copied values.
 #define _mm_blend_epi16(V1, V2, M) \
-  (__m128i)__builtin_shufflevector((__v8hi)(__m128i)(V1), \
-                                   (__v8hi)(__m128i)(V2), \
-                                   (((M) & 0x01) ?  8 : 0), \
-                                   (((M) & 0x02) ?  9 : 1), \
-                                   (((M) & 0x04) ? 10 : 2), \
-                                   (((M) & 0x08) ? 11 : 3), \
-                                   (((M) & 0x10) ? 12 : 4), \
-                                   (((M) & 0x20) ? 13 : 5), \
-                                   (((M) & 0x40) ? 14 : 6), \
-                                   (((M) & 0x80) ? 15 : 7))
+  (__m128i) __builtin_ia32_pblendw128 ((__v8hi)(__m128i)(V1), \
+                                       (__v8hi)(__m128i)(V2), (int)(M))
 
 /* SSE4 Dword Multiply Instructions.  */
 /// Multiples corresponding elements of two 128-bit vectors of [4 x i32]
diff --git a/lib/Sema/SemaChecking.cpp b/lib/Sema/SemaChecking.cpp
index 03a117f435..db004607cc 100644
--- a/lib/Sema/SemaChecking.cpp
+++ b/lib/Sema/SemaChecking.cpp
@@ -2624,6 +2624,7 @@ bool Sema::CheckX86BuiltinFunctionCall(unsigned BuiltinID, CallExpr *TheCall) {
     i = 1; l = 0; u = 7;
     break;
   case X86::BI__builtin_ia32_sha1rnds4:
+  case X86::BI__builtin_ia32_blendpd:
   case X86::BI__builtin_ia32_vec_set_v4hi:
   case X86::BI__builtin_ia32_vec_set_v4si:
   case X86::BI__builtin_ia32_vec_set_v4di:
@@ -2683,6 +2684,9 @@ bool Sema::CheckX86BuiltinFunctionCall(unsigned BuiltinID, CallExpr *TheCall) {
   case X86::BI__builtin_ia32_vec_ext_v16hi:
     i = 1; l = 0; u = 15;
     break;
+  case X86::BI__builtin_ia32_pblendd128:
+  case X86::BI__builtin_ia32_blendps:
+  case X86::BI__builtin_ia32_blendpd256:
   case X86::BI__builtin_ia32_roundss:
   case X86::BI__builtin_ia32_roundsd:
   case X86::BI__builtin_ia32_rangepd128_mask:
@@ -2754,6 +2758,10 @@ bool Sema::CheckX86BuiltinFunctionCall(unsigned BuiltinID, CallExpr *TheCall) {
   case X86::BI__builtin_ia32_fpclassss_mask:
     i = 1; l = 0; u = 255;
     break;
+  case X86::BI__builtin_ia32_pblendw128:
+  case X86::BI__builtin_ia32_pblendw256:
+  case X86::BI__builtin_ia32_blendps256:
+  case X86::BI__builtin_ia32_pblendd256:
   case X86::BI__builtin_ia32_palignr128:
   case X86::BI__builtin_ia32_palignr256:
   case X86::BI__builtin_ia32_palignr512:
diff --git a/test/CodeGen/avx-builtins.c b/test/CodeGen/avx-builtins.c
index 43c502f47f..e40a03d8cf 100644
--- a/test/CodeGen/avx-builtins.c
+++ b/test/CodeGen/avx-builtins.c
@@ -59,7 +59,7 @@ __m256 test_mm256_andnot_ps(__m256 A, __m256 B) {
 __m256d test_mm256_blend_pd(__m256d A, __m256d B) {
   // CHECK-LABEL: test_mm256_blend_pd
   // CHECK: shufflevector <4 x double> %{{.*}}, <4 x double> %{{.*}}, <4 x i32> <i32 4, i32 1, i32 6, i32 3>
-  return _mm256_blend_pd(A, B, 0x35);
+  return _mm256_blend_pd(A, B, 0x05);
 }
 
 __m256 test_mm256_blend_ps(__m256 A, __m256 B) {
diff --git a/test/CodeGen/avx2-builtins.c b/test/CodeGen/avx2-builtins.c
index b647ca595b..bd7e3a9596 100644
--- a/test/CodeGen/avx2-builtins.c
+++ b/test/CodeGen/avx2-builtins.c
@@ -141,7 +141,7 @@ __m128i test_mm_blend_epi32(__m128i a, __m128i b) {
   // CHECK-LABEL: test_mm_blend_epi32
   // CHECK-NOT: @llvm.x86.avx2.pblendd.128
   // CHECK: shufflevector <4 x i32> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> <i32 4, i32 1, i32 6, i32 3>
-  return _mm_blend_epi32(a, b, 0x35);
+  return _mm_blend_epi32(a, b, 0x05);
 }
 
 __m256i test_mm256_blend_epi32(__m256i a, __m256i b) {
-- 
2.40.0