[X86] Add builtins for blend with immediate control to enforce target feature require...

author Craig Topper <craig.topper@intel.com>

Fri, 8 Jun 2018 00:00:21 +0000 (00:00 +0000)

committer Craig Topper <craig.topper@intel.com>

Fri, 8 Jun 2018 00:00:21 +0000 (00:00 +0000)
author Craig Topper <craig.topper@intel.com>
Fri, 8 Jun 2018 00:00:21 +0000 (00:00 +0000)
committer Craig Topper <craig.topper@intel.com>
Fri, 8 Jun 2018 00:00:21 +0000 (00:00 +0000)
diff --git a/include/clang/Basic/BuiltinsX86.def b/include/clang/Basic/BuiltinsX86.def

index 9e56028281901fb2496715247b1d979d0451bd82..6f952bc061fe189fe6b9c638b346057e9086f9e6 100644 (file)
--- a/include/clang/Basic/BuiltinsX86.def
+++ b/include/clang/Basic/BuiltinsX86.def
@@ -369,6 +369,9 @@ TARGET_BUILTIN(__builtin_ia32_palignr128, "V16cV16cV16cIi", "nc", "ssse3")
  
  TARGET_BUILTIN(__builtin_ia32_insertps128, "V4fV4fV4fIc", "nc", "sse4.1")
  TARGET_BUILTIN(__builtin_ia32_pblendvb128, "V16cV16cV16cV16c", "nc", "sse4.1")
+TARGET_BUILTIN(__builtin_ia32_pblendw128, "V8sV8sV8sIi", "nc", "sse4.1")
+TARGET_BUILTIN(__builtin_ia32_blendpd, "V2dV2dV2dIi", "nc", "sse4.1")
+TARGET_BUILTIN(__builtin_ia32_blendps, "V4fV4fV4fIi", "nc", "sse4.1")
  TARGET_BUILTIN(__builtin_ia32_blendvpd, "V2dV2dV2dV2d", "nc", "sse4.1")
  TARGET_BUILTIN(__builtin_ia32_blendvps, "V4fV4fV4fV4f", "nc", "sse4.1")
  TARGET_BUILTIN(__builtin_ia32_packusdw128, "V8sV4iV4i", "nc", "sse4.1")
@@ -477,6 +480,8 @@ TARGET_BUILTIN(__builtin_ia32_vpermilvarpd, "V2dV2dV2LLi", "nc", "avx")
  TARGET_BUILTIN(__builtin_ia32_vpermilvarps, "V4fV4fV4i", "nc", "avx")
  TARGET_BUILTIN(__builtin_ia32_vpermilvarpd256, "V4dV4dV4LLi", "nc", "avx")
  TARGET_BUILTIN(__builtin_ia32_vpermilvarps256, "V8fV8fV8i", "nc", "avx")
+TARGET_BUILTIN(__builtin_ia32_blendpd256, "V4dV4dV4dIi", "nc", "avx")
+TARGET_BUILTIN(__builtin_ia32_blendps256, "V8fV8fV8fIi", "nc", "avx")
  TARGET_BUILTIN(__builtin_ia32_blendvpd256, "V4dV4dV4dV4d", "nc", "avx")
  TARGET_BUILTIN(__builtin_ia32_blendvps256, "V8fV8fV8fV8f", "nc", "avx")
  TARGET_BUILTIN(__builtin_ia32_dpps256, "V8fV8fV8fIc", "nc", "avx")
@@ -554,6 +559,7 @@ TARGET_BUILTIN(__builtin_ia32_psubusb256, "V32cV32cV32c", "nc", "avx2")
  TARGET_BUILTIN(__builtin_ia32_psubusw256, "V16sV16sV16s", "nc", "avx2")
  TARGET_BUILTIN(__builtin_ia32_palignr256, "V32cV32cV32cIi", "nc", "avx2")
  TARGET_BUILTIN(__builtin_ia32_pblendvb256, "V32cV32cV32cV32c", "nc", "avx2")
+TARGET_BUILTIN(__builtin_ia32_pblendw256, "V16sV16sV16sIi", "nc", "avx2")
  TARGET_BUILTIN(__builtin_ia32_phaddw256, "V16sV16sV16s", "nc", "avx2")
  TARGET_BUILTIN(__builtin_ia32_phaddd256, "V8iV8iV8i", "nc", "avx2")
  TARGET_BUILTIN(__builtin_ia32_phaddsw256, "V16sV16sV16s", "nc", "avx2")
@@ -603,6 +609,8 @@ TARGET_BUILTIN(__builtin_ia32_psrldi256, "V8iV8ii", "nc", "avx2")
  TARGET_BUILTIN(__builtin_ia32_psrld256, "V8iV8iV4i", "nc", "avx2")
  TARGET_BUILTIN(__builtin_ia32_psrlqi256, "V4LLiV4LLii", "nc", "avx2")
  TARGET_BUILTIN(__builtin_ia32_psrlq256, "V4LLiV4LLiV2LLi", "nc", "avx2")
+TARGET_BUILTIN(__builtin_ia32_pblendd128, "V4iV4iV4iIi", "nc", "avx2")
+TARGET_BUILTIN(__builtin_ia32_pblendd256, "V8iV8iV8iIi", "nc", "avx2")
  TARGET_BUILTIN(__builtin_ia32_permvarsi256, "V8iV8iV8i", "nc", "avx2")
  TARGET_BUILTIN(__builtin_ia32_permvarsf256, "V8fV8fV8i", "nc", "avx2")
  TARGET_BUILTIN(__builtin_ia32_permti256, "V4LLiV4LLiV4LLiIc", "nc", "avx2")
diff --git a/lib/CodeGen/CGBuiltin.cpp b/lib/CodeGen/CGBuiltin.cpp

index 69f7d91b38a3793d37294f0153f39992dd66bc5a..00020584955c41229dc2bb45c686da035c04fe76 100644 (file)
--- a/lib/CodeGen/CGBuiltin.cpp
+++ b/lib/CodeGen/CGBuiltin.cpp
@@ -9235,6 +9235,27 @@ Value *CodeGenFunction::EmitX86BuiltinExpr(unsigned BuiltinID,
      Ops[0] = Builder.CreateBitCast(Ops[0], PtrTy);
      return Builder.CreateDefaultAlignedStore(Ops[1], Ops[0]);
    }
+  case X86::BI__builtin_ia32_pblendw128:
+  case X86::BI__builtin_ia32_blendpd:
+  case X86::BI__builtin_ia32_blendps:
+  case X86::BI__builtin_ia32_blendpd256:
+  case X86::BI__builtin_ia32_blendps256:
+  case X86::BI__builtin_ia32_pblendw256:
+  case X86::BI__builtin_ia32_pblendd128:
+  case X86::BI__builtin_ia32_pblendd256: {
+    unsigned NumElts = Ops[0]->getType()->getVectorNumElements();
+    unsigned Imm = cast<llvm::ConstantInt>(Ops[2])->getZExtValue();
+
+    uint32_t Indices[16];
+    // If there are more than 8 elements, the immediate is used twice so make
+    // sure we handle that.
+    for (unsigned i = 0; i != NumElts; ++i)
+      Indices[i] = ((Imm >> (i % 8)) & 0x1) ? NumElts + i : i;
+
+    return Builder.CreateShuffleVector(Ops[1], Ops[0],
+                                       makeArrayRef(Indices, NumElts),
+                                       "blend");
+  }
    case X86::BI__builtin_ia32_palignr128:
    case X86::BI__builtin_ia32_palignr256:
    case X86::BI__builtin_ia32_palignr512: {
diff --git a/lib/Headers/avx2intrin.h b/lib/Headers/avx2intrin.h

index 9371fcb949336f839e9190663d4bc2390ab6e690..d1c530693bf6d3138fbee36669359c9f1d3c1b6f 100644 (file)
--- a/lib/Headers/avx2intrin.h
+++ b/lib/Headers/avx2intrin.h
@@ -170,24 +170,8 @@ _mm256_blendv_epi8(__m256i __V1, __m256i __V2, __m256i __M)
  }
  
  #define _mm256_blend_epi16(V1, V2, M) \
-  (__m256i)__builtin_shufflevector((__v16hi)(__m256i)(V1),   \
-                                   (__v16hi)(__m256i)(V2),   \
-                                   (((M) & 0x01) ? 16 : 0),  \
-                                   (((M) & 0x02) ? 17 : 1),  \
-                                   (((M) & 0x04) ? 18 : 2),  \
-                                   (((M) & 0x08) ? 19 : 3),  \
-                                   (((M) & 0x10) ? 20 : 4),  \
-                                   (((M) & 0x20) ? 21 : 5),  \
-                                   (((M) & 0x40) ? 22 : 6),  \
-                                   (((M) & 0x80) ? 23 : 7),  \
-                                   (((M) & 0x01) ? 24 : 8),  \
-                                   (((M) & 0x02) ? 25 : 9),  \
-                                   (((M) & 0x04) ? 26 : 10), \
-                                   (((M) & 0x08) ? 27 : 11), \
-                                   (((M) & 0x10) ? 28 : 12), \
-                                   (((M) & 0x20) ? 29 : 13), \
-                                   (((M) & 0x40) ? 30 : 14), \
-                                   (((M) & 0x80) ? 31 : 15))
+  (__m256i)__builtin_ia32_pblendw256((__v16hi)(__m256i)(V1), \
+                                     (__v16hi)(__m256i)(V2), (int)(M))
  
  static __inline__ __m256i __DEFAULT_FN_ATTRS
  _mm256_cmpeq_epi8(__m256i __a, __m256i __b)
@@ -809,24 +793,12 @@ _mm256_broadcastsi128_si256(__m128i __X)
  }
  
  #define _mm_blend_epi32(V1, V2, M) \
-  (__m128i)__builtin_shufflevector((__v4si)(__m128i)(V1),  \
-                                   (__v4si)(__m128i)(V2),  \
-                                   (((M) & 0x01) ? 4 : 0), \
-                                   (((M) & 0x02) ? 5 : 1), \
-                                   (((M) & 0x04) ? 6 : 2), \
-                                   (((M) & 0x08) ? 7 : 3))
+  (__m128i)__builtin_ia32_pblendd128((__v4si)(__m128i)(V1), \
+                                     (__v4si)(__m128i)(V2), (int)(M))
  
  #define _mm256_blend_epi32(V1, V2, M) \
-  (__m256i)__builtin_shufflevector((__v8si)(__m256i)(V1),   \
-                                   (__v8si)(__m256i)(V2),   \
-                                   (((M) & 0x01) ?  8 : 0), \
-                                   (((M) & 0x02) ?  9 : 1), \
-                                   (((M) & 0x04) ? 10 : 2), \
-                                   (((M) & 0x08) ? 11 : 3), \
-                                   (((M) & 0x10) ? 12 : 4), \
-                                   (((M) & 0x20) ? 13 : 5), \
-                                   (((M) & 0x40) ? 14 : 6), \
-                                   (((M) & 0x80) ? 15 : 7))
+  (__m256i)__builtin_ia32_pblendd256((__v8si)(__m256i)(V1), \
+                                     (__v8si)(__m256i)(V2), (int)(M))
  
  static __inline__ __m256i __DEFAULT_FN_ATTRS
  _mm256_broadcastb_epi8(__m128i __X)
diff --git a/lib/Headers/avxintrin.h b/lib/Headers/avxintrin.h

index 883b2e350d4cbf4174d76881a1e12bb88052d781..7e3c51ffb6922037d80081a938fd0c374a85d9e3 100644 (file)
--- a/lib/Headers/avxintrin.h
+++ b/lib/Headers/avxintrin.h
@@ -1355,12 +1355,8 @@ _mm256_permutevar_ps(__m256 __a, __m256i __c)
  ///    operand \a V2 is copied to the same position in the destination.
  /// \returns A 256-bit vector of [4 x double] containing the copied values.
  #define _mm256_blend_pd(V1, V2, M) \
-  (__m256d)__builtin_shufflevector((__v4df)(__m256d)(V1), \
-                                   (__v4df)(__m256d)(V2), \
-                                   (((M) & 0x01) ? 4 : 0), \
-                                   (((M) & 0x02) ? 5 : 1), \
-                                   (((M) & 0x04) ? 6 : 2), \
-                                   (((M) & 0x08) ? 7 : 3))
+  (__m256d)__builtin_ia32_blendpd256((__v4df)(__m256d)(V1), \
+                                     (__v4df)(__m256d)(V2), (int)(M))
  
  /// Merges 32-bit single-precision data values stored in either of the
  ///    two 256-bit vectors of [8 x float], as specified by the immediate
@@ -1387,16 +1383,8 @@ _mm256_permutevar_ps(__m256 __a, __m256i __c)
  ///    operand \a V2 is copied to the same position in the destination.
  /// \returns A 256-bit vector of [8 x float] containing the copied values.
  #define _mm256_blend_ps(V1, V2, M) \
-  (__m256)__builtin_shufflevector((__v8sf)(__m256)(V1), \
-                                  (__v8sf)(__m256)(V2), \
-                                  (((M) & 0x01) ?  8 : 0), \
-                                  (((M) & 0x02) ?  9 : 1), \
-                                  (((M) & 0x04) ? 10 : 2), \
-                                  (((M) & 0x08) ? 11 : 3), \
-                                  (((M) & 0x10) ? 12 : 4), \
-                                  (((M) & 0x20) ? 13 : 5), \
-                                  (((M) & 0x40) ? 14 : 6), \
-                                  (((M) & 0x80) ? 15 : 7))
+  (__m256)__builtin_ia32_blendps256((__v8sf)(__m256)(V1), \
+                                    (__v8sf)(__m256)(V2), (int)(M))
  
  /// Merges 64-bit double-precision data values stored in either of the
  ///    two 256-bit vectors of [4 x double], as specified by the 256-bit vector
diff --git a/lib/Headers/smmintrin.h b/lib/Headers/smmintrin.h

index b3084562a0b3f618706ce3f798d84c0f62f5f56a..7ccb1a37f52cfa2976c42dc809de09d0cf80b9c6 100644 (file)
--- a/lib/Headers/smmintrin.h
+++ b/lib/Headers/smmintrin.h
@@ -390,10 +390,8 @@
  ///    is copied to the same position in the result.
  /// \returns A 128-bit vector of [2 x double] containing the copied values.
  #define _mm_blend_pd(V1, V2, M) \
-  (__m128d)__builtin_shufflevector((__v2df)(__m128d)(V1), \
-                                   (__v2df)(__m128d)(V2), \
-                                   (((M) & 0x01) ? 2 : 0), \
-                                   (((M) & 0x02) ? 3 : 1))
+  (__m128d) __builtin_ia32_blendpd ((__v2df)(__m128d)(V1), \
+                                    (__v2df)(__m128d)(V2), (int)(M))
  
  /// Returns a 128-bit vector of [4 x float] where the values are selected
  ///    from either the first or second operand as specified by the third
@@ -420,11 +418,8 @@
  ///    is copied to the same position in the result.
  /// \returns A 128-bit vector of [4 x float] containing the copied values.
  #define _mm_blend_ps(V1, V2, M) \
-  (__m128)__builtin_shufflevector((__v4sf)(__m128)(V1), (__v4sf)(__m128)(V2), \
-                                  (((M) & 0x01) ? 4 : 0), \
-                                  (((M) & 0x02) ? 5 : 1), \
-                                  (((M) & 0x04) ? 6 : 2), \
-                                  (((M) & 0x08) ? 7 : 3))
+  (__m128) __builtin_ia32_blendps ((__v4sf)(__m128)(V1), \
+                                   (__v4sf)(__m128)(V2), (int)(M))
  
  /// Returns a 128-bit vector of [2 x double] where the values are
  ///    selected from either the first or second operand as specified by the
@@ -532,16 +527,8 @@ _mm_blendv_epi8 (__m128i __V1, __m128i __V2, __m128i __M)
  ///    is copied to the same position in the result.
  /// \returns A 128-bit vector of [8 x i16] containing the copied values.
  #define _mm_blend_epi16(V1, V2, M) \
-  (__m128i)__builtin_shufflevector((__v8hi)(__m128i)(V1), \
-                                   (__v8hi)(__m128i)(V2), \
-                                   (((M) & 0x01) ?  8 : 0), \
-                                   (((M) & 0x02) ?  9 : 1), \
-                                   (((M) & 0x04) ? 10 : 2), \
-                                   (((M) & 0x08) ? 11 : 3), \
-                                   (((M) & 0x10) ? 12 : 4), \
-                                   (((M) & 0x20) ? 13 : 5), \
-                                   (((M) & 0x40) ? 14 : 6), \
-                                   (((M) & 0x80) ? 15 : 7))
+  (__m128i) __builtin_ia32_pblendw128 ((__v8hi)(__m128i)(V1), \
+                                       (__v8hi)(__m128i)(V2), (int)(M))
  
  /* SSE4 Dword Multiply Instructions.  */
  /// Multiples corresponding elements of two 128-bit vectors of [4 x i32]
diff --git a/lib/Sema/SemaChecking.cpp b/lib/Sema/SemaChecking.cpp

index 03a117f435420b86d653a63e20bf751c55e770dc..db004607cc0c0f6cf0e84fde0bffb9b47fea8da3 100644 (file)
--- a/lib/Sema/SemaChecking.cpp
+++ b/lib/Sema/SemaChecking.cpp
@@ -2624,6 +2624,7 @@ bool Sema::CheckX86BuiltinFunctionCall(unsigned BuiltinID, CallExpr *TheCall) {
      i = 1; l = 0; u = 7;
      break;
    case X86::BI__builtin_ia32_sha1rnds4:
+  case X86::BI__builtin_ia32_blendpd:
    case X86::BI__builtin_ia32_vec_set_v4hi:
    case X86::BI__builtin_ia32_vec_set_v4si:
    case X86::BI__builtin_ia32_vec_set_v4di:
@@ -2683,6 +2684,9 @@ bool Sema::CheckX86BuiltinFunctionCall(unsigned BuiltinID, CallExpr *TheCall) {
    case X86::BI__builtin_ia32_vec_ext_v16hi:
      i = 1; l = 0; u = 15;
      break;
+  case X86::BI__builtin_ia32_pblendd128:
+  case X86::BI__builtin_ia32_blendps:
+  case X86::BI__builtin_ia32_blendpd256:
    case X86::BI__builtin_ia32_roundss:
    case X86::BI__builtin_ia32_roundsd:
    case X86::BI__builtin_ia32_rangepd128_mask:
@@ -2754,6 +2758,10 @@ bool Sema::CheckX86BuiltinFunctionCall(unsigned BuiltinID, CallExpr *TheCall) {
    case X86::BI__builtin_ia32_fpclassss_mask:
      i = 1; l = 0; u = 255;
      break;
+  case X86::BI__builtin_ia32_pblendw128:
+  case X86::BI__builtin_ia32_pblendw256:
+  case X86::BI__builtin_ia32_blendps256:
+  case X86::BI__builtin_ia32_pblendd256:
    case X86::BI__builtin_ia32_palignr128:
    case X86::BI__builtin_ia32_palignr256:
    case X86::BI__builtin_ia32_palignr512:
diff --git a/test/CodeGen/avx-builtins.c b/test/CodeGen/avx-builtins.c

index 43c502f47f1ea70411b4c3eef777bb43059b2e10..e40a03d8cf6696334d9b82719cdd052e0334cd5f 100644 (file)
--- a/test/CodeGen/avx-builtins.c
+++ b/test/CodeGen/avx-builtins.c
@@ -59,7 +59,7 @@ __m256 test_mm256_andnot_ps(__m256 A, __m256 B) {
  __m256d test_mm256_blend_pd(__m256d A, __m256d B) {
    // CHECK-LABEL: test_mm256_blend_pd
    // CHECK: shufflevector <4 x double> %{{.*}}, <4 x double> %{{.*}}, <4 x i32> <i32 4, i32 1, i32 6, i32 3>
-  return _mm256_blend_pd(A, B, 0x35);
+  return _mm256_blend_pd(A, B, 0x05);
  }
  
  __m256 test_mm256_blend_ps(__m256 A, __m256 B) {
diff --git a/test/CodeGen/avx2-builtins.c b/test/CodeGen/avx2-builtins.c

index b647ca595bc0d8f9986939d9dd66bd364eb56fee..bd7e3a959681915b82188dafec4d796aad66d154 100644 (file)
--- a/test/CodeGen/avx2-builtins.c
+++ b/test/CodeGen/avx2-builtins.c
@@ -141,7 +141,7 @@ __m128i test_mm_blend_epi32(__m128i a, __m128i b) {
    // CHECK-LABEL: test_mm_blend_epi32
    // CHECK-NOT: @llvm.x86.avx2.pblendd.128
    // CHECK: shufflevector <4 x i32> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> <i32 4, i32 1, i32 6, i32 3>
-  return _mm_blend_epi32(a, b, 0x35);
+  return _mm_blend_epi32(a, b, 0x05);
  }
  
  __m256i test_mm256_blend_epi32(__m256i a, __m256i b) {
author	Craig Topper <craig.topper@intel.com>
	Fri, 8 Jun 2018 00:00:21 +0000 (00:00 +0000)
committer	Craig Topper <craig.topper@intel.com>
	Fri, 8 Jun 2018 00:00:21 +0000 (00:00 +0000)
include/clang/Basic/BuiltinsX86.def		patch \| blob \| history
lib/CodeGen/CGBuiltin.cpp		patch \| blob \| history
lib/Headers/avx2intrin.h		patch \| blob \| history
lib/Headers/avxintrin.h		patch \| blob \| history
lib/Headers/smmintrin.h		patch \| blob \| history
lib/Sema/SemaChecking.cpp		patch \| blob \| history
test/CodeGen/avx-builtins.c		patch \| blob \| history
test/CodeGen/avx2-builtins.c		patch \| blob \| history