TARGET_BUILTIN(__builtin_ia32_insertps128, "V4fV4fV4fIc", "nc", "sse4.1")
TARGET_BUILTIN(__builtin_ia32_pblendvb128, "V16cV16cV16cV16c", "nc", "sse4.1")
+TARGET_BUILTIN(__builtin_ia32_pblendw128, "V8sV8sV8sIi", "nc", "sse4.1")
+TARGET_BUILTIN(__builtin_ia32_blendpd, "V2dV2dV2dIi", "nc", "sse4.1")
+TARGET_BUILTIN(__builtin_ia32_blendps, "V4fV4fV4fIi", "nc", "sse4.1")
TARGET_BUILTIN(__builtin_ia32_blendvpd, "V2dV2dV2dV2d", "nc", "sse4.1")
TARGET_BUILTIN(__builtin_ia32_blendvps, "V4fV4fV4fV4f", "nc", "sse4.1")
TARGET_BUILTIN(__builtin_ia32_packusdw128, "V8sV4iV4i", "nc", "sse4.1")
TARGET_BUILTIN(__builtin_ia32_vpermilvarps, "V4fV4fV4i", "nc", "avx")
TARGET_BUILTIN(__builtin_ia32_vpermilvarpd256, "V4dV4dV4LLi", "nc", "avx")
TARGET_BUILTIN(__builtin_ia32_vpermilvarps256, "V8fV8fV8i", "nc", "avx")
+TARGET_BUILTIN(__builtin_ia32_blendpd256, "V4dV4dV4dIi", "nc", "avx")
+TARGET_BUILTIN(__builtin_ia32_blendps256, "V8fV8fV8fIi", "nc", "avx")
TARGET_BUILTIN(__builtin_ia32_blendvpd256, "V4dV4dV4dV4d", "nc", "avx")
TARGET_BUILTIN(__builtin_ia32_blendvps256, "V8fV8fV8fV8f", "nc", "avx")
TARGET_BUILTIN(__builtin_ia32_dpps256, "V8fV8fV8fIc", "nc", "avx")
TARGET_BUILTIN(__builtin_ia32_psubusw256, "V16sV16sV16s", "nc", "avx2")
TARGET_BUILTIN(__builtin_ia32_palignr256, "V32cV32cV32cIi", "nc", "avx2")
TARGET_BUILTIN(__builtin_ia32_pblendvb256, "V32cV32cV32cV32c", "nc", "avx2")
+TARGET_BUILTIN(__builtin_ia32_pblendw256, "V16sV16sV16sIi", "nc", "avx2")
TARGET_BUILTIN(__builtin_ia32_phaddw256, "V16sV16sV16s", "nc", "avx2")
TARGET_BUILTIN(__builtin_ia32_phaddd256, "V8iV8iV8i", "nc", "avx2")
TARGET_BUILTIN(__builtin_ia32_phaddsw256, "V16sV16sV16s", "nc", "avx2")
TARGET_BUILTIN(__builtin_ia32_psrld256, "V8iV8iV4i", "nc", "avx2")
TARGET_BUILTIN(__builtin_ia32_psrlqi256, "V4LLiV4LLii", "nc", "avx2")
TARGET_BUILTIN(__builtin_ia32_psrlq256, "V4LLiV4LLiV2LLi", "nc", "avx2")
+TARGET_BUILTIN(__builtin_ia32_pblendd128, "V4iV4iV4iIi", "nc", "avx2")
+TARGET_BUILTIN(__builtin_ia32_pblendd256, "V8iV8iV8iIi", "nc", "avx2")
TARGET_BUILTIN(__builtin_ia32_permvarsi256, "V8iV8iV8i", "nc", "avx2")
TARGET_BUILTIN(__builtin_ia32_permvarsf256, "V8fV8fV8i", "nc", "avx2")
TARGET_BUILTIN(__builtin_ia32_permti256, "V4LLiV4LLiV4LLiIc", "nc", "avx2")
Ops[0] = Builder.CreateBitCast(Ops[0], PtrTy);
return Builder.CreateDefaultAlignedStore(Ops[1], Ops[0]);
}
+ case X86::BI__builtin_ia32_pblendw128:
+ case X86::BI__builtin_ia32_blendpd:
+ case X86::BI__builtin_ia32_blendps:
+ case X86::BI__builtin_ia32_blendpd256:
+ case X86::BI__builtin_ia32_blendps256:
+ case X86::BI__builtin_ia32_pblendw256:
+ case X86::BI__builtin_ia32_pblendd128:
+ case X86::BI__builtin_ia32_pblendd256: {
+ unsigned NumElts = Ops[0]->getType()->getVectorNumElements();
+ unsigned Imm = cast<llvm::ConstantInt>(Ops[2])->getZExtValue();
+
+ uint32_t Indices[16];
+ // If there are more than 8 elements, the immediate is used twice so make
+ // sure we handle that.
+ for (unsigned i = 0; i != NumElts; ++i)
+ Indices[i] = ((Imm >> (i % 8)) & 0x1) ? NumElts + i : i;
+
+ return Builder.CreateShuffleVector(Ops[1], Ops[0],
+ makeArrayRef(Indices, NumElts),
+ "blend");
+ }
case X86::BI__builtin_ia32_palignr128:
case X86::BI__builtin_ia32_palignr256:
case X86::BI__builtin_ia32_palignr512: {
}
#define _mm256_blend_epi16(V1, V2, M) \
- (__m256i)__builtin_shufflevector((__v16hi)(__m256i)(V1), \
- (__v16hi)(__m256i)(V2), \
- (((M) & 0x01) ? 16 : 0), \
- (((M) & 0x02) ? 17 : 1), \
- (((M) & 0x04) ? 18 : 2), \
- (((M) & 0x08) ? 19 : 3), \
- (((M) & 0x10) ? 20 : 4), \
- (((M) & 0x20) ? 21 : 5), \
- (((M) & 0x40) ? 22 : 6), \
- (((M) & 0x80) ? 23 : 7), \
- (((M) & 0x01) ? 24 : 8), \
- (((M) & 0x02) ? 25 : 9), \
- (((M) & 0x04) ? 26 : 10), \
- (((M) & 0x08) ? 27 : 11), \
- (((M) & 0x10) ? 28 : 12), \
- (((M) & 0x20) ? 29 : 13), \
- (((M) & 0x40) ? 30 : 14), \
- (((M) & 0x80) ? 31 : 15))
+ (__m256i)__builtin_ia32_pblendw256((__v16hi)(__m256i)(V1), \
+ (__v16hi)(__m256i)(V2), (int)(M))
static __inline__ __m256i __DEFAULT_FN_ATTRS
_mm256_cmpeq_epi8(__m256i __a, __m256i __b)
}
#define _mm_blend_epi32(V1, V2, M) \
- (__m128i)__builtin_shufflevector((__v4si)(__m128i)(V1), \
- (__v4si)(__m128i)(V2), \
- (((M) & 0x01) ? 4 : 0), \
- (((M) & 0x02) ? 5 : 1), \
- (((M) & 0x04) ? 6 : 2), \
- (((M) & 0x08) ? 7 : 3))
+ (__m128i)__builtin_ia32_pblendd128((__v4si)(__m128i)(V1), \
+ (__v4si)(__m128i)(V2), (int)(M))
#define _mm256_blend_epi32(V1, V2, M) \
- (__m256i)__builtin_shufflevector((__v8si)(__m256i)(V1), \
- (__v8si)(__m256i)(V2), \
- (((M) & 0x01) ? 8 : 0), \
- (((M) & 0x02) ? 9 : 1), \
- (((M) & 0x04) ? 10 : 2), \
- (((M) & 0x08) ? 11 : 3), \
- (((M) & 0x10) ? 12 : 4), \
- (((M) & 0x20) ? 13 : 5), \
- (((M) & 0x40) ? 14 : 6), \
- (((M) & 0x80) ? 15 : 7))
+ (__m256i)__builtin_ia32_pblendd256((__v8si)(__m256i)(V1), \
+ (__v8si)(__m256i)(V2), (int)(M))
static __inline__ __m256i __DEFAULT_FN_ATTRS
_mm256_broadcastb_epi8(__m128i __X)
/// operand \a V2 is copied to the same position in the destination.
/// \returns A 256-bit vector of [4 x double] containing the copied values.
#define _mm256_blend_pd(V1, V2, M) \
- (__m256d)__builtin_shufflevector((__v4df)(__m256d)(V1), \
- (__v4df)(__m256d)(V2), \
- (((M) & 0x01) ? 4 : 0), \
- (((M) & 0x02) ? 5 : 1), \
- (((M) & 0x04) ? 6 : 2), \
- (((M) & 0x08) ? 7 : 3))
+ (__m256d)__builtin_ia32_blendpd256((__v4df)(__m256d)(V1), \
+ (__v4df)(__m256d)(V2), (int)(M))
/// Merges 32-bit single-precision data values stored in either of the
/// two 256-bit vectors of [8 x float], as specified by the immediate
/// operand \a V2 is copied to the same position in the destination.
/// \returns A 256-bit vector of [8 x float] containing the copied values.
#define _mm256_blend_ps(V1, V2, M) \
- (__m256)__builtin_shufflevector((__v8sf)(__m256)(V1), \
- (__v8sf)(__m256)(V2), \
- (((M) & 0x01) ? 8 : 0), \
- (((M) & 0x02) ? 9 : 1), \
- (((M) & 0x04) ? 10 : 2), \
- (((M) & 0x08) ? 11 : 3), \
- (((M) & 0x10) ? 12 : 4), \
- (((M) & 0x20) ? 13 : 5), \
- (((M) & 0x40) ? 14 : 6), \
- (((M) & 0x80) ? 15 : 7))
+ (__m256)__builtin_ia32_blendps256((__v8sf)(__m256)(V1), \
+ (__v8sf)(__m256)(V2), (int)(M))
/// Merges 64-bit double-precision data values stored in either of the
/// two 256-bit vectors of [4 x double], as specified by the 256-bit vector
/// is copied to the same position in the result.
/// \returns A 128-bit vector of [2 x double] containing the copied values.
#define _mm_blend_pd(V1, V2, M) \
- (__m128d)__builtin_shufflevector((__v2df)(__m128d)(V1), \
- (__v2df)(__m128d)(V2), \
- (((M) & 0x01) ? 2 : 0), \
- (((M) & 0x02) ? 3 : 1))
+ (__m128d) __builtin_ia32_blendpd ((__v2df)(__m128d)(V1), \
+ (__v2df)(__m128d)(V2), (int)(M))
/// Returns a 128-bit vector of [4 x float] where the values are selected
/// from either the first or second operand as specified by the third
/// is copied to the same position in the result.
/// \returns A 128-bit vector of [4 x float] containing the copied values.
#define _mm_blend_ps(V1, V2, M) \
- (__m128)__builtin_shufflevector((__v4sf)(__m128)(V1), (__v4sf)(__m128)(V2), \
- (((M) & 0x01) ? 4 : 0), \
- (((M) & 0x02) ? 5 : 1), \
- (((M) & 0x04) ? 6 : 2), \
- (((M) & 0x08) ? 7 : 3))
+ (__m128) __builtin_ia32_blendps ((__v4sf)(__m128)(V1), \
+ (__v4sf)(__m128)(V2), (int)(M))
/// Returns a 128-bit vector of [2 x double] where the values are
/// selected from either the first or second operand as specified by the
/// is copied to the same position in the result.
/// \returns A 128-bit vector of [8 x i16] containing the copied values.
#define _mm_blend_epi16(V1, V2, M) \
- (__m128i)__builtin_shufflevector((__v8hi)(__m128i)(V1), \
- (__v8hi)(__m128i)(V2), \
- (((M) & 0x01) ? 8 : 0), \
- (((M) & 0x02) ? 9 : 1), \
- (((M) & 0x04) ? 10 : 2), \
- (((M) & 0x08) ? 11 : 3), \
- (((M) & 0x10) ? 12 : 4), \
- (((M) & 0x20) ? 13 : 5), \
- (((M) & 0x40) ? 14 : 6), \
- (((M) & 0x80) ? 15 : 7))
+ (__m128i) __builtin_ia32_pblendw128 ((__v8hi)(__m128i)(V1), \
+ (__v8hi)(__m128i)(V2), (int)(M))
/* SSE4 Dword Multiply Instructions. */
/// Multiples corresponding elements of two 128-bit vectors of [4 x i32]
i = 1; l = 0; u = 7;
break;
case X86::BI__builtin_ia32_sha1rnds4:
+ case X86::BI__builtin_ia32_blendpd:
case X86::BI__builtin_ia32_vec_set_v4hi:
case X86::BI__builtin_ia32_vec_set_v4si:
case X86::BI__builtin_ia32_vec_set_v4di:
case X86::BI__builtin_ia32_vec_ext_v16hi:
i = 1; l = 0; u = 15;
break;
+ case X86::BI__builtin_ia32_pblendd128:
+ case X86::BI__builtin_ia32_blendps:
+ case X86::BI__builtin_ia32_blendpd256:
case X86::BI__builtin_ia32_roundss:
case X86::BI__builtin_ia32_roundsd:
case X86::BI__builtin_ia32_rangepd128_mask:
case X86::BI__builtin_ia32_fpclassss_mask:
i = 1; l = 0; u = 255;
break;
+ case X86::BI__builtin_ia32_pblendw128:
+ case X86::BI__builtin_ia32_pblendw256:
+ case X86::BI__builtin_ia32_blendps256:
+ case X86::BI__builtin_ia32_pblendd256:
case X86::BI__builtin_ia32_palignr128:
case X86::BI__builtin_ia32_palignr256:
case X86::BI__builtin_ia32_palignr512:
__m256d test_mm256_blend_pd(__m256d A, __m256d B) {
// CHECK-LABEL: test_mm256_blend_pd
// CHECK: shufflevector <4 x double> %{{.*}}, <4 x double> %{{.*}}, <4 x i32> <i32 4, i32 1, i32 6, i32 3>
- return _mm256_blend_pd(A, B, 0x35);
+ return _mm256_blend_pd(A, B, 0x05);
}
__m256 test_mm256_blend_ps(__m256 A, __m256 B) {
// CHECK-LABEL: test_mm_blend_epi32
// CHECK-NOT: @llvm.x86.avx2.pblendd.128
// CHECK: shufflevector <4 x i32> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> <i32 4, i32 1, i32 6, i32 3>
- return _mm_blend_epi32(a, b, 0x35);
+ return _mm_blend_epi32(a, b, 0x05);
}
__m256i test_mm256_blend_epi32(__m256i a, __m256i b) {