/// Bit[4]=1 indicates that bits[127:64] of operand \a __Y are used.
/// \returns The 128-bit integer vector containing the result of the carry-less
/// multiplication of the selected 64-bit values.
-#define _mm_clmulepi64_si128(__X, __Y, __I) \
- ((__m128i)__builtin_ia32_pclmulqdq128((__v2di)(__m128i)(__X), \
- (__v2di)(__m128i)(__Y), (char)(__I)))
+#define _mm_clmulepi64_si128(X, Y, I) \
+ ((__m128i)__builtin_ia32_pclmulqdq128((__v2di)(__m128i)(X), \
+ (__v2di)(__m128i)(Y), (char)(I)))
#endif /* __WMMINTRIN_PCLMUL_H */
#define _mm512_mask_extractf32x8_ps(W, U, A, imm) \
(__m256)__builtin_ia32_selectps_256((__mmask8)(U), \
(__v8sf)_mm512_extractf32x8_ps((A), (imm)), \
- (__v8sf)(W))
+ (__v8sf)(__m256)(W))
#define _mm512_maskz_extractf32x8_ps(U, A, imm) \
(__m256)__builtin_ia32_selectps_256((__mmask8)(U), \
#define _mm512_mask_extractf64x2_pd(W, U, A, imm) \
(__m128d)__builtin_ia32_selectpd_128((__mmask8)(U), \
(__v2df)_mm512_extractf64x2_pd((A), (imm)), \
- (__v2df)(W))
+ (__v2df)(__m128d)(W))
#define _mm512_maskz_extractf64x2_pd(U, A, imm) \
(__m128d)__builtin_ia32_selectpd_128((__mmask8)(U), \
#define _mm512_mask_extracti32x8_epi32(W, U, A, imm) \
(__m256i)__builtin_ia32_selectd_256((__mmask8)(U), \
(__v8si)_mm512_extracti32x8_epi32((A), (imm)), \
- (__v8si)(W))
+ (__v8si)(__m256i)(W))
#define _mm512_maskz_extracti32x8_epi32(U, A, imm) \
(__m256i)__builtin_ia32_selectd_256((__mmask8)(U), \
#define _mm512_mask_extracti64x2_epi64(W, U, A, imm) \
(__m128d)__builtin_ia32_selectq_128((__mmask8)(U), \
(__v2di)_mm512_extracti64x2_epi64((A), (imm)), \
- (__v2di)(W))
+ (__v2di)(__m128i)(W))
#define _mm512_maskz_extracti64x2_epi64(U, A, imm) \
(__m128d)__builtin_ia32_selectq_128((__mmask8)(U), \
#define _mm512_mask_insertf32x8(W, U, A, B, imm) \
(__m512)__builtin_ia32_selectps_512((__mmask16)(U), \
(__v16sf)_mm512_insertf32x8((A), (B), (imm)), \
- (__v16sf)(W))
+ (__v16sf)(__m512)(W))
#define _mm512_maskz_insertf32x8(U, A, B, imm) \
(__m512)__builtin_ia32_selectps_512((__mmask16)(U), \
#define _mm512_mask_insertf64x2(W, U, A, B, imm) \
(__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \
(__v8df)_mm512_insertf64x2((A), (B), (imm)), \
- (__v8df)(W))
+ (__v8df)(__m512d)(W))
#define _mm512_maskz_insertf64x2(U, A, B, imm) \
(__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \
#define _mm512_mask_inserti32x8(W, U, A, B, imm) \
(__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \
(__v16si)_mm512_inserti32x8((A), (B), (imm)), \
- (__v16si)(W))
+ (__v16si)(__m512i)(W))
#define _mm512_maskz_inserti32x8(U, A, B, imm) \
(__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \
#define _mm512_mask_inserti64x2(W, U, A, B, imm) \
(__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \
(__v8di)_mm512_inserti64x2((A), (B), (imm)), \
- (__v8di)(W))
+ (__v8di)(__m512i)(W))
#define _mm512_maskz_inserti64x2(U, A, B, imm) \
(__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \
static __inline__ __m512i __DEFAULT_FN_ATTRS
_mm512_andnot_si512 (__m512i __A, __m512i __B)
{
- return (__m512i)(~(__v8du)(__A) & (__v8du)__B);
+ return (__m512i)(~(__v8du)__A & (__v8du)__B);
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
_mm512_andnot_epi32 (__m512i __A, __m512i __B)
{
- return (__m512i)(~(__v16su)(__A) & (__v16su)__B);
+ return (__m512i)(~(__v16su)__A & (__v16su)__B);
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
static __inline__ __m512i __DEFAULT_FN_ATTRS
_mm512_andnot_epi64(__m512i __A, __m512i __B)
{
- return (__m512i)(~(__v8du)(__A) & (__v8du)__B);
+ return (__m512i)(~(__v8du)__A & (__v8du)__B);
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
#define _mm512_mask_extractf64x4_pd(W, U, A, imm) \
(__m256d)__builtin_ia32_selectpd_256((__mmask8)(U), \
(__v4df)_mm512_extractf64x4_pd((A), (imm)), \
- (__v4df)(W))
+ (__v4df)(__m256d)(W))
#define _mm512_maskz_extractf64x4_pd(U, A, imm) \
(__m256d)__builtin_ia32_selectpd_256((__mmask8)(U), \
#define _mm512_mask_extractf32x4_ps(W, U, A, imm) \
(__m128)__builtin_ia32_selectps_128((__mmask8)(U), \
(__v4sf)_mm512_extractf32x4_ps((A), (imm)), \
- (__v4sf)(W))
+ (__v4sf)(__m128)(W))
#define _mm512_maskz_extractf32x4_ps(U, A, imm) \
(__m128)__builtin_ia32_selectps_128((__mmask8)(U), \
#define _mm512_mask_cvt_roundpd_epu32(W, U, A, R) \
(__m256i)__builtin_ia32_cvtpd2udq512_mask((__v8df)(__m512d)(A), \
- (__v8si)(W), \
+ (__v8si)(__m256i)(W), \
(__mmask8)(U), (int)(R))
#define _mm512_maskz_cvt_roundpd_epu32(U, A, R) \
#define _mm512_mask_extracti32x4_epi32(W, U, A, imm) \
(__m128i)__builtin_ia32_selectd_128((__mmask8)(U), \
(__v4si)_mm512_extracti32x4_epi32((A), (imm)), \
- (__v4si)(W))
+ (__v4si)(__m128i)(W))
#define _mm512_maskz_extracti32x4_epi32(U, A, imm) \
(__m128i)__builtin_ia32_selectd_128((__mmask8)(U), \
#define _mm512_mask_extracti64x4_epi64(W, U, A, imm) \
(__m256i)__builtin_ia32_selectq_256((__mmask8)(U), \
(__v4di)_mm512_extracti64x4_epi64((A), (imm)), \
- (__v4di)(W))
+ (__v4di)(__m256i)(W))
#define _mm512_maskz_extracti64x4_epi64(U, A, imm) \
(__m256i)__builtin_ia32_selectq_256((__mmask8)(U), \
#define _mm512_mask_insertf64x4(W, U, A, B, imm) \
(__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \
(__v8df)_mm512_insertf64x4((A), (B), (imm)), \
- (__v8df)(W))
+ (__v8df)(__m512d)(W))
#define _mm512_maskz_insertf64x4(U, A, B, imm) \
(__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \
#define _mm512_mask_inserti64x4(W, U, A, B, imm) \
(__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \
(__v8di)_mm512_inserti64x4((A), (B), (imm)), \
- (__v8di)(W))
+ (__v8di)(__m512i)(W))
#define _mm512_maskz_inserti64x4(U, A, B, imm) \
(__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \
#define _mm512_mask_insertf32x4(W, U, A, B, imm) \
(__m512)__builtin_ia32_selectps_512((__mmask16)(U), \
(__v16sf)_mm512_insertf32x4((A), (B), (imm)), \
- (__v16sf)(W))
+ (__v16sf)(__m512)(W))
#define _mm512_maskz_insertf32x4(U, A, B, imm) \
(__m512)__builtin_ia32_selectps_512((__mmask16)(U), \
#define _mm512_mask_inserti32x4(W, U, A, B, imm) \
(__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \
(__v16si)_mm512_inserti32x4((A), (B), (imm)), \
- (__v16si)(W))
+ (__v16si)(__m512i)(W))
#define _mm512_maskz_inserti32x4(U, A, B, imm) \
(__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \
static __inline__ __m128 __DEFAULT_FN_ATTRS
_mm_mask_cvtsd_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128d __B)
{
- return __builtin_ia32_cvtsd2ss_round_mask ((__v4sf)(__A),
- (__v2df)(__B),
- (__v4sf)(__W),
- (__mmask8)(__U), _MM_FROUND_CUR_DIRECTION);
+ return __builtin_ia32_cvtsd2ss_round_mask ((__v4sf)__A,
+ (__v2df)__B,
+ (__v4sf)__W,
+ (__mmask8)__U, _MM_FROUND_CUR_DIRECTION);
}
static __inline__ __m128 __DEFAULT_FN_ATTRS
_mm_maskz_cvtsd_ss (__mmask8 __U, __m128 __A, __m128d __B)
{
- return __builtin_ia32_cvtsd2ss_round_mask ((__v4sf)(__A),
- (__v2df)(__B),
+ return __builtin_ia32_cvtsd2ss_round_mask ((__v4sf)__A,
+ (__v2df)__B,
(__v4sf)_mm_setzero_ps(),
- (__mmask8)(__U), _MM_FROUND_CUR_DIRECTION);
+ (__mmask8)__U, _MM_FROUND_CUR_DIRECTION);
}
#define _mm_cvtss_i32 _mm_cvtss_si32
static __inline__ __m128d __DEFAULT_FN_ATTRS
_mm_mask_cvtss_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128 __B)
{
- return __builtin_ia32_cvtss2sd_round_mask((__v2df)(__A),
- (__v4sf)(__B),
- (__v2df)(__W),
- (__mmask8)(__U), _MM_FROUND_CUR_DIRECTION);
+ return __builtin_ia32_cvtss2sd_round_mask((__v2df)__A,
+ (__v4sf)__B,
+ (__v2df)__W,
+ (__mmask8)__U, _MM_FROUND_CUR_DIRECTION);
}
static __inline__ __m128d __DEFAULT_FN_ATTRS
_mm_maskz_cvtss_sd (__mmask8 __U, __m128d __A, __m128 __B)
{
- return __builtin_ia32_cvtss2sd_round_mask((__v2df)(__A),
- (__v4sf)(__B),
- (__v2df)_mm_setzero_pd(),
- (__mmask8)(__U), _MM_FROUND_CUR_DIRECTION);
+ return __builtin_ia32_cvtss2sd_round_mask((__v2df)__A,
+ (__v4sf)__B,
+ (__v2df)_mm_setzero_pd(),
+ (__mmask8)__U, _MM_FROUND_CUR_DIRECTION);
}
static __inline__ __m128d __DEFAULT_FN_ATTRS
}
#define _mm512_mask_shldi_epi64(S, U, A, B, I) \
- (__m512i)__builtin_ia32_vpshldq512_mask((__v8di)(A), \
- (__v8di)(B), \
+ (__m512i)__builtin_ia32_vpshldq512_mask((__v8di)(__m512i)(A), \
+ (__v8di)(__m512i)(B), \
(int)(I), \
- (__v8di)(S), \
+ (__v8di)(__m512i)(S), \
(__mmask8)(U))
#define _mm512_maskz_shldi_epi64(U, A, B, I) \
_mm512_mask_shldi_epi64(_mm512_undefined(), (__mmask8)(-1), (A), (B), (I))
#define _mm512_mask_shldi_epi32(S, U, A, B, I) \
- (__m512i)__builtin_ia32_vpshldd512_mask((__v16si)(A), \
- (__v16si)(B), \
+ (__m512i)__builtin_ia32_vpshldd512_mask((__v16si)(__m512i)(A), \
+ (__v16si)(__m512i)(B), \
(int)(I), \
- (__v16si)(S), \
+ (__v16si)(__m512i)(S), \
(__mmask16)(U))
#define _mm512_maskz_shldi_epi32(U, A, B, I) \
_mm512_mask_shldi_epi32(_mm512_undefined(), (__mmask16)(-1), (A), (B), (I))
#define _mm512_mask_shldi_epi16(S, U, A, B, I) \
- (__m512i)__builtin_ia32_vpshldw512_mask((__v32hi)(A), \
- (__v32hi)(B), \
+ (__m512i)__builtin_ia32_vpshldw512_mask((__v32hi)(__m512i)(A), \
+ (__v32hi)(__m512i)(B), \
(int)(I), \
- (__v32hi)(S), \
+ (__v32hi)(__m512i)(S), \
(__mmask32)(U))
#define _mm512_maskz_shldi_epi16(U, A, B, I) \
_mm512_mask_shldi_epi16(_mm512_undefined(), (__mmask32)(-1), (A), (B), (I))
#define _mm512_mask_shrdi_epi64(S, U, A, B, I) \
- (__m512i)__builtin_ia32_vpshrdq512_mask((__v8di)(A), \
- (__v8di)(B), \
+ (__m512i)__builtin_ia32_vpshrdq512_mask((__v8di)(__m512i)(A), \
+ (__v8di)(__m512i)(B), \
(int)(I), \
- (__v8di)(S), \
+ (__v8di)(__m512i)(S), \
(__mmask8)(U))
#define _mm512_maskz_shrdi_epi64(U, A, B, I) \
_mm512_mask_shrdi_epi64(_mm512_undefined(), (__mmask8)(-1), (A), (B), (I))
#define _mm512_mask_shrdi_epi32(S, U, A, B, I) \
- (__m512i)__builtin_ia32_vpshrdd512_mask((__v16si)(A), \
- (__v16si)(B), \
+ (__m512i)__builtin_ia32_vpshrdd512_mask((__v16si)(__m512i)(A), \
+ (__v16si)(__m512i)(B), \
(int)(I), \
- (__v16si)(S), \
+ (__v16si)(__m512i)(S), \
(__mmask16)(U))
#define _mm512_maskz_shrdi_epi32(U, A, B, I) \
_mm512_mask_shrdi_epi32(_mm512_undefined(), (__mmask16)(-1), (A), (B), (I))
#define _mm512_mask_shrdi_epi16(S, U, A, B, I) \
- (__m512i)__builtin_ia32_vpshrdw512_mask((__v32hi)(A), \
- (__v32hi)(B), \
+ (__m512i)__builtin_ia32_vpshrdw512_mask((__v32hi)(__m512i)(A), \
+ (__v32hi)(__m512i)(B), \
(int)(I), \
- (__v32hi)(S), \
+ (__v32hi)(__m512i)(S), \
(__mmask32)(U))
#define _mm512_maskz_shrdi_epi16(U, A, B, I) \
#define _mm256_mask_extractf64x2_pd(W, U, A, imm) \
(__m128d)__builtin_ia32_selectpd_128((__mmask8)(U), \
(__v2df)_mm256_extractf64x2_pd((A), (imm)), \
- (__v2df)(W))
+ (__v2df)(__m128d)(W))
#define _mm256_maskz_extractf64x2_pd(U, A, imm) \
(__m128d)__builtin_ia32_selectpd_128((__mmask8)(U), \
#define _mm256_mask_extracti64x2_epi64(W, U, A, imm) \
(__m128i)__builtin_ia32_selectq_128((__mmask8)(U), \
(__v2di)_mm256_extracti64x2_epi64((A), (imm)), \
- (__v2di)(W))
+ (__v2di)(__m128i)(W))
#define _mm256_maskz_extracti64x2_epi64(U, A, imm) \
(__m128i)__builtin_ia32_selectq_128((__mmask8)(U), \
(__v2di)_mm_setzero_si128())
#define _mm256_insertf64x2(A, B, imm) \
- (__m256d)__builtin_shufflevector((__v4df)(A), \
+ (__m256d)__builtin_shufflevector((__v4df)(__m256d)(A), \
(__v4df)_mm256_castpd128_pd256((__m128d)(B)), \
((imm) & 0x1) ? 0 : 4, \
((imm) & 0x1) ? 1 : 5, \
#define _mm256_mask_insertf64x2(W, U, A, B, imm) \
(__m256d)__builtin_ia32_selectpd_256((__mmask8)(U), \
(__v4df)_mm256_insertf64x2((A), (B), (imm)), \
- (__v4df)(W))
+ (__v4df)(__m256d)(W))
#define _mm256_maskz_insertf64x2(U, A, B, imm) \
(__m256d)__builtin_ia32_selectpd_256((__mmask8)(U), \
(__v4df)_mm256_setzero_pd())
#define _mm256_inserti64x2(A, B, imm) \
- (__m256i)__builtin_shufflevector((__v4di)(A), \
+ (__m256i)__builtin_shufflevector((__v4di)(__m256i)(A), \
(__v4di)_mm256_castsi128_si256((__m128i)(B)), \
((imm) & 0x1) ? 0 : 4, \
((imm) & 0x1) ? 1 : 5, \
#define _mm256_mask_inserti64x2(W, U, A, B, imm) \
(__m256i)__builtin_ia32_selectq_256((__mmask8)(U), \
(__v4di)_mm256_inserti64x2((A), (B), (imm)), \
- (__v4di)(W))
+ (__v4di)(__m256i)(W))
#define _mm256_maskz_inserti64x2(U, A, B, imm) \
(__m256i)__builtin_ia32_selectq_256((__mmask8)(U), \
#define _mm256_mask_extractf32x4_ps(W, U, A, imm) \
(__m128)__builtin_ia32_selectps_128((__mmask8)(U), \
(__v4sf)_mm256_extractf32x4_ps((A), (imm)), \
- (__v4sf)(W))
+ (__v4sf)(__m128)(W))
#define _mm256_maskz_extractf32x4_ps(U, A, imm) \
(__m128)__builtin_ia32_selectps_128((__mmask8)(U), \
#define _mm256_mask_extracti32x4_epi32(W, U, A, imm) \
(__m128i)__builtin_ia32_selectd_128((__mmask8)(U), \
(__v4si)_mm256_extracti32x4_epi32((A), (imm)), \
- (__v4si)(W))
+ (__v4si)(__m128i)(W))
#define _mm256_maskz_extracti32x4_epi32(U, A, imm) \
(__m128i)__builtin_ia32_selectd_128((__mmask8)(U), \
(__v4si)_mm_setzero_si128())
#define _mm256_insertf32x4(A, B, imm) \
- (__m256)__builtin_shufflevector((__v8sf)(A), \
+ (__m256)__builtin_shufflevector((__v8sf)(__m256)(A), \
(__v8sf)_mm256_castps128_ps256((__m128)(B)), \
((imm) & 0x1) ? 0 : 8, \
((imm) & 0x1) ? 1 : 9, \
#define _mm256_mask_insertf32x4(W, U, A, B, imm) \
(__m256)__builtin_ia32_selectps_256((__mmask8)(U), \
(__v8sf)_mm256_insertf32x4((A), (B), (imm)), \
- (__v8sf)(W))
+ (__v8sf)(__m256)(W))
#define _mm256_maskz_insertf32x4(U, A, B, imm) \
(__m256)__builtin_ia32_selectps_256((__mmask8)(U), \
(__v8sf)_mm256_setzero_ps())
#define _mm256_inserti32x4(A, B, imm) \
- (__m256i)__builtin_shufflevector((__v8si)(A), \
+ (__m256i)__builtin_shufflevector((__v8si)(__m256i)(A), \
(__v8si)_mm256_castsi128_si256((__m128i)(B)), \
((imm) & 0x1) ? 0 : 8, \
((imm) & 0x1) ? 1 : 9, \
#define _mm256_mask_inserti32x4(W, U, A, B, imm) \
(__m256i)__builtin_ia32_selectd_256((__mmask8)(U), \
(__v8si)_mm256_inserti32x4((A), (B), (imm)), \
- (__v8si)(W))
+ (__v8si)(__m256i)(W))
#define _mm256_maskz_inserti32x4(U, A, B, imm) \
(__m256i)__builtin_ia32_selectd_256((__mmask8)(U), \
}
#define _mm256_mask_shldi_epi64(S, U, A, B, I) \
- (__m256i)__builtin_ia32_vpshldq256_mask((__v4di)(A), \
- (__v4di)(B), \
+ (__m256i)__builtin_ia32_vpshldq256_mask((__v4di)(__m256i)(A), \
+ (__v4di)(__m256i)(B), \
(int)(I), \
- (__v4di)(S), \
+ (__v4di)(__m256i)(S), \
(__mmask8)(U))
#define _mm256_maskz_shldi_epi64(U, A, B, I) \
_mm256_mask_shldi_epi64(_mm256_undefined_si256(), (__mmask8)(-1), (A), (B), (I))
#define _mm_mask_shldi_epi64(S, U, A, B, I) \
- (__m128i)__builtin_ia32_vpshldq128_mask((__v2di)(A), \
- (__v2di)(B), \
+ (__m128i)__builtin_ia32_vpshldq128_mask((__v2di)(__m128i)(A), \
+ (__v2di)(__m128i)(B), \
(int)(I), \
- (__v2di)(S), \
+ (__v2di)(__m128i)(S), \
(__mmask8)(U))
#define _mm_maskz_shldi_epi64(U, A, B, I) \
_mm_mask_shldi_epi64(_mm_undefined_si128(), (__mmask8)(-1), (A), (B), (I))
#define _mm256_mask_shldi_epi32(S, U, A, B, I) \
- (__m256i)__builtin_ia32_vpshldd256_mask((__v8si)(A), \
- (__v8si)(B), \
+ (__m256i)__builtin_ia32_vpshldd256_mask((__v8si)(__m256i)(A), \
+ (__v8si)(__m256i)(B), \
(int)(I), \
- (__v8si)(S), \
+ (__v8si)(__m256i)(S), \
(__mmask8)(U))
#define _mm256_maskz_shldi_epi32(U, A, B, I) \
_mm256_mask_shldi_epi32(_mm256_undefined_si256(), (__mmask8)(-1), (A), (B), (I))
#define _mm_mask_shldi_epi32(S, U, A, B, I) \
- (__m128i)__builtin_ia32_vpshldd128_mask((__v4si)(A), \
- (__v4si)(B), \
+ (__m128i)__builtin_ia32_vpshldd128_mask((__v4si)(__m128i)(A), \
+ (__v4si)(__m128i)(B), \
(int)(I), \
- (__v4si)(S), \
+ (__v4si)(__m128i)(S), \
(__mmask8)(U))
#define _mm_maskz_shldi_epi32(U, A, B, I) \
_mm_mask_shldi_epi32(_mm_undefined_si128(), (__mmask8)(-1), (A), (B), (I))
#define _mm256_mask_shldi_epi16(S, U, A, B, I) \
- (__m256i)__builtin_ia32_vpshldw256_mask((__v16hi)(A), \
- (__v16hi)(B), \
+ (__m256i)__builtin_ia32_vpshldw256_mask((__v16hi)(__m256i)(A), \
+ (__v16hi)(__m256i)(B), \
(int)(I), \
- (__v16hi)(S), \
+ (__v16hi)(__m256i)(S), \
(__mmask16)(U))
#define _mm256_maskz_shldi_epi16(U, A, B, I) \
_mm256_mask_shldi_epi16(_mm256_undefined_si256(), (__mmask8)(-1), (A), (B), (I))
#define _mm_mask_shldi_epi16(S, U, A, B, I) \
- (__m128i)__builtin_ia32_vpshldw128_mask((__v8hi)(A), \
- (__v8hi)(B), \
+ (__m128i)__builtin_ia32_vpshldw128_mask((__v8hi)(__m128i)(A), \
+ (__v8hi)(__m128i)(B), \
(int)(I), \
- (__v8hi)(S), \
+ (__v8hi)(__m128i)(S), \
(__mmask8)(U))
#define _mm_maskz_shldi_epi16(U, A, B, I) \
_mm_mask_shldi_epi16(_mm_undefined_si128(), (__mmask8)(-1), (A), (B), (I))
#define _mm256_mask_shrdi_epi64(S, U, A, B, I) \
- (__m256i)__builtin_ia32_vpshrdq256_mask((__v4di)(A), \
- (__v4di)(B), \
+ (__m256i)__builtin_ia32_vpshrdq256_mask((__v4di)(__m256i)(A), \
+ (__v4di)(__m256i)(B), \
(int)(I), \
- (__v4di)(S), \
+ (__v4di)(__m256i)(S), \
(__mmask8)(U))
#define _mm256_maskz_shrdi_epi64(U, A, B, I) \
_mm256_mask_shrdi_epi64(_mm256_undefined_si256(), (__mmask8)(-1), (A), (B), (I))
#define _mm_mask_shrdi_epi64(S, U, A, B, I) \
- (__m128i)__builtin_ia32_vpshrdq128_mask((__v2di)(A), \
- (__v2di)(B), \
+ (__m128i)__builtin_ia32_vpshrdq128_mask((__v2di)(__m128i)(A), \
+ (__v2di)(__m128i)(B), \
(int)(I), \
- (__v2di)(S), \
+ (__v2di)(__m128i)(S), \
(__mmask8)(U))
#define _mm_maskz_shrdi_epi64(U, A, B, I) \
_mm_mask_shrdi_epi64(_mm_undefined_si128(), (__mmask8)(-1), (A), (B), (I))
#define _mm256_mask_shrdi_epi32(S, U, A, B, I) \
- (__m256i)__builtin_ia32_vpshrdd256_mask((__v8si)(A), \
- (__v8si)(B), \
+ (__m256i)__builtin_ia32_vpshrdd256_mask((__v8si)(__m256i)(A), \
+ (__v8si)(__m256i)(B), \
(int)(I), \
- (__v8si)(S), \
+ (__v8si)(__m256i)(S), \
(__mmask8)(U))
#define _mm256_maskz_shrdi_epi32(U, A, B, I) \
_mm256_mask_shrdi_epi32(_mm256_undefined_si256(), (__mmask8)(-1), (A), (B), (I))
#define _mm_mask_shrdi_epi32(S, U, A, B, I) \
- (__m128i)__builtin_ia32_vpshrdd128_mask((__v4si)(A), \
- (__v4si)(B), \
+ (__m128i)__builtin_ia32_vpshrdd128_mask((__v4si)(__m128i)(A), \
+ (__v4si)(__m128i)(B), \
(int)(I), \
- (__v4si)(S), \
+ (__v4si)(__m128i)(S), \
(__mmask8)(U))
#define _mm_maskz_shrdi_epi32(U, A, B, I) \
_mm_mask_shrdi_epi32(_mm_undefined_si128(), (__mmask8)(-1), (A), (B), (I))
#define _mm256_mask_shrdi_epi16(S, U, A, B, I) \
- (__m256i)__builtin_ia32_vpshrdw256_mask((__v16hi)(A), \
- (__v16hi)(B), \
+ (__m256i)__builtin_ia32_vpshrdw256_mask((__v16hi)(__m256i)(A), \
+ (__v16hi)(__m256i)(B), \
(int)(I), \
- (__v16hi)(S), \
+ (__v16hi)(__m256i)(S), \
(__mmask16)(U))
#define _mm256_maskz_shrdi_epi16(U, A, B, I) \
_mm256_mask_shrdi_epi16(_mm256_undefined_si256(), (__mmask8)(-1), (A), (B), (I))
#define _mm_mask_shrdi_epi16(S, U, A, B, I) \
- (__m128i)__builtin_ia32_vpshrdw128_mask((__v8hi)(A), \
- (__v8hi)(B), \
+ (__m128i)__builtin_ia32_vpshrdw128_mask((__v8hi)(__m128i)(A), \
+ (__v8hi)(__m128i)(B), \
(int)(I), \
- (__v8hi)(S), \
+ (__v8hi)(__m128i)(S), \
(__mmask8)(U))
#define _mm_maskz_shrdi_epi16(U, A, B, I) \