(__m512)__builtin_ia32_subps512_mask((__v16sf)(__m512)(A), \
(__v16sf)(__m512)(B), \
(__v16sf)(__m512)(W), (__mmask16)(U), \
- (int)(R));
+ (int)(R))
#define _mm512_maskz_sub_round_ps(U, A, B, R) \
(__m512)__builtin_ia32_subps512_mask((__v16sf)(__m512)(A), \
(__v16sf)(__m512)(B), \
(__v16sf)_mm512_setzero_ps(), \
- (__mmask16)(U), (int)(R));
+ (__mmask16)(U), (int)(R))
static __inline__ __m128 __DEFAULT_FN_ATTRS
_mm_mask_mul_ss(__m128 __W, __mmask8 __U,__m128 __A, __m128 __B) {
(__m512)__builtin_ia32_mulps512_mask((__v16sf)(__m512)(A), \
(__v16sf)(__m512)(B), \
(__v16sf)(__m512)(W), (__mmask16)(U), \
- (int)(R));
+ (int)(R))
#define _mm512_maskz_mul_round_ps(U, A, B, R) \
(__m512)__builtin_ia32_mulps512_mask((__v16sf)(__m512)(A), \
(__v16sf)(__m512)(B), \
(__v16sf)_mm512_setzero_ps(), \
- (__mmask16)(U), (int)(R));
+ (__mmask16)(U), (int)(R))
static __inline__ __m128 __DEFAULT_FN_ATTRS
_mm_mask_div_ss(__m128 __W, __mmask8 __U,__m128 __A, __m128 __B) {
(__m512)__builtin_ia32_divps512_mask((__v16sf)(__m512)(A), \
(__v16sf)(__m512)(B), \
(__v16sf)(__m512)(W), (__mmask16)(U), \
- (int)(R));
+ (int)(R))
#define _mm512_maskz_div_round_ps(U, A, B, R) \
(__m512)__builtin_ia32_divps512_mask((__v16sf)(__m512)(A), \
(__v16sf)(__m512)(B), \
(__v16sf)_mm512_setzero_ps(), \
- (__mmask16)(U), (int)(R));
+ (__mmask16)(U), (int)(R))
#define _mm512_roundscale_ps(A, B) \
(__m512)__builtin_ia32_rndscaleps_mask((__v16sf)(__m512)(A), (int)(B), \
/// \returns The converted 16-bit half-precision float value.
#define _cvtss_sh(a, imm) \
(unsigned short)(((__v8hi)__builtin_ia32_vcvtps2ph((__v4sf){a, 0, 0, 0}, \
- (imm)))[0]);
+ (imm)))[0])
/// Converts a 128-bit vector containing 32-bit float values into a
/// 128-bit vector containing 16-bit half-precision float values.
/// values. The lower 64 bits are used to store the converted 16-bit
/// half-precision floating-point values.
#define _mm_cvtps_ph(a, imm) \
- (__m128i)__builtin_ia32_vcvtps2ph((__v4sf)(__m128)(a), (imm));
+ (__m128i)__builtin_ia32_vcvtps2ph((__v4sf)(__m128)(a), (imm))
/// Converts a 128-bit vector containing 16-bit half-precision float
/// values into a 128-bit vector containing 32-bit float values.
/// \returns A 128-bit vector containing the converted 16-bit half-precision
/// float values.
#define _mm256_cvtps_ph(a, imm) \
- (__m128i)__builtin_ia32_vcvtps2ph256((__v8sf)(__m256)(a), (imm));
+ (__m128i)__builtin_ia32_vcvtps2ph256((__v8sf)(__m256)(a), (imm))
/// Converts a 128-bit vector containing 16-bit half-precision float
/// values into a 256-bit vector of [8 x float].
#define _mm256_maskz_gf2p8affineinv_epi64_epi8(U, A, B, I) \
(__m256i)_mm256_mask_gf2p8affineinv_epi64_epi8((__m256i)_mm256_setzero_si256(), \
- U, A, B, I);
+ U, A, B, I)
#define _mm512_gf2p8affineinv_epi64_epi8(A, B, I) \
#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("sha")))
#define _mm_sha1rnds4_epu32(V1, V2, M) \
- __builtin_ia32_sha1rnds4((__v4si)(__m128i)(V1), (__v4si)(__m128i)(V2), (M));
+ __builtin_ia32_sha1rnds4((__v4si)(__m128i)(V1), (__v4si)(__m128i)(V2), (M))
static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_sha1nexte_epu32(__m128i __X, __m128i __Y)
#define _mm256_clmulepi64_epi128(A, B, I) \
(__m256i)__builtin_ia32_pclmulqdq256((__v4di)(__m256i)(A), \
(__v4di)(__m256i)(B), \
- (char)(I));
+ (char)(I))
#define _mm512_clmulepi64_epi128(A, B, I) \
(__m512i)__builtin_ia32_pclmulqdq512((__v8di)(__m512i)(A), \
(__v8di)(__m512i)(B), \
- (char)(I));
+ (char)(I))
#endif /* __VPCLMULQDQINTRIN_H */