*/
#define _mm256_insertf128_ps(V1, V2, M) __extension__ ({ \
(__m256)__builtin_shufflevector( \
- (__v8sf)(V1), \
+ (__v8sf)(__m256)(V1), \
(__v8sf)_mm256_castps128_ps256((__m128)(V2)), \
(((M) & 1) ? 0 : 8), \
(((M) & 1) ? 1 : 9), \
#define _mm256_insertf128_pd(V1, V2, M) __extension__ ({ \
(__m256d)__builtin_shufflevector( \
- (__v4df)(V1), \
+ (__v4df)(__m256d)(V1), \
(__v4df)_mm256_castpd128_pd256((__m128d)(V2)), \
(((M) & 1) ? 0 : 4), \
(((M) & 1) ? 1 : 5), \
#define _mm256_insertf128_si256(V1, V2, M) __extension__ ({ \
(__m256i)__builtin_shufflevector( \
- (__v4di)(V1), \
+ (__v4di)(__m256i)(V1), \
(__v4di)_mm256_castsi128_si256((__m128i)(V2)), \
(((M) & 1) ? 0 : 4), \
(((M) & 1) ? 1 : 5), \
*/
#define _mm256_extractf128_ps(V, M) __extension__ ({ \
(__m128)__builtin_shufflevector( \
- (__v8sf)(V), \
+ (__v8sf)(__m256)(V), \
(__v8sf)(_mm256_setzero_ps()), \
(((M) & 1) ? 4 : 0), \
(((M) & 1) ? 5 : 1), \
#define _mm256_extractf128_pd(V, M) __extension__ ({ \
(__m128d)__builtin_shufflevector( \
- (__v4df)(V), \
+ (__v4df)(__m256d)(V), \
(__v4df)(_mm256_setzero_pd()), \
(((M) & 1) ? 2 : 0), \
(((M) & 1) ? 3 : 1) );})
#define _mm256_extractf128_si256(V, M) __extension__ ({ \
(__m128i)__builtin_shufflevector( \
- (__v4di)(V), \
+ (__v4di)(__m256i)(V), \
(__v4di)(_mm256_setzero_si256()), \
(((M) & 1) ? 2 : 0), \
(((M) & 1) ? 3 : 1) );})
#define _mm_insert_ps(X, Y, N) __builtin_ia32_insertps128((X), (Y), (N))
#define _mm_extract_ps(X, N) (__extension__ \
({ union { int __i; float __f; } __t; \
- __v4sf __a = (__v4sf)(X); \
+ __v4sf __a = (__v4sf)(__m128)(X); \
__t.__f = __a[(N) & 3]; \
__t.__i;}))
_MM_MK_INSERTPS_NDX((N), 0, 0x0e))
/* Insert int into packed integer array at index. */
-#define _mm_insert_epi8(X, I, N) (__extension__ ({ __v16qi __a = (__v16qi)(X); \
- __a[(N) & 15] = (I); \
- __a;}))
-#define _mm_insert_epi32(X, I, N) (__extension__ ({ __v4si __a = (__v4si)(X); \
- __a[(N) & 3] = (I); \
- __a;}))
+#define _mm_insert_epi8(X, I, N) (__extension__ \
+ ({ __v16qi __a = (__v16qi)(__m128i)(X); \
+ __a[(N) & 15] = (I); \
+ __a;}))
+#define _mm_insert_epi32(X, I, N) (__extension__ \
+ ({ __v4si __a = (__v4si)(__m128i)(X); \
+ __a[(N) & 3] = (I); \
+ __a;}))
#ifdef __x86_64__
-#define _mm_insert_epi64(X, I, N) (__extension__ ({ __v2di __a = (__v2di)(X); \
- __a[(N) & 1] = (I); \
- __a;}))
+#define _mm_insert_epi64(X, I, N) (__extension__ \
+ ({ __v2di __a = (__v2di)(__m128i)(X); \
+ __a[(N) & 1] = (I); \
+ __a;}))
#endif /* __x86_64__ */
/* Extract int from packed integer array at index. This returns the element
* as a zero extended value, so it is unsigned.
*/
-#define _mm_extract_epi8(X, N) (__extension__ ({ __v16qi __a = (__v16qi)(X); \
- (int)(unsigned char) \
- __a[(N) & 15];}))
-#define _mm_extract_epi32(X, N) (__extension__ ({ __v4si __a = (__v4si)(X); \
- __a[(N) & 3];}))
+#define _mm_extract_epi8(X, N) (__extension__ \
+ ({ __v16qi __a = (__v16qi)(__m128i)(X); \
+ (int)(unsigned char) __a[(N) & 15];}))
+#define _mm_extract_epi32(X, N) (__extension__ \
+ ({ __v4si __a = (__v4si)(__m128i)(X); \
+ (int)__a[(N) & 3];}))
#ifdef __x86_64__
-#define _mm_extract_epi64(X, N) (__extension__ ({ __v2di __a = (__v2di)(X); \
- __a[(N) & 1];}))
+#define _mm_extract_epi64(X, N) (__extension__ \
+ ({ __v2di __a = (__v2di)(__m128i)(X); \
+ (long long)__a[(N) & 1];}))
#endif /* __x86_64 */
/* SSE4 128-bit Packed Integer Comparisons. */
#define _SIDD_UNIT_MASK 0x40
/* SSE4.2 Packed Comparison Intrinsics. */
-#define _mm_cmpistrm(A, B, M) __builtin_ia32_pcmpistrm128((A), (B), (M))
-#define _mm_cmpistri(A, B, M) __builtin_ia32_pcmpistri128((A), (B), (M))
+#define _mm_cmpistrm(A, B, M) \
+ (__m128i)__builtin_ia32_pcmpistrm128((__v16qi)(__m128i)(A), \
+ (__v16qi)(__m128i)(B), (int)(M))
+#define _mm_cmpistri(A, B, M) \
+ (int)__builtin_ia32_pcmpistri128((__v16qi)(__m128i)(A), \
+ (__v16qi)(__m128i)(B), (int)(M))
#define _mm_cmpestrm(A, LA, B, LB, M) \
- __builtin_ia32_pcmpestrm128((A), (LA), (B), (LB), (M))
+ (__m128i)__builtin_ia32_pcmpestrm128((__v16qi)(__m128i)(A), (int)(LA), \
+ (__v16qi)(__m128i)(B), (int)(LB), \
+ (int)(M))
#define _mm_cmpestri(A, LA, B, LB, M) \
- __builtin_ia32_pcmpestri128((A), (LA), (B), (LB), (M))
+ (int)__builtin_ia32_pcmpestri128((__v16qi)(__m128i)(A), (int)(LA), \
+ (__v16qi)(__m128i)(B), (int)(LB), \
+ (int)(M))
/* SSE4.2 Packed Comparison Intrinsics and EFlag Reading. */
#define _mm_cmpistra(A, B, M) \
- __builtin_ia32_pcmpistria128((A), (B), (M))
+ (int)__builtin_ia32_pcmpistria128((__v16qi)(__m128i)(A), \
+ (__v16qi)(__m128i)(B), (int)(M))
#define _mm_cmpistrc(A, B, M) \
- __builtin_ia32_pcmpistric128((A), (B), (M))
+ (int)__builtin_ia32_pcmpistric128((__v16qi)(__m128i)(A), \
+ (__v16qi)(__m128i)(B), (int)(M))
#define _mm_cmpistro(A, B, M) \
- __builtin_ia32_pcmpistrio128((A), (B), (M))
+ (int)__builtin_ia32_pcmpistrio128((__v16qi)(__m128i)(A), \
+ (__v16qi)(__m128i)(B), (int)(M))
#define _mm_cmpistrs(A, B, M) \
- __builtin_ia32_pcmpistris128((A), (B), (M))
+ (int)__builtin_ia32_pcmpistris128((__v16qi)(__m128i)(A), \
+ (__v16qi)(__m128i)(B), (int)(M))
#define _mm_cmpistrz(A, B, M) \
- __builtin_ia32_pcmpistriz128((A), (B), (M))
+ (int)__builtin_ia32_pcmpistriz128((__v16qi)(__m128i)(A), \
+ (__v16qi)(__m128i)(B), (int)(M))
#define _mm_cmpestra(A, LA, B, LB, M) \
- __builtin_ia32_pcmpestria128((A), (LA), (B), (LB), (M))
+ (int)__builtin_ia32_pcmpestria128((__v16qi)(__m128i)(A), (int)(LA), \
+ (__v16qi)(__m128i)(B), (int)(LB), \
+ (int)(M))
#define _mm_cmpestrc(A, LA, B, LB, M) \
- __builtin_ia32_pcmpestric128((A), (LA), (B), (LB), (M))
+ (int)__builtin_ia32_pcmpestric128((__v16qi)(__m128i)(A), (int)(LA), \
+ (__v16qi)(__m128i)(B), (int)(LB), \
+ (int)(M))
#define _mm_cmpestro(A, LA, B, LB, M) \
- __builtin_ia32_pcmpestrio128((A), (LA), (B), (LB), (M))
+ (int)__builtin_ia32_pcmpestrio128((__v16qi)(__m128i)(A), (int)(LA), \
+ (__v16qi)(__m128i)(B), (int)(LB), \
+ (int)(M))
#define _mm_cmpestrs(A, LA, B, LB, M) \
- __builtin_ia32_pcmpestris128((A), (LA), (B), (LB), (M))
+ (int)__builtin_ia32_pcmpestris128((__v16qi)(__m128i)(A), (int)(LA), \
+ (__v16qi)(__m128i)(B), (int)(LB), \
+ (int)(M))
#define _mm_cmpestrz(A, LA, B, LB, M) \
- __builtin_ia32_pcmpestriz128((A), (LA), (B), (LB), (M))
+ (int)__builtin_ia32_pcmpestriz128((__v16qi)(__m128i)(A), (int)(LA), \
+ (__v16qi)(__m128i)(B), (int)(LB), \
+ (int)(M))
/* SSE4.2 Compare Packed Data -- Greater Than. */
static __inline__ __m128i __DEFAULT_FN_ATTRS