typedef short __v8hi __attribute__((__vector_size__(16)));
typedef char __v16qi __attribute__((__vector_size__(16)));
-static inline __m128d __attribute__((__always_inline__))
+static inline __m128d __attribute__((__always_inline__, __nodebug__))
_mm_add_sd(__m128d a, __m128d b)
{
return __builtin_ia32_addsd(a, b);
}
-static inline __m128d __attribute__((__always_inline__))
+static inline __m128d __attribute__((__always_inline__, __nodebug__))
_mm_add_pd(__m128d a, __m128d b)
{
return a + b;
}
-static inline __m128d __attribute__((__always_inline__))
+static inline __m128d __attribute__((__always_inline__, __nodebug__))
_mm_sub_sd(__m128d a, __m128d b)
{
return __builtin_ia32_subsd(a, b);
}
-static inline __m128d __attribute__((__always_inline__))
+static inline __m128d __attribute__((__always_inline__, __nodebug__))
_mm_sub_pd(__m128d a, __m128d b)
{
return a - b;
}
-static inline __m128d __attribute__((__always_inline__))
+static inline __m128d __attribute__((__always_inline__, __nodebug__))
_mm_mul_sd(__m128d a, __m128d b)
{
return __builtin_ia32_mulsd(a, b);
}
-static inline __m128d __attribute__((__always_inline__))
+static inline __m128d __attribute__((__always_inline__, __nodebug__))
_mm_mul_pd(__m128d a, __m128d b)
{
return a * b;
}
-static inline __m128d __attribute__((__always_inline__))
+static inline __m128d __attribute__((__always_inline__, __nodebug__))
_mm_div_sd(__m128d a, __m128d b)
{
return __builtin_ia32_divsd(a, b);
}
-static inline __m128d __attribute__((__always_inline__))
+static inline __m128d __attribute__((__always_inline__, __nodebug__))
_mm_div_pd(__m128d a, __m128d b)
{
return a / b;
}
-static inline __m128d __attribute__((__always_inline__))
+static inline __m128d __attribute__((__always_inline__, __nodebug__))
_mm_sqrt_sd(__m128d a, __m128d b)
{
__m128d c = __builtin_ia32_sqrtsd(b);
return (__m128d) { c[0], a[1] };
}
-static inline __m128d __attribute__((__always_inline__))
+static inline __m128d __attribute__((__always_inline__, __nodebug__))
_mm_sqrt_pd(__m128d a)
{
return __builtin_ia32_sqrtpd(a);
}
-static inline __m128d __attribute__((__always_inline__))
+static inline __m128d __attribute__((__always_inline__, __nodebug__))
_mm_min_sd(__m128d a, __m128d b)
{
return __builtin_ia32_minsd(a, b);
}
-static inline __m128d __attribute__((__always_inline__))
+static inline __m128d __attribute__((__always_inline__, __nodebug__))
_mm_min_pd(__m128d a, __m128d b)
{
return __builtin_ia32_minpd(a, b);
}
-static inline __m128d __attribute__((__always_inline__))
+static inline __m128d __attribute__((__always_inline__, __nodebug__))
_mm_max_sd(__m128d a, __m128d b)
{
return __builtin_ia32_maxsd(a, b);
}
-static inline __m128d __attribute__((__always_inline__))
+static inline __m128d __attribute__((__always_inline__, __nodebug__))
_mm_max_pd(__m128d a, __m128d b)
{
return __builtin_ia32_maxpd(a, b);
}
-static inline __m128d __attribute__((__always_inline__))
+static inline __m128d __attribute__((__always_inline__, __nodebug__))
_mm_and_pd(__m128d a, __m128d b)
{
return __builtin_ia32_andpd(a, b);
}
-static inline __m128d __attribute__((__always_inline__))
+static inline __m128d __attribute__((__always_inline__, __nodebug__))
_mm_andnot_pd(__m128d a, __m128d b)
{
return __builtin_ia32_andnpd(a, b);
}
-static inline __m128d __attribute__((__always_inline__))
+static inline __m128d __attribute__((__always_inline__, __nodebug__))
_mm_or_pd(__m128d a, __m128d b)
{
return __builtin_ia32_orpd(a, b);
}
-static inline __m128d __attribute__((__always_inline__))
+static inline __m128d __attribute__((__always_inline__, __nodebug__))
_mm_xor_pd(__m128d a, __m128d b)
{
return __builtin_ia32_xorpd(a, b);
}
-static inline __m128d __attribute__((__always_inline__))
+static inline __m128d __attribute__((__always_inline__, __nodebug__))
_mm_cmpeq_pd(__m128d a, __m128d b)
{
return (__m128d)__builtin_ia32_cmpeqpd(a, b);
}
-static inline __m128d __attribute__((__always_inline__))
+static inline __m128d __attribute__((__always_inline__, __nodebug__))
_mm_cmplt_pd(__m128d a, __m128d b)
{
return (__m128d)__builtin_ia32_cmpltpd(a, b);
}
-static inline __m128d __attribute__((__always_inline__))
+static inline __m128d __attribute__((__always_inline__, __nodebug__))
_mm_cmple_pd(__m128d a, __m128d b)
{
return (__m128d)__builtin_ia32_cmplepd(a, b);
}
-static inline __m128d __attribute__((__always_inline__))
+static inline __m128d __attribute__((__always_inline__, __nodebug__))
_mm_cmpgt_pd(__m128d a, __m128d b)
{
return (__m128d)__builtin_ia32_cmpltpd(b, a);
}
-static inline __m128d __attribute__((__always_inline__))
+static inline __m128d __attribute__((__always_inline__, __nodebug__))
_mm_cmpge_pd(__m128d a, __m128d b)
{
return (__m128d)__builtin_ia32_cmplepd(b, a);
}
-static inline __m128d __attribute__((__always_inline__))
+static inline __m128d __attribute__((__always_inline__, __nodebug__))
_mm_cmpord_pd(__m128d a, __m128d b)
{
return (__m128d)__builtin_ia32_cmpordpd(a, b);
}
-static inline __m128d __attribute__((__always_inline__))
+static inline __m128d __attribute__((__always_inline__, __nodebug__))
_mm_cmpunord_pd(__m128d a, __m128d b)
{
return (__m128d)__builtin_ia32_cmpunordpd(a, b);
}
-static inline __m128d __attribute__((__always_inline__))
+static inline __m128d __attribute__((__always_inline__, __nodebug__))
_mm_cmpneq_pd(__m128d a, __m128d b)
{
return (__m128d)__builtin_ia32_cmpneqpd(a, b);
}
-static inline __m128d __attribute__((__always_inline__))
+static inline __m128d __attribute__((__always_inline__, __nodebug__))
_mm_cmpnlt_pd(__m128d a, __m128d b)
{
return (__m128d)__builtin_ia32_cmpnltpd(a, b);
}
-static inline __m128d __attribute__((__always_inline__))
+static inline __m128d __attribute__((__always_inline__, __nodebug__))
_mm_cmpnle_pd(__m128d a, __m128d b)
{
return (__m128d)__builtin_ia32_cmpnlepd(a, b);
}
-static inline __m128d __attribute__((__always_inline__))
+static inline __m128d __attribute__((__always_inline__, __nodebug__))
_mm_cmpngt_pd(__m128d a, __m128d b)
{
return (__m128d)__builtin_ia32_cmpnltpd(b, a);
}
-static inline __m128d __attribute__((__always_inline__))
+static inline __m128d __attribute__((__always_inline__, __nodebug__))
_mm_cmpnge_pd(__m128d a, __m128d b)
{
return (__m128d)__builtin_ia32_cmpnlepd(b, a);
}
-static inline __m128d __attribute__((__always_inline__))
+static inline __m128d __attribute__((__always_inline__, __nodebug__))
_mm_cmpeq_sd(__m128d a, __m128d b)
{
return (__m128d)__builtin_ia32_cmpeqsd(a, b);
}
-static inline __m128d __attribute__((__always_inline__))
+static inline __m128d __attribute__((__always_inline__, __nodebug__))
_mm_cmplt_sd(__m128d a, __m128d b)
{
return (__m128d)__builtin_ia32_cmpltsd(a, b);
}
-static inline __m128d __attribute__((__always_inline__))
+static inline __m128d __attribute__((__always_inline__, __nodebug__))
_mm_cmple_sd(__m128d a, __m128d b)
{
return (__m128d)__builtin_ia32_cmplesd(a, b);
}
-static inline __m128d __attribute__((__always_inline__))
+static inline __m128d __attribute__((__always_inline__, __nodebug__))
_mm_cmpgt_sd(__m128d a, __m128d b)
{
return (__m128d)__builtin_ia32_cmpltsd(b, a);
}
-static inline __m128d __attribute__((__always_inline__))
+static inline __m128d __attribute__((__always_inline__, __nodebug__))
_mm_cmpge_sd(__m128d a, __m128d b)
{
return (__m128d)__builtin_ia32_cmplesd(b, a);
}
-static inline __m128d __attribute__((__always_inline__))
+static inline __m128d __attribute__((__always_inline__, __nodebug__))
_mm_cmpord_sd(__m128d a, __m128d b)
{
return (__m128d)__builtin_ia32_cmpordsd(a, b);
}
-static inline __m128d __attribute__((__always_inline__))
+static inline __m128d __attribute__((__always_inline__, __nodebug__))
_mm_cmpunord_sd(__m128d a, __m128d b)
{
return (__m128d)__builtin_ia32_cmpunordsd(a, b);
}
-static inline __m128d __attribute__((__always_inline__))
+static inline __m128d __attribute__((__always_inline__, __nodebug__))
_mm_cmpneq_sd(__m128d a, __m128d b)
{
return (__m128d)__builtin_ia32_cmpneqsd(a, b);
}
-static inline __m128d __attribute__((__always_inline__))
+static inline __m128d __attribute__((__always_inline__, __nodebug__))
_mm_cmpnlt_sd(__m128d a, __m128d b)
{
return (__m128d)__builtin_ia32_cmpnltsd(a, b);
}
-static inline __m128d __attribute__((__always_inline__))
+static inline __m128d __attribute__((__always_inline__, __nodebug__))
_mm_cmpnle_sd(__m128d a, __m128d b)
{
return (__m128d)__builtin_ia32_cmpnlesd(a, b);
}
-static inline __m128d __attribute__((__always_inline__))
+static inline __m128d __attribute__((__always_inline__, __nodebug__))
_mm_cmpngt_sd(__m128d a, __m128d b)
{
return (__m128d)__builtin_ia32_cmpnltsd(b, a);
}
-static inline __m128d __attribute__((__always_inline__))
+static inline __m128d __attribute__((__always_inline__, __nodebug__))
_mm_cmpnge_sd(__m128d a, __m128d b)
{
return (__m128d)__builtin_ia32_cmpnlesd(b, a);
}
-static inline int __attribute__((__always_inline__))
+static inline int __attribute__((__always_inline__, __nodebug__))
_mm_comieq_sd(__m128d a, __m128d b)
{
return __builtin_ia32_comisdeq(a, b);
}
-static inline int __attribute__((__always_inline__))
+static inline int __attribute__((__always_inline__, __nodebug__))
_mm_comilt_sd(__m128d a, __m128d b)
{
return __builtin_ia32_comisdlt(a, b);
}
-static inline int __attribute__((__always_inline__))
+static inline int __attribute__((__always_inline__, __nodebug__))
_mm_comile_sd(__m128d a, __m128d b)
{
return __builtin_ia32_comisdle(a, b);
}
-static inline int __attribute__((__always_inline__))
+static inline int __attribute__((__always_inline__, __nodebug__))
_mm_comigt_sd(__m128d a, __m128d b)
{
return __builtin_ia32_comisdgt(a, b);
}
-static inline int __attribute__((__always_inline__))
+static inline int __attribute__((__always_inline__, __nodebug__))
_mm_comineq_sd(__m128d a, __m128d b)
{
return __builtin_ia32_comisdneq(a, b);
}
-static inline int __attribute__((__always_inline__))
+static inline int __attribute__((__always_inline__, __nodebug__))
_mm_ucomieq_sd(__m128d a, __m128d b)
{
return __builtin_ia32_ucomisdeq(a, b);
}
-static inline int __attribute__((__always_inline__))
+static inline int __attribute__((__always_inline__, __nodebug__))
_mm_ucomilt_sd(__m128d a, __m128d b)
{
return __builtin_ia32_ucomisdlt(a, b);
}
-static inline int __attribute__((__always_inline__))
+static inline int __attribute__((__always_inline__, __nodebug__))
_mm_ucomile_sd(__m128d a, __m128d b)
{
return __builtin_ia32_ucomisdle(a, b);
}
-static inline int __attribute__((__always_inline__))
+static inline int __attribute__((__always_inline__, __nodebug__))
_mm_ucomigt_sd(__m128d a, __m128d b)
{
return __builtin_ia32_ucomisdgt(a, b);
}
-static inline int __attribute__((__always_inline__))
+static inline int __attribute__((__always_inline__, __nodebug__))
_mm_ucomineq_sd(__m128d a, __m128d b)
{
return __builtin_ia32_ucomisdneq(a, b);
}
-static inline __m128 __attribute__((__always_inline__))
+static inline __m128 __attribute__((__always_inline__, __nodebug__))
_mm_cvtpd_ps(__m128d a)
{
return __builtin_ia32_cvtpd2ps(a);
}
-static inline __m128d __attribute__((__always_inline__))
+static inline __m128d __attribute__((__always_inline__, __nodebug__))
_mm_cvtps_pd(__m128 a)
{
return __builtin_ia32_cvtps2pd(a);
}
-static inline __m128d __attribute__((__always_inline__))
+static inline __m128d __attribute__((__always_inline__, __nodebug__))
_mm_cvtepi32_pd(__m128i a)
{
return __builtin_ia32_cvtdq2pd((__v4si)a);
}
-static inline __m128i __attribute__((__always_inline__))
+static inline __m128i __attribute__((__always_inline__, __nodebug__))
_mm_cvtpd_epi32(__m128d a)
{
return __builtin_ia32_cvtpd2dq(a);
}
-static inline int __attribute__((__always_inline__))
+static inline int __attribute__((__always_inline__, __nodebug__))
_mm_cvtsd_si32(__m128d a)
{
return __builtin_ia32_cvtsd2si(a);
}
-static inline __m128 __attribute__((__always_inline__))
+static inline __m128 __attribute__((__always_inline__, __nodebug__))
_mm_cvtsd_ss(__m128 a, __m128d b)
{
return __builtin_ia32_cvtsd2ss(a, b);
}
-static inline __m128d __attribute__((__always_inline__))
+static inline __m128d __attribute__((__always_inline__, __nodebug__))
_mm_cvtsi32_sd(__m128d a, int b)
{
return __builtin_ia32_cvtsi2sd(a, b);
}
-static inline __m128d __attribute__((__always_inline__))
+static inline __m128d __attribute__((__always_inline__, __nodebug__))
_mm_cvtss_sd(__m128d a, __m128 b)
{
return __builtin_ia32_cvtss2sd(a, b);
}
-static inline __m128i __attribute__((__always_inline__))
+static inline __m128i __attribute__((__always_inline__, __nodebug__))
_mm_cvttpd_epi32(__m128d a)
{
return (__m128i)__builtin_ia32_cvttpd2dq(a);
}
-static inline int __attribute__((__always_inline__))
+static inline int __attribute__((__always_inline__, __nodebug__))
_mm_cvttsd_si32(__m128d a)
{
return __builtin_ia32_cvttsd2si(a);
}
-static inline __m64 __attribute__((__always_inline__))
+static inline __m64 __attribute__((__always_inline__, __nodebug__))
_mm_cvtpd_pi32(__m128d a)
{
return (__m64)__builtin_ia32_cvtpd2pi(a);
}
-static inline __m64 __attribute__((__always_inline__))
+static inline __m64 __attribute__((__always_inline__, __nodebug__))
_mm_cvttpd_pi32(__m128d a)
{
return (__m64)__builtin_ia32_cvttpd2pi(a);
}
-static inline __m128d __attribute__((__always_inline__))
+static inline __m128d __attribute__((__always_inline__, __nodebug__))
_mm_cvtpi32_pd(__m64 a)
{
return __builtin_ia32_cvtpi2pd((__v2si)a);
}
-static inline double __attribute__((__always_inline__))
+static inline double __attribute__((__always_inline__, __nodebug__))
_mm_cvtsd_f64(__m128d a)
{
return a[0];
}
-static inline __m128d __attribute__((__always_inline__))
+static inline __m128d __attribute__((__always_inline__, __nodebug__))
_mm_load_pd(double const *dp)
{
return *(__m128d*)dp;
}
-static inline __m128d __attribute__((__always_inline__))
+static inline __m128d __attribute__((__always_inline__, __nodebug__))
_mm_load1_pd(double const *dp)
{
return (__m128d){ dp[0], dp[0] };
}
-static inline __m128d __attribute__((__always_inline__))
+static inline __m128d __attribute__((__always_inline__, __nodebug__))
_mm_loadr_pd(double const *dp)
{
return (__m128d){ dp[1], dp[0] };
}
-static inline __m128d __attribute__((__always_inline__))
+static inline __m128d __attribute__((__always_inline__, __nodebug__))
_mm_loadu_pd(double const *dp)
{
return __builtin_ia32_loadupd(dp);
}
-static inline __m128d __attribute__((__always_inline__))
+static inline __m128d __attribute__((__always_inline__, __nodebug__))
_mm_load_sd(double const *dp)
{
return (__m128d){ *dp, 0.0 };
}
-static inline __m128d __attribute__((__always_inline__))
+static inline __m128d __attribute__((__always_inline__, __nodebug__))
_mm_loadh_pd(__m128d a, double const *dp)
{
return __builtin_shufflevector(a, *(__m128d *)dp, 0, 2);
}
-static inline __m128d __attribute__((__always_inline__))
+static inline __m128d __attribute__((__always_inline__, __nodebug__))
_mm_loadl_pd(__m128d a, double const *dp)
{
return __builtin_shufflevector(a, *(__m128d *)dp, 2, 1);
}
-static inline __m128d __attribute__((__always_inline__))
+static inline __m128d __attribute__((__always_inline__, __nodebug__))
_mm_set_sd(double w)
{
return (__m128d){ w, 0 };
}
-static inline __m128d __attribute__((__always_inline__))
+static inline __m128d __attribute__((__always_inline__, __nodebug__))
_mm_set1_pd(double w)
{
return (__m128d){ w, w };
}
-static inline __m128d __attribute__((__always_inline__))
+static inline __m128d __attribute__((__always_inline__, __nodebug__))
_mm_set_pd(double w, double x)
{
return (__m128d){ w, x };
}
-static inline __m128d __attribute__((__always_inline__))
+static inline __m128d __attribute__((__always_inline__, __nodebug__))
_mm_setr_pd(double w, double x)
{
return (__m128d){ x, w };
}
-static inline __m128d __attribute__((__always_inline__))
+static inline __m128d __attribute__((__always_inline__, __nodebug__))
_mm_setzero_pd(void)
{
return (__m128d){ 0, 0 };
}
-static inline __m128d __attribute__((__always_inline__))
+static inline __m128d __attribute__((__always_inline__, __nodebug__))
_mm_move_sd(__m128d a, __m128d b)
{
return (__m128d){ b[0], a[1] };
}
-static inline void __attribute__((__always_inline__))
+static inline void __attribute__((__always_inline__, __nodebug__))
_mm_store_sd(double *dp, __m128d a)
{
dp[0] = a[0];
}
-static inline void __attribute__((__always_inline__))
+static inline void __attribute__((__always_inline__, __nodebug__))
_mm_store1_pd(double *dp, __m128d a)
{
dp[0] = a[0];
dp[1] = a[0];
}
-static inline void __attribute__((__always_inline__))
+static inline void __attribute__((__always_inline__, __nodebug__))
_mm_store_pd(double *dp, __m128d a)
{
*(__m128d *)dp = a;
}
-static inline void __attribute__((__always_inline__))
+static inline void __attribute__((__always_inline__, __nodebug__))
_mm_storeu_pd(double *dp, __m128d a)
{
__builtin_ia32_storeupd(dp, a);
}
-static inline void __attribute__((__always_inline__))
+static inline void __attribute__((__always_inline__, __nodebug__))
_mm_storer_pd(double *dp, __m128d a)
{
dp[0] = a[1];
dp[1] = a[0];
}
-static inline void __attribute__((__always_inline__))
+static inline void __attribute__((__always_inline__, __nodebug__))
_mm_storeh_pd(double *dp, __m128d a)
{
dp[0] = a[1];
}
-static inline void __attribute__((__always_inline__))
+static inline void __attribute__((__always_inline__, __nodebug__))
_mm_storel_pd(double *dp, __m128d a)
{
dp[0] = a[0];
}
-static inline __m128i __attribute__((__always_inline__))
+static inline __m128i __attribute__((__always_inline__, __nodebug__))
_mm_add_epi8(__m128i a, __m128i b)
{
return (__m128i)((__v16qi)a + (__v16qi)b);
}
-static inline __m128i __attribute__((__always_inline__))
+static inline __m128i __attribute__((__always_inline__, __nodebug__))
_mm_add_epi16(__m128i a, __m128i b)
{
return (__m128i)((__v8hi)a + (__v8hi)b);
}
-static inline __m128i __attribute__((__always_inline__))
+static inline __m128i __attribute__((__always_inline__, __nodebug__))
_mm_add_epi32(__m128i a, __m128i b)
{
return (__m128i)((__v4si)a + (__v4si)b);
}
-static inline __m64 __attribute__((__always_inline__))
+static inline __m64 __attribute__((__always_inline__, __nodebug__))
_mm_add_si64(__m64 a, __m64 b)
{
return a + b;
}
-static inline __m128i __attribute__((__always_inline__))
+static inline __m128i __attribute__((__always_inline__, __nodebug__))
_mm_add_epi64(__m128i a, __m128i b)
{
return a + b;
}
-static inline __m128i __attribute__((__always_inline__))
+static inline __m128i __attribute__((__always_inline__, __nodebug__))
_mm_adds_epi8(__m128i a, __m128i b)
{
return (__m128i)__builtin_ia32_paddsb128((__v16qi)a, (__v16qi)b);
}
-static inline __m128i __attribute__((__always_inline__))
+static inline __m128i __attribute__((__always_inline__, __nodebug__))
_mm_adds_epi16(__m128i a, __m128i b)
{
return (__m128i)__builtin_ia32_paddsw128((__v8hi)a, (__v8hi)b);
}
-static inline __m128i __attribute__((__always_inline__))
+static inline __m128i __attribute__((__always_inline__, __nodebug__))
_mm_adds_epu8(__m128i a, __m128i b)
{
return (__m128i)__builtin_ia32_paddusb128((__v16qi)a, (__v16qi)b);
}
-static inline __m128i __attribute__((__always_inline__))
+static inline __m128i __attribute__((__always_inline__, __nodebug__))
_mm_adds_epu16(__m128i a, __m128i b)
{
return (__m128i)__builtin_ia32_paddusw128((__v8hi)a, (__v8hi)b);
}
-static inline __m128i __attribute__((__always_inline__))
+static inline __m128i __attribute__((__always_inline__, __nodebug__))
_mm_avg_epu8(__m128i a, __m128i b)
{
return (__m128i)__builtin_ia32_pavgb128((__v16qi)a, (__v16qi)b);
}
-static inline __m128i __attribute__((__always_inline__))
+static inline __m128i __attribute__((__always_inline__, __nodebug__))
_mm_avg_epu16(__m128i a, __m128i b)
{
return (__m128i)__builtin_ia32_pavgw128((__v8hi)a, (__v8hi)b);
}
-static inline __m128i __attribute__((__always_inline__))
+static inline __m128i __attribute__((__always_inline__, __nodebug__))
_mm_madd_epi16(__m128i a, __m128i b)
{
return (__m128i)__builtin_ia32_pmaddwd128((__v8hi)a, (__v8hi)b);
}
-static inline __m128i __attribute__((__always_inline__))
+static inline __m128i __attribute__((__always_inline__, __nodebug__))
_mm_max_epi16(__m128i a, __m128i b)
{
return (__m128i)__builtin_ia32_pmaxsw128((__v8hi)a, (__v8hi)b);
}
-static inline __m128i __attribute__((__always_inline__))
+static inline __m128i __attribute__((__always_inline__, __nodebug__))
_mm_max_epu8(__m128i a, __m128i b)
{
return (__m128i)__builtin_ia32_pmaxub128((__v16qi)a, (__v16qi)b);
}
-static inline __m128i __attribute__((__always_inline__))
+static inline __m128i __attribute__((__always_inline__, __nodebug__))
_mm_min_epi16(__m128i a, __m128i b)
{
return (__m128i)__builtin_ia32_pminsw128((__v8hi)a, (__v8hi)b);
}
-static inline __m128i __attribute__((__always_inline__))
+static inline __m128i __attribute__((__always_inline__, __nodebug__))
_mm_min_epu8(__m128i a, __m128i b)
{
return (__m128i)__builtin_ia32_pminub128((__v16qi)a, (__v16qi)b);
}
-static inline __m128i __attribute__((__always_inline__))
+static inline __m128i __attribute__((__always_inline__, __nodebug__))
_mm_mulhi_epi16(__m128i a, __m128i b)
{
return (__m128i)__builtin_ia32_pmulhw128((__v8hi)a, (__v8hi)b);
}
-static inline __m128i __attribute__((__always_inline__))
+static inline __m128i __attribute__((__always_inline__, __nodebug__))
_mm_mulhi_epu16(__m128i a, __m128i b)
{
return (__m128i)__builtin_ia32_pmulhuw128((__v8hi)a, (__v8hi)b);
}
-static inline __m128i __attribute__((__always_inline__))
+static inline __m128i __attribute__((__always_inline__, __nodebug__))
_mm_mullo_epi16(__m128i a, __m128i b)
{
return (__m128i)__builtin_ia32_pmullw128((__v8hi)a, (__v8hi)b);
}
-static inline __m64 __attribute__((__always_inline__))
+static inline __m64 __attribute__((__always_inline__, __nodebug__))
_mm_mul_su32(__m64 a, __m64 b)
{
return __builtin_ia32_pmuludq((__v2si)a, (__v2si)b);
}
-static inline __m128i __attribute__((__always_inline__))
+static inline __m128i __attribute__((__always_inline__, __nodebug__))
_mm_mul_epu32(__m128i a, __m128i b)
{
return __builtin_ia32_pmuludq128((__v4si)a, (__v4si)b);
}
-static inline __m128i __attribute__((__always_inline__))
+static inline __m128i __attribute__((__always_inline__, __nodebug__))
_mm_sad_epu(__m128i a, __m128i b)
{
return __builtin_ia32_psadbw128((__v16qi)a, (__v16qi)b);
}
-static inline __m128i __attribute__((__always_inline__))
+static inline __m128i __attribute__((__always_inline__, __nodebug__))
_mm_sub_epi8(__m128i a, __m128i b)
{
return (__m128i)((__v16qi)a - (__v16qi)b);
}
-static inline __m128i __attribute__((__always_inline__))
+static inline __m128i __attribute__((__always_inline__, __nodebug__))
_mm_sub_epi16(__m128i a, __m128i b)
{
return (__m128i)((__v8hi)a - (__v8hi)b);
}
-static inline __m128i __attribute__((__always_inline__))
+static inline __m128i __attribute__((__always_inline__, __nodebug__))
_mm_sub_epi32(__m128i a, __m128i b)
{
return (__m128i)((__v4si)a - (__v4si)b);
}
-static inline __m64 __attribute__((__always_inline__))
+static inline __m64 __attribute__((__always_inline__, __nodebug__))
_mm_sub_si64(__m64 a, __m64 b)
{
return a - b;
}
-static inline __m128i __attribute__((__always_inline__))
+static inline __m128i __attribute__((__always_inline__, __nodebug__))
_mm_sub_epi64(__m128i a, __m128i b)
{
return a - b;
}
-static inline __m128i __attribute__((__always_inline__))
+static inline __m128i __attribute__((__always_inline__, __nodebug__))
_mm_subs_epi8(__m128i a, __m128i b)
{
return (__m128i)__builtin_ia32_psubsb128((__v16qi)a, (__v16qi)b);
}
-static inline __m128i __attribute__((__always_inline__))
+static inline __m128i __attribute__((__always_inline__, __nodebug__))
_mm_subs_epi16(__m128i a, __m128i b)
{
return (__m128i)__builtin_ia32_psubsw128((__v8hi)a, (__v8hi)b);
}
-static inline __m128i __attribute__((__always_inline__))
+static inline __m128i __attribute__((__always_inline__, __nodebug__))
_mm_subs_epu8(__m128i a, __m128i b)
{
return (__m128i)__builtin_ia32_psubusb128((__v16qi)a, (__v16qi)b);
}
-static inline __m128i __attribute__((__always_inline__))
+static inline __m128i __attribute__((__always_inline__, __nodebug__))
_mm_subs_epu16(__m128i a, __m128i b)
{
return (__m128i)__builtin_ia32_psubusw128((__v8hi)a, (__v8hi)b);
}
-static inline __m128i __attribute__((__always_inline__))
+static inline __m128i __attribute__((__always_inline__, __nodebug__))
_mm_and_si128(__m128i a, __m128i b)
{
return __builtin_ia32_pand128(a, b);
}
-static inline __m128i __attribute__((__always_inline__))
+static inline __m128i __attribute__((__always_inline__, __nodebug__))
_mm_andnot_si128(__m128i a, __m128i b)
{
return __builtin_ia32_pandn128(a, b);
}
-static inline __m128i __attribute__((__always_inline__))
+static inline __m128i __attribute__((__always_inline__, __nodebug__))
_mm_or_si128(__m128i a, __m128i b)
{
return __builtin_ia32_por128(a, b);
}
-static inline __m128i __attribute__((__always_inline__))
+static inline __m128i __attribute__((__always_inline__, __nodebug__))
_mm_xor_si128(__m128i a, __m128i b)
{
return __builtin_ia32_pxor128(a, b);
}
-static inline __m128i __attribute__((__always_inline__))
+static inline __m128i __attribute__((__always_inline__, __nodebug__))
_mm_slli_si128(__m128i a, int imm)
{
return __builtin_ia32_pslldqi128(a, imm * 8);
}
-static inline __m128i __attribute__((__always_inline__))
+static inline __m128i __attribute__((__always_inline__, __nodebug__))
_mm_slli_epi16(__m128i a, int count)
{
return (__m128i)__builtin_ia32_psllwi128((__v8hi)a, count);
}
-static inline __m128i __attribute__((__always_inline__))
+static inline __m128i __attribute__((__always_inline__, __nodebug__))
_mm_sll_epi16(__m128i a, __m128i count)
{
return (__m128i)__builtin_ia32_psllw128((__v8hi)a, (__v8hi)count);
}
-static inline __m128i __attribute__((__always_inline__))
+static inline __m128i __attribute__((__always_inline__, __nodebug__))
_mm_slli_epi32(__m128i a, int count)
{
return (__m128i)__builtin_ia32_pslldi128((__v4si)a, count);
}
-static inline __m128i __attribute__((__always_inline__))
+static inline __m128i __attribute__((__always_inline__, __nodebug__))
_mm_sll_epi32(__m128i a, __m128i count)
{
return (__m128i)__builtin_ia32_pslld128((__v4si)a, (__v4si)count);
}
-static inline __m128i __attribute__((__always_inline__))
+static inline __m128i __attribute__((__always_inline__, __nodebug__))
_mm_slli_epi64(__m128i a, int count)
{
return __builtin_ia32_psllqi128(a, count);
}
-static inline __m128i __attribute__((__always_inline__))
+static inline __m128i __attribute__((__always_inline__, __nodebug__))
_mm_sll_epi64(__m128i a, __m128i count)
{
return __builtin_ia32_psllq128(a, count);
}
-static inline __m128i __attribute__((__always_inline__))
+static inline __m128i __attribute__((__always_inline__, __nodebug__))
_mm_srai_epi16(__m128i a, int count)
{
return (__m128i)__builtin_ia32_psrawi128((__v8hi)a, count);
}
-static inline __m128i __attribute__((__always_inline__))
+static inline __m128i __attribute__((__always_inline__, __nodebug__))
_mm_sra_epi16(__m128i a, __m128i count)
{
return (__m128i)__builtin_ia32_psraw128((__v8hi)a, (__v8hi)count);
}
-static inline __m128i __attribute__((__always_inline__))
+static inline __m128i __attribute__((__always_inline__, __nodebug__))
_mm_srai_epi32(__m128i a, int count)
{
return (__m128i)__builtin_ia32_psradi128((__v4si)a, count);
}
-static inline __m128i __attribute__((__always_inline__))
+static inline __m128i __attribute__((__always_inline__, __nodebug__))
_mm_sra_epi32(__m128i a, __m128i count)
{
return (__m128i)__builtin_ia32_psrad128((__v4si)a, (__v4si)count);
}
-static inline __m128i __attribute__((__always_inline__))
+static inline __m128i __attribute__((__always_inline__, __nodebug__))
_mm_srli_si128(__m128i a, int imm)
{
return __builtin_ia32_psrldqi128(a, imm * 8);
}
-static inline __m128i __attribute__((__always_inline__))
+static inline __m128i __attribute__((__always_inline__, __nodebug__))
_mm_srli_epi16(__m128i a, int count)
{
return (__m128i)__builtin_ia32_psrlwi128((__v8hi)a, count);
}
-static inline __m128i __attribute__((__always_inline__))
+static inline __m128i __attribute__((__always_inline__, __nodebug__))
_mm_srl_epi16(__m128i a, __m128i count)
{
return (__m128i)__builtin_ia32_psrlw128((__v8hi)a, (__v8hi)count);
}
-static inline __m128i __attribute__((__always_inline__))
+static inline __m128i __attribute__((__always_inline__, __nodebug__))
_mm_srli_epi32(__m128i a, int count)
{
return (__m128i)__builtin_ia32_psrldi128((__v4si)a, count);
}
-static inline __m128i __attribute__((__always_inline__))
+static inline __m128i __attribute__((__always_inline__, __nodebug__))
_mm_srl_epi32(__m128i a, __m128i count)
{
return (__m128i)__builtin_ia32_psrld128((__v4si)a, (__v4si)count);
}
-static inline __m128i __attribute__((__always_inline__))
+static inline __m128i __attribute__((__always_inline__, __nodebug__))
_mm_srli_epi64(__m128i a, int count)
{
return __builtin_ia32_psrlqi128(a, count);
}
-static inline __m128i __attribute__((__always_inline__))
+static inline __m128i __attribute__((__always_inline__, __nodebug__))
_mm_srl_epi64(__m128i a, __m128i count)
{
return __builtin_ia32_psrlq128(a, count);
}
-static inline __m128i __attribute__((__always_inline__))
+static inline __m128i __attribute__((__always_inline__, __nodebug__))
_mm_cmpeq_epi8(__m128i a, __m128i b)
{
return (__m128i)__builtin_ia32_pcmpeqb128((__v16qi)a, (__v16qi)b);
}
-static inline __m128i __attribute__((__always_inline__))
+static inline __m128i __attribute__((__always_inline__, __nodebug__))
_mm_cmpeq_epi16(__m128i a, __m128i b)
{
return (__m128i)__builtin_ia32_pcmpeqw128((__v8hi)a, (__v8hi)b);
}
-static inline __m128i __attribute__((__always_inline__))
+static inline __m128i __attribute__((__always_inline__, __nodebug__))
_mm_cmpeq_epi32(__m128i a, __m128i b)
{
return (__m128i)__builtin_ia32_pcmpeqd128((__v4si)a, (__v4si)b);
}
-static inline __m128i __attribute__((__always_inline__))
+static inline __m128i __attribute__((__always_inline__, __nodebug__))
_mm_cmpgt_epi8(__m128i a, __m128i b)
{
return (__m128i)__builtin_ia32_pcmpgtb128((__v16qi)a, (__v16qi)b);
}
-static inline __m128i __attribute__((__always_inline__))
+static inline __m128i __attribute__((__always_inline__, __nodebug__))
_mm_cmpgt_epi16(__m128i a, __m128i b)
{
return (__m128i)__builtin_ia32_pcmpgtw128((__v8hi)a, (__v8hi)b);
}
-static inline __m128i __attribute__((__always_inline__))
+static inline __m128i __attribute__((__always_inline__, __nodebug__))
_mm_cmpgt_epi32(__m128i a, __m128i b)
{
return (__m128i)__builtin_ia32_pcmpgtd128((__v4si)a, (__v4si)b);
}
-static inline __m128i __attribute__((__always_inline__))
+static inline __m128i __attribute__((__always_inline__, __nodebug__))
_mm_cmplt_epi8(__m128i a, __m128i b)
{
return (__m128i)__builtin_ia32_pcmpgtb128((__v16qi)b, (__v16qi)a);
}
-static inline __m128i __attribute__((__always_inline__))
+static inline __m128i __attribute__((__always_inline__, __nodebug__))
_mm_cmplt_epi16(__m128i a, __m128i b)
{
return (__m128i)__builtin_ia32_pcmpgtw128((__v8hi)b, (__v8hi)a);
}
-static inline __m128i __attribute__((__always_inline__))
+static inline __m128i __attribute__((__always_inline__, __nodebug__))
_mm_cmplt_epi32(__m128i a, __m128i b)
{
return (__m128i)__builtin_ia32_pcmpgtd128((__v4si)b, (__v4si)a);
}
#ifdef __x86_64__
-static inline __m128d __attribute__((__always_inline__))
+static inline __m128d __attribute__((__always_inline__, __nodebug__))
_mm_cvtsi64_sd(__m128d a, long long b)
{
return __builtin_ia32_cvtsi642sd(a, b);
}
-static inline long long __attribute__((__always_inline__))
+static inline long long __attribute__((__always_inline__, __nodebug__))
_mm_cvtsd_si64(__m128d a)
{
return __builtin_ia32_cvtsd2si64(a);
}
-static inline long long __attribute__((__always_inline__))
+static inline long long __attribute__((__always_inline__, __nodebug__))
_mm_cvttsd_si64(__m128d a)
{
return __builtin_ia32_cvttsd2si64(a);
}
#endif
-static inline __m128 __attribute__((__always_inline__))
+static inline __m128 __attribute__((__always_inline__, __nodebug__))
_mm_cvtepi32_ps(__m128i a)
{
return __builtin_ia32_cvtdq2ps((__v4si)a);
}
-static inline __m128i __attribute__((__always_inline__))
+static inline __m128i __attribute__((__always_inline__, __nodebug__))
_mm_cvtps_epi32(__m128 a)
{
return (__m128i)__builtin_ia32_cvtps2dq(a);
}
-static inline __m128i __attribute__((__always_inline__))
+static inline __m128i __attribute__((__always_inline__, __nodebug__))
_mm_cvttps_epi32(__m128 a)
{
return (__m128i)__builtin_ia32_cvttps2dq(a);
}
-static inline __m128i __attribute__((__always_inline__))
+static inline __m128i __attribute__((__always_inline__, __nodebug__))
_mm_cvtsi32_si128(int a)
{
return (__m128i)(__v4si){ a, 0, 0, 0 };
}
#ifdef __x86_64__
-static inline __m128i __attribute__((__always_inline__))
+static inline __m128i __attribute__((__always_inline__, __nodebug__))
_mm_cvtsi64_si128(long long a)
{
return (__m128i){ a, 0 };
}
#endif
-static inline int __attribute__((__always_inline__))
+static inline int __attribute__((__always_inline__, __nodebug__))
_mm_cvtsi128_si32(__m128i a)
{
__v4si b = (__v4si)a;
}
#ifdef __x86_64__
-static inline long long __attribute__((__always_inline__))
+static inline long long __attribute__((__always_inline__, __nodebug__))
_mm_cvtsi128_si64(__m128i a)
{
return a[0];
}
#endif
-static inline __m128i __attribute__((__always_inline__))
+static inline __m128i __attribute__((__always_inline__, __nodebug__))
_mm_load_si128(__m128i const *p)
{
return *p;
}
-static inline __m128i __attribute__((__always_inline__))
+static inline __m128i __attribute__((__always_inline__, __nodebug__))
_mm_loadu_si128(__m128i const *p)
{
return (__m128i)__builtin_ia32_loaddqu((char const *)p);
}
-static inline __m128i __attribute__((__always_inline__))
+static inline __m128i __attribute__((__always_inline__, __nodebug__))
_mm_loadl_epi64(__m128i const *p)
{
return (__m128i)__builtin_ia32_loadlv4si((__v2si *)p);
}
-static inline __m128i __attribute__((__always_inline__))
+static inline __m128i __attribute__((__always_inline__, __nodebug__))
_mm_set_epi64(__m64 q1, __m64 q0)
{
return (__m128i){ (long long)q0, (long long)q1 };
}
-static inline __m128i __attribute__((__always_inline__))
+static inline __m128i __attribute__((__always_inline__, __nodebug__))
_mm_set_epi32(int i3, int i2, int i1, int i0)
{
return (__m128i)(__v4si){ i0, i1, i2, i3};
}
-static inline __m128i __attribute__((__always_inline__))
+static inline __m128i __attribute__((__always_inline__, __nodebug__))
_mm_set_epi16(short w7, short w6, short w5, short w4, short w3, short w2, short w1, short w0)
{
return (__m128i)(__v8hi){ w0, w1, w2, w3, w4, w5, w6, w7 };
}
-static inline __m128i __attribute__((__always_inline__))
+static inline __m128i __attribute__((__always_inline__, __nodebug__))
_mm_set_epi8(char b15, char b14, char b13, char b12, char b11, char b10, char b9, char b8, char b7, char b6, char b5, char b4, char b3, char b2, char b1, char b0)
{
return (__m128i)(__v16qi){ b0, b1, b2, b3, b4, b5, b6, b7, b8, b9, b10, b11, b12, b13, b14, b15 };
}
-static inline __m128i __attribute__((__always_inline__))
+static inline __m128i __attribute__((__always_inline__, __nodebug__))
_mm_set1_epi64(__m64 q)
{
return (__m128i){ (long long)q, (long long)q };
}
-static inline __m128i __attribute__((__always_inline__))
+static inline __m128i __attribute__((__always_inline__, __nodebug__))
_mm_set1_epi32(int i)
{
return (__m128i)(__v4si){ i, i, i, i };
}
-static inline __m128i __attribute__((__always_inline__))
+static inline __m128i __attribute__((__always_inline__, __nodebug__))
_mm_set1_epi16(short w)
{
return (__m128i)(__v8hi){ w, w, w, w, w, w, w, w };
}
-static inline __m128i __attribute__((__always_inline__))
+static inline __m128i __attribute__((__always_inline__, __nodebug__))
_mm_set1_epi8(char b)
{
return (__m128i)(__v16qi){ b, b, b, b, b, b, b, b, b, b, b, b, b, b, b, b };
}
-static inline __m128i __attribute__((__always_inline__))
+static inline __m128i __attribute__((__always_inline__, __nodebug__))
_mm_setr_epi64(__m64 q0, __m64 q1)
{
return (__m128i){ (long long)q0, (long long)q1 };
}
-static inline __m128i __attribute__((__always_inline__))
+static inline __m128i __attribute__((__always_inline__, __nodebug__))
_mm_setr_epi32(int i0, int i1, int i2, int i3)
{
return (__m128i)(__v4si){ i0, i1, i2, i3};
}
-static inline __m128i __attribute__((__always_inline__))
+static inline __m128i __attribute__((__always_inline__, __nodebug__))
_mm_setr_epi16(short w0, short w1, short w2, short w3, short w4, short w5, short w6, short w7)
{
return (__m128i)(__v8hi){ w0, w1, w2, w3, w4, w5, w6, w7 };
}
-static inline __m128i __attribute__((__always_inline__))
+static inline __m128i __attribute__((__always_inline__, __nodebug__))
_mm_setr_epi8(char b0, char b1, char b2, char b3, char b4, char b5, char b6, char b7, char b8, char b9, char b10, char b11, char b12, char b13, char b14, char b15)
{
return (__m128i)(__v16qi){ b0, b1, b2, b3, b4, b5, b6, b7, b8, b9, b10, b11, b12, b13, b14, b15 };
}
-static inline __m128i __attribute__((__always_inline__))
+static inline __m128i __attribute__((__always_inline__, __nodebug__))
_mm_setzero_si128(void)
{
return (__m128i){ 0LL, 0LL };
}
-static inline void __attribute__((__always_inline__))
+static inline void __attribute__((__always_inline__, __nodebug__))
_mm_store_si128(__m128i *p, __m128i b)
{
*p = b;
}
-static inline void __attribute__((__always_inline__))
+static inline void __attribute__((__always_inline__, __nodebug__))
_mm_storeu_si128(__m128i *p, __m128i b)
{
__builtin_ia32_storedqu((char *)p, (__v16qi)b);
}
-static inline void __attribute__((__always_inline__))
+static inline void __attribute__((__always_inline__, __nodebug__))
_mm_maskmoveu_si128(__m128i d, __m128i n, char *p)
{
__builtin_ia32_maskmovdqu((__v16qi)d, (__v16qi)n, p);
}
-static inline void __attribute__((__always_inline__))
+static inline void __attribute__((__always_inline__, __nodebug__))
_mm_storel_epi64(__m128i *p, __m128i a)
{
__builtin_ia32_storelv4si((__v2si *)p, a);
}
-static inline void __attribute__((__always_inline__))
+static inline void __attribute__((__always_inline__, __nodebug__))
_mm_stream_pd(double *p, __m128d a)
{
__builtin_ia32_movntpd(p, a);
}
-static inline void __attribute__((__always_inline__))
+static inline void __attribute__((__always_inline__, __nodebug__))
_mm_stream_si128(__m128i *p, __m128i a)
{
__builtin_ia32_movntdq(p, a);
}
-static inline void __attribute__((__always_inline__))
+static inline void __attribute__((__always_inline__, __nodebug__))
_mm_stream_si32(int *p, int a)
{
__builtin_ia32_movnti(p, a);
}
-static inline void __attribute__((__always_inline__))
+static inline void __attribute__((__always_inline__, __nodebug__))
_mm_clflush(void const *p)
{
__builtin_ia32_clflush(p);
}
-static inline void __attribute__((__always_inline__))
+static inline void __attribute__((__always_inline__, __nodebug__))
_mm_lfence(void)
{
__builtin_ia32_lfence();
}
-static inline void __attribute__((__always_inline__))
+static inline void __attribute__((__always_inline__, __nodebug__))
_mm_mfence(void)
{
__builtin_ia32_mfence();
}
-static inline __m128i __attribute__((__always_inline__))
+static inline __m128i __attribute__((__always_inline__, __nodebug__))
_mm_packs_epi16(__m128i a, __m128i b)
{
return (__m128i)__builtin_ia32_packsswb128((__v8hi)a, (__v8hi)b);
}
-static inline __m128i __attribute__((__always_inline__))
+static inline __m128i __attribute__((__always_inline__, __nodebug__))
_mm_packs_epi32(__m128i a, __m128i b)
{
return (__m128i)__builtin_ia32_packssdw128((__v4si)a, (__v4si)b);
}
-static inline __m128i __attribute__((__always_inline__))
+static inline __m128i __attribute__((__always_inline__, __nodebug__))
_mm_packus_epi16(__m128i a, __m128i b)
{
return (__m128i)__builtin_ia32_packuswb128((__v8hi)a, (__v8hi)b);
}
-static inline int __attribute__((__always_inline__))
+static inline int __attribute__((__always_inline__, __nodebug__))
_mm_extract_epi16(__m128i a, int imm)
{
__v8hi b = (__v8hi)a;
return b[imm];
}
-static inline __m128i __attribute__((__always_inline__))
+static inline __m128i __attribute__((__always_inline__, __nodebug__))
_mm_insert_epi16(__m128i a, int b, int imm)
{
return (__m128i)__builtin_ia32_vec_set_v8hi((__v8hi)a, b, imm);
}
-static inline int __attribute__((__always_inline__))
+static inline int __attribute__((__always_inline__, __nodebug__))
_mm_movemask_epi8(__m128i a)
{
return __builtin_ia32_pmovmskb128((__v16qi)a);
#define _mm_shufflehi_epi16(a, imm) ((__m128i)__builtin_ia32_pshufhw((__v8hi)(a), (imm)))
#define _mm_shufflelo_epi16(a, imm) ((__m128i)__builtin_ia32_pshuflw((__v8hi)(a), (imm)))
-static inline __m128i __attribute__((__always_inline__))
+static inline __m128i __attribute__((__always_inline__, __nodebug__))
_mm_unpackhi_epi8(__m128i a, __m128i b)
{
return (__m128i)__builtin_shufflevector((__v16qi)a, (__v16qi)b, 8, 16+8, 9, 16+9, 10, 16+10, 11, 16+11, 12, 16+12, 13, 16+13, 14, 16+14, 15, 16+15);
}
-static inline __m128i __attribute__((__always_inline__))
+static inline __m128i __attribute__((__always_inline__, __nodebug__))
_mm_unpackhi_epi16(__m128i a, __m128i b)
{
return (__m128i)__builtin_shufflevector((__v8hi)a, (__v8hi)b, 4, 8+4, 5, 8+5, 6, 8+6, 7, 8+7);
}
-static inline __m128i __attribute__((__always_inline__))
+static inline __m128i __attribute__((__always_inline__, __nodebug__))
_mm_unpackhi_epi32(__m128i a, __m128i b)
{
return (__m128i)__builtin_shufflevector((__v4si)a, (__v4si)b, 2, 4+2, 3, 4+3);
}
-static inline __m128i __attribute__((__always_inline__))
+static inline __m128i __attribute__((__always_inline__, __nodebug__))
_mm_unpackhi_epi64(__m128i a, __m128i b)
{
return (__m128i)__builtin_shufflevector(a, b, 1, 2+1);
}
-static inline __m128i __attribute__((__always_inline__))
+static inline __m128i __attribute__((__always_inline__, __nodebug__))
_mm_unpacklo_epi8(__m128i a, __m128i b)
{
return (__m128i)__builtin_shufflevector((__v16qi)a, (__v16qi)b, 0, 16+0, 1, 16+1, 2, 16+2, 3, 16+3, 4, 16+4, 5, 16+5, 6, 16+6, 7, 16+7);
}
-static inline __m128i __attribute__((__always_inline__))
+static inline __m128i __attribute__((__always_inline__, __nodebug__))
_mm_unpacklo_epi16(__m128i a, __m128i b)
{
return (__m128i)__builtin_shufflevector((__v8hi)a, (__v8hi)b, 0, 8+0, 1, 8+1, 2, 8+2, 3, 8+3);
}
-static inline __m128i __attribute__((__always_inline__))
+static inline __m128i __attribute__((__always_inline__, __nodebug__))
_mm_unpacklo_epi32(__m128i a, __m128i b)
{
return (__m128i)__builtin_shufflevector((__v4si)a, (__v4si)b, 0, 4+0, 1, 4+1);
}
-static inline __m128i __attribute__((__always_inline__))
+static inline __m128i __attribute__((__always_inline__, __nodebug__))
_mm_unpacklo_epi64(__m128i a, __m128i b)
{
return (__m128i)__builtin_shufflevector(a, b, 0, 2+0);
}
-static inline __m64 __attribute__((__always_inline__))
+static inline __m64 __attribute__((__always_inline__, __nodebug__))
_mm_movepi64_pi64(__m128i a)
{
return (__m64)a[0];
}
-static inline __m128i __attribute__((__always_inline__))
+static inline __m128i __attribute__((__always_inline__, __nodebug__))
_mm_movpi64_pi64(__m64 a)
{
return (__m128i){ (long long)a, 0 };
}
-static inline __m128i __attribute__((__always_inline__))
+static inline __m128i __attribute__((__always_inline__, __nodebug__))
_mm_move_epi64(__m128i a)
{
return (__m128i){ a[0], 0 };
}
-static inline __m128d __attribute__((__always_inline__))
+static inline __m128d __attribute__((__always_inline__, __nodebug__))
_mm_unpackhi_pd(__m128d a, __m128d b)
{
return __builtin_shufflevector(a, b, 1, 2+1);
}
-static inline __m128d __attribute__((__always_inline__))
+static inline __m128d __attribute__((__always_inline__, __nodebug__))
_mm_unpacklo_pd(__m128d a, __m128d b)
{
return __builtin_shufflevector(a, b, 0, 2+0);
}
-static inline int __attribute__((__always_inline__))
+static inline int __attribute__((__always_inline__, __nodebug__))
_mm_movemask_pd(__m128d a)
{
return __builtin_ia32_movmskpd(a);
#define _mm_shuffle_pd(a, b, i) (__builtin_ia32_shufpd((a), (b), (i)))
-static inline __m128 __attribute__((__always_inline__))
+static inline __m128 __attribute__((__always_inline__, __nodebug__))
_mm_castpd_ps(__m128d in)
{
return (__m128)in;
}
-static inline __m128i __attribute__((__always_inline__))
+static inline __m128i __attribute__((__always_inline__, __nodebug__))
_mm_castpd_si128(__m128d in)
{
return (__m128i)in;
}
-static inline __m128d __attribute__((__always_inline__))
+static inline __m128d __attribute__((__always_inline__, __nodebug__))
_mm_castps_pd(__m128 in)
{
return (__m128d)in;
}
-static inline __m128i __attribute__((__always_inline__))
+static inline __m128i __attribute__((__always_inline__, __nodebug__))
_mm_castps_si128(__m128 in)
{
return (__m128i)in;
}
-static inline __m128 __attribute__((__always_inline__))
+static inline __m128 __attribute__((__always_inline__, __nodebug__))
_mm_castsi128_ps(__m128i in)
{
return (__m128)in;
}
-static inline __m128d __attribute__((__always_inline__))
+static inline __m128d __attribute__((__always_inline__, __nodebug__))
_mm_castsi128_pd(__m128i in)
{
return (__m128d)in;
}
-static inline void __attribute__((__always_inline__))
+static inline void __attribute__((__always_inline__, __nodebug__))
_mm_pause(void)
{
__asm__ volatile ("pause");
typedef short __v4hi __attribute__((__vector_size__(8)));
typedef char __v8qi __attribute__((__vector_size__(8)));
-static inline void __attribute__((__always_inline__)) _mm_empty(void)
+static inline void __attribute__((__always_inline__, __nodebug__)) _mm_empty(void)
{
__builtin_ia32_emms();
}
-static inline __m64 __attribute__((__always_inline__)) _mm_cvtsi32_si64(int __i)
+static inline __m64 __attribute__((__always_inline__, __nodebug__)) _mm_cvtsi32_si64(int __i)
{
return (__m64)(__v2si){__i, 0};
}
-static inline int __attribute__((__always_inline__)) _mm_cvtsi64_si32(__m64 __m)
+static inline int __attribute__((__always_inline__, __nodebug__)) _mm_cvtsi64_si32(__m64 __m)
{
__v2si __mmx_var2 = (__v2si)__m;
return __mmx_var2[0];
}
-static inline __m64 __attribute__((__always_inline__)) _mm_cvtsi64_m64(long long __i)
+static inline __m64 __attribute__((__always_inline__, __nodebug__)) _mm_cvtsi64_m64(long long __i)
{
return (__m64)__i;
}
-static inline long long __attribute__((__always_inline__)) _mm_cvtm64_si64(__m64 __m)
+static inline long long __attribute__((__always_inline__, __nodebug__)) _mm_cvtm64_si64(__m64 __m)
{
return (long long)__m;
}
-static inline __m64 __attribute__((__always_inline__)) _mm_packs_pi16(__m64 __m1, __m64 __m2)
+static inline __m64 __attribute__((__always_inline__, __nodebug__)) _mm_packs_pi16(__m64 __m1, __m64 __m2)
{
return (__m64)__builtin_ia32_packsswb((__v4hi)__m1, (__v4hi)__m2);
}
-static inline __m64 __attribute__((__always_inline__)) _mm_packs_pi32(__m64 __m1, __m64 __m2)
+static inline __m64 __attribute__((__always_inline__, __nodebug__)) _mm_packs_pi32(__m64 __m1, __m64 __m2)
{
return (__m64)__builtin_ia32_packssdw((__v2si)__m1, (__v2si)__m2);
}
-static inline __m64 __attribute__((__always_inline__)) _mm_packs_pu16(__m64 __m1, __m64 __m2)
+static inline __m64 __attribute__((__always_inline__, __nodebug__)) _mm_packs_pu16(__m64 __m1, __m64 __m2)
{
return (__m64)__builtin_ia32_packuswb((__v4hi)__m1, (__v4hi)__m2);
}
-static inline __m64 __attribute__((__always_inline__)) _mm_unpackhi_pi8(__m64 __m1, __m64 __m2)
+static inline __m64 __attribute__((__always_inline__, __nodebug__)) _mm_unpackhi_pi8(__m64 __m1, __m64 __m2)
{
return (__m64)__builtin_shufflevector((__v8qi)__m1, (__v8qi)__m2, 4, 8+4, 5, 8+5, 6, 8+6, 7, 8+7);
}
-static inline __m64 __attribute__((__always_inline__)) _mm_unpackhi_pi16(__m64 __m1, __m64 __m2)
+static inline __m64 __attribute__((__always_inline__, __nodebug__)) _mm_unpackhi_pi16(__m64 __m1, __m64 __m2)
{
return (__m64)__builtin_shufflevector((__v4hi)__m1, (__v4hi)__m2, 2, 4+2, 3, 4+3);
}
-static inline __m64 __attribute__((__always_inline__)) _mm_unpackhi_pi32(__m64 __m1, __m64 __m2)
+static inline __m64 __attribute__((__always_inline__, __nodebug__)) _mm_unpackhi_pi32(__m64 __m1, __m64 __m2)
{
return (__m64)__builtin_shufflevector((__v2si)__m1, (__v2si)__m2, 1, 2+1);
}
-static inline __m64 __attribute__((__always_inline__)) _mm_unpacklo_pi8(__m64 __m1, __m64 __m2)
+static inline __m64 __attribute__((__always_inline__, __nodebug__)) _mm_unpacklo_pi8(__m64 __m1, __m64 __m2)
{
return (__m64)__builtin_shufflevector((__v8qi)__m1, (__v8qi)__m2, 0, 8+0, 1, 8+1, 2, 8+2, 3, 8+3);
}
-static inline __m64 __attribute__((__always_inline__)) _mm_unpacklo_pi16(__m64 __m1, __m64 __m2)
+static inline __m64 __attribute__((__always_inline__, __nodebug__)) _mm_unpacklo_pi16(__m64 __m1, __m64 __m2)
{
return (__m64)__builtin_shufflevector((__v4hi)__m1, (__v4hi)__m2, 0, 4+0, 1, 4+1);
}
-static inline __m64 __attribute__((__always_inline__)) _mm_unpacklo_pi32(__m64 __m1, __m64 __m2)
+static inline __m64 __attribute__((__always_inline__, __nodebug__)) _mm_unpacklo_pi32(__m64 __m1, __m64 __m2)
{
return (__m64)__builtin_shufflevector((__v2si)__m1, (__v2si)__m2, 0, 2+0);
}
-static inline __m64 __attribute__((__always_inline__)) _mm_add_pi8(__m64 __m1, __m64 __m2)
+static inline __m64 __attribute__((__always_inline__, __nodebug__)) _mm_add_pi8(__m64 __m1, __m64 __m2)
{
return (__m64)((__v8qi)__m1 + (__v8qi)__m2);
}
-static inline __m64 __attribute__((__always_inline__)) _mm_add_pi16(__m64 __m1, __m64 __m2)
+static inline __m64 __attribute__((__always_inline__, __nodebug__)) _mm_add_pi16(__m64 __m1, __m64 __m2)
{
return (__m64)((__v4hi)__m1 + (__v4hi)__m2);
}
-static inline __m64 __attribute__((__always_inline__)) _mm_add_pi32(__m64 __m1, __m64 __m2)
+static inline __m64 __attribute__((__always_inline__, __nodebug__)) _mm_add_pi32(__m64 __m1, __m64 __m2)
{
return (__m64)((__v2si)__m1 + (__v2si)__m2);
}
-static inline __m64 __attribute__((__always_inline__)) _mm_adds_pi8(__m64 __m1, __m64 __m2)
+static inline __m64 __attribute__((__always_inline__, __nodebug__)) _mm_adds_pi8(__m64 __m1, __m64 __m2)
{
return (__m64)__builtin_ia32_paddsb((__v8qi)__m1, (__v8qi)__m2);
}
-static inline __m64 __attribute__((__always_inline__)) _mm_adds_pi16(__m64 __m1, __m64 __m2)
+static inline __m64 __attribute__((__always_inline__, __nodebug__)) _mm_adds_pi16(__m64 __m1, __m64 __m2)
{
return (__m64)__builtin_ia32_paddsw((__v4hi)__m1, (__v4hi)__m2);
}
-static inline __m64 __attribute__((__always_inline__)) _mm_adds_pu8(__m64 __m1, __m64 __m2)
+static inline __m64 __attribute__((__always_inline__, __nodebug__)) _mm_adds_pu8(__m64 __m1, __m64 __m2)
{
return (__m64)__builtin_ia32_paddusb((__v8qi)__m1, (__v8qi)__m2);
}
-static inline __m64 __attribute__((__always_inline__)) _mm_adds_pu16(__m64 __m1, __m64 __m2)
+static inline __m64 __attribute__((__always_inline__, __nodebug__)) _mm_adds_pu16(__m64 __m1, __m64 __m2)
{
return (__m64)__builtin_ia32_paddusw((__v4hi)__m1, (__v4hi)__m2);
}
-static inline __m64 __attribute__((__always_inline__)) _mm_sub_pi8(__m64 __m1, __m64 __m2)
+static inline __m64 __attribute__((__always_inline__, __nodebug__)) _mm_sub_pi8(__m64 __m1, __m64 __m2)
{
return (__m64)((__v8qi)__m1 - (__v8qi)__m2);
}
-static inline __m64 __attribute__((__always_inline__)) _mm_sub_pi16(__m64 __m1, __m64 __m2)
+static inline __m64 __attribute__((__always_inline__, __nodebug__)) _mm_sub_pi16(__m64 __m1, __m64 __m2)
{
return (__m64)((__v4hi)__m1 - (__v4hi)__m2);
}
-static inline __m64 __attribute__((__always_inline__)) _mm_sub_pi32(__m64 __m1, __m64 __m2)
+static inline __m64 __attribute__((__always_inline__, __nodebug__)) _mm_sub_pi32(__m64 __m1, __m64 __m2)
{
return (__m64)((__v2si)__m1 - (__v2si)__m2);
}
-static inline __m64 __attribute__((__always_inline__)) _mm_subs_pi8(__m64 __m1, __m64 __m2)
+static inline __m64 __attribute__((__always_inline__, __nodebug__)) _mm_subs_pi8(__m64 __m1, __m64 __m2)
{
return (__m64)__builtin_ia32_psubsb((__v8qi)__m1, (__v8qi)__m2);
}
-static inline __m64 __attribute__((__always_inline__)) _mm_subs_pi16(__m64 __m1, __m64 __m2)
+static inline __m64 __attribute__((__always_inline__, __nodebug__)) _mm_subs_pi16(__m64 __m1, __m64 __m2)
{
return (__m64)__builtin_ia32_psubsw((__v4hi)__m1, (__v4hi)__m2);
}
-static inline __m64 __attribute__((__always_inline__)) _mm_subs_pu8(__m64 __m1, __m64 __m2)
+static inline __m64 __attribute__((__always_inline__, __nodebug__)) _mm_subs_pu8(__m64 __m1, __m64 __m2)
{
return (__m64)__builtin_ia32_psubusb((__v8qi)__m1, (__v8qi)__m2);
}
-static inline __m64 __attribute__((__always_inline__)) _mm_subs_pu16(__m64 __m1, __m64 __m2)
+static inline __m64 __attribute__((__always_inline__, __nodebug__)) _mm_subs_pu16(__m64 __m1, __m64 __m2)
{
return (__m64)__builtin_ia32_psubusw((__v4hi)__m1, (__v4hi)__m2);
}
-static inline __m64 __attribute__((__always_inline__)) _mm_madd_pi16(__m64 __m1, __m64 __m2)
+static inline __m64 __attribute__((__always_inline__, __nodebug__)) _mm_madd_pi16(__m64 __m1, __m64 __m2)
{
return (__m64)__builtin_ia32_pmaddwd((__v4hi)__m1, (__v4hi)__m2);
}
-static inline __m64 __attribute__((__always_inline__)) _mm_mulhi_pi16(__m64 __m1, __m64 __m2)
+static inline __m64 __attribute__((__always_inline__, __nodebug__)) _mm_mulhi_pi16(__m64 __m1, __m64 __m2)
{
return (__m64)__builtin_ia32_pmulhw((__v4hi)__m1, (__v4hi)__m2);
}
-static inline __m64 __attribute__((__always_inline__)) _mm_mullo_pi16(__m64 __m1, __m64 __m2)
+static inline __m64 __attribute__((__always_inline__, __nodebug__)) _mm_mullo_pi16(__m64 __m1, __m64 __m2)
{
return (__m64)((__v4hi)__m1 * (__v4hi)__m2);
}
-static inline __m64 __attribute__((__always_inline__)) _mm_sll_pi16(__m64 __m, __m64 __count)
+static inline __m64 __attribute__((__always_inline__, __nodebug__)) _mm_sll_pi16(__m64 __m, __m64 __count)
{
return (__m64)__builtin_ia32_psllw((__v4hi)__m, __count);
}
-static inline __m64 __attribute__((__always_inline__)) _mm_slli_pi16(__m64 __m, int __count)
+static inline __m64 __attribute__((__always_inline__, __nodebug__)) _mm_slli_pi16(__m64 __m, int __count)
{
return (__m64)__builtin_ia32_psllwi((__v4hi)__m, __count);
}
-static inline __m64 __attribute__((__always_inline__)) _mm_sll_pi32(__m64 __m, __m64 __count)
+static inline __m64 __attribute__((__always_inline__, __nodebug__)) _mm_sll_pi32(__m64 __m, __m64 __count)
{
return (__m64)__builtin_ia32_pslld((__v2si)__m, __count);
}
-static inline __m64 __attribute__((__always_inline__)) _mm_slli_pi32(__m64 __m, int __count)
+static inline __m64 __attribute__((__always_inline__, __nodebug__)) _mm_slli_pi32(__m64 __m, int __count)
{
return (__m64)__builtin_ia32_pslldi((__v2si)__m, __count);
}
-static inline __m64 __attribute__((__always_inline__)) _mm_sll_si64(__m64 __m, __m64 __count)
+static inline __m64 __attribute__((__always_inline__, __nodebug__)) _mm_sll_si64(__m64 __m, __m64 __count)
{
return __builtin_ia32_psllq(__m, __count);
}
-static inline __m64 __attribute__((__always_inline__)) _mm_slli_si64(__m64 __m, int __count)
+static inline __m64 __attribute__((__always_inline__, __nodebug__)) _mm_slli_si64(__m64 __m, int __count)
{
return __builtin_ia32_psllqi(__m, __count);
}
-static inline __m64 __attribute__((__always_inline__)) _mm_sra_pi16(__m64 __m, __m64 __count)
+static inline __m64 __attribute__((__always_inline__, __nodebug__)) _mm_sra_pi16(__m64 __m, __m64 __count)
{
return (__m64)__builtin_ia32_psraw((__v4hi)__m, __count);
}
-static inline __m64 __attribute__((__always_inline__)) _mm_srai_pi16(__m64 __m, int __count)
+static inline __m64 __attribute__((__always_inline__, __nodebug__)) _mm_srai_pi16(__m64 __m, int __count)
{
return (__m64)__builtin_ia32_psrawi((__v4hi)__m, __count);
}
-static inline __m64 __attribute__((__always_inline__)) _mm_sra_pi32(__m64 __m, __m64 __count)
+static inline __m64 __attribute__((__always_inline__, __nodebug__)) _mm_sra_pi32(__m64 __m, __m64 __count)
{
return (__m64)__builtin_ia32_psrad((__v2si)__m, __count);
}
-static inline __m64 __attribute__((__always_inline__)) _mm_srai_pi32(__m64 __m, int __count)
+static inline __m64 __attribute__((__always_inline__, __nodebug__)) _mm_srai_pi32(__m64 __m, int __count)
{
return (__m64)__builtin_ia32_psradi((__v2si)__m, __count);
}
-static inline __m64 __attribute__((__always_inline__)) _mm_srl_pi16(__m64 __m, __m64 __count)
+static inline __m64 __attribute__((__always_inline__, __nodebug__)) _mm_srl_pi16(__m64 __m, __m64 __count)
{
return (__m64)__builtin_ia32_psrlw((__v4hi)__m, __count);
}
-static inline __m64 __attribute__((__always_inline__)) _mm_srli_pi16(__m64 __m, int __count)
+static inline __m64 __attribute__((__always_inline__, __nodebug__)) _mm_srli_pi16(__m64 __m, int __count)
{
return (__m64)__builtin_ia32_psrlwi((__v4hi)__m, __count);
}
-static inline __m64 __attribute__((__always_inline__)) _mm_srl_pi32(__m64 __m, __m64 __count)
+static inline __m64 __attribute__((__always_inline__, __nodebug__)) _mm_srl_pi32(__m64 __m, __m64 __count)
{
return (__m64)__builtin_ia32_psrld((__v2si)__m, __count);
}
-static inline __m64 __attribute__((__always_inline__)) _mm_srli_pi32(__m64 __m, int __count)
+static inline __m64 __attribute__((__always_inline__, __nodebug__)) _mm_srli_pi32(__m64 __m, int __count)
{
return (__m64)__builtin_ia32_psrldi((__v2si)__m, __count);
}
-static inline __m64 __attribute__((__always_inline__)) _mm_srl_si64(__m64 __m, __m64 __count)
+static inline __m64 __attribute__((__always_inline__, __nodebug__)) _mm_srl_si64(__m64 __m, __m64 __count)
{
return (__m64)__builtin_ia32_psrlq(__m, __count);
}
-static inline __m64 __attribute__((__always_inline__)) _mm_srli_si64(__m64 __m, int __count)
+static inline __m64 __attribute__((__always_inline__, __nodebug__)) _mm_srli_si64(__m64 __m, int __count)
{
return __builtin_ia32_psrlqi(__m, __count);
}
-static inline __m64 __attribute__((__always_inline__)) _mm_and_si64(__m64 __m1, __m64 __m2)
+static inline __m64 __attribute__((__always_inline__, __nodebug__)) _mm_and_si64(__m64 __m1, __m64 __m2)
{
return __m1 & __m2;
}
-static inline __m64 __attribute__((__always_inline__)) _mm_andnot_si64(__m64 __m1, __m64 __m2)
+static inline __m64 __attribute__((__always_inline__, __nodebug__)) _mm_andnot_si64(__m64 __m1, __m64 __m2)
{
return ~__m1 & __m2;
}
-static inline __m64 __attribute__((__always_inline__)) _mm_or_si64(__m64 __m1, __m64 __m2)
+static inline __m64 __attribute__((__always_inline__, __nodebug__)) _mm_or_si64(__m64 __m1, __m64 __m2)
{
return __m1 | __m2;
}
-static inline __m64 __attribute__((__always_inline__)) _mm_xor_si64(__m64 __m1, __m64 __m2)
+static inline __m64 __attribute__((__always_inline__, __nodebug__)) _mm_xor_si64(__m64 __m1, __m64 __m2)
{
return __m1 ^ __m2;
}
-static inline __m64 __attribute__((__always_inline__)) _mm_cmpeq_pi8(__m64 __m1, __m64 __m2)
+static inline __m64 __attribute__((__always_inline__, __nodebug__)) _mm_cmpeq_pi8(__m64 __m1, __m64 __m2)
{
return (__m64)__builtin_ia32_pcmpeqb((__v8qi)__m1, (__v8qi)__m2);
}
-static inline __m64 __attribute__((__always_inline__)) _mm_cmpeq_pi16(__m64 __m1, __m64 __m2)
+static inline __m64 __attribute__((__always_inline__, __nodebug__)) _mm_cmpeq_pi16(__m64 __m1, __m64 __m2)
{
return (__m64)__builtin_ia32_pcmpeqw((__v4hi)__m1, (__v4hi)__m2);
}
-static inline __m64 __attribute__((__always_inline__)) _mm_cmpeq_pi32(__m64 __m1, __m64 __m2)
+static inline __m64 __attribute__((__always_inline__, __nodebug__)) _mm_cmpeq_pi32(__m64 __m1, __m64 __m2)
{
return (__m64)__builtin_ia32_pcmpeqd((__v2si)__m1, (__v2si)__m2);
}
-static inline __m64 __attribute__((__always_inline__)) _mm_cmpgt_pi8(__m64 __m1, __m64 __m2)
+static inline __m64 __attribute__((__always_inline__, __nodebug__)) _mm_cmpgt_pi8(__m64 __m1, __m64 __m2)
{
return (__m64)__builtin_ia32_pcmpgtb((__v8qi)__m1, (__v8qi)__m2);
}
-static inline __m64 __attribute__((__always_inline__)) _mm_cmpgt_pi16(__m64 __m1, __m64 __m2)
+static inline __m64 __attribute__((__always_inline__, __nodebug__)) _mm_cmpgt_pi16(__m64 __m1, __m64 __m2)
{
return (__m64)__builtin_ia32_pcmpgtw((__v4hi)__m1, (__v4hi)__m2);
}
-static inline __m64 __attribute__((__always_inline__)) _mm_cmpgt_pi32(__m64 __m1, __m64 __m2)
+static inline __m64 __attribute__((__always_inline__, __nodebug__)) _mm_cmpgt_pi32(__m64 __m1, __m64 __m2)
{
return (__m64)__builtin_ia32_pcmpgtd((__v2si)__m1, (__v2si)__m2);
}
-static inline __m64 __attribute__((__always_inline__)) _mm_setzero_si64(void)
+static inline __m64 __attribute__((__always_inline__, __nodebug__)) _mm_setzero_si64(void)
{
return (__m64){ 0LL };
}
-static inline __m64 __attribute__((__always_inline__)) _mm_set_pi32(int __i1, int __i0)
+static inline __m64 __attribute__((__always_inline__, __nodebug__)) _mm_set_pi32(int __i1, int __i0)
{
return (__m64)(__v2si){ __i0, __i1 };
}
-static inline __m64 __attribute__((__always_inline__)) _mm_set_pi16(short __s3, short __s2, short __s1, short __s0)
+static inline __m64 __attribute__((__always_inline__, __nodebug__)) _mm_set_pi16(short __s3, short __s2, short __s1, short __s0)
{
return (__m64)(__v4hi){ __s0, __s1, __s2, __s3 };
}
-static inline __m64 __attribute__((__always_inline__)) _mm_set_pi8(char __b7, char __b6, char __b5, char __b4, char __b3, char __b2, char __b1, char __b0)
+static inline __m64 __attribute__((__always_inline__, __nodebug__)) _mm_set_pi8(char __b7, char __b6, char __b5, char __b4, char __b3, char __b2, char __b1, char __b0)
{
return (__m64)(__v8qi){ __b0, __b1, __b2, __b3, __b4, __b5, __b6, __b7 };
}
-static inline __m64 __attribute__((__always_inline__)) _mm_set1_pi32(int __i)
+static inline __m64 __attribute__((__always_inline__, __nodebug__)) _mm_set1_pi32(int __i)
{
return (__m64)(__v2si){ __i, __i };
}
-static inline __m64 __attribute__((__always_inline__)) _mm_set1_pi16(short __s)
+static inline __m64 __attribute__((__always_inline__, __nodebug__)) _mm_set1_pi16(short __s)
{
return (__m64)(__v4hi){ __s };
}
-static inline __m64 __attribute__((__always_inline__)) _mm_set1_pi8(char __b)
+static inline __m64 __attribute__((__always_inline__, __nodebug__)) _mm_set1_pi8(char __b)
{
return (__m64)(__v8qi){ __b };
}
-static inline __m64 __attribute__((__always_inline__)) _mm_setr_pi32(int __i1, int __i0)
+static inline __m64 __attribute__((__always_inline__, __nodebug__)) _mm_setr_pi32(int __i1, int __i0)
{
return (__m64)(__v2si){ __i1, __i0 };
}
-static inline __m64 __attribute__((__always_inline__)) _mm_setr_pi16(short __s3, short __s2, short __s1, short __s0)
+static inline __m64 __attribute__((__always_inline__, __nodebug__)) _mm_setr_pi16(short __s3, short __s2, short __s1, short __s0)
{
return (__m64)(__v4hi){ __s3, __s2, __s1, __s0 };
}
-static inline __m64 __attribute__((__always_inline__)) _mm_setr_pi8(char __b7, char __b6, char __b5, char __b4, char __b3, char __b2, char __b1, char __b0)
+static inline __m64 __attribute__((__always_inline__, __nodebug__)) _mm_setr_pi8(char __b7, char __b6, char __b5, char __b4, char __b3, char __b2, char __b1, char __b0)
{
return (__m64)(__v8qi){ __b7, __b6, __b5, __b4, __b3, __b2, __b1, __b0 };
}
#include <mm_malloc.h>
-static inline __m128 __attribute__((__always_inline__))
+static inline __m128 __attribute__((__always_inline__, __nodebug__))
_mm_add_ss(__m128 a, __m128 b)
{
return __builtin_ia32_addss(a, b);
}
-static inline __m128 __attribute__((__always_inline__))
+static inline __m128 __attribute__((__always_inline__, __nodebug__))
_mm_add_ps(__m128 a, __m128 b)
{
return a + b;
}
-static inline __m128 __attribute__((__always_inline__))
+static inline __m128 __attribute__((__always_inline__, __nodebug__))
_mm_sub_ss(__m128 a, __m128 b)
{
return __builtin_ia32_subss(a, b);
}
-static inline __m128 __attribute__((__always_inline__))
+static inline __m128 __attribute__((__always_inline__, __nodebug__))
_mm_sub_ps(__m128 a, __m128 b)
{
return a - b;
}
-static inline __m128 __attribute__((__always_inline__))
+static inline __m128 __attribute__((__always_inline__, __nodebug__))
_mm_mul_ss(__m128 a, __m128 b)
{
return __builtin_ia32_mulss(a, b);
}
-static inline __m128 __attribute__((__always_inline__))
+static inline __m128 __attribute__((__always_inline__, __nodebug__))
_mm_mul_ps(__m128 a, __m128 b)
{
return a * b;
}
-static inline __m128 __attribute__((__always_inline__))
+static inline __m128 __attribute__((__always_inline__, __nodebug__))
_mm_div_ss(__m128 a, __m128 b)
{
return __builtin_ia32_divss(a, b);
}
-static inline __m128 __attribute__((__always_inline__))
+static inline __m128 __attribute__((__always_inline__, __nodebug__))
_mm_div_ps(__m128 a, __m128 b)
{
return a / b;
}
-static inline __m128 __attribute__((__always_inline__))
+static inline __m128 __attribute__((__always_inline__, __nodebug__))
_mm_sqrt_ss(__m128 a)
{
return __builtin_ia32_sqrtss(a);
}
-static inline __m128 __attribute__((__always_inline__))
+static inline __m128 __attribute__((__always_inline__, __nodebug__))
_mm_sqrt_ps(__m128 a)
{
return __builtin_ia32_sqrtps(a);
}
-static inline __m128 __attribute__((__always_inline__))
+static inline __m128 __attribute__((__always_inline__, __nodebug__))
_mm_rcp_ss(__m128 a)
{
return __builtin_ia32_rcpss(a);
}
-static inline __m128 __attribute__((__always_inline__))
+static inline __m128 __attribute__((__always_inline__, __nodebug__))
_mm_rcp_ps(__m128 a)
{
return __builtin_ia32_rcpps(a);
}
-static inline __m128 __attribute__((__always_inline__))
+static inline __m128 __attribute__((__always_inline__, __nodebug__))
_mm_rsqrt_ss(__m128 a)
{
return __builtin_ia32_rsqrtss(a);
}
-static inline __m128 __attribute__((__always_inline__))
+static inline __m128 __attribute__((__always_inline__, __nodebug__))
_mm_rsqrt_ps(__m128 a)
{
return __builtin_ia32_rsqrtps(a);
}
-static inline __m128 __attribute__((__always_inline__))
+static inline __m128 __attribute__((__always_inline__, __nodebug__))
_mm_min_ss(__m128 a, __m128 b)
{
return __builtin_ia32_minss(a, b);
}
-static inline __m128 __attribute__((__always_inline__))
+static inline __m128 __attribute__((__always_inline__, __nodebug__))
_mm_min_ps(__m128 a, __m128 b)
{
return __builtin_ia32_minps(a, b);
}
-static inline __m128 __attribute__((__always_inline__))
+static inline __m128 __attribute__((__always_inline__, __nodebug__))
_mm_max_ss(__m128 a, __m128 b)
{
return __builtin_ia32_maxss(a, b);
}
-static inline __m128 __attribute__((__always_inline__))
+static inline __m128 __attribute__((__always_inline__, __nodebug__))
_mm_max_ps(__m128 a, __m128 b)
{
return __builtin_ia32_maxps(a, b);
}
-static inline __m128 __attribute__((__always_inline__))
+static inline __m128 __attribute__((__always_inline__, __nodebug__))
_mm_and_ps(__m128 a, __m128 b)
{
return __builtin_ia32_andps(a, b);
}
-static inline __m128 __attribute__((__always_inline__))
+static inline __m128 __attribute__((__always_inline__, __nodebug__))
_mm_andnot_ps(__m128 a, __m128 b)
{
return __builtin_ia32_andnps(a, b);
}
-static inline __m128 __attribute__((__always_inline__))
+static inline __m128 __attribute__((__always_inline__, __nodebug__))
_mm_or_ps(__m128 a, __m128 b)
{
return __builtin_ia32_orps(a, b);
}
-static inline __m128 __attribute__((__always_inline__))
+static inline __m128 __attribute__((__always_inline__, __nodebug__))
_mm_xor_ps(__m128 a, __m128 b)
{
return __builtin_ia32_xorps(a, b);
}
-static inline __m128 __attribute__((__always_inline__))
+static inline __m128 __attribute__((__always_inline__, __nodebug__))
_mm_cmpeq_ss(__m128 a, __m128 b)
{
return (__m128)__builtin_ia32_cmpeqss(a, b);
}
-static inline __m128 __attribute__((__always_inline__))
+static inline __m128 __attribute__((__always_inline__, __nodebug__))
_mm_cmpeq_ps(__m128 a, __m128 b)
{
return (__m128)__builtin_ia32_cmpeqps(a, b);
}
-static inline __m128 __attribute__((__always_inline__))
+static inline __m128 __attribute__((__always_inline__, __nodebug__))
_mm_cmplt_ss(__m128 a, __m128 b)
{
return (__m128)__builtin_ia32_cmpltss(a, b);
}
-static inline __m128 __attribute__((__always_inline__))
+static inline __m128 __attribute__((__always_inline__, __nodebug__))
_mm_cmplt_ps(__m128 a, __m128 b)
{
return (__m128)__builtin_ia32_cmpltps(a, b);
}
-static inline __m128 __attribute__((__always_inline__))
+static inline __m128 __attribute__((__always_inline__, __nodebug__))
_mm_cmple_ss(__m128 a, __m128 b)
{
return (__m128)__builtin_ia32_cmpless(a, b);
}
-static inline __m128 __attribute__((__always_inline__))
+static inline __m128 __attribute__((__always_inline__, __nodebug__))
_mm_cmple_ps(__m128 a, __m128 b)
{
return (__m128)__builtin_ia32_cmpleps(a, b);
}
-static inline __m128 __attribute__((__always_inline__))
+static inline __m128 __attribute__((__always_inline__, __nodebug__))
_mm_cmpgt_ss(__m128 a, __m128 b)
{
return (__m128)__builtin_ia32_cmpltss(b, a);
}
-static inline __m128 __attribute__((__always_inline__))
+static inline __m128 __attribute__((__always_inline__, __nodebug__))
_mm_cmpgt_ps(__m128 a, __m128 b)
{
return (__m128)__builtin_ia32_cmpltps(b, a);
}
-static inline __m128 __attribute__((__always_inline__))
+static inline __m128 __attribute__((__always_inline__, __nodebug__))
_mm_cmpge_ss(__m128 a, __m128 b)
{
return (__m128)__builtin_ia32_cmpless(b, a);
}
-static inline __m128 __attribute__((__always_inline__))
+static inline __m128 __attribute__((__always_inline__, __nodebug__))
_mm_cmpge_ps(__m128 a, __m128 b)
{
return (__m128)__builtin_ia32_cmpleps(b, a);
}
-static inline __m128 __attribute__((__always_inline__))
+static inline __m128 __attribute__((__always_inline__, __nodebug__))
_mm_cmpneq_ss(__m128 a, __m128 b)
{
return (__m128)__builtin_ia32_cmpneqss(a, b);
}
-static inline __m128 __attribute__((__always_inline__))
+static inline __m128 __attribute__((__always_inline__, __nodebug__))
_mm_cmpneq_ps(__m128 a, __m128 b)
{
return (__m128)__builtin_ia32_cmpneqps(a, b);
}
-static inline __m128 __attribute__((__always_inline__))
+static inline __m128 __attribute__((__always_inline__, __nodebug__))
_mm_cmpnlt_ss(__m128 a, __m128 b)
{
return (__m128)__builtin_ia32_cmpnltss(a, b);
}
-static inline __m128 __attribute__((__always_inline__))
+static inline __m128 __attribute__((__always_inline__, __nodebug__))
_mm_cmpnlt_ps(__m128 a, __m128 b)
{
return (__m128)__builtin_ia32_cmpnltps(a, b);
}
-static inline __m128 __attribute__((__always_inline__))
+static inline __m128 __attribute__((__always_inline__, __nodebug__))
_mm_cmpnle_ss(__m128 a, __m128 b)
{
return (__m128)__builtin_ia32_cmpnless(a, b);
}
-static inline __m128 __attribute__((__always_inline__))
+static inline __m128 __attribute__((__always_inline__, __nodebug__))
_mm_cmpnle_ps(__m128 a, __m128 b)
{
return (__m128)__builtin_ia32_cmpnleps(a, b);
}
-static inline __m128 __attribute__((__always_inline__))
+static inline __m128 __attribute__((__always_inline__, __nodebug__))
_mm_cmpngt_ss(__m128 a, __m128 b)
{
return (__m128)__builtin_ia32_cmpnltss(b, a);
}
-static inline __m128 __attribute__((__always_inline__))
+static inline __m128 __attribute__((__always_inline__, __nodebug__))
_mm_cmpngt_ps(__m128 a, __m128 b)
{
return (__m128)__builtin_ia32_cmpnltps(b, a);
}
-static inline __m128 __attribute__((__always_inline__))
+static inline __m128 __attribute__((__always_inline__, __nodebug__))
_mm_cmpnge_ss(__m128 a, __m128 b)
{
return (__m128)__builtin_ia32_cmpnless(b, a);
}
-static inline __m128 __attribute__((__always_inline__))
+static inline __m128 __attribute__((__always_inline__, __nodebug__))
_mm_cmpnge_ps(__m128 a, __m128 b)
{
return (__m128)__builtin_ia32_cmpnleps(b, a);
}
-static inline __m128 __attribute__((__always_inline__))
+static inline __m128 __attribute__((__always_inline__, __nodebug__))
_mm_cmpord_ss(__m128 a, __m128 b)
{
return (__m128)__builtin_ia32_cmpordss(a, b);
}
-static inline __m128 __attribute__((__always_inline__))
+static inline __m128 __attribute__((__always_inline__, __nodebug__))
_mm_cmpord_ps(__m128 a, __m128 b)
{
return (__m128)__builtin_ia32_cmpordps(a, b);
}
-static inline __m128 __attribute__((__always_inline__))
+static inline __m128 __attribute__((__always_inline__, __nodebug__))
_mm_cmpunord_ss(__m128 a, __m128 b)
{
return (__m128)__builtin_ia32_cmpunordss(a, b);
}
-static inline __m128 __attribute__((__always_inline__))
+static inline __m128 __attribute__((__always_inline__, __nodebug__))
_mm_cmpunord_ps(__m128 a, __m128 b)
{
return (__m128)__builtin_ia32_cmpunordps(a, b);
}
-static inline int __attribute__((__always_inline__))
+static inline int __attribute__((__always_inline__, __nodebug__))
_mm_comieq_ss(__m128 a, __m128 b)
{
return __builtin_ia32_comieq(a, b);
}
-static inline int __attribute__((__always_inline__))
+static inline int __attribute__((__always_inline__, __nodebug__))
_mm_comilt_ss(__m128 a, __m128 b)
{
return __builtin_ia32_comilt(a, b);
}
-static inline int __attribute__((__always_inline__))
+static inline int __attribute__((__always_inline__, __nodebug__))
_mm_comile_ss(__m128 a, __m128 b)
{
return __builtin_ia32_comile(a, b);
}
-static inline int __attribute__((__always_inline__))
+static inline int __attribute__((__always_inline__, __nodebug__))
_mm_comigt_ss(__m128 a, __m128 b)
{
return __builtin_ia32_comigt(a, b);
}
-static inline int __attribute__((__always_inline__))
+static inline int __attribute__((__always_inline__, __nodebug__))
_mm_comige_ss(__m128 a, __m128 b)
{
return __builtin_ia32_comige(a, b);
}
-static inline int __attribute__((__always_inline__))
+static inline int __attribute__((__always_inline__, __nodebug__))
_mm_comineq_ss(__m128 a, __m128 b)
{
return __builtin_ia32_comineq(a, b);
}
-static inline int __attribute__((__always_inline__))
+static inline int __attribute__((__always_inline__, __nodebug__))
_mm_ucomieq_ss(__m128 a, __m128 b)
{
return __builtin_ia32_ucomieq(a, b);
}
-static inline int __attribute__((__always_inline__))
+static inline int __attribute__((__always_inline__, __nodebug__))
_mm_ucomilt_ss(__m128 a, __m128 b)
{
return __builtin_ia32_ucomilt(a, b);
}
-static inline int __attribute__((__always_inline__))
+static inline int __attribute__((__always_inline__, __nodebug__))
_mm_ucomile_ss(__m128 a, __m128 b)
{
return __builtin_ia32_ucomile(a, b);
}
-static inline int __attribute__((__always_inline__))
+static inline int __attribute__((__always_inline__, __nodebug__))
_mm_ucomigt_ss(__m128 a, __m128 b)
{
return __builtin_ia32_ucomigt(a, b);
}
-static inline int __attribute__((__always_inline__))
+static inline int __attribute__((__always_inline__, __nodebug__))
_mm_ucomige_ss(__m128 a, __m128 b)
{
return __builtin_ia32_ucomige(a, b);
}
-static inline int __attribute__((__always_inline__))
+static inline int __attribute__((__always_inline__, __nodebug__))
_mm_ucomineq_ss(__m128 a, __m128 b)
{
return __builtin_ia32_ucomineq(a, b);
}
-static inline int __attribute__((__always_inline__))
+static inline int __attribute__((__always_inline__, __nodebug__))
_mm_cvtss_si32(__m128 a)
{
return __builtin_ia32_cvtss2si(a);
}
-static inline long long __attribute__((__always_inline__))
+static inline long long __attribute__((__always_inline__, __nodebug__))
_mm_cvtss_si64(__m128 a)
{
return __builtin_ia32_cvtss2si64(a);
}
-static inline __m64 __attribute__((__always_inline__))
+static inline __m64 __attribute__((__always_inline__, __nodebug__))
_mm_cvtps_pi32(__m128 a)
{
return (__m64)__builtin_ia32_cvtps2pi(a);
}
-static inline int __attribute__((__always_inline__))
+static inline int __attribute__((__always_inline__, __nodebug__))
_mm_cvttss_si32(__m128 a)
{
return __builtin_ia32_cvttss2si(a);
}
-static inline long long __attribute__((__always_inline__))
+static inline long long __attribute__((__always_inline__, __nodebug__))
_mm_cvttss_si64(__m128 a)
{
return __builtin_ia32_cvttss2si64(a);
}
-static inline __m64 __attribute__((__always_inline__))
+static inline __m64 __attribute__((__always_inline__, __nodebug__))
_mm_cvttps_pi32(__m128 a)
{
return (__m64)__builtin_ia32_cvttps2pi(a);
}
-static inline __m128 __attribute__((__always_inline__))
+static inline __m128 __attribute__((__always_inline__, __nodebug__))
_mm_cvtsi32_ss(__m128 a, int b)
{
return __builtin_ia32_cvtsi2ss(a, b);
#ifdef __x86_64__
-static inline __m128 __attribute__((__always_inline__))
+static inline __m128 __attribute__((__always_inline__, __nodebug__))
_mm_cvtsi64_ss(__m128 a, long long b)
{
return __builtin_ia32_cvtsi642ss(a, b);
#endif
-static inline __m128 __attribute__((__always_inline__))
+static inline __m128 __attribute__((__always_inline__, __nodebug__))
_mm_cvtpi32_ps(__m128 a, __m64 b)
{
return __builtin_ia32_cvtpi2ps(a, (__v2si)b);
}
-static inline float __attribute__((__always_inline__))
+static inline float __attribute__((__always_inline__, __nodebug__))
_mm_cvtss_f32(__m128 a)
{
return a[0];
}
-static inline __m128 __attribute__((__always_inline__))
+static inline __m128 __attribute__((__always_inline__, __nodebug__))
_mm_loadh_pi(__m128 a, __m64 const *p)
{
return __builtin_ia32_loadhps(a, (__v2si *)p);
}
-static inline __m128 __attribute__((__always_inline__))
+static inline __m128 __attribute__((__always_inline__, __nodebug__))
_mm_loadl_pi(__m128 a, __m64 const *p)
{
return __builtin_ia32_loadlps(a, (__v2si *)p);
}
-static inline __m128 __attribute__((__always_inline__))
+static inline __m128 __attribute__((__always_inline__, __nodebug__))
_mm_load_ss(float *p)
{
return (__m128){ *p, 0, 0, 0 };
}
-static inline __m128 __attribute__((__always_inline__))
+static inline __m128 __attribute__((__always_inline__, __nodebug__))
_mm_load1_ps(float *p)
{
return (__m128){ *p, *p, *p, *p };
}
-static inline __m128 __attribute__((__always_inline__))
+static inline __m128 __attribute__((__always_inline__, __nodebug__))
_mm_load_ps(float *p)
{
return *(__m128*)p;
}
-static inline __m128 __attribute__((__always_inline__))
+static inline __m128 __attribute__((__always_inline__, __nodebug__))
_mm_loadu_ps(float *p)
{
return __builtin_ia32_loadups(p);
}
-static inline __m128 __attribute__((__always_inline__))
+static inline __m128 __attribute__((__always_inline__, __nodebug__))
_mm_loadr_ps(float *p)
{
__m128 a = _mm_load_ps(p);
return __builtin_shufflevector(a, a, 3, 2, 1, 0);
}
-static inline __m128 __attribute__((__always_inline__))
+static inline __m128 __attribute__((__always_inline__, __nodebug__))
_mm_set_ss(float w)
{
return (__m128){ w, 0, 0, 0 };
}
-static inline __m128 __attribute__((__always_inline__))
+static inline __m128 __attribute__((__always_inline__, __nodebug__))
_mm_set1_ps(float w)
{
return (__m128){ w, w, w, w };
}
// Microsoft specific.
-static inline __m128 __attribute__((__always_inline__))
+static inline __m128 __attribute__((__always_inline__, __nodebug__))
_mm_set_ps1(float w)
{
return _mm_set1_ps(w);
}
-static inline __m128 __attribute__((__always_inline__))
+static inline __m128 __attribute__((__always_inline__, __nodebug__))
_mm_set_ps(float z, float y, float x, float w)
{
return (__m128){ w, x, y, z };
}
-static inline __m128 __attribute__((__always_inline__))
+static inline __m128 __attribute__((__always_inline__, __nodebug__))
_mm_setr_ps(float z, float y, float x, float w)
{
return (__m128){ z, y, x, w };
*p = a[0];
}
-static inline void __attribute__((__always_inline__))
+static inline void __attribute__((__always_inline__, __nodebug__))
_mm_storeu_ps(float *p, __m128 a)
{
__builtin_ia32_storeups(p, a);
}
-static inline void __attribute__((__always_inline__))
+static inline void __attribute__((__always_inline__, __nodebug__))
_mm_store1_ps(float *p, __m128 a)
{
a = __builtin_shufflevector(a, a, 0, 0, 0, 0);
_mm_storeu_ps(p, a);
}
-static inline void __attribute__((__always_inline__))
+static inline void __attribute__((__always_inline__, __nodebug__))
_mm_store_ps(float *p, __m128 a)
{
*(__m128 *)p = a;
}
-static inline void __attribute__((__always_inline__))
+static inline void __attribute__((__always_inline__, __nodebug__))
_mm_storer_ps(float *p, __m128 a)
{
a = __builtin_shufflevector(a, a, 3, 2, 1, 0);
#define _mm_prefetch(a, sel) (__builtin_prefetch((void *)a, 0, sel))
-static inline void __attribute__((__always_inline__))
+static inline void __attribute__((__always_inline__, __nodebug__))
_mm_stream_pi(__m64 *p, __m64 a)
{
__builtin_ia32_movntq(p, a);
}
-static inline void __attribute__((__always_inline__))
+static inline void __attribute__((__always_inline__, __nodebug__))
_mm_stream_ps(float *p, __m128 a)
{
__builtin_ia32_movntps(p, a);
}
-static inline void __attribute__((__always_inline__))
+static inline void __attribute__((__always_inline__, __nodebug__))
_mm_sfence(void)
{
__builtin_ia32_sfence();
}
-static inline int __attribute__((__always_inline__))
+static inline int __attribute__((__always_inline__, __nodebug__))
_mm_extract_pi16(__m64 a, int n)
{
/* FIXME:
the already existing __builtin_shufflevector.
*/
/*
-static inline __m64 __attribute__((__always_inline__))
+static inline __m64 __attribute__((__always_inline__, __nodebug__))
_mm_insert_pi16(__m64 a, int d, int n)
{
return (__m64){ 0LL };
}
*/
-static inline __m64 __attribute__((__always_inline__))
+static inline __m64 __attribute__((__always_inline__, __nodebug__))
_mm_max_pi16(__m64 a, __m64 b)
{
return (__m64)__builtin_ia32_pmaxsw((__v4hi)a, (__v4hi)b);
}
-static inline __m64 __attribute__((__always_inline__))
+static inline __m64 __attribute__((__always_inline__, __nodebug__))
_mm_max_pu8(__m64 a, __m64 b)
{
return (__m64)__builtin_ia32_pmaxub((__v8qi)a, (__v8qi)b);
}
-static inline __m64 __attribute__((__always_inline__))
+static inline __m64 __attribute__((__always_inline__, __nodebug__))
_mm_min_pi16(__m64 a, __m64 b)
{
return (__m64)__builtin_ia32_pminsw((__v4hi)a, (__v4hi)b);
}
-static inline __m64 __attribute__((__always_inline__))
+static inline __m64 __attribute__((__always_inline__, __nodebug__))
_mm_min_pu8(__m64 a, __m64 b)
{
return (__m64)__builtin_ia32_pminub((__v8qi)a, (__v8qi)b);
}
-static inline int __attribute__((__always_inline__))
+static inline int __attribute__((__always_inline__, __nodebug__))
_mm_movemask_pi8(__m64 a)
{
return __builtin_ia32_pmovmskb((__v8qi)a);
}
-static inline __m64 __attribute__((__always_inline__))
+static inline __m64 __attribute__((__always_inline__, __nodebug__))
_mm_mulhi_pu16(__m64 a, __m64 b)
{
return (__m64)__builtin_ia32_pmulhuw((__v4hi)a, (__v4hi)b);
#define _mm_shuffle_pi16(a, n) ((__m64)__builtin_ia32_pshufw((__v4hi)a, n))
-static inline void __attribute__((__always_inline__))
+static inline void __attribute__((__always_inline__, __nodebug__))
_mm_maskmove_si64(__m64 d, __m64 n, char *p)
{
__builtin_ia32_maskmovq((__v8qi)d, (__v8qi)n, p);
}
-static inline __m64 __attribute__((__always_inline__))
+static inline __m64 __attribute__((__always_inline__, __nodebug__))
_mm_avg_pu8(__m64 a, __m64 b)
{
return (__m64)__builtin_ia32_pavgb((__v8qi)a, (__v8qi)b);
}
-static inline __m64 __attribute__((__always_inline__))
+static inline __m64 __attribute__((__always_inline__, __nodebug__))
_mm_avg_pu16(__m64 a, __m64 b)
{
return (__m64)__builtin_ia32_pavgw((__v4hi)a, (__v4hi)b);
}
-static inline __m64 __attribute__((__always_inline___))
+static inline __m64 __attribute__((__always_inline__, __nodebug___))
_mm_sad_pu8(__m64 a, __m64 b)
{
return (__m64)__builtin_ia32_psadbw((__v8qi)a, (__v8qi)b);
}
-static inline unsigned int __attribute__((__always_inline___))
+static inline unsigned int __attribute__((__always_inline__, __nodebug___))
_mm_getcsr(void)
{
return __builtin_ia32_stmxcsr();
}
-static inline void __attribute__((__always_inline__))
+static inline void __attribute__((__always_inline__, __nodebug__))
_mm_setcsr(unsigned int i)
{
__builtin_ia32_ldmxcsr(i);
#define _mm_shuffle_ps(a, b, mask) (__builtin_ia32_shufps(a, b, mask))
-static inline __m128 __attribute__((__always_inline__))
+static inline __m128 __attribute__((__always_inline__, __nodebug__))
_mm_unpackhi_ps(__m128 a, __m128 b)
{
return __builtin_shufflevector(a, b, 2, 6, 3, 7);
}
-static inline __m128 __attribute__((__always_inline__))
+static inline __m128 __attribute__((__always_inline__, __nodebug__))
_mm_unpacklo_ps(__m128 a, __m128 b)
{
return __builtin_shufflevector(a, b, 0, 4, 1, 5);
}
-static inline __m128 __attribute__((__always_inline__))
+static inline __m128 __attribute__((__always_inline__, __nodebug__))
_mm_move_ss(__m128 a, __m128 b)
{
return __builtin_shufflevector(a, b, 4, 1, 2, 3);
}
-static inline __m128 __attribute__((__always_inline__))
+static inline __m128 __attribute__((__always_inline__, __nodebug__))
_mm_movehl_ps(__m128 a, __m128 b)
{
return __builtin_shufflevector(a, b, 6, 7, 2, 3);
}
-static inline __m128 __attribute__((__always_inline__))
+static inline __m128 __attribute__((__always_inline__, __nodebug__))
_mm_movelh_ps(__m128 a, __m128 b)
{
return __builtin_shufflevector(a, b, 0, 1, 4, 5);
}
-static inline __m128 __attribute__((__always_inline__))
+static inline __m128 __attribute__((__always_inline__, __nodebug__))
_mm_cvtpi16_ps(__m64 a)
{
__m64 b, c;
return r;
}
-static inline __m128 __attribute__((__always_inline__))
+static inline __m128 __attribute__((__always_inline__, __nodebug__))
_mm_cvtpu16_ps(__m64 a)
{
__m64 b, c;
return r;
}
-static inline __m128 __attribute__((__always_inline__))
+static inline __m128 __attribute__((__always_inline__, __nodebug__))
_mm_cvtpi8_ps(__m64 a)
{
__m64 b;
return _mm_cvtpi16_ps(b);
}
-static inline __m128 __attribute__((__always_inline__))
+static inline __m128 __attribute__((__always_inline__, __nodebug__))
_mm_cvtpu8_ps(__m64 a)
{
__m64 b;
return _mm_cvtpi16_ps(b);
}
-static inline __m128 __attribute__((__always_inline__))
+static inline __m128 __attribute__((__always_inline__, __nodebug__))
_mm_cvtpi32x2_ps(__m64 a, __m64 b)
{
__m128 c;
return _mm_cvtpi32_ps(c, a);
}
-static inline __m64 __attribute__((__always_inline__))
+static inline __m64 __attribute__((__always_inline__, __nodebug__))
_mm_cvtps_pi16(__m128 a)
{
__m64 b, c;
return _mm_packs_pi16(b, c);
}
-static inline __m64 __attribute__((__always_inline__))
+static inline __m64 __attribute__((__always_inline__, __nodebug__))
_mm_cvtps_pi8(__m128 a)
{
__m64 b, c;
return _mm_packs_pi16(b, c);
}
-static inline int __attribute__((__always_inline__))
+static inline int __attribute__((__always_inline__, __nodebug__))
_mm_movemask_ps(__m128 a)
{
return __builtin_ia32_movmskps(a);