BUILTIN(__builtin_ia32_phsubw128, "V8sV8sV8s", "")
BUILTIN(__builtin_ia32_phsubd128, "V4iV4iV4i", "")
BUILTIN(__builtin_ia32_phsubsw128, "V8sV8sV8s", "")
-BUILTIN(__builtin_ia32_pmaddubsw128, "V16cV16cV16c", "")
+BUILTIN(__builtin_ia32_pmaddubsw128, "V8sV16cV16c", "")
BUILTIN(__builtin_ia32_pmulhrsw128, "V8sV8sV8s", "")
BUILTIN(__builtin_ia32_pshufb128, "V16cV16cV16c", "")
BUILTIN(__builtin_ia32_psignb128, "V16cV16cV16c", "")
BUILTIN(__builtin_ia32_psrlqi128, "V2LLiV2LLii", "")
BUILTIN(__builtin_ia32_psrawi128, "V8sV8si", "")
BUILTIN(__builtin_ia32_psradi128, "V4iV4ii", "")
-BUILTIN(__builtin_ia32_pmaddwd128, "V8sV8sV8s", "")
+BUILTIN(__builtin_ia32_pmaddwd128, "V4iV8sV8s", "")
BUILTIN(__builtin_ia32_monitor, "vv*UiUi", "")
BUILTIN(__builtin_ia32_mwait, "vUiUi", "")
BUILTIN(__builtin_ia32_lddqu, "V16ccC*", "")
BUILTIN(__builtin_ia32_phsubw256, "V16sV16sV16s", "")
BUILTIN(__builtin_ia32_phsubd256, "V8iV8iV8i", "")
BUILTIN(__builtin_ia32_phsubsw256, "V16sV16sV16s", "")
+BUILTIN(__builtin_ia32_pmaddubsw256, "V16sV32cV32c", "")
+BUILTIN(__builtin_ia32_pmaddwd256, "V8iV16sV16s", "")
#undef BUILTIN
return (__m256i)__builtin_ia32_phsubsw256((__v16hi)a, (__v16hi)b);
}
+static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_maddubs_epi16(__m256i a, __m256i b)
+{
+ return (__m256i)__builtin_ia32_pmaddubsw256((__v32qi)a, (__v32qi)b);
+}
+
+static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
+_mm256_madd_epi16(__m256i a, __m256i b)
+{
+ return (__m256i)__builtin_ia32_pmaddwd256((__v16hi)a, (__v16hi)b);
+}
+
static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))
_mm256_or_si256(__m256i a, __m256i b)
{
// CHECK: @llvm.x86.avx2.phsub.sw
return _mm256_hsubs_epi16(a, b);
}
+
+__m256 test_mm256_maddubs_epi16(__m256 a, __m256 b) {
+ // CHECK: @llvm.x86.avx2.pmadd.ub.sw
+ return _mm256_maddubs_epi16(a, b);
+}
+
+__m256 test_mm256_madd_epi16(__m256 a, __m256 b) {
+ // CHECK: @llvm.x86.avx2.pmadd.wd
+ return _mm256_madd_epi16(a, b);
+}