From: Craig Topper Date: Tue, 20 Dec 2011 09:55:26 +0000 (+0000) Subject: Add AVX2 intrinsics for pavg, pblend, and pcmp instructions. Also remove unneeded... X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=4c07c5dfebd270b2f0660e86f056eeafdb26a4fb;p=clang Add AVX2 intrinsics for pavg, pblend, and pcmp instructions. Also remove unneeded builtins for SSE pcmp. Change SSE pcmpeqq and pcmpgtq to not use builtins and just use vector == and >. git-svn-id: https://llvm.org/svn/llvm-project/cfe/trunk@146969 91177308-0d34-0410-b5e6-96231b3b80d8 --- diff --git a/include/clang/Basic/BuiltinsX86.def b/include/clang/Basic/BuiltinsX86.def index 3dc7271469..e72d86bb7f 100644 --- a/include/clang/Basic/BuiltinsX86.def +++ b/include/clang/Basic/BuiltinsX86.def @@ -210,12 +210,6 @@ BUILTIN(__builtin_ia32_psubusw128, "V8sV8sV8s", "") BUILTIN(__builtin_ia32_pmulhw128, "V8sV8sV8s", "") BUILTIN(__builtin_ia32_pavgb128, "V16cV16cV16c", "") BUILTIN(__builtin_ia32_pavgw128, "V8sV8sV8s", "") -BUILTIN(__builtin_ia32_pcmpeqb128, "V16cV16cV16c", "") -BUILTIN(__builtin_ia32_pcmpeqw128, "V8sV8sV8s", "") -BUILTIN(__builtin_ia32_pcmpeqd128, "V4iV4iV4i", "") -BUILTIN(__builtin_ia32_pcmpgtb128, "V16cV16cV16c", "") -BUILTIN(__builtin_ia32_pcmpgtw128, "V8sV8sV8s", "") -BUILTIN(__builtin_ia32_pcmpgtd128, "V4iV4iV4i", "") BUILTIN(__builtin_ia32_pmaxub128, "V16cV16cV16c", "") BUILTIN(__builtin_ia32_pmaxsw128, "V8sV8sV8s", "") BUILTIN(__builtin_ia32_pminub128, "V16cV16cV16c", "") @@ -353,7 +347,6 @@ BUILTIN(__builtin_ia32_movntdqa, "V2LLiV2LLi*", "") BUILTIN(__builtin_ia32_ptestz128, "iV2LLiV2LLi", "") BUILTIN(__builtin_ia32_ptestc128, "iV2LLiV2LLi", "") BUILTIN(__builtin_ia32_ptestnzc128, "iV2LLiV2LLi", "") -BUILTIN(__builtin_ia32_pcmpeqq, "V2LLiV2LLiV2LLi", "") BUILTIN(__builtin_ia32_mpsadbw128, "V16cV16cV16ci", "") // SSE 4.2 @@ -374,8 +367,6 @@ BUILTIN(__builtin_ia32_pcmpestri128, "iV16ciV16ciIc","") //BUILTIN(__builtin_ia32_pcmpestris128, "iV16ciV16ciIc","") //BUILTIN(__builtin_ia32_pcmpestriz128, "iV16ciV16ciIc","") -BUILTIN(__builtin_ia32_pcmpgtq, "V2LLiV2LLiV2LLi", "") - BUILTIN(__builtin_ia32_crc32qi, "UiUiUc", "") BUILTIN(__builtin_ia32_crc32hi, "UiUiUs", "") BUILTIN(__builtin_ia32_crc32si, "UiUiUi", "") @@ -499,5 +490,9 @@ BUILTIN(__builtin_ia32_paddusw256, "V16sV16sV16s", "") BUILTIN(__builtin_ia32_psubusb256, "V32cV32cV32c", "") BUILTIN(__builtin_ia32_psubusw256, "V16sV16sV16s", "") BUILTIN(__builtin_ia32_palignr256, "V32cV32cV32cIc", "") +BUILTIN(__builtin_ia32_pavgb256, "V32cV32cV32c", "") +BUILTIN(__builtin_ia32_pavgw256, "V16sV16sV16s", "") +BUILTIN(__builtin_ia32_pblendvb256, "V32cV32cV32cV32c", "") +BUILTIN(__builtin_ia32_pblendw256, "V16sV16sV16sIi", "") #undef BUILTIN diff --git a/lib/Headers/avx2intrin.h b/lib/Headers/avx2intrin.h index df0450d7d1..813c602527 100644 --- a/lib/Headers/avx2intrin.h +++ b/lib/Headers/avx2intrin.h @@ -135,6 +135,78 @@ _mm256_andnot_si256(__m256i a, __m256i b) return ~a & b; } +static __inline__ __m256i __attribute__((__always_inline__, __nodebug__)) +_mm256_avg_epu8(__m256i a, __m256i b) +{ + return (__m256i)__builtin_ia32_pavgb256((__v32qi)a, (__v32qi)b); +} + +static __inline__ __m256i __attribute__((__always_inline__, __nodebug__)) +_mm256_avg_epu16(__m256i a, __m256i b) +{ + return (__m256i)__builtin_ia32_pavgw256((__v16hi)a, (__v16hi)b); +} + +static __inline__ __m256i __attribute__((__always_inline__, __nodebug__)) +_mm256_blendv_epi8(__m256i __V1, __m256i __V2, __m256i __M) +{ + return (__m256i)__builtin_ia32_pblendvb256((__v32qi)__V1, (__v32qi)__V2, + (__v32qi)__M); +} + +#define _mm256_blend_epi16(V1, V2, M) __extension__ ({ \ + __m256i __V1 = (V1); \ + __m256i __V2 = (V2); \ + (__m256i)__builtin_ia32_pblendw256((__v16hi)__V1, (__v16hi)__V2, M); }) + +static __inline__ __m256i __attribute__((__always_inline__, __nodebug__)) +_mm256_cmpeq_epi8(__m256i a, __m256i b) +{ + return (__m256i)((__v32qi)a == (__v32qi)b); +} + +static __inline__ __m256i __attribute__((__always_inline__, __nodebug__)) +_mm256_cmpeq_epi16(__m256i a, __m256i b) +{ + return (__m256i)((__v16hi)a == (__v16hi)b); +} + +static __inline__ __m256i __attribute__((__always_inline__, __nodebug__)) +_mm256_cmpeq_epi32(__m256i a, __m256i b) +{ + return (__m256i)((__v8si)a == (__v8si)b); +} + +static __inline__ __m256i __attribute__((__always_inline__, __nodebug__)) +_mm256_cmpeq_epi64(__m256i a, __m256i b) +{ + return (__m256i)((__v4di)a == (__v4di)b); +} + +static __inline__ __m256i __attribute__((__always_inline__, __nodebug__)) +_mm256_cmpgt_epi8(__m256i a, __m256i b) +{ + return (__m256i)((__v32qi)a > (__v32qi)b); +} + +static __inline__ __m256i __attribute__((__always_inline__, __nodebug__)) +_mm256_cmpgt_epi16(__m256i a, __m256i b) +{ + return (__m256i)((__v16hi)a > (__v16hi)b); +} + +static __inline__ __m256i __attribute__((__always_inline__, __nodebug__)) +_mm256_cmpgt_epi32(__m256i a, __m256i b) +{ + return (__m256i)((__v8si)a > (__v8si)b); +} + +static __inline__ __m256i __attribute__((__always_inline__, __nodebug__)) +_mm256_cmpgt_epi64(__m256i a, __m256i b) +{ + return (__m256i)((__v4di)a > (__v4di)b); +} + static __inline__ __m256i __attribute__((__always_inline__, __nodebug__)) _mm256_or_si256(__m256i a, __m256i b) { diff --git a/lib/Headers/smmintrin.h b/lib/Headers/smmintrin.h index a2bbd5da87..2e376ddb46 100644 --- a/lib/Headers/smmintrin.h +++ b/lib/Headers/smmintrin.h @@ -245,7 +245,7 @@ _mm_testnzc_si128(__m128i __M, __m128i __V) static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) _mm_cmpeq_epi64(__m128i __V1, __m128i __V2) { - return (__m128i) __builtin_ia32_pcmpeqq((__v2di)__V1, (__v2di)__V2); + return (__m128i)((__v2di)__V1 == (__v2di)__V2); } /* SSE4 Packed Integer Sign-Extension. */ @@ -398,7 +398,7 @@ _mm_packus_epi32(__m128i __V1, __m128i __V2) static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) _mm_cmpgt_epi64(__m128i __V1, __m128i __V2) { - return __builtin_ia32_pcmpgtq((__v2di)__V1, (__v2di)__V2); + return (__m128i)((__v2di)__V1 > (__v2di)__V2); } /* SSE4.2 Accumulate CRC32. */ diff --git a/test/CodeGen/avx2-builtins.c b/test/CodeGen/avx2-builtins.c index 3aa374e448..aa13b3c268 100644 --- a/test/CodeGen/avx2-builtins.c +++ b/test/CodeGen/avx2-builtins.c @@ -155,3 +155,63 @@ __m256 test_mm256_xor_si256(__m256 a, __m256 b) { // CHECK: xor <4 x i64> return _mm256_xor_si256(a, b); } + +__m256 test_mm256_avg_epu8(__m256 a, __m256 b) { + // CHECK: @llvm.x86.avx2.pavg.b + return _mm256_avg_epu8(a, b); +} + +__m256 test_mm256_avg_epu16(__m256 a, __m256 b) { + // CHECK: @llvm.x86.avx2.pavg.w + return _mm256_avg_epu16(a, b); +} + +__m256 test_mm256_blendv_epi8(__m256 a, __m256 b, __m256 m) { + // CHECK: @llvm.x86.avx2.pblendvb + return _mm256_blendv_epi8(a, b, m); +} + +__m256 test_mm256_blend_epi16(__m256 a, __m256 b) { + // CHECK: @llvm.x86.avx2.pblendw(<16 x i16> %{{.*}}, <16 x i16> %{{.*}}, i32 2) + return _mm256_blend_epi16(a, b, 2); +} + +__m256 test_mm256_cmpeq_epi8(__m256 a, __m256 b) { + // CHECK: icmp eq <32 x i8> + return _mm256_cmpeq_epi8(a, b); +} + +__m256 test_mm256_cmpeq_epi16(__m256 a, __m256 b) { + // CHECK: icmp eq <16 x i16> + return _mm256_cmpeq_epi16(a, b); +} + +__m256 test_mm256_cmpeq_epi32(__m256 a, __m256 b) { + // CHECK: icmp eq <8 x i32> + return _mm256_cmpeq_epi32(a, b); +} + +__m256 test_mm256_cmpeq_epi64(__m256 a, __m256 b) { + // CHECK: icmp eq <4 x i64> + return _mm256_cmpeq_epi64(a, b); +} + +__m256 test_mm256_cmpgt_epi8(__m256 a, __m256 b) { + // CHECK: icmp sgt <32 x i8> + return _mm256_cmpgt_epi8(a, b); +} + +__m256 test_mm256_cmpgt_epi16(__m256 a, __m256 b) { + // CHECK: icmp sgt <16 x i16> + return _mm256_cmpgt_epi16(a, b); +} + +__m256 test_mm256_cmpgt_epi32(__m256 a, __m256 b) { + // CHECK: icmp sgt <8 x i32> + return _mm256_cmpgt_epi32(a, b); +} + +__m256 test_mm256_cmpgt_epi64(__m256 a, __m256 b) { + // CHECK: icmp sgt <4 x i64> + return _mm256_cmpgt_epi64(a, b); +} diff --git a/test/CodeGen/builtins-x86.c b/test/CodeGen/builtins-x86.c index 7f028e5932..772ab105db 100644 --- a/test/CodeGen/builtins-x86.c +++ b/test/CodeGen/builtins-x86.c @@ -199,12 +199,6 @@ void f0() { tmp_V8s = __builtin_ia32_pmulhw128(tmp_V8s, tmp_V8s); tmp_V16c = __builtin_ia32_pavgb128(tmp_V16c, tmp_V16c); tmp_V8s = __builtin_ia32_pavgw128(tmp_V8s, tmp_V8s); - tmp_V16c = __builtin_ia32_pcmpeqb128(tmp_V16c, tmp_V16c); - tmp_V8s = __builtin_ia32_pcmpeqw128(tmp_V8s, tmp_V8s); - tmp_V4i = __builtin_ia32_pcmpeqd128(tmp_V4i, tmp_V4i); - tmp_V16c = __builtin_ia32_pcmpgtb128(tmp_V16c, tmp_V16c); - tmp_V8s = __builtin_ia32_pcmpgtw128(tmp_V8s, tmp_V8s); - tmp_V4i = __builtin_ia32_pcmpgtd128(tmp_V4i, tmp_V4i); tmp_V16c = __builtin_ia32_pmaxub128(tmp_V16c, tmp_V16c); tmp_V8s = __builtin_ia32_pmaxsw128(tmp_V8s, tmp_V8s); tmp_V16c = __builtin_ia32_pminub128(tmp_V16c, tmp_V16c);