From 2d3f7520fc0be56b082353cd044edbaff0e8cd25 Mon Sep 17 00:00:00 2001 From: Yael Tsafrir Date: Tue, 12 Sep 2017 07:46:32 +0000 Subject: [PATCH] [X86] Lower _mm[256|512]_[mask[z]]_avg_epu[8|16] intrinsics to native llvm IR Differential Revision: https://reviews.llvm.org/D37562 git-svn-id: https://llvm.org/svn/llvm-project/cfe/trunk@313011 91177308-0d34-0410-b5e6-96231b3b80d8 --- include/clang/Basic/BuiltinsX86.def | 6 --- lib/Headers/avx2intrin.h | 12 ++++- lib/Headers/avx512bwintrin.h | 46 ++++++++++--------- lib/Headers/emmintrin.h | 12 ++++- test/CodeGen/avx2-builtins.c | 16 ++++++- test/CodeGen/avx512bw-builtins.c | 54 ++++++++++++++++++++--- test/CodeGen/avx512vlbw-builtins.c | 68 +++++++++++++++++++++++++---- test/CodeGen/builtins-x86.c | 4 -- test/CodeGen/sse2-builtins.c | 16 ++++++- 9 files changed, 178 insertions(+), 56 deletions(-) diff --git a/include/clang/Basic/BuiltinsX86.def b/include/clang/Basic/BuiltinsX86.def index 46d71c3ecb..6bfb180409 100644 --- a/include/clang/Basic/BuiltinsX86.def +++ b/include/clang/Basic/BuiltinsX86.def @@ -266,8 +266,6 @@ TARGET_BUILTIN(__builtin_ia32_paddusw128, "V8sV8sV8s", "", "sse2") TARGET_BUILTIN(__builtin_ia32_psubusb128, "V16cV16cV16c", "", "sse2") TARGET_BUILTIN(__builtin_ia32_psubusw128, "V8sV8sV8s", "", "sse2") TARGET_BUILTIN(__builtin_ia32_pmulhw128, "V8sV8sV8s", "", "sse2") -TARGET_BUILTIN(__builtin_ia32_pavgb128, "V16cV16cV16c", "", "sse2") -TARGET_BUILTIN(__builtin_ia32_pavgw128, "V8sV8sV8s", "", "sse2") TARGET_BUILTIN(__builtin_ia32_pmaxub128, "V16cV16cV16c", "", "sse2") TARGET_BUILTIN(__builtin_ia32_pmaxsw128, "V8sV8sV8s", "", "sse2") TARGET_BUILTIN(__builtin_ia32_pminub128, "V16cV16cV16c", "", "sse2") @@ -522,8 +520,6 @@ TARGET_BUILTIN(__builtin_ia32_paddusw256, "V16sV16sV16s", "", "avx2") TARGET_BUILTIN(__builtin_ia32_psubusb256, "V32cV32cV32c", "", "avx2") TARGET_BUILTIN(__builtin_ia32_psubusw256, "V16sV16sV16s", "", "avx2") TARGET_BUILTIN(__builtin_ia32_palignr256, "V32cV32cV32cIi", "", "avx2") -TARGET_BUILTIN(__builtin_ia32_pavgb256, "V32cV32cV32c", "", "avx2") -TARGET_BUILTIN(__builtin_ia32_pavgw256, "V16sV16sV16s", "", "avx2") TARGET_BUILTIN(__builtin_ia32_pblendvb256, "V32cV32cV32cV32c", "", "avx2") TARGET_BUILTIN(__builtin_ia32_phaddw256, "V16sV16sV16s", "", "avx2") TARGET_BUILTIN(__builtin_ia32_phaddd256, "V8iV8iV8i", "", "avx2") @@ -1075,8 +1071,6 @@ TARGET_BUILTIN(__builtin_ia32_paddsb512_mask, "V64cV64cV64cV64cULLi", "", "avx51 TARGET_BUILTIN(__builtin_ia32_paddsw512_mask, "V32sV32sV32sV32sUi", "", "avx512bw") TARGET_BUILTIN(__builtin_ia32_paddusb512_mask, "V64cV64cV64cV64cULLi", "", "avx512bw") TARGET_BUILTIN(__builtin_ia32_paddusw512_mask, "V32sV32sV32sV32sUi", "", "avx512bw") -TARGET_BUILTIN(__builtin_ia32_pavgb512_mask, "V64cV64cV64cV64cULLi", "", "avx512bw") -TARGET_BUILTIN(__builtin_ia32_pavgw512_mask, "V32sV32sV32sV32sUi", "", "avx512bw") TARGET_BUILTIN(__builtin_ia32_pmaxsb512_mask, "V64cV64cV64cV64cULLi", "", "avx512bw") TARGET_BUILTIN(__builtin_ia32_pmaxsw512_mask, "V32sV32sV32sV32sUi", "", "avx512bw") TARGET_BUILTIN(__builtin_ia32_pmaxub512_mask, "V64cV64cV64cV64cULLi", "", "avx512bw") diff --git a/lib/Headers/avx2intrin.h b/lib/Headers/avx2intrin.h index 576f761b25..caf4ced920 100644 --- a/lib/Headers/avx2intrin.h +++ b/lib/Headers/avx2intrin.h @@ -145,13 +145,21 @@ _mm256_andnot_si256(__m256i __a, __m256i __b) static __inline__ __m256i __DEFAULT_FN_ATTRS _mm256_avg_epu8(__m256i __a, __m256i __b) { - return (__m256i)__builtin_ia32_pavgb256((__v32qi)__a, (__v32qi)__b); + typedef unsigned short __v32hu __attribute__((__vector_size__(64))); + return (__m256i)__builtin_convertvector( + ((__builtin_convertvector((__v32qu)__a, __v32hu) + + __builtin_convertvector((__v32qu)__b, __v32hu)) + 1) + >> 1, __v32qu); } static __inline__ __m256i __DEFAULT_FN_ATTRS _mm256_avg_epu16(__m256i __a, __m256i __b) { - return (__m256i)__builtin_ia32_pavgw256((__v16hi)__a, (__v16hi)__b); + typedef unsigned int __v16su __attribute__((__vector_size__(64))); + return (__m256i)__builtin_convertvector( + ((__builtin_convertvector((__v16hu)__a, __v16su) + + __builtin_convertvector((__v16hu)__b, __v16su)) + 1) + >> 1, __v16hu); } static __inline__ __m256i __DEFAULT_FN_ATTRS diff --git a/lib/Headers/avx512bwintrin.h b/lib/Headers/avx512bwintrin.h index 41958b7214..fa4fb20432 100644 --- a/lib/Headers/avx512bwintrin.h +++ b/lib/Headers/avx512bwintrin.h @@ -706,57 +706,55 @@ _mm512_maskz_adds_epu16 (__mmask32 __U, __m512i __A, __m512i __B) static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_avg_epu8 (__m512i __A, __m512i __B) { - return (__m512i) __builtin_ia32_pavgb512_mask ((__v64qi) __A, - (__v64qi) __B, - (__v64qi) _mm512_setzero_qi(), - (__mmask64) -1); + typedef unsigned short __v64hu __attribute__((__vector_size__(128))); + return (__m512i)__builtin_convertvector( + ((__builtin_convertvector((__v64qu) __A, __v64hu) + + __builtin_convertvector((__v64qu) __B, __v64hu)) + 1) + >> 1, __v64qu); } static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_mask_avg_epu8 (__m512i __W, __mmask64 __U, __m512i __A, __m512i __B) { - return (__m512i) __builtin_ia32_pavgb512_mask ((__v64qi) __A, - (__v64qi) __B, - (__v64qi) __W, - (__mmask64) __U); + return (__m512i)__builtin_ia32_selectb_512((__mmask64)__U, + (__v64qi)_mm512_avg_epu8(__A, __B), + (__v64qi)__W); } static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_maskz_avg_epu8 (__mmask64 __U, __m512i __A, __m512i __B) { - return (__m512i) __builtin_ia32_pavgb512_mask ((__v64qi) __A, - (__v64qi) __B, - (__v64qi) _mm512_setzero_qi(), - (__mmask64) __U); + return (__m512i)__builtin_ia32_selectb_512((__mmask64)__U, + (__v64qi)_mm512_avg_epu8(__A, __B), + (__v64qi)_mm512_setzero_qi()); } static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_avg_epu16 (__m512i __A, __m512i __B) { - return (__m512i) __builtin_ia32_pavgw512_mask ((__v32hi) __A, - (__v32hi) __B, - (__v32hi) _mm512_setzero_hi(), - (__mmask32) -1); + typedef unsigned int __v32su __attribute__((__vector_size__(128))); + return (__m512i)__builtin_convertvector( + ((__builtin_convertvector((__v32hu) __A, __v32su) + + __builtin_convertvector((__v32hu) __B, __v32su)) + 1) + >> 1, __v32hu); } static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_mask_avg_epu16 (__m512i __W, __mmask32 __U, __m512i __A, __m512i __B) { - return (__m512i) __builtin_ia32_pavgw512_mask ((__v32hi) __A, - (__v32hi) __B, - (__v32hi) __W, - (__mmask32) __U); + return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U, + (__v32hi)_mm512_avg_epu16(__A, __B), + (__v32hi)__W); } static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_maskz_avg_epu16 (__mmask32 __U, __m512i __A, __m512i __B) { - return (__m512i) __builtin_ia32_pavgw512_mask ((__v32hi) __A, - (__v32hi) __B, - (__v32hi) _mm512_setzero_hi(), - (__mmask32) __U); + return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U, + (__v32hi)_mm512_avg_epu16(__A, __B), + (__v32hi) _mm512_setzero_hi()); } static __inline__ __m512i __DEFAULT_FN_ATTRS diff --git a/lib/Headers/emmintrin.h b/lib/Headers/emmintrin.h index 709815cbb4..3372508a7f 100644 --- a/lib/Headers/emmintrin.h +++ b/lib/Headers/emmintrin.h @@ -2258,7 +2258,11 @@ _mm_adds_epu16(__m128i __a, __m128i __b) static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_avg_epu8(__m128i __a, __m128i __b) { - return (__m128i)__builtin_ia32_pavgb128((__v16qi)__a, (__v16qi)__b); + typedef unsigned short __v16hu __attribute__ ((__vector_size__ (32))); + return (__m128i)__builtin_convertvector( + ((__builtin_convertvector((__v16qu)__a, __v16hu) + + __builtin_convertvector((__v16qu)__b, __v16hu)) + 1) + >> 1, __v16qu); } /// \brief Computes the rounded avarages of corresponding elements of two @@ -2278,7 +2282,11 @@ _mm_avg_epu8(__m128i __a, __m128i __b) static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_avg_epu16(__m128i __a, __m128i __b) { - return (__m128i)__builtin_ia32_pavgw128((__v8hi)__a, (__v8hi)__b); + typedef unsigned int __v8su __attribute__ ((__vector_size__ (32))); + return (__m128i)__builtin_convertvector( + ((__builtin_convertvector((__v8hu)__a, __v8su) + + __builtin_convertvector((__v8hu)__b, __v8su)) + 1) + >> 1, __v8hu); } /// \brief Multiplies the corresponding elements of two 128-bit signed [8 x i16] diff --git a/test/CodeGen/avx2-builtins.c b/test/CodeGen/avx2-builtins.c index 10f3e715de..38aa79631d 100644 --- a/test/CodeGen/avx2-builtins.c +++ b/test/CodeGen/avx2-builtins.c @@ -99,13 +99,25 @@ __m256i test_mm256_andnot_si256(__m256i a, __m256i b) { __m256i test_mm256_avg_epu8(__m256i a, __m256i b) { // CHECK-LABEL: test_mm256_avg_epu8 - // CHECK: call <32 x i8> @llvm.x86.avx2.pavg.b(<32 x i8> %{{.*}}, <32 x i8> %{{.*}}) + // CHECK-NOT: call <32 x i8> @llvm.x86.avx2.pavg.b(<32 x i8> %{{.*}}, <32 x i8> %{{.*}}) + // CHECK: zext <32 x i8> %{{.*}} to <32 x i16> + // CHECK: zext <32 x i8> %{{.*}} to <32 x i16> + // CHECK: add <32 x i16> %{{.*}}, %{{.*}} + // CHECK: add <32 x i16> %{{.*}}, + // CHECK: lshr <32 x i16> %{{.*}}, + // CHECK: trunc <32 x i16> %{{.*}} to <32 x i8> return _mm256_avg_epu8(a, b); } __m256i test_mm256_avg_epu16(__m256i a, __m256i b) { // CHECK-LABEL: test_mm256_avg_epu16 - // CHECK: call <16 x i16> @llvm.x86.avx2.pavg.w(<16 x i16> %{{.*}}, <16 x i16> %{{.*}}) + // CHECK-NOT: call <16 x i16> @llvm.x86.avx2.pavg.w(<16 x i16> %{{.*}}, <16 x i16> %{{.*}}) + // CHECK: zext <16 x i16> %{{.*}} to <16 x i32> + // CHECK: zext <16 x i16> %{{.*}} to <16 x i32> + // CHECK: add <16 x i32> %{{.*}}, %{{.*}} + // CHECK: add <16 x i32> %{{.*}}, + // CHECK: lshr <16 x i32> %{{.*}}, + // CHECK: trunc <16 x i32> %{{.*}} to <16 x i16> return _mm256_avg_epu16(a, b); } diff --git a/test/CodeGen/avx512bw-builtins.c b/test/CodeGen/avx512bw-builtins.c index 60d2423e9c..0ea0880e4a 100644 --- a/test/CodeGen/avx512bw-builtins.c +++ b/test/CodeGen/avx512bw-builtins.c @@ -638,32 +638,74 @@ __m512i test_mm512_maskz_adds_epu16(__mmask32 __U, __m512i __A, __m512i __B) { } __m512i test_mm512_avg_epu8(__m512i __A, __m512i __B) { // CHECK-LABEL: @test_mm512_avg_epu8 - // CHECK: @llvm.x86.avx512.mask.pavg.b.512 + // CHECK-NOT: @llvm.x86.avx512.mask.pavg.b.512 + // CHECK: zext <64 x i8> %{{.*}} to <64 x i16> + // CHECK: zext <64 x i8> %{{.*}} to <64 x i16> + // CHECK: add <64 x i16> %{{.*}}, %{{.*}} + // CHECK: add <64 x i16> %{{.*}}, + // CHECK: lshr <64 x i16> %{{.*}}, + // CHECK: trunc <64 x i16> %{{.*}} to <64 x i8> return _mm512_avg_epu8(__A,__B); } __m512i test_mm512_mask_avg_epu8(__m512i __W, __mmask64 __U, __m512i __A, __m512i __B) { // CHECK-LABEL: @test_mm512_mask_avg_epu8 - // CHECK: @llvm.x86.avx512.mask.pavg.b.512 + // CHECK-NOT: @llvm.x86.avx512.mask.pavg.b.512 + // CHECK: zext <64 x i8> %{{.*}} to <64 x i16> + // CHECK: zext <64 x i8> %{{.*}} to <64 x i16> + // CHECK: add <64 x i16> %{{.*}}, %{{.*}} + // CHECK: add <64 x i16> %{{.*}}, + // CHECK: lshr <64 x i16> %{{.*}}, + // CHECK: trunc <64 x i16> %{{.*}} to <64 x i8> + // CHECK: select <64 x i1> %{{.*}}, <64 x i8> %{{.*}}, <64 x i8> %{{.*}} return _mm512_mask_avg_epu8(__W,__U,__A,__B); } __m512i test_mm512_maskz_avg_epu8(__mmask64 __U, __m512i __A, __m512i __B) { // CHECK-LABEL: @test_mm512_maskz_avg_epu8 - // CHECK: @llvm.x86.avx512.mask.pavg.b.512 + // CHECK-NOT: @llvm.x86.avx512.mask.pavg.b.512 + // CHECK: zext <64 x i8> %{{.*}} to <64 x i16> + // CHECK: zext <64 x i8> %{{.*}} to <64 x i16> + // CHECK: add <64 x i16> %{{.*}}, %{{.*}} + // CHECK: add <64 x i16> %{{.*}}, + // CHECK: lshr <64 x i16> %{{.*}}, + // CHECK: trunc <64 x i16> %{{.*}} to <64 x i8> + // CHECK: store <64 x i8> zeroinitializer + // CHECK: select <64 x i1> %{{.*}}, <64 x i8> %{{.*}}, <64 x i8> %{{.*}} return _mm512_maskz_avg_epu8(__U,__A,__B); } __m512i test_mm512_avg_epu16(__m512i __A, __m512i __B) { // CHECK-LABEL: @test_mm512_avg_epu16 - // CHECK: @llvm.x86.avx512.mask.pavg.w.512 + // CHECK-NOT: @llvm.x86.avx512.mask.pavg.w.512 + // CHECK: zext <32 x i16> %{{.*}} to <32 x i32> + // CHECK: zext <32 x i16> %{{.*}} to <32 x i32> + // CHECK: add <32 x i32> %{{.*}}, %{{.*}} + // CHECK: add <32 x i32> %{{.*}}, + // CHECK: lshr <32 x i32> %{{.*}}, + // CHECK: trunc <32 x i32> %{{.*}} to <32 x i16> return _mm512_avg_epu16(__A,__B); } __m512i test_mm512_mask_avg_epu16(__m512i __W, __mmask32 __U, __m512i __A, __m512i __B) { // CHECK-LABEL: @test_mm512_mask_avg_epu16 - // CHECK: @llvm.x86.avx512.mask.pavg.w.512 + // CHECK-NOT: @llvm.x86.avx512.mask.pavg.w.512 + // CHECK: zext <32 x i16> %{{.*}} to <32 x i32> + // CHECK: zext <32 x i16> %{{.*}} to <32 x i32> + // CHECK: add <32 x i32> %{{.*}}, %{{.*}} + // CHECK: add <32 x i32> %{{.*}}, + // CHECK: lshr <32 x i32> %{{.*}}, + // CHECK: trunc <32 x i32> %{{.*}} to <32 x i16> + // CHECK: select <32 x i1> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16> %{{.*}} return _mm512_mask_avg_epu16(__W,__U,__A,__B); } __m512i test_mm512_maskz_avg_epu16(__mmask32 __U, __m512i __A, __m512i __B) { // CHECK-LABEL: @test_mm512_maskz_avg_epu16 - // CHECK: @llvm.x86.avx512.mask.pavg.w.512 + // CHECK-NOT: @llvm.x86.avx512.mask.pavg.w.512 + // CHECK: zext <32 x i16> %{{.*}} to <32 x i32> + // CHECK: zext <32 x i16> %{{.*}} to <32 x i32> + // CHECK: add <32 x i32> %{{.*}}, %{{.*}} + // CHECK: add <32 x i32> %{{.*}}, + // CHECK: lshr <32 x i32> %{{.*}}, + // CHECK: trunc <32 x i32> %{{.*}} to <32 x i16> + // CHECK: store <32 x i16> zeroinitializer + // CHECK: select <32 x i1> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16> %{{.*}} return _mm512_maskz_avg_epu16(__U,__A,__B); } __m512i test_mm512_max_epi8(__m512i __A, __m512i __B) { diff --git a/test/CodeGen/avx512vlbw-builtins.c b/test/CodeGen/avx512vlbw-builtins.c index 5a7283608b..3aeaf5d43f 100644 --- a/test/CodeGen/avx512vlbw-builtins.c +++ b/test/CodeGen/avx512vlbw-builtins.c @@ -1155,49 +1155,101 @@ __m256i test_mm256_maskz_adds_epu16(__mmask16 __U, __m256i __A, __m256i __B) { } __m128i test_mm_mask_avg_epu8(__m128i __W, __mmask16 __U, __m128i __A, __m128i __B) { // CHECK-LABEL: @test_mm_mask_avg_epu8 - // CHECK: @llvm.x86.sse2.pavg.b + // CHECK-NOT: @llvm.x86.sse2.pavg.b + // CHECK: zext <16 x i8> %{{.*}} to <16 x i16> + // CHECK: zext <16 x i8> %{{.*}} to <16 x i16> + // CHECK: add <16 x i16> %{{.*}}, %{{.*}} + // CHECK: add <16 x i16> %{{.*}}, + // CHECK: lshr <16 x i16> %{{.*}}, + // CHECK: trunc <16 x i16> %{{.*}} to <16 x i8> // CHECK: select <16 x i1> %{{.*}}, <16 x i8> %{{.*}}, <16 x i8> %{{.*}} return _mm_mask_avg_epu8(__W,__U,__A,__B); } __m128i test_mm_maskz_avg_epu8(__mmask16 __U, __m128i __A, __m128i __B) { // CHECK-LABEL: @test_mm_maskz_avg_epu8 - // CHECK: @llvm.x86.sse2.pavg.b + // CHECK-NOT: @llvm.x86.sse2.pavg.b + // CHECK: zext <16 x i8> %{{.*}} to <16 x i16> + // CHECK: zext <16 x i8> %{{.*}} to <16 x i16> + // CHECK: add <16 x i16> %{{.*}}, %{{.*}} + // CHECK: add <16 x i16> %{{.*}}, + // CHECK: lshr <16 x i16> %{{.*}}, + // CHECK: trunc <16 x i16> %{{.*}} to <16 x i8> + // CHECK: store <2 x i64> zeroinitializer // CHECK: select <16 x i1> %{{.*}}, <16 x i8> %{{.*}}, <16 x i8> %{{.*}} return _mm_maskz_avg_epu8(__U,__A,__B); } __m256i test_mm256_mask_avg_epu8(__m256i __W, __mmask32 __U, __m256i __A, __m256i __B) { // CHECK-LABEL: @test_mm256_mask_avg_epu8 - // CHECK: @llvm.x86.avx2.pavg.b + // CHECK-NOT: @llvm.x86.avx2.pavg.b + // CHECK: zext <32 x i8> %{{.*}} to <32 x i16> + // CHECK: zext <32 x i8> %{{.*}} to <32 x i16> + // CHECK: add <32 x i16> %{{.*}}, %{{.*}} + // CHECK: add <32 x i16> %{{.*}}, + // CHECK: lshr <32 x i16> %{{.*}}, + // CHECK: trunc <32 x i16> %{{.*}} to <32 x i8> // CHECK: select <32 x i1> %{{.*}}, <32 x i8> %{{.*}}, <32 x i8> %{{.*}} return _mm256_mask_avg_epu8(__W,__U,__A,__B); } __m256i test_mm256_maskz_avg_epu8(__mmask32 __U, __m256i __A, __m256i __B) { // CHECK-LABEL: @test_mm256_maskz_avg_epu8 - // CHECK: @llvm.x86.avx2.pavg.b + // CHECK-NOT: @llvm.x86.avx2.pavg.b + // CHECK: zext <32 x i8> %{{.*}} to <32 x i16> + // CHECK: zext <32 x i8> %{{.*}} to <32 x i16> + // CHECK: add <32 x i16> %{{.*}}, %{{.*}} + // CHECK: add <32 x i16> %{{.*}}, + // CHECK: lshr <32 x i16> %{{.*}}, + // CHECK: trunc <32 x i16> %{{.*}} to <32 x i8> + // CHECK: store <4 x i64> zeroinitializer // CHECK: select <32 x i1> %{{.*}}, <32 x i8> %{{.*}}, <32 x i8> %{{.*}} return _mm256_maskz_avg_epu8(__U,__A,__B); } __m128i test_mm_mask_avg_epu16(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) { // CHECK-LABEL: @test_mm_mask_avg_epu16 - // CHECK: @llvm.x86.sse2.pavg.w + // CHECK-NOT: @llvm.x86.sse2.pavg.w + // CHECK: zext <8 x i16> %{{.*}} to <8 x i32> + // CHECK: zext <8 x i16> %{{.*}} to <8 x i32> + // CHECK: add <8 x i32> %{{.*}}, %{{.*}} + // CHECK: add <8 x i32> %{{.*}}, + // CHECK: lshr <8 x i32> %{{.*}}, + // CHECK: trunc <8 x i32> %{{.*}} to <8 x i16> // CHECK: select <8 x i1> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}} return _mm_mask_avg_epu16(__W,__U,__A,__B); } __m128i test_mm_maskz_avg_epu16(__mmask8 __U, __m128i __A, __m128i __B) { // CHECK-LABEL: @test_mm_maskz_avg_epu16 - // CHECK: @llvm.x86.sse2.pavg.w + // CHECK-NOT: @llvm.x86.sse2.pavg.w + // CHECK: zext <8 x i16> %{{.*}} to <8 x i32> + // CHECK: zext <8 x i16> %{{.*}} to <8 x i32> + // CHECK: add <8 x i32> %{{.*}}, %{{.*}} + // CHECK: add <8 x i32> %{{.*}}, + // CHECK: lshr <8 x i32> %{{.*}}, + // CHECK: trunc <8 x i32> %{{.*}} to <8 x i16> + // CHECK: store <2 x i64> zeroinitializer // CHECK: select <8 x i1> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}} return _mm_maskz_avg_epu16(__U,__A,__B); } __m256i test_mm256_mask_avg_epu16(__m256i __W, __mmask16 __U, __m256i __A, __m256i __B) { // CHECK-LABEL: @test_mm256_mask_avg_epu16 - // CHECK: @llvm.x86.avx2.pavg.w + // CHECK-NOT: @llvm.x86.avx2.pavg.w + // CHECK: zext <16 x i16> %{{.*}} to <16 x i32> + // CHECK: zext <16 x i16> %{{.*}} to <16 x i32> + // CHECK: add <16 x i32> %{{.*}}, %{{.*}} + // CHECK: add <16 x i32> %{{.*}}, + // CHECK: lshr <16 x i32> %{{.*}}, + // CHECK: trunc <16 x i32> %{{.*}} to <16 x i16> // CHECK: select <16 x i1> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}} return _mm256_mask_avg_epu16(__W,__U,__A,__B); } __m256i test_mm256_maskz_avg_epu16(__mmask16 __U, __m256i __A, __m256i __B) { // CHECK-LABEL: @test_mm256_maskz_avg_epu16 - // CHECK: @llvm.x86.avx2.pavg.w + // CHECK-NOT: @llvm.x86.avx2.pavg.w + // CHECK: zext <16 x i16> %{{.*}} to <16 x i32> + // CHECK: zext <16 x i16> %{{.*}} to <16 x i32> + // CHECK: add <16 x i32> %{{.*}}, %{{.*}} + // CHECK: add <16 x i32> %{{.*}}, + // CHECK: lshr <16 x i32> %{{.*}}, + // CHECK: trunc <16 x i32> %{{.*}} to <16 x i16> + // CHECK: store <4 x i64> zeroinitializer // CHECK: select <16 x i1> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}} return _mm256_maskz_avg_epu16(__U,__A,__B); } diff --git a/test/CodeGen/builtins-x86.c b/test/CodeGen/builtins-x86.c index 0086f7079d..5df0e01c25 100644 --- a/test/CodeGen/builtins-x86.c +++ b/test/CodeGen/builtins-x86.c @@ -160,8 +160,6 @@ void f0() { tmp_V4s = __builtin_ia32_psubusw(tmp_V4s, tmp_V4s); tmp_V4s = __builtin_ia32_pmulhw(tmp_V4s, tmp_V4s); tmp_V4s = __builtin_ia32_pmulhuw(tmp_V4s, tmp_V4s); - tmp_V8c = __builtin_ia32_pavgb(tmp_V8c, tmp_V8c); - tmp_V4s = __builtin_ia32_pavgw(tmp_V4s, tmp_V4s); tmp_V8c = __builtin_ia32_pcmpeqb(tmp_V8c, tmp_V8c); tmp_V4s = __builtin_ia32_pcmpeqw(tmp_V4s, tmp_V4s); tmp_V2i = __builtin_ia32_pcmpeqd(tmp_V2i, tmp_V2i); @@ -201,8 +199,6 @@ void f0() { tmp_V16c = __builtin_ia32_psubusb128(tmp_V16c, tmp_V16c); tmp_V8s = __builtin_ia32_psubusw128(tmp_V8s, tmp_V8s); tmp_V8s = __builtin_ia32_pmulhw128(tmp_V8s, tmp_V8s); - tmp_V16c = __builtin_ia32_pavgb128(tmp_V16c, tmp_V16c); - tmp_V8s = __builtin_ia32_pavgw128(tmp_V8s, tmp_V8s); tmp_V16c = __builtin_ia32_pmaxub128(tmp_V16c, tmp_V16c); tmp_V8s = __builtin_ia32_pmaxsw128(tmp_V8s, tmp_V8s); tmp_V16c = __builtin_ia32_pminub128(tmp_V16c, tmp_V16c); diff --git a/test/CodeGen/sse2-builtins.c b/test/CodeGen/sse2-builtins.c index ca51314d80..c2279cb881 100644 --- a/test/CodeGen/sse2-builtins.c +++ b/test/CodeGen/sse2-builtins.c @@ -97,13 +97,25 @@ __m128i test_mm_andnot_si128(__m128i A, __m128i B) { __m128i test_mm_avg_epu8(__m128i A, __m128i B) { // CHECK-LABEL: test_mm_avg_epu8 - // CHECK: call <16 x i8> @llvm.x86.sse2.pavg.b(<16 x i8> %{{.*}}, <16 x i8> %{{.*}}) + // CHECK-NOT: call <16 x i8> @llvm.x86.sse2.pavg.b(<16 x i8> %{{.*}}, <16 x i8> %{{.*}}) + // CHECK: zext <16 x i8> %{{.*}} to <16 x i16> + // CHECK: zext <16 x i8> %{{.*}} to <16 x i16> + // CHECK: add <16 x i16> %{{.*}}, %{{.*}} + // CHECK: add <16 x i16> %{{.*}}, + // CHECK: lshr <16 x i16> %{{.*}}, + // CHECK:trunc <16 x i16> %{{.*}} to <16 x i8> return _mm_avg_epu8(A, B); } __m128i test_mm_avg_epu16(__m128i A, __m128i B) { // CHECK-LABEL: test_mm_avg_epu16 - // CHECK: call <8 x i16> @llvm.x86.sse2.pavg.w(<8 x i16> %{{.*}}, <8 x i16> %{{.*}}) + // CHECK-NOT: call <8 x i16> @llvm.x86.sse2.pavg.w(<8 x i16> %{{.*}}, <8 x i16> %{{.*}}) + // CHECK: zext <8 x i16> %{{.*}} to <8 x i32> + // CHECK: zext <8 x i16> %{{.*}} to <8 x i32> + // CHECK: add <8 x i32> %{{.*}}, %{{.*}} + // CHECK: add <8 x i32> %{{.*}}, + // CHECK: lshr <8 x i32> %{{.*}}, + // CHECK: trunc <8 x i32> %{{.*}} to <8 x i16> return _mm_avg_epu16(A, B); } -- 2.49.0