From 12611ec99bb52f4f2c1b114138d867b3a2aa182b Mon Sep 17 00:00:00 2001 From: Henrik Gramner Date: Sun, 8 Oct 2017 21:41:16 +0200 Subject: [PATCH] x86: AVX-512 load_deinterleave_chroma_fdec --- common/x86/mc-a2.asm | 24 ++++++++++++++++++++++++ common/x86/mc-c.c | 3 +++ 2 files changed, 27 insertions(+) diff --git a/common/x86/mc-a2.asm b/common/x86/mc-a2.asm index 69ed4cd4..90f5419e 100644 --- a/common/x86/mc-a2.asm +++ b/common/x86/mc-a2.asm @@ -1259,6 +1259,29 @@ cglobal load_deinterleave_chroma_fdec, 4,4 RET %endmacro ; LOAD_DEINTERLEAVE_CHROMA +%macro LOAD_DEINTERLEAVE_CHROMA_FDEC_AVX512 0 +cglobal load_deinterleave_chroma_fdec, 4,5 + vbroadcasti32x8 m0, [deinterleave_shuf32a] + mov r4d, 0x3333ff00 + kmovd k1, r4d + lea r4, [r2*3] + kshiftrd k2, k1, 16 +.loop: + vbroadcasti128 ym1, [r1] + vbroadcasti32x4 m1 {k1}, [r1+r2] + vbroadcasti128 ym2, [r1+r2*2] + vbroadcasti32x4 m2 {k1}, [r1+r4] + lea r1, [r1+r2*4] + pshufb m1, m0 + pshufb m2, m0 + vmovdqa32 [r0] {k2}, m1 + vmovdqa32 [r0+mmsize] {k2}, m2 + add r0, 2*mmsize + sub r3d, 4 + jg .loop + RET +%endmacro + %macro LOAD_DEINTERLEAVE_CHROMA_FENC_AVX2 0 cglobal load_deinterleave_chroma_fenc, 4,5 vbroadcasti128 m0, [deinterleave_shuf] @@ -1510,6 +1533,7 @@ INIT_YMM avx2 LOAD_DEINTERLEAVE_CHROMA_FENC_AVX2 PLANE_DEINTERLEAVE_RGB INIT_ZMM avx512 +LOAD_DEINTERLEAVE_CHROMA_FDEC_AVX512 LOAD_DEINTERLEAVE_CHROMA_FENC_AVX2 %endif diff --git a/common/x86/mc-c.c b/common/x86/mc-c.c index 0deb1387..bf697cfc 100644 --- a/common/x86/mc-c.c +++ b/common/x86/mc-c.c @@ -255,6 +255,8 @@ void x264_load_deinterleave_chroma_fdec_ssse3( uint8_t *dst, uint8_t *src, intpt void x264_load_deinterleave_chroma_fdec_avx( uint16_t *dst, uint16_t *src, intptr_t i_src, int height ); #define x264_load_deinterleave_chroma_fdec_avx2 x264_template(load_deinterleave_chroma_fdec_avx2) void x264_load_deinterleave_chroma_fdec_avx2( uint16_t *dst, uint16_t *src, intptr_t i_src, int height ); +#define x264_load_deinterleave_chroma_fdec_avx512 x264_template(load_deinterleave_chroma_fdec_avx512) +void x264_load_deinterleave_chroma_fdec_avx512( uint8_t *dst, uint8_t *src, intptr_t i_src, int height ); #define x264_memcpy_aligned_sse x264_template(memcpy_aligned_sse) void *x264_memcpy_aligned_sse ( void *dst, const void *src, size_t n ); #define x264_memcpy_aligned_avx x264_template(memcpy_aligned_avx) @@ -1081,6 +1083,7 @@ void x264_mc_init_mmx( int cpu, x264_mc_functions_t *pf ) pf->avg[PIXEL_8x16] = x264_pixel_avg_8x16_avx512; pf->avg[PIXEL_8x8] = x264_pixel_avg_8x8_avx512; pf->avg[PIXEL_8x4] = x264_pixel_avg_8x4_avx512; + pf->load_deinterleave_chroma_fdec = x264_load_deinterleave_chroma_fdec_avx512; pf->load_deinterleave_chroma_fenc = x264_load_deinterleave_chroma_fenc_avx512; } #endif // HIGH_BIT_DEPTH -- 2.40.0