From: Henrik Gramner Date: Sun, 8 Oct 2017 19:23:12 +0000 (+0200) Subject: x86: AVX-512 load_deinterleave_chroma_fenc X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=d93851ec282eb069f91a6eddab3284f7766cd5bd;p=libx264 x86: AVX-512 load_deinterleave_chroma_fenc --- diff --git a/common/x86/mc-a2.asm b/common/x86/mc-a2.asm index a4e11616..69ed4cd4 100644 --- a/common/x86/mc-a2.asm +++ b/common/x86/mc-a2.asm @@ -1264,17 +1264,27 @@ cglobal load_deinterleave_chroma_fenc, 4,5 vbroadcasti128 m0, [deinterleave_shuf] lea r4, [r2*3] .loop: - mova xm1, [r1] - vinserti128 m1, m1, [r1+r2], 1 - mova xm2, [r1+r2*2] - vinserti128 m2, m2, [r1+r4], 1 + mova xm1, [r1] ; 0 + vinserti128 ym1, [r1+r2], 1 ; 1 +%if mmsize == 64 + mova xm2, [r1+r2*4] ; 4 + vinserti32x4 m1, [r1+r2*2], 2 ; 2 + vinserti32x4 m2, [r1+r4*2], 2 ; 6 + vinserti32x4 m1, [r1+r4], 3 ; 3 + lea r1, [r1+r2*4] + vinserti32x4 m2, [r1+r2], 1 ; 5 + vinserti32x4 m2, [r1+r4], 3 ; 7 +%else + mova xm2, [r1+r2*2] ; 2 + vinserti128 m2, [r1+r4], 1 ; 3 +%endif + lea r1, [r1+r2*4] pshufb m1, m0 pshufb m2, m0 - mova [r0+0*FENC_STRIDE], m1 - mova [r0+2*FENC_STRIDE], m2 - lea r1, [r1+r2*4] - add r0, 4*FENC_STRIDE - sub r3d, 4 + mova [r0], m1 + mova [r0+mmsize], m2 + add r0, 2*mmsize + sub r3d, mmsize/8 jg .loop RET %endmacro ; LOAD_DEINTERLEAVE_CHROMA_FENC_AVX2 @@ -1499,6 +1509,8 @@ PLANE_DEINTERLEAVE_RGB INIT_YMM avx2 LOAD_DEINTERLEAVE_CHROMA_FENC_AVX2 PLANE_DEINTERLEAVE_RGB +INIT_ZMM avx512 +LOAD_DEINTERLEAVE_CHROMA_FENC_AVX2 %endif ; These functions are not general-use; not only do they require aligned input, but memcpy diff --git a/common/x86/mc-c.c b/common/x86/mc-c.c index 51764811..0deb1387 100644 --- a/common/x86/mc-c.c +++ b/common/x86/mc-c.c @@ -245,6 +245,8 @@ void x264_load_deinterleave_chroma_fenc_ssse3( uint8_t *dst, uint8_t *src, intpt void x264_load_deinterleave_chroma_fenc_avx( uint16_t *dst, uint16_t *src, intptr_t i_src, int height ); #define x264_load_deinterleave_chroma_fenc_avx2 x264_template(load_deinterleave_chroma_fenc_avx2) void x264_load_deinterleave_chroma_fenc_avx2( pixel *dst, pixel *src, intptr_t i_src, int height ); +#define x264_load_deinterleave_chroma_fenc_avx512 x264_template(load_deinterleave_chroma_fenc_avx512) +void x264_load_deinterleave_chroma_fenc_avx512( uint8_t *dst, uint8_t *src, intptr_t i_src, int height ); #define x264_load_deinterleave_chroma_fdec_sse2 x264_template(load_deinterleave_chroma_fdec_sse2) void x264_load_deinterleave_chroma_fdec_sse2( pixel *dst, pixel *src, intptr_t i_src, int height ); #define x264_load_deinterleave_chroma_fdec_ssse3 x264_template(load_deinterleave_chroma_fdec_ssse3) @@ -909,6 +911,7 @@ void x264_mc_init_mmx( int cpu, x264_mc_functions_t *pf ) { pf->mc_luma = mc_luma_avx2; pf->load_deinterleave_chroma_fdec = x264_load_deinterleave_chroma_fdec_avx2; + pf->load_deinterleave_chroma_fenc = x264_load_deinterleave_chroma_fenc_avx2; pf->plane_copy_deinterleave_v210 = x264_plane_copy_deinterleave_v210_avx2; } @@ -1068,6 +1071,7 @@ void x264_mc_init_mmx( int cpu, x264_mc_functions_t *pf ) pf->integral_init4h = x264_integral_init4h_avx2; pf->frame_init_lowres_core = x264_frame_init_lowres_core_avx2; pf->plane_copy_deinterleave_rgb = x264_plane_copy_deinterleave_rgb_avx2; + pf->load_deinterleave_chroma_fenc = x264_load_deinterleave_chroma_fenc_avx2; } if( cpu&X264_CPU_AVX512 ) @@ -1077,6 +1081,7 @@ void x264_mc_init_mmx( int cpu, x264_mc_functions_t *pf ) pf->avg[PIXEL_8x16] = x264_pixel_avg_8x16_avx512; pf->avg[PIXEL_8x8] = x264_pixel_avg_8x8_avx512; pf->avg[PIXEL_8x4] = x264_pixel_avg_8x4_avx512; + pf->load_deinterleave_chroma_fenc = x264_load_deinterleave_chroma_fenc_avx512; } #endif // HIGH_BIT_DEPTH @@ -1096,7 +1101,6 @@ void x264_mc_init_mmx( int cpu, x264_mc_functions_t *pf ) pf->plane_copy_swap = plane_copy_swap_avx2; pf->plane_copy_deinterleave = x264_plane_copy_deinterleave_avx2; pf->plane_copy_deinterleave_yuyv = plane_copy_deinterleave_yuyv_avx2; - pf->load_deinterleave_chroma_fenc = x264_load_deinterleave_chroma_fenc_avx2; pf->get_ref = get_ref_avx2; pf->mbtree_propagate_cost = x264_mbtree_propagate_cost_avx2; pf->mbtree_propagate_list = mbtree_propagate_list_avx2;