From: Henrik Gramner Date: Wed, 18 Jan 2017 20:46:55 +0000 (+0100) Subject: x86: AVX2 load_deinterleave_chroma_fenc X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=cce50082129d3c92bd41bc0afc5a8c8d93084c9c;p=libx264 x86: AVX2 load_deinterleave_chroma_fenc 20% faster than SSSE3. --- diff --git a/common/common.h b/common/common.h index f26868eb..bce186d5 100644 --- a/common/common.h +++ b/common/common.h @@ -781,7 +781,7 @@ struct x264_t /* space for p_fenc and p_fdec */ #define FENC_STRIDE 16 #define FDEC_STRIDE 32 - ALIGNED_16( pixel fenc_buf[48*FENC_STRIDE] ); + ALIGNED_N( pixel fenc_buf[48*FENC_STRIDE] ); ALIGNED_N( pixel fdec_buf[52*FDEC_STRIDE] ); /* i4x4 and i8x8 backup data, for skipping the encode stage when possible */ diff --git a/common/x86/mc-a2.asm b/common/x86/mc-a2.asm index f39645a1..c4aff28c 100644 --- a/common/x86/mc-a2.asm +++ b/common/x86/mc-a2.asm @@ -1250,6 +1250,26 @@ cglobal load_deinterleave_chroma_fdec, 4,4 RET %endmacro ; LOAD_DEINTERLEAVE_CHROMA +%macro LOAD_DEINTERLEAVE_CHROMA_FENC_AVX2 0 +cglobal load_deinterleave_chroma_fenc, 4,5 + vbroadcasti128 m0, [deinterleave_shuf] + lea r4, [r2*3] +.loop: + mova xm1, [r1] + vinserti128 m1, m1, [r1+r2], 1 + mova xm2, [r1+r2*2] + vinserti128 m2, m2, [r1+r4], 1 + pshufb m1, m0 + pshufb m2, m0 + mova [r0+0*FENC_STRIDE], m1 + mova [r0+2*FENC_STRIDE], m2 + lea r1, [r1+r2*4] + add r0, 4*FENC_STRIDE + sub r3d, 4 + jg .loop + RET +%endmacro ; LOAD_DEINTERLEAVE_CHROMA_FENC_AVX2 + %macro PLANE_DEINTERLEAVE_RGB_CORE 9 ; pw, i_dsta, i_dstb, i_dstc, i_src, w, h, tmp1, tmp2 %if mmsize == 32 vbroadcasti128 m3, [deinterleave_rgb_shuf+(%1-3)*16] @@ -1444,6 +1464,7 @@ PLANE_DEINTERLEAVE LOAD_DEINTERLEAVE_CHROMA PLANE_DEINTERLEAVE_RGB INIT_YMM avx2 +LOAD_DEINTERLEAVE_CHROMA_FENC_AVX2 PLANE_DEINTERLEAVE_RGB %endif diff --git a/common/x86/mc-c.c b/common/x86/mc-c.c index d8fbafcd..8f1bca56 100644 --- a/common/x86/mc-c.c +++ b/common/x86/mc-c.c @@ -138,6 +138,7 @@ void x264_store_interleave_chroma_avx ( pixel *dst, intptr_t i_dst, pixel *srcu, void x264_load_deinterleave_chroma_fenc_sse2( pixel *dst, pixel *src, intptr_t i_src, int height ); void x264_load_deinterleave_chroma_fenc_ssse3( uint8_t *dst, uint8_t *src, intptr_t i_src, int height ); void x264_load_deinterleave_chroma_fenc_avx( uint16_t *dst, uint16_t *src, intptr_t i_src, int height ); +void x264_load_deinterleave_chroma_fenc_avx2( uint8_t *dst, uint8_t *src, intptr_t i_src, int height ); void x264_load_deinterleave_chroma_fdec_sse2( pixel *dst, pixel *src, intptr_t i_src, int height ); void x264_load_deinterleave_chroma_fdec_ssse3( uint8_t *dst, uint8_t *src, intptr_t i_src, int height ); void x264_load_deinterleave_chroma_fdec_avx( uint16_t *dst, uint16_t *src, intptr_t i_src, int height ); @@ -823,6 +824,7 @@ void x264_mc_init_mmx( int cpu, x264_mc_functions_t *pf ) pf->integral_init8h = x264_integral_init8h_avx2; pf->integral_init4h = x264_integral_init4h_avx2; pf->frame_init_lowres_core = x264_frame_init_lowres_core_avx2; + pf->load_deinterleave_chroma_fenc = x264_load_deinterleave_chroma_fenc_avx2; pf->plane_copy_deinterleave_rgb = x264_plane_copy_deinterleave_rgb_avx2; } #endif // HIGH_BIT_DEPTH diff --git a/encoder/analyse.c b/encoder/analyse.c index ff655521..1941bf28 100644 --- a/encoder/analyse.c +++ b/encoder/analyse.c @@ -2147,7 +2147,7 @@ static void x264_mb_analyse_inter_b16x16( x264_t *h, x264_mb_analysis_t *a ) } else { - ALIGNED_ARRAY_16( pixel, pixuv, [2],[16*FENC_STRIDE] ); + ALIGNED_ARRAY_N( pixel, pixuv, [2],[16*FENC_STRIDE] ); int chromapix = h->luma2chroma_pixel[PIXEL_16x16]; int v_shift = CHROMA_V_SHIFT; diff --git a/encoder/ratecontrol.c b/encoder/ratecontrol.c index 3c5357a5..dbccb277 100644 --- a/encoder/ratecontrol.c +++ b/encoder/ratecontrol.c @@ -243,7 +243,7 @@ static ALWAYS_INLINE uint32_t ac_energy_plane( x264_t *h, int mb_x, int mb_y, x2 stride <<= b_field; if( b_chroma ) { - ALIGNED_ARRAY_16( pixel, pix,[FENC_STRIDE*16] ); + ALIGNED_ARRAY_N( pixel, pix,[FENC_STRIDE*16] ); int chromapix = h->luma2chroma_pixel[PIXEL_16x16]; int shift = 7 - CHROMA_V_SHIFT;