From: Henrik Gramner Date: Wed, 7 Sep 2016 17:26:42 +0000 (+0200) Subject: x86: Move predict_16x16_dc_left calculations to asm X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=0c36239a4826f6e5a3cb873aca1814e389a46e29;p=libx264 x86: Move predict_16x16_dc_left calculations to asm 1-2 cycles faster and avoids some code duplication to decrease code size. Also drop the MMX2 implementation in favor of SSE2 to simplify things. --- diff --git a/common/pixel.c b/common/pixel.c index bb59152e..3963af74 100644 --- a/common/pixel.c +++ b/common/pixel.c @@ -556,6 +556,7 @@ INTRA_MBCMP(satd, 16x16, v, h, dc, ,, _c ) #if HIGH_BIT_DEPTH #define x264_predict_8x8c_v_mmx2 x264_predict_8x8c_v_mmx #define x264_predict_8x16c_v_mmx2 x264_predict_8x16c_v_c +#define x264_predict_16x16_dc_mmx2 x264_predict_16x16_dc_c #define x264_predict_8x8c_v_sse2 x264_predict_8x8c_v_sse #define x264_predict_8x16c_v_sse2 x264_predict_8x16c_v_sse #define x264_predict_16x16_v_sse2 x264_predict_16x16_v_sse diff --git a/common/x86/predict-a.asm b/common/x86/predict-a.asm index 16c29eee..e8954e3e 100644 --- a/common/x86/predict-a.asm +++ b/common/x86/predict-a.asm @@ -2092,63 +2092,28 @@ PREDICT_16x16_H %endif ;----------------------------------------------------------------------------- -; void predict_16x16_dc_core( pixel *src, int i_dc_left ) +; void predict_16x16_dc( pixel *src ) ;----------------------------------------------------------------------------- -%macro PRED16x16_DC_MMX 2 -%if HIGH_BIT_DEPTH - mova m0, [r0 - FDEC_STRIDEB+ 0] - paddw m0, [r0 - FDEC_STRIDEB+ 8] - paddw m0, [r0 - FDEC_STRIDEB+16] - paddw m0, [r0 - FDEC_STRIDEB+24] - HADDW m0, m1 - paddw m0, %1 - psrlw m0, %2 - SPLATW m0, m0 - STORE16 m0, m0, m0, m0 -%else ; !HIGH_BIT_DEPTH - pxor m0, m0 - pxor m1, m1 - psadbw m0, [r0 - FDEC_STRIDE] - psadbw m1, [r0 - FDEC_STRIDE + 8] - paddusw m0, m1 - paddusw m0, %1 - psrlw m0, %2 ; dc - pshufw m0, m0, 0 - packuswb m0, m0 ; dc in bytes - STORE16 m0, m0 -%endif -%endmacro - -INIT_MMX mmx2 -cglobal predict_16x16_dc_core, 1,2 -%if ARCH_X86_64 - movd m6, r1d - PRED16x16_DC_MMX m6, 5 +%if WIN64 +DECLARE_REG_TMP 6 ; Reduces code size due to fewer REX prefixes %else - PRED16x16_DC_MMX r1m, 5 +DECLARE_REG_TMP 3 %endif - RET - -INIT_MMX mmx2 -cglobal predict_16x16_dc_top, 1,2 - PRED16x16_DC_MMX [pw_8], 4 - RET -INIT_MMX mmx2 -%if HIGH_BIT_DEPTH -cglobal predict_16x16_dc_left_core, 1,2 - movd m0, r1m - SPLATW m0, m0 - STORE16 m0, m0, m0, m0 - RET -%else ; !HIGH_BIT_DEPTH -cglobal predict_16x16_dc_left_core, 1,1 - movd m0, r1m - pshufw m0, m0, 0 - packuswb m0, m0 - STORE16 m0, m0 +INIT_XMM +; Returns the sum of the left pixels in r1d+r2d +cglobal predict_16x16_dc_left_internal, 0,4 + movzx r1d, pixel [r0-SIZEOF_PIXEL] + movzx r2d, pixel [r0+FDEC_STRIDEB-SIZEOF_PIXEL] +%assign i 2*FDEC_STRIDEB +%rep 7 + movzx t0d, pixel [r0+i-SIZEOF_PIXEL] + add r1d, t0d + movzx t0d, pixel [r0+i+FDEC_STRIDEB-SIZEOF_PIXEL] + add r2d, t0d +%assign i i+2*FDEC_STRIDEB +%endrep RET -%endif %macro PRED16x16_DC 2 %if HIGH_BIT_DEPTH @@ -2176,9 +2141,11 @@ cglobal predict_16x16_dc_left_core, 1,1 %endif %endmacro -%macro PREDICT_16x16_DC_CORE 0 -cglobal predict_16x16_dc_core, 2,2,4 - movd xm3, r1m +%macro PREDICT_16x16_DC 0 +cglobal predict_16x16_dc, 1,3 + call predict_16x16_dc_left_internal + lea r1d, [r1+r2+16] + movd xm3, r1d PRED16x16_DC xm3, 5 RET @@ -2186,8 +2153,11 @@ cglobal predict_16x16_dc_top, 1,2 PRED16x16_DC [pw_8], 4 RET -cglobal predict_16x16_dc_left_core, 1,2 - movd xm0, r1m +cglobal predict_16x16_dc_left, 1,3 + call predict_16x16_dc_left_internal + lea r1d, [r1+r2+8] + shr r1d, 4 + movd xm0, r1d SPLATW m0, xm0 %if HIGH_BIT_DEPTH && mmsize == 16 STORE16 m0, m0 @@ -2201,11 +2171,11 @@ cglobal predict_16x16_dc_left_core, 1,2 %endmacro INIT_XMM sse2 -PREDICT_16x16_DC_CORE +PREDICT_16x16_DC %if HIGH_BIT_DEPTH INIT_YMM avx2 -PREDICT_16x16_DC_CORE +PREDICT_16x16_DC %else INIT_XMM avx2 -PREDICT_16x16_DC_CORE +PREDICT_16x16_DC %endif diff --git a/common/x86/predict-c.c b/common/x86/predict-c.c index b5a8b45c..38ff39e5 100644 --- a/common/x86/predict-c.c +++ b/common/x86/predict-c.c @@ -29,38 +29,6 @@ #include "predict.h" #include "pixel.h" -#define PREDICT_16x16_DC(name)\ -void x264_predict_16x16_dc_##name( pixel *src )\ -{\ - uint32_t dc = 16;\ - for( int i = 0; i < 16; i += 2 )\ - {\ - dc += src[-1 + i * FDEC_STRIDE];\ - dc += src[-1 + (i+1) * FDEC_STRIDE];\ - }\ - x264_predict_16x16_dc_core_##name( src, dc );\ -} - -PREDICT_16x16_DC( mmx2 ) -PREDICT_16x16_DC( sse2 ) -PREDICT_16x16_DC( avx2 ) - -#define PREDICT_16x16_DC_LEFT(name)\ -static void x264_predict_16x16_dc_left_##name( pixel *src )\ -{\ - uint32_t dc = 8;\ - for( int i = 0; i < 16; i += 2 )\ - {\ - dc += src[-1 + i * FDEC_STRIDE];\ - dc += src[-1 + (i+1) * FDEC_STRIDE];\ - }\ - x264_predict_16x16_dc_left_core_##name( src, dc>>4 );\ -} - -PREDICT_16x16_DC_LEFT( mmx2 ) -PREDICT_16x16_DC_LEFT( sse2 ) -PREDICT_16x16_DC_LEFT( avx2 ) - #define PREDICT_P_SUM(j,i)\ H += i * ( src[j+i - FDEC_STRIDE ] - src[j-i - FDEC_STRIDE ] );\ V += i * ( src[(j+i)*FDEC_STRIDE -1] - src[(j-i)*FDEC_STRIDE -1] ); @@ -347,9 +315,6 @@ void x264_predict_16x16_init_mmx( int cpu, x264_predict_t pf[7] ) { if( !(cpu&X264_CPU_MMX2) ) return; - pf[I_PRED_16x16_DC] = x264_predict_16x16_dc_mmx2; - pf[I_PRED_16x16_DC_TOP] = x264_predict_16x16_dc_top_mmx2; - pf[I_PRED_16x16_DC_LEFT] = x264_predict_16x16_dc_left_mmx2; pf[I_PRED_16x16_V] = x264_predict_16x16_v_mmx2; pf[I_PRED_16x16_H] = x264_predict_16x16_h_mmx2; #if HIGH_BIT_DEPTH diff --git a/common/x86/predict.h b/common/x86/predict.h index 662cc64c..ba1dd6ba 100644 --- a/common/x86/predict.h +++ b/common/x86/predict.h @@ -40,15 +40,10 @@ void x264_predict_16x16_h_mmx2( pixel *src ); void x264_predict_16x16_h_sse2( uint16_t *src ); void x264_predict_16x16_h_ssse3( uint8_t *src ); void x264_predict_16x16_h_avx2( uint16_t *src ); -void x264_predict_16x16_dc_mmx2( pixel *src ); void x264_predict_16x16_dc_sse2( pixel *src ); -void x264_predict_16x16_dc_core_mmx2( pixel *src, int i_dc_left ); -void x264_predict_16x16_dc_core_sse2( pixel *src, int i_dc_left ); -void x264_predict_16x16_dc_core_avx2( pixel *src, int i_dc_left ); -void x264_predict_16x16_dc_left_core_mmx2( pixel *src, int i_dc_left ); -void x264_predict_16x16_dc_left_core_sse2( pixel *src, int i_dc_left ); -void x264_predict_16x16_dc_left_core_avx2( pixel *src, int i_dc_left ); -void x264_predict_16x16_dc_top_mmx2( pixel *src ); +void x264_predict_16x16_dc_avx2( pixel *src ); +void x264_predict_16x16_dc_left_sse2( pixel *src ); +void x264_predict_16x16_dc_left_avx2( pixel *src ); void x264_predict_16x16_dc_top_sse2( pixel *src ); void x264_predict_16x16_dc_top_avx2( pixel *src ); void x264_predict_16x16_p_core_mmx2( uint8_t *src, int i00, int b, int c );