From bc29c635327d79f6a5372df30477db28635e3846 Mon Sep 17 00:00:00 2001 From: Loren Merritt <pengvado@akuvian.org> Date: Thu, 11 Dec 2008 19:47:17 +0000 Subject: [PATCH] faster ESA init reduce memory if using ESA and not p4x4 --- common/common.h | 1 + common/frame.c | 2 +- common/mc.c | 71 +++++++++++++++++++++++++------- common/mc.h | 6 +++ common/x86/mc-a2.asm | 98 ++++++++++++++++++++++++++++++++++++++++++++ common/x86/mc-c.c | 16 ++++++++ encoder/encoder.c | 3 ++ tools/checkasm.c | 66 +++++++++++++++++++---------- 8 files changed, 227 insertions(+), 36 deletions(-) diff --git a/common/common.h b/common/common.h index f2a0c54a..1668a630 100644 --- a/common/common.h +++ b/common/common.h @@ -338,6 +338,7 @@ struct x264_t int i_max_ref1; int i_delay; /* Number of frames buffered for B reordering */ int b_have_lowres; /* Whether 1/2 resolution luma planes are being used */ + int b_have_sub8x8_esa; } frames; /* current frame being encoded */ diff --git a/common/frame.c b/common/frame.c index 482992d2..021242f1 100644 --- a/common/frame.c +++ b/common/frame.c @@ -99,7 +99,7 @@ x264_frame_t *x264_frame_new( x264_t *h ) if( h->param.analyse.i_me_method >= X264_ME_ESA ) { CHECKED_MALLOC( frame->buffer[3], - 2 * frame->i_stride[0] * (frame->i_lines[0] + 2*i_padv) * sizeof(uint16_t) ); + frame->i_stride[0] * (frame->i_lines[0] + 2*i_padv) * sizeof(uint16_t) << h->frames.b_have_sub8x8_esa ); frame->integral = (uint16_t*)frame->buffer[3] + frame->i_stride[0] * i_padv + PADH; } diff --git a/common/mc.c b/common/mc.c index fe37c470..7422ba4e 100644 --- a/common/mc.c +++ b/common/mc.c @@ -269,6 +269,42 @@ static void memzero_aligned( void * dst, int n ) memset( dst, 0, n ); } +static void integral_init4h( uint16_t *sum, uint8_t *pix, int stride ) +{ + int x, v = pix[0]+pix[1]+pix[2]+pix[3]; + for( x=0; x<stride-4; x++ ) + { + sum[x] = v + sum[x-stride]; + v += pix[x+4] - pix[x]; + } +} + +static void integral_init8h( uint16_t *sum, uint8_t *pix, int stride ) +{ + int x, v = pix[0]+pix[1]+pix[2]+pix[3]+pix[4]+pix[5]+pix[6]+pix[7]; + for( x=0; x<stride-8; x++ ) + { + sum[x] = v + sum[x-stride]; + v += pix[x+8] - pix[x]; + } +} + +static void integral_init4v( uint16_t *sum8, uint16_t *sum4, int stride ) +{ + int x; + for( x=0; x<stride-8; x++ ) + sum4[x] = sum8[x+4*stride] - sum8[x]; + for( x=0; x<stride-8; x++ ) + sum8[x] = sum8[x+8*stride] + sum8[x+8*stride+4] - sum8[x] - sum8[x+4]; +} + +static void integral_init8v( uint16_t *sum8, int stride ) +{ + int x; + for( x=0; x<stride-8; x++ ) + sum8[x] = sum8[x+8*stride] - sum8[x]; +} + void x264_frame_init_lowres( x264_t *h, x264_frame_t *frame ) { uint8_t *src = frame->plane[0]; @@ -353,6 +389,11 @@ void x264_mc_init( int cpu, x264_mc_functions_t *pf ) pf->memzero_aligned = memzero_aligned; pf->frame_init_lowres_core = frame_init_lowres_core; + pf->integral_init4h = integral_init4h; + pf->integral_init8h = integral_init8h; + pf->integral_init4v = integral_init4v; + pf->integral_init8v = integral_init8v; + #ifdef HAVE_MMX x264_mc_init_mmx( cpu, pf ); #endif @@ -370,7 +411,7 @@ void x264_frame_filter( x264_t *h, x264_frame_t *frame, int mb_y, int b_end ) int start = (mb_y*16 >> b_interlaced) - 8; // buffer = 4 for deblock + 3 for 6tap, rounded to 8 int height = ((b_end ? frame->i_lines[0] : mb_y*16) >> b_interlaced) + 8; int offs = start*stride - 8; // buffer = 3 for 6tap, aligned to 8 for simd - int x, y; + int y; if( mb_y & b_interlaced ) return; @@ -401,20 +442,22 @@ void x264_frame_filter( x264_t *h, x264_frame_t *frame, int mb_y, int b_end ) height += PADV-8; for( y = start; y < height; y++ ) { - uint8_t *ref = frame->plane[0] + y * stride - PADH; - uint16_t *line = frame->integral + (y+1) * stride - PADH + 1; - uint16_t v = line[0] = 0; - for( x = 1; x < stride-1; x++ ) - line[x] = v += ref[x] + line[x-stride] - line[x-stride-1]; - line -= 8*stride; - if( y >= 9-PADV ) + uint8_t *pix = frame->plane[0] + y * stride - PADH; + uint16_t *sum8 = frame->integral + (y+1) * stride - PADH; + uint16_t *sum4; + if( h->frames.b_have_sub8x8_esa ) + { + h->mc.integral_init4h( sum8, pix, stride ); + sum8 -= 8*stride; + sum4 = sum8 + stride * (frame->i_lines[0] + PADV*2); + if( y >= 8-PADV ) + h->mc.integral_init4v( sum8, sum4, stride ); + } + else { - uint16_t *sum4 = line + stride * (frame->i_lines[0] + PADV*2); - for( x = 1; x < stride-8; x++, line++, sum4++ ) - { - sum4[0] = line[4+4*stride] - line[4] - line[4*stride] + line[0]; - line[0] += line[8+8*stride] - line[8] - line[8*stride]; - } + h->mc.integral_init8h( sum8, pix, stride ); + if( y >= 8-PADV ) + h->mc.integral_init8v( sum8-8*stride, stride ); } } } diff --git a/common/mc.h b/common/mc.h index 57c596cf..884d0165 100644 --- a/common/mc.h +++ b/common/mc.h @@ -66,6 +66,12 @@ typedef struct void *(*memcpy_aligned)( void *dst, const void *src, size_t n ); void (*memzero_aligned)( void *dst, int n ); + /* successive elimination prefilter */ + void (*integral_init4h)( uint16_t *sum, uint8_t *pix, int stride ); + void (*integral_init8h)( uint16_t *sum, uint8_t *pix, int stride ); + void (*integral_init4v)( uint16_t *sum8, uint16_t *sum4, int stride ); + void (*integral_init8v)( uint16_t *sum8, int stride ); + void (*frame_init_lowres_core)( uint8_t *src0, uint8_t *dst0, uint8_t *dsth, uint8_t *dstv, uint8_t *dstc, int src_stride, int dst_stride, int width, int height ); } x264_mc_functions_t; diff --git a/common/x86/mc-a2.asm b/common/x86/mc-a2.asm index fa3e3bd9..82daf2ce 100644 --- a/common/x86/mc-a2.asm +++ b/common/x86/mc-a2.asm @@ -694,6 +694,104 @@ MEMZERO sse2 +;----------------------------------------------------------------------------- +; void x264_integral_init4h_sse4( uint16_t *sum, uint8_t *pix, int stride ) +;----------------------------------------------------------------------------- +cglobal x264_integral_init4h_sse4, 3,4 + lea r3, [r0+r2*2] + add r1, r2 + neg r2 + pxor m4, m4 +.loop: + movdqa m0, [r1+r2] + movdqu m1, [r1+r2+8] + mpsadbw m0, m4, 0 + mpsadbw m1, m4, 0 + paddw m0, [r0+r2*2] + paddw m1, [r0+r2*2+16] + movdqa [r3+r2*2 ], m0 + movdqa [r3+r2*2+16], m1 + add r2, 16 + jl .loop + REP_RET + +cglobal x264_integral_init8h_sse4, 3,4 + lea r3, [r0+r2*2] + add r1, r2 + neg r2 + pxor m4, m4 +.loop: + movdqa m0, [r1+r2] + movdqu m1, [r1+r2+8] + movdqa m2, m0 + movdqa m3, m1 + mpsadbw m0, m4, 0 + mpsadbw m1, m4, 0 + mpsadbw m2, m4, 4 + mpsadbw m3, m4, 4 + paddw m0, [r0+r2*2] + paddw m1, [r0+r2*2+16] + paddw m0, m2 + paddw m1, m3 + movdqa [r3+r2*2 ], m0 + movdqa [r3+r2*2+16], m1 + add r2, 16 + jl .loop + REP_RET + +%macro INTEGRAL_INIT 1 +;----------------------------------------------------------------------------- +; void x264_integral_init4v_mmx( uint16_t *sum8, uint16_t *sum4, int stride ) +;----------------------------------------------------------------------------- +cglobal x264_integral_init4v_%1, 3,5 + shl r2, 1 + add r0, r2 + add r1, r2 + lea r3, [r0+r2*4] + lea r4, [r0+r2*8] + neg r2 +.loop: + movu m0, [r0+r2+8] + mova m2, [r0+r2] + movu m1, [r4+r2+8] + paddw m0, m2 + paddw m1, [r4+r2] + mova m3, [r3+r2] + psubw m1, m0 + psubw m3, m2 + mova [r0+r2], m1 + mova [r1+r2], m3 + add r2, mmsize + jl .loop + REP_RET + +;----------------------------------------------------------------------------- +; void x264_integral_init8v_mmx( uint16_t *sum8, int stride ) +;----------------------------------------------------------------------------- +cglobal x264_integral_init8v_%1, 3,3 + shl r1, 1 + add r0, r1 + lea r2, [r0+r1*8] + neg r1 +.loop: + mova m0, [r2+r1] + mova m1, [r2+r1+mmsize] + psubw m0, [r0+r1] + psubw m1, [r0+r1+mmsize] + mova [r0+r1], m0 + mova [r0+r1+mmsize], m1 + add r1, 2*mmsize + jl .loop + REP_RET +%endmacro + +INIT_MMX +INTEGRAL_INIT mmx +INIT_XMM +INTEGRAL_INIT sse2 + + + %macro FILT8x4 7 mova %3, [r0+%7] mova %4, [r0+r5+%7] diff --git a/common/x86/mc-c.c b/common/x86/mc-c.c index 0ec7adef..56ca4c4e 100644 --- a/common/x86/mc-c.c +++ b/common/x86/mc-c.c @@ -64,6 +64,12 @@ extern void *x264_memcpy_aligned_mmx( void * dst, const void * src, size_t n ); extern void *x264_memcpy_aligned_sse2( void * dst, const void * src, size_t n ); extern void x264_memzero_aligned_mmx( void * dst, int n ); extern void x264_memzero_aligned_sse2( void * dst, int n ); +extern void x264_integral_init4h_sse4( uint16_t *sum, uint8_t *pix, int stride ); +extern void x264_integral_init8h_sse4( uint16_t *sum, uint8_t *pix, int stride ); +extern void x264_integral_init4v_mmx( uint16_t *sum8, uint16_t *sum4, int stride ); +extern void x264_integral_init4v_sse2( uint16_t *sum8, uint16_t *sum4, int stride ); +extern void x264_integral_init8v_mmx( uint16_t *sum8, int stride ); +extern void x264_integral_init8v_sse2( uint16_t *sum8, int stride ); #define LOWRES(cpu) \ extern void x264_frame_init_lowres_core_##cpu( uint8_t *src0, uint8_t *dst0, uint8_t *dsth, uint8_t *dstv, uint8_t *dstc,\ int src_stride, int dst_stride, int width, int height ); @@ -242,6 +248,8 @@ void x264_mc_init_mmx( int cpu, x264_mc_functions_t *pf ) pf->copy[PIXEL_4x4] = x264_mc_copy_w4_mmx; pf->memcpy_aligned = x264_memcpy_aligned_mmx; pf->memzero_aligned = x264_memzero_aligned_mmx; + pf->integral_init4v = x264_integral_init4v_mmx; + pf->integral_init8v = x264_integral_init8v_mmx; if( !(cpu&X264_CPU_MMXEXT) ) return; @@ -286,6 +294,8 @@ void x264_mc_init_mmx( int cpu, x264_mc_functions_t *pf ) pf->memcpy_aligned = x264_memcpy_aligned_sse2; pf->memzero_aligned = x264_memzero_aligned_sse2; + pf->integral_init4v = x264_integral_init4v_sse2; + pf->integral_init8v = x264_integral_init8v_sse2; pf->hpel_filter = x264_hpel_filter_sse2_amd; if( cpu&X264_CPU_SSE2_IS_SLOW ) @@ -331,4 +341,10 @@ void x264_mc_init_mmx( int cpu, x264_mc_functions_t *pf ) pf->hpel_filter = x264_hpel_filter_ssse3; pf->frame_init_lowres_core = x264_frame_init_lowres_core_ssse3; pf->mc_chroma = x264_mc_chroma_ssse3; + + if( !(cpu&X264_CPU_SSE4) ) + return; + + pf->integral_init4h = x264_integral_init4h_sse4; + pf->integral_init8h = x264_integral_init8h_sse4; } diff --git a/encoder/encoder.c b/encoder/encoder.c index 73d33f4a..0a91134e 100644 --- a/encoder/encoder.c +++ b/encoder/encoder.c @@ -713,6 +713,7 @@ x264_t *x264_encoder_open ( x264_param_t *param ) || h->param.i_bframe_adaptive || h->param.b_pre_scenecut ); h->frames.b_have_lowres |= (h->param.rc.b_stat_read && h->param.rc.i_vbv_buffer_size > 0); + h->frames.b_have_sub8x8_esa = !!(h->param.analyse.inter & X264_ANALYSE_PSUB8x8); h->frames.i_last_idr = - h->param.i_keyint_max; h->frames.i_input = 0; @@ -839,6 +840,8 @@ int x264_encoder_reconfig( x264_t *h, x264_param_t *param ) // can only twiddle these if they were enabled to begin with: if( h->param.analyse.i_me_method >= X264_ME_ESA || param->analyse.i_me_method < X264_ME_ESA ) COPY( analyse.i_me_method ); + if( h->param.analyse.i_me_method >= X264_ME_ESA && !h->frames.b_have_sub8x8_esa ) + h->param.analyse.inter &= ~X264_ANALYSE_PSUB8x8; if( h->pps->b_transform_8x8_mode ) COPY( analyse.b_transform_8x8 ); if( h->frames.i_max_ref1 > 1 ) diff --git a/tools/checkasm.c b/tools/checkasm.c index e810cdcc..1c173199 100644 --- a/tools/checkasm.c +++ b/tools/checkasm.c @@ -822,33 +822,57 @@ static int check_mc( int cpu_ref, int cpu_new ) uint8_t *dstc[4] = { buf3, buf3+1024, buf3+2048, buf3+3072 }; uint8_t *dsta[4] = { buf4, buf4+1024, buf4+2048, buf3+3072 }; set_func_name( "lowres_init" ); + ok = 1; used_asm = 1; for( w=40; w<=48; w+=8 ) - if( mc_a.frame_init_lowres_core != mc_ref.frame_init_lowres_core ) + { + int stride = (w+8)&~15; + call_c( mc_c.frame_init_lowres_core, buf1, dstc[0], dstc[1], dstc[2], dstc[3], w*2, stride, w, 16 ); + call_a( mc_a.frame_init_lowres_core, buf1, dsta[0], dsta[1], dsta[2], dsta[3], w*2, stride, w, 16 ); + for( i=0; i<16; i++) { - int stride = (w+8)&~15; - used_asm = 1; - call_c( mc_c.frame_init_lowres_core, buf1, dstc[0], dstc[1], dstc[2], dstc[3], w*2, stride, w, 16 ); - call_a( mc_a.frame_init_lowres_core, buf1, dsta[0], dsta[1], dsta[2], dsta[3], w*2, stride, w, 16 ); - for( i=0; i<16; i++) - { - for( j=0; j<4; j++) - if( memcmp( dstc[j]+i*stride, dsta[j]+i*stride, w ) ) - { - ok = 0; - fprintf( stderr, "frame_init_lowres differs at plane %d line %d\n", j, i ); - for( k=0; k<w; k++ ) - printf( "%d ", dstc[j][k+i*stride] ); - printf("\n"); - for( k=0; k<w; k++ ) - printf( "%d ", dsta[j][k+i*stride] ); - printf("\n"); - break; - } - } + for( j=0; j<4; j++) + if( memcmp( dstc[j]+i*stride, dsta[j]+i*stride, w ) ) + { + ok = 0; + fprintf( stderr, "frame_init_lowres differs at plane %d line %d\n", j, i ); + for( k=0; k<w; k++ ) + printf( "%d ", dstc[j][k+i*stride] ); + printf("\n"); + for( k=0; k<w; k++ ) + printf( "%d ", dsta[j][k+i*stride] ); + printf("\n"); + break; + } } + } report( "lowres init :" ); } +#define INTEGRAL_INIT( name, size, ... )\ + if( mc_a.name != mc_ref.name )\ + {\ + int stride = 80;\ + set_func_name( #name );\ + used_asm = 1;\ + memcpy( buf3, buf1, size*2*stride );\ + memcpy( buf4, buf1, size*2*stride );\ + uint16_t *sum = (uint16_t*)buf3;\ + call_c1( mc_c.name, __VA_ARGS__ );\ + sum = (uint16_t*)buf4;\ + call_a1( mc_a.name, __VA_ARGS__ );\ + if( memcmp( buf3, buf4, (stride-8)*2 )\ + || (size>9 && memcmp( buf3+18*stride, buf4+18*stride, (stride-8)*2 )))\ + ok = 0;\ + call_c2( mc_c.name, __VA_ARGS__ );\ + call_a2( mc_a.name, __VA_ARGS__ );\ + } + ok = 1; used_asm = 0; + INTEGRAL_INIT( integral_init4h, 2, sum+stride, buf2, stride ); + INTEGRAL_INIT( integral_init8h, 2, sum+stride, buf2, stride ); + INTEGRAL_INIT( integral_init4v, 14, sum, sum+9*stride, stride ); + INTEGRAL_INIT( integral_init8v, 9, sum, stride ); + report( "integral init :" ); + return ret; } -- 2.40.0