From: Loren Merritt Date: Mon, 9 Jan 2006 06:18:39 +0000 (+0000) Subject: early termination within large SADs. ~1% faster UMH, ~4% faster ESA. X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=6bf39eaa780ef0877b7d6fe8497df9a38d4baa3d;p=libx264 early termination within large SADs. ~1% faster UMH, ~4% faster ESA. git-svn-id: svn://svn.videolan.org/x264/trunk@397 df754926-b1dd-0310-bc7b-ec298dee348c --- diff --git a/common/amd64/pixel-a.asm b/common/amd64/pixel-a.asm index a2fa66a4..da5b074a 100644 --- a/common/amd64/pixel-a.asm +++ b/common/amd64/pixel-a.asm @@ -262,6 +262,10 @@ cglobal x264_pixel_sad_8x4_mmxext cglobal x264_pixel_sad_4x8_mmxext cglobal x264_pixel_sad_4x4_mmxext +cglobal x264_pixel_sad_pde_16x16_mmxext +cglobal x264_pixel_sad_pde_16x8_mmxext +cglobal x264_pixel_sad_pde_8x16_mmxext + cglobal x264_pixel_ssd_16x16_mmxext cglobal x264_pixel_ssd_16x8_mmxext cglobal x264_pixel_ssd_8x16_mmxext @@ -377,6 +381,64 @@ x264_pixel_sad_4x4_mmxext: +%macro PDE_CHECK 0 + movd eax, mm0 + cmp eax, parm5d ; prev_score + jl .continue + ret +ALIGN 4 +.continue: +%endmacro + +ALIGN 16 +;----------------------------------------------------------------------------- +; int __cdecl x264_pixel_sad_pde_16x16_mmxext (uint8_t *, int, uint8_t *, int, int ) +;----------------------------------------------------------------------------- +x264_pixel_sad_pde_16x16_mmxext: + SAD_START + SAD_INC_2x16P + SAD_INC_2x16P + SAD_INC_2x16P + SAD_INC_2x16P + PDE_CHECK + SAD_INC_2x16P + SAD_INC_2x16P + SAD_INC_2x16P + SAD_INC_2x16P + SAD_END + +ALIGN 16 +;----------------------------------------------------------------------------- +; int __cdecl x264_pixel_sad_pde_16x8_mmxext (uint8_t *, int, uint8_t *, int, int ) +;----------------------------------------------------------------------------- +x264_pixel_sad_pde_16x8_mmxext: + SAD_START + SAD_INC_2x16P + SAD_INC_2x16P + PDE_CHECK + SAD_INC_2x16P + SAD_INC_2x16P + SAD_END + +ALIGN 16 +;----------------------------------------------------------------------------- +; int __cdecl x264_pixel_sad_pde_8x16_mmxext (uint8_t *, int, uint8_t *, int, int ) +;----------------------------------------------------------------------------- +x264_pixel_sad_pde_8x16_mmxext: + SAD_START + SAD_INC_2x8P + SAD_INC_2x8P + SAD_INC_2x8P + SAD_INC_2x8P + PDE_CHECK + SAD_INC_2x8P + SAD_INC_2x8P + SAD_INC_2x8P + SAD_INC_2x8P + SAD_END + + + %macro SSD_START 0 firstpush rbx pushreg rbx diff --git a/common/i386/pixel-a.asm b/common/i386/pixel-a.asm index 1fabddca..42e3b63c 100644 --- a/common/i386/pixel-a.asm +++ b/common/i386/pixel-a.asm @@ -269,6 +269,10 @@ cglobal x264_pixel_sad_8x4_mmxext cglobal x264_pixel_sad_4x8_mmxext cglobal x264_pixel_sad_4x4_mmxext +cglobal x264_pixel_sad_pde_16x16_mmxext +cglobal x264_pixel_sad_pde_16x8_mmxext +cglobal x264_pixel_sad_pde_8x16_mmxext + cglobal x264_pixel_ssd_16x16_mmxext cglobal x264_pixel_ssd_16x8_mmxext cglobal x264_pixel_ssd_8x16_mmxext @@ -391,6 +395,66 @@ x264_pixel_sad_4x4_mmxext: SAD_END +%macro PDE_CHECK 0 + movd ebx, mm0 + cmp ebx, [esp+24] ; prev_score + jl .continue + pop ebx + mov eax, 0xffff + ret +ALIGN 4 +.continue: + mov ebx, [esp+12] +%endmacro + +ALIGN 16 +;----------------------------------------------------------------------------- +; int __cdecl x264_pixel_sad_pde_16x16_mmxext (uint8_t *, int, uint8_t *, int, int ) +;----------------------------------------------------------------------------- +x264_pixel_sad_pde_16x16_mmxext: + SAD_START + SAD_INC_2x16P + SAD_INC_2x16P + SAD_INC_2x16P + SAD_INC_2x16P + PDE_CHECK + SAD_INC_2x16P + SAD_INC_2x16P + SAD_INC_2x16P + SAD_INC_2x16P + SAD_END + +ALIGN 16 +;----------------------------------------------------------------------------- +; int __cdecl x264_pixel_sad_pde_16x8_mmxext (uint8_t *, int, uint8_t *, int, int ) +;----------------------------------------------------------------------------- +x264_pixel_sad_pde_16x8_mmxext: + SAD_START + SAD_INC_2x16P + SAD_INC_2x16P + PDE_CHECK + SAD_INC_2x16P + SAD_INC_2x16P + SAD_END + +ALIGN 16 +;----------------------------------------------------------------------------- +; int __cdecl x264_pixel_sad_pde_8x16_mmxext (uint8_t *, int, uint8_t *, int, int ) +;----------------------------------------------------------------------------- +x264_pixel_sad_pde_8x16_mmxext: + SAD_START + SAD_INC_2x8P + SAD_INC_2x8P + SAD_INC_2x8P + SAD_INC_2x8P + PDE_CHECK + SAD_INC_2x8P + SAD_INC_2x8P + SAD_INC_2x8P + SAD_INC_2x8P + SAD_END + + %macro SSD_START 0 push ebx diff --git a/common/i386/pixel.h b/common/i386/pixel.h index 92f7be49..94778a6c 100644 --- a/common/i386/pixel.h +++ b/common/i386/pixel.h @@ -32,6 +32,10 @@ int x264_pixel_sad_8x4_mmxext( uint8_t *, int, uint8_t *, int ); int x264_pixel_sad_4x8_mmxext( uint8_t *, int, uint8_t *, int ); int x264_pixel_sad_4x4_mmxext( uint8_t *, int, uint8_t *, int ); +int x264_pixel_sad_pde_16x16_mmxext( uint8_t *, int, uint8_t *, int, int ); +int x264_pixel_sad_pde_16x8_mmxext( uint8_t *, int, uint8_t *, int, int ); +int x264_pixel_sad_pde_8x16_mmxext( uint8_t *, int, uint8_t *, int, int ); + int x264_pixel_ssd_16x16_mmxext( uint8_t *, int, uint8_t *, int ); int x264_pixel_ssd_16x8_mmxext( uint8_t *, int, uint8_t *, int ); int x264_pixel_ssd_8x16_mmxext( uint8_t *, int, uint8_t *, int ); diff --git a/common/pixel.c b/common/pixel.c index 1e3a015f..58de6486 100644 --- a/common/pixel.c +++ b/common/pixel.c @@ -346,6 +346,10 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf ) pixf->sad[PIXEL_4x8 ] = x264_pixel_sad_4x8_mmxext; pixf->sad[PIXEL_4x4] = x264_pixel_sad_4x4_mmxext; + pixf->sad_pde[PIXEL_16x16] = x264_pixel_sad_pde_16x16_mmxext; + pixf->sad_pde[PIXEL_16x8 ] = x264_pixel_sad_pde_16x8_mmxext; + pixf->sad_pde[PIXEL_8x16 ] = x264_pixel_sad_pde_8x16_mmxext; + pixf->ssd[PIXEL_16x16] = x264_pixel_ssd_16x16_mmxext; pixf->ssd[PIXEL_16x8] = x264_pixel_ssd_16x8_mmxext; pixf->ssd[PIXEL_8x16] = x264_pixel_ssd_8x16_mmxext; diff --git a/common/pixel.h b/common/pixel.h index f8012bab..2300bc0c 100644 --- a/common/pixel.h +++ b/common/pixel.h @@ -25,6 +25,7 @@ #define _PIXEL_H 1 typedef int (*x264_pixel_cmp_t) ( uint8_t *, int, uint8_t *, int ); +typedef int (*x264_pixel_cmp_pde_t) ( uint8_t *, int, uint8_t *, int, int ); enum { @@ -66,6 +67,11 @@ typedef struct x264_pixel_cmp_t satd[7]; x264_pixel_cmp_t sa8d[4]; x264_pixel_cmp_t mbcmp[7]; /* either satd or sad for subpel refine and mode decision */ + + /* partial distortion elimination: + * terminate early if partial score is worse than a threshold. + * may be NULL, in which case just use sad instead. */ + x264_pixel_cmp_pde_t sad_pde[7]; } x264_pixel_function_t; void x264_pixel_init( int cpu, x264_pixel_function_t *pixf ); diff --git a/encoder/me.c b/encoder/me.c index 66207613..e90da164 100644 --- a/encoder/me.c +++ b/encoder/me.c @@ -62,6 +62,19 @@ static void refine_subpel( x264_t *h, x264_me_t *m, int hpel_iters, int qpel_ite #define COST_MV( mx, my ) COST_MV_INT( mx, my, 0, 0 ) #define COST_MV_DIR( mx, my, d ) COST_MV_INT( mx, my, 1, d ) +#define COST_MV_PDE( mx, my ) \ +{ \ + int cost = h->pixf.sad_pde[i_pixel]( m->p_fenc[0], m->i_stride[0], \ + &p_fref[(my)*m->i_stride[0]+(mx)], m->i_stride[0], \ + bcost - p_cost_mvx[ (mx)<<2 ] - p_cost_mvy[ (my)<<2 ] ); \ + if( cost < bcost - p_cost_mvx[ (mx)<<2 ] - p_cost_mvy[ (my)<<2 ] ) \ + { \ + bcost = cost + p_cost_mvx[ (mx)<<2 ] + p_cost_mvy[ (my)<<2 ]; \ + bmx = mx; \ + bmy = my; \ + } \ +} + #define DIA1_ITER( mx, my )\ {\ omx = mx; omy = my;\ @@ -234,24 +247,40 @@ me_hex2: }; COST_MV( omx + square2[i][0], omy + square2[i][1] ); } + /* hexagon grid */ omx = bmx; omy = bmy; for( i = 1; i <= i_me_range/4; i++ ) { - int bounds_check = 4*i > X264_MIN4( mv_x_max-omx, mv_y_max-omy, omx-mv_x_min, omy-mv_y_min ); - for( j = 0; j < 16; j++ ) + static const int hex4[16][2] = { + {-4, 2}, {-4, 1}, {-4, 0}, {-4,-1}, {-4,-2}, + { 4,-2}, { 4,-1}, { 4, 0}, { 4, 1}, { 4, 2}, + { 2, 3}, { 0, 4}, {-2, 3}, + {-2,-3}, { 0,-4}, { 2,-3}, + }; + const int bounds_check = 4*i > X264_MIN4( mv_x_max-omx, mv_y_max-omy, omx-mv_x_min, omy-mv_y_min ); + + if( h->pixf.sad_pde[i_pixel] ) { - static const int hex4[16][2] = { - {-4, 2}, {-4, 1}, {-4, 0}, {-4,-1}, {-4,-2}, - { 4,-2}, { 4,-1}, { 4, 0}, { 4, 1}, { 4, 2}, - { 2, 3}, { 0, 4}, {-2, 3}, - {-2,-3}, { 0,-4}, { 2,-3}, - }; - int mx = omx + hex4[j][0]*i; - int my = omy + hex4[j][1]*i; - if( !bounds_check || ( mx >= mv_x_min && mx <= mv_x_max - && my >= mv_y_min && my <= mv_y_max ) ) - COST_MV( mx, my ); + for( j = 0; j < 16; j++ ) + { + int mx = omx + hex4[j][0]*i; + int my = omy + hex4[j][1]*i; + if( !bounds_check || ( mx >= mv_x_min && mx <= mv_x_max + && my >= mv_y_min && my <= mv_y_max ) ) + COST_MV_PDE( mx, my ); + } + } + else + { + for( j = 0; j < 16; j++ ) + { + int mx = omx + hex4[j][0]*i; + int my = omy + hex4[j][1]*i; + if( !bounds_check || ( mx >= mv_x_min && mx <= mv_x_max + && my >= mv_y_min && my <= mv_y_max ) ) + COST_MV( mx, my ); + } } } goto me_hex2; @@ -279,15 +308,32 @@ me_hex2: const int enc_dc = h->pixf.sad[i_pixel]( m->p_fenc[0], stride, zero, 16 ); const uint16_t *integral_base = &m->integral[ -1 - 1*stride ]; - for( my = min_y; my <= max_y; my++ ) - for( mx = min_x; mx <= max_x; mx++ ) - { - const uint16_t *integral = &integral_base[ mx + my * stride ]; - const uint16_t ref_dc = integral[ 0 ] + integral[ dh + dw ] - - integral[ dw ] - integral[ dh ]; - if( abs( ref_dc - enc_dc ) < bcost - p_cost_mvx[ (mx)<<2 ] - p_cost_mvy[ (my)<<2 ] ) - COST_MV( mx, my ); - } + if( h->pixf.sad_pde[i_pixel] ) + { + for( my = min_y; my <= max_y; my++ ) + for( mx = min_x; mx <= max_x; mx++ ) + { + const uint16_t *integral = &integral_base[ mx + my * stride ]; + const uint16_t ref_dc = integral[ 0 ] + integral[ dh + dw ] + - integral[ dw ] - integral[ dh ]; + const int bsad = bcost - p_cost_mvx[ (mx)<<2 ] - p_cost_mvy[ (my)<<2 ]; + if( abs( ref_dc - enc_dc ) < bsad ) + COST_MV_PDE( mx, my ); + } + } + else + { + for( my = min_y; my <= max_y; my++ ) + for( mx = min_x; mx <= max_x; mx++ ) + { + const uint16_t *integral = &integral_base[ mx + my * stride ]; + const uint16_t ref_dc = integral[ 0 ] + integral[ dh + dw ] + - integral[ dw ] - integral[ dh ]; + const int bsad = bcost - p_cost_mvx[ (mx)<<2 ] - p_cost_mvy[ (my)<<2 ]; + if( abs( ref_dc - enc_dc ) < bsad ) + COST_MV( mx, my ); + } + } #endif } break;