From 8d09ebe2e862688ce213d3f098ce7eca719fea23 Mon Sep 17 00:00:00 2001 From: Loren Merritt Date: Sun, 27 Jan 2008 11:36:11 +0000 Subject: [PATCH] satd exhaustive motion search (--me tesa) git-svn-id: svn://svn.videolan.org/x264/trunk@728 df754926-b1dd-0310-bc7b-ec298dee348c --- common/frame.c | 2 +- common/pixel.c | 47 ++++++++++++++++++ common/pixel.h | 8 +++- encoder/analyse.c | 2 +- encoder/encoder.c | 16 +++++-- encoder/me.c | 118 +++++++++++++++++++++++++++++++++++++++------- x264.c | 3 +- x264.h | 3 +- 8 files changed, 172 insertions(+), 27 deletions(-) diff --git a/common/frame.c b/common/frame.c index 570441bf..ce8af34d 100644 --- a/common/frame.c +++ b/common/frame.c @@ -94,7 +94,7 @@ x264_frame_t *x264_frame_new( x264_t *h ) } } - if( h->param.analyse.i_me_method == X264_ME_ESA ) + if( h->param.analyse.i_me_method >= X264_ME_ESA ) { CHECKED_MALLOC( frame->buffer[7], 2 * frame->i_stride[0] * (frame->i_lines[0] + 2*i_padv) * sizeof(uint16_t) ); diff --git a/common/pixel.c b/common/pixel.c index 6bfc3a1c..046e01f3 100644 --- a/common/pixel.c +++ b/common/pixel.c @@ -323,6 +323,45 @@ SAD_X( 8x16_vis ) SAD_X( 8x8_vis ) #endif +/**************************************************************************** + * pixel_satd_x4 + * no faster than single satd, but needed for satd to be a drop-in replacement for sad + ****************************************************************************/ + +#define SATD_X( size, cpu ) \ +static void x264_pixel_satd_x3_##size##cpu( uint8_t *fenc, uint8_t *pix0, uint8_t *pix1, uint8_t *pix2, int i_stride, int scores[3] )\ +{\ + scores[0] = x264_pixel_satd_##size##cpu( fenc, FENC_STRIDE, pix0, i_stride );\ + scores[1] = x264_pixel_satd_##size##cpu( fenc, FENC_STRIDE, pix1, i_stride );\ + scores[2] = x264_pixel_satd_##size##cpu( fenc, FENC_STRIDE, pix2, i_stride );\ +}\ +static void x264_pixel_satd_x4_##size##cpu( uint8_t *fenc, uint8_t *pix0, uint8_t *pix1, uint8_t *pix2, uint8_t *pix3, int i_stride, int scores[4] )\ +{\ + scores[0] = x264_pixel_satd_##size##cpu( fenc, FENC_STRIDE, pix0, i_stride );\ + scores[1] = x264_pixel_satd_##size##cpu( fenc, FENC_STRIDE, pix1, i_stride );\ + scores[2] = x264_pixel_satd_##size##cpu( fenc, FENC_STRIDE, pix2, i_stride );\ + scores[3] = x264_pixel_satd_##size##cpu( fenc, FENC_STRIDE, pix3, i_stride );\ +} +#define SATD_X_DECL5( cpu )\ +SATD_X( 16x16, cpu )\ +SATD_X( 16x8, cpu )\ +SATD_X( 8x16, cpu )\ +SATD_X( 8x8, cpu )\ +SATD_X( 8x4, cpu ) +#define SATD_X_DECL7( cpu )\ +SATD_X_DECL5( cpu )\ +SATD_X( 4x8, cpu )\ +SATD_X( 4x4, cpu ) + +SATD_X_DECL7() +#ifdef HAVE_MMX +SATD_X_DECL7( _mmxext ) +SATD_X_DECL5( _sse2 ) +#ifdef HAVE_SSE3 +SATD_X_DECL5( _ssse3 ) +#endif +#endif + /**************************************************************************** * structural similarity metric ****************************************************************************/ @@ -487,6 +526,8 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf ) INIT7( sad_x4, ); INIT7( ssd, ); INIT7( satd, ); + INIT7( satd_x3, ); + INIT7( satd_x4, ); INIT4( sa8d, ); INIT_ADS( ); @@ -505,6 +546,8 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf ) INIT7( sad_x3, _mmxext ); INIT7( sad_x4, _mmxext ); INIT7( satd, _mmxext ); + INIT7( satd_x3, _mmxext ); + INIT7( satd_x4, _mmxext ); INIT_ADS( _mmxext ); #ifdef ARCH_X86 @@ -552,6 +595,8 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf ) INIT2( sad_x3, _sse2 ); INIT2( sad_x4, _sse2 ); INIT5( satd, _sse2 ); + INIT5( satd_x3, _sse2 ); + INIT5( satd_x4, _sse2 ); INIT_ADS( _sse2 ); #ifdef ARCH_X86 @@ -588,6 +633,8 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf ) if( cpu&X264_CPU_SSSE3 ) { INIT5( satd, _ssse3 ); + INIT5( satd_x3, _ssse3 ); + INIT5( satd_x4, _ssse3 ); INIT_ADS( _ssse3 ); #ifdef ARCH_X86_64 pixf->sa8d[PIXEL_16x16]= x264_pixel_sa8d_16x16_ssse3; diff --git a/common/pixel.h b/common/pixel.h index 1a8d0ac2..fb5f99ec 100644 --- a/common/pixel.h +++ b/common/pixel.h @@ -69,15 +69,19 @@ typedef struct x264_pixel_cmp_t ssim[7]; x264_pixel_cmp_t sa8d[4]; x264_pixel_cmp_t mbcmp[7]; /* either satd or sad for subpel refine and mode decision */ - x264_pixel_cmp_t rdcmp[7]; /* either ssd or ssim for rate-distortion */ + x264_pixel_cmp_t fpelcmp[7]; /* either satd or sad for fullpel motion search */ + x264_pixel_cmp_x3_t fpelcmp_x3[7]; + x264_pixel_cmp_x4_t fpelcmp_x4[7]; void (*ssim_4x4x2_core)( const uint8_t *pix1, int stride1, const uint8_t *pix2, int stride2, int sums[2][4] ); float (*ssim_end4)( int sum0[5][4], int sum1[5][4], int width ); - /* multiple parallel calls to sad. */ + /* multiple parallel calls to cmp. */ x264_pixel_cmp_x3_t sad_x3[7]; x264_pixel_cmp_x4_t sad_x4[7]; + x264_pixel_cmp_x3_t satd_x3[7]; + x264_pixel_cmp_x4_t satd_x4[7]; /* abs-diff-sum for successive elimination. * may round width up to a multiple of 16. */ diff --git a/encoder/analyse.c b/encoder/analyse.c index 62971585..fbf2b92b 100644 --- a/encoder/analyse.c +++ b/encoder/analyse.c @@ -190,7 +190,7 @@ static void x264_mb_analyse_load_costs( x264_t *h, x264_mb_analysis_t *a ) a->p_cost_mv = p_cost_mv[a->i_qp]; /* FIXME is this useful for all me methods? */ - if( h->param.analyse.i_me_method == X264_ME_ESA && !x264_cost_mv_fpel[a->i_qp][0] ) + if( h->param.analyse.i_me_method >= X264_ME_ESA && !x264_cost_mv_fpel[a->i_qp][0] ) { for( j=0; j<4; j++ ) { diff --git a/encoder/encoder.c b/encoder/encoder.c index 1715c68f..b7baf938 100644 --- a/encoder/encoder.c +++ b/encoder/encoder.c @@ -363,7 +363,7 @@ static int x264_validate_parameters( x264_t *h ) if( h->param.b_interlaced ) { - if( h->param.analyse.i_me_method == X264_ME_ESA ) + if( h->param.analyse.i_me_method >= X264_ME_ESA ) { x264_log( h, X264_LOG_WARNING, "interlace + me=esa is not implemented\n" ); h->param.analyse.i_me_method = X264_ME_UMH; @@ -449,12 +449,15 @@ static int x264_validate_parameters( x264_t *h ) h->param.i_cqm_preset = X264_CQM_FLAT; if( h->param.analyse.i_me_method < X264_ME_DIA || - h->param.analyse.i_me_method > X264_ME_ESA ) + h->param.analyse.i_me_method > X264_ME_TESA ) h->param.analyse.i_me_method = X264_ME_HEX; if( h->param.analyse.i_me_range < 4 ) h->param.analyse.i_me_range = 4; if( h->param.analyse.i_me_range > 16 && h->param.analyse.i_me_method <= X264_ME_HEX ) h->param.analyse.i_me_range = 16; + if( h->param.analyse.i_me_method == X264_ME_TESA && + (h->mb.b_lossless || h->param.analyse.i_subpel_refine <= 1) ) + h->param.analyse.i_me_method = X264_ME_ESA; h->param.analyse.i_subpel_refine = x264_clip3( h->param.analyse.i_subpel_refine, 1, 7 ); h->param.analyse.b_bframe_rdo = h->param.analyse.b_bframe_rdo && h->param.analyse.i_subpel_refine >= 6; h->param.analyse.b_mixed_references = h->param.analyse.b_mixed_references && h->param.i_frame_reference > 1; @@ -546,9 +549,12 @@ static int x264_validate_parameters( x264_t *h ) static void mbcmp_init( x264_t *h ) { - memcpy( h->pixf.mbcmp, - ( h->mb.b_lossless || h->param.analyse.i_subpel_refine <= 1 ) ? h->pixf.sad : h->pixf.satd, - sizeof(h->pixf.mbcmp) ); + int satd = !h->mb.b_lossless && h->param.analyse.i_subpel_refine > 1; + memcpy( h->pixf.mbcmp, satd ? h->pixf.satd : h->pixf.sad, sizeof(h->pixf.mbcmp) ); + satd &= h->param.analyse.i_me_method == X264_ME_TESA; + memcpy( h->pixf.fpelcmp, satd ? h->pixf.satd : h->pixf.sad, sizeof(h->pixf.fpelcmp) ); + memcpy( h->pixf.fpelcmp_x3, satd ? h->pixf.satd_x3 : h->pixf.sad_x3, sizeof(h->pixf.fpelcmp_x3) ); + memcpy( h->pixf.fpelcmp_x4, satd ? h->pixf.satd_x4 : h->pixf.sad_x4, sizeof(h->pixf.fpelcmp_x4) ); } /**************************************************************************** diff --git a/encoder/me.c b/encoder/me.c index 9706cd45..61c6db2a 100644 --- a/encoder/me.c +++ b/encoder/me.c @@ -54,7 +54,7 @@ static void refine_subpel( x264_t *h, x264_me_t *m, int hpel_iters, int qpel_ite #define COST_MV( mx, my )\ {\ - int cost = h->pixf.sad[i_pixel]( m->p_fenc[0], FENC_STRIDE,\ + int cost = h->pixf.fpelcmp[i_pixel]( m->p_fenc[0], FENC_STRIDE,\ &p_fref[(my)*m->i_stride[0]+(mx)], m->i_stride[0] )\ + BITS_MVD(mx,my);\ COPY3_IF_LT( bcost, cost, bmx, mx, bmy, my );\ @@ -64,7 +64,7 @@ static void refine_subpel( x264_t *h, x264_me_t *m, int hpel_iters, int qpel_ite { \ int stride = 16; \ uint8_t *src = h->mc.get_ref( pix, &stride, m->p_fref, m->i_stride[0], mx, my, bw, bh ); \ - int cost = h->pixf.sad[i_pixel]( m->p_fenc[0], FENC_STRIDE, src, stride ) \ + int cost = h->pixf.fpelcmp[i_pixel]( m->p_fenc[0], FENC_STRIDE, src, stride ) \ + p_cost_mvx[ mx ] + p_cost_mvy[ my ]; \ COPY3_IF_LT( bpred_cost, cost, bpred_mx, mx, bpred_my, my ); \ } @@ -72,7 +72,7 @@ static void refine_subpel( x264_t *h, x264_me_t *m, int hpel_iters, int qpel_ite #define COST_MV_X3_DIR( m0x, m0y, m1x, m1y, m2x, m2y, costs )\ {\ uint8_t *pix_base = p_fref + bmx + bmy*m->i_stride[0];\ - h->pixf.sad_x3[i_pixel]( m->p_fenc[0],\ + h->pixf.fpelcmp_x3[i_pixel]( m->p_fenc[0],\ pix_base + (m0x) + (m0y)*m->i_stride[0],\ pix_base + (m1x) + (m1y)*m->i_stride[0],\ pix_base + (m2x) + (m2y)*m->i_stride[0],\ @@ -85,7 +85,7 @@ static void refine_subpel( x264_t *h, x264_me_t *m, int hpel_iters, int qpel_ite #define COST_MV_X4( m0x, m0y, m1x, m1y, m2x, m2y, m3x, m3y )\ {\ uint8_t *pix_base = p_fref + omx + omy*m->i_stride[0];\ - h->pixf.sad_x4[i_pixel]( m->p_fenc[0],\ + h->pixf.fpelcmp_x4[i_pixel]( m->p_fenc[0],\ pix_base + (m0x) + (m0y)*m->i_stride[0],\ pix_base + (m1x) + (m1y)*m->i_stride[0],\ pix_base + (m2x) + (m2y)*m->i_stride[0],\ @@ -103,7 +103,7 @@ static void refine_subpel( x264_t *h, x264_me_t *m, int hpel_iters, int qpel_ite #define COST_MV_X3_ABS( m0x, m0y, m1x, m1y, m2x, m2y )\ {\ - h->pixf.sad_x3[i_pixel]( m->p_fenc[0],\ + h->pixf.fpelcmp_x3[i_pixel]( m->p_fenc[0],\ p_fref + (m0x) + (m0y)*m->i_stride[0],\ p_fref + (m1x) + (m1y)*m->i_stride[0],\ p_fref + (m2x) + (m2y)*m->i_stride[0],\ @@ -450,6 +450,7 @@ me_hex2: } case X264_ME_ESA: + case X264_ME_TESA: { const int min_x = X264_MAX( bmx - i_me_range, mv_x_min ); const int min_y = X264_MAX( bmy - i_me_range, mv_y_min ); @@ -488,16 +489,101 @@ me_hex2: if( i_pixel == PIXEL_8x16 || i_pixel == PIXEL_4x8 ) enc_dc[1] = enc_dc[2]; - for( my = min_y; my <= max_y; my++ ) + if( h->mb.i_me_method == X264_ME_TESA ) + { + // ADS threshold, then SAD threshold, then keep the best few SADs, then SATD + typedef struct { + int sad; + int16_t mx, my; + } mvsad_t; + mvsad_t *mvsads = x264_malloc( width*(max_y-min_y+1)*sizeof(mvsad_t) ); + int nmvsad = 0, limit; + int sad_thresh = i_me_range <= 16 ? 10 : i_me_range <= 24 ? 11 : 12; + int bsad = h->pixf.sad[i_pixel]( m->p_fenc[0], FENC_STRIDE, p_fref+bmy*stride+bmx, stride ) + + BITS_MVD( bmx, bmy ); + for( my = min_y; my <= max_y; my++ ) + { + int ycost = p_cost_mvy[my<<2]; + bsad -= ycost; + xn = h->pixf.ads[i_pixel]( enc_dc, sums_base + min_x + my * stride, delta, + cost_fpel_mvx+min_x, xs, width, bsad*17/16 ); + for( i=0; ipixf.sad_x3[i_pixel]( m->p_fenc[0], ref+xs[i], ref+xs[i+1], ref+xs[i+2], stride, sads ); + for( j=0; j<3; j++ ) + { + int sad = sads[j] + cost_fpel_mvx[xs[i+j]]; + if( sad < bsad*sad_thresh>>3 ) + { + COPY1_IF_LT( bsad, sad ); + mvsads[nmvsad].sad = sad + ycost; + mvsads[nmvsad].mx = min_x+xs[i+j]; + mvsads[nmvsad].my = my; + nmvsad++; + } + } + } + for( ; ipixf.sad[i_pixel]( m->p_fenc[0], FENC_STRIDE, p_fref+mx+my*stride, stride ) + + cost_fpel_mvx[xs[i]]; + if( sad < bsad*sad_thresh>>3 ) + { + COPY1_IF_LT( bsad, sad ); + mvsads[nmvsad].sad = sad + ycost; + mvsads[nmvsad].mx = mx; + mvsads[nmvsad].my = my; + nmvsad++; + } + } + bsad += ycost; + } + + limit = i_me_range / 2; + if( nmvsad > limit*2 ) + { + // halve the range if the domain is too large... eh, close enough + bsad = bsad*(sad_thresh+8)>>4; + for( i=0; i limit ) + { + for( i=0; i i ) + XCHG( mvsad_t, mvsads[i], mvsads[bj] ); + } + nmvsad = limit; + } + for( i=0; ipixf.ads[i_pixel]( enc_dc, sums_base + min_x + my * stride, delta, - cost_fpel_mvx+min_x, xs, width, bcost ); - for( i=0; ipixf.ads[i_pixel]( enc_dc, sums_base + min_x + my * stride, delta, + cost_fpel_mvx+min_x, xs, width, bcost ); + for( i=0; imc.get_ref( pix[0], &stride, m->p_fref, m->i_stride[0], mx, my, bw, bh ); \ - int cost = h->pixf.sad[i_pixel]( m->p_fenc[0], FENC_STRIDE, src, stride ) \ + int cost = h->pixf.fpelcmp[i_pixel]( m->p_fenc[0], FENC_STRIDE, src, stride ) \ + p_cost_mvx[ mx ] + p_cost_mvy[ my ]; \ COPY3_IF_LT( bcost, cost, bmx, mx, bmy, my ); \ } @@ -623,7 +709,7 @@ static void refine_subpel( x264_t *h, x264_me_t *m, int hpel_iters, int qpel_ite src2 = h->mc.get_ref( pix[1], &stride, m->p_fref, m->i_stride[0], omx-2, omy, bw+4, bh ); src1 = src0 + stride; src3 = src2 + 1; - h->pixf.sad_x4[i_pixel]( m->p_fenc[0], src0, src1, src2, src3, stride, costs ); + h->pixf.fpelcmp_x4[i_pixel]( m->p_fenc[0], src0, src1, src2, src3, stride, costs ); COPY2_IF_LT( bcost, costs[0] + p_cost_mvx[omx ] + p_cost_mvy[omy-2], bmy, omy-2 ); COPY2_IF_LT( bcost, costs[1] + p_cost_mvx[omx ] + p_cost_mvy[omy+2], bmy, omy+2 ); COPY3_IF_LT( bcost, costs[2] + p_cost_mvx[omx-2] + p_cost_mvy[omy ], bmx, omx-2, bmy, omy ); diff --git a/x264.c b/x264.c index 73877c9a..f68755d9 100644 --- a/x264.c +++ b/x264.c @@ -225,7 +225,8 @@ static void Help( x264_param_t *defaults, int b_longhelp ) H1( " - dia: diamond search, radius 1 (fast)\n" " - hex: hexagonal search, radius 2\n" " - umh: uneven multi-hexagon search\n" - " - esa: exhaustive search (slow)\n" ); + " - esa: exhaustive search\n" + " - tesa: hadamard exhaustive search (slow)\n" ); else H0( " - dia, hex, umh\n" ); H0( " --merange Maximum motion vector search range [%d]\n", defaults->analyse.i_me_range ); H1( " --mvrange Maximum motion vector length [-1 (auto)]\n" ); diff --git a/x264.h b/x264.h index 6d2ee753..f4fd1d13 100644 --- a/x264.h +++ b/x264.h @@ -74,6 +74,7 @@ typedef struct x264_t x264_t; #define X264_ME_HEX 1 #define X264_ME_UMH 2 #define X264_ME_ESA 3 +#define X264_ME_TESA 4 #define X264_CQM_FLAT 0 #define X264_CQM_JVT 1 #define X264_CQM_CUSTOM 2 @@ -83,7 +84,7 @@ typedef struct x264_t x264_t; #define X264_RC_ABR 2 static const char * const x264_direct_pred_names[] = { "none", "spatial", "temporal", "auto", 0 }; -static const char * const x264_motion_est_names[] = { "dia", "hex", "umh", "esa", 0 }; +static const char * const x264_motion_est_names[] = { "dia", "hex", "umh", "esa", "tesa", 0 }; static const char * const x264_overscan_names[] = { "undef", "show", "crop", 0 }; static const char * const x264_vidformat_names[] = { "component", "pal", "ntsc", "secam", "mac", "undef", 0 }; static const char * const x264_fullrange_names[] = { "off", "on", 0 }; -- 2.40.0