From 48c2e935e3638a38c988b11204ff52a85bf48fc9 Mon Sep 17 00:00:00 2001 From: Loren Merritt Date: Sat, 15 Oct 2005 00:27:17 +0000 Subject: [PATCH] Use SAD instead of SATD for halfpel motion search. Move multiref termination after halfpel search. Total: 3-7% speedup and +/-.02 dB. patch by Alex Wright. git-svn-id: svn://svn.videolan.org/x264/trunk@329 df754926-b1dd-0310-bc7b-ec298dee348c --- encoder/analyse.c | 32 ++++++------ encoder/me.c | 130 +++++++++++++++++++++++++++------------------- 2 files changed, 93 insertions(+), 69 deletions(-) diff --git a/encoder/analyse.c b/encoder/analyse.c index 6e4c3ab0..a2348ab6 100644 --- a/encoder/analyse.c +++ b/encoder/analyse.c @@ -674,8 +674,8 @@ static void x264_mb_analyse_inter_p16x16( x264_t *h, x264_mb_analysis_t *a ) x264_me_t m; int i_ref; int mvc[7][2], i_mvc; - int i_fullpel_thresh = INT_MAX; - int *p_fullpel_thresh = h->i_ref0>1 ? &i_fullpel_thresh : NULL; + int i_halfpel_thresh = INT_MAX; + int *p_halfpel_thresh = h->i_ref0>1 ? &i_halfpel_thresh : NULL; /* 16x16 Search on all ref frame */ m.i_pixel = PIXEL_16x16; @@ -686,7 +686,7 @@ static void x264_mb_analyse_inter_p16x16( x264_t *h, x264_mb_analysis_t *a ) for( i_ref = 0; i_ref < h->i_ref0; i_ref++ ) { const int i_ref_cost = REF_COST( 0, i_ref ); - i_fullpel_thresh -= i_ref_cost; + i_halfpel_thresh -= i_ref_cost; m.i_ref_cost = i_ref_cost; m.i_ref = i_ref; @@ -694,10 +694,10 @@ static void x264_mb_analyse_inter_p16x16( x264_t *h, x264_mb_analysis_t *a ) LOAD_HPELS( &m, h->mb.pic.p_fref[0][i_ref], 0, 0 ); x264_mb_predict_mv_16x16( h, 0, i_ref, m.mvp ); x264_mb_predict_mv_ref16x16( h, 0, i_ref, mvc, &i_mvc ); - x264_me_search_ref( h, &m, mvc, i_mvc, p_fullpel_thresh ); + x264_me_search_ref( h, &m, mvc, i_mvc, p_halfpel_thresh ); m.cost += i_ref_cost; - i_fullpel_thresh += i_ref_cost; + i_halfpel_thresh += i_ref_cost; if( m.cost < a->l0.me16x16.cost ) a->l0.me16x16 = m; @@ -726,8 +726,8 @@ static void x264_mb_analyse_inter_p8x8_mixed_ref( x264_t *h, x264_mb_analysis_t x264_me_t m; int i_ref; uint8_t **p_fenc = h->mb.pic.p_fenc; - int i_fullpel_thresh = INT_MAX; - int *p_fullpel_thresh = /*h->i_ref0>1 ? &i_fullpel_thresh : */NULL; + int i_halfpel_thresh = INT_MAX; + int *p_halfpel_thresh = /*h->i_ref0>1 ? &i_halfpel_thresh : */NULL; int i; int i_maxref = h->i_ref0-1; @@ -767,17 +767,17 @@ static void x264_mb_analyse_inter_p8x8_mixed_ref( x264_t *h, x264_mb_analysis_t for( i_ref = 0; i_ref <= i_maxref; i_ref++ ) { const int i_ref_cost = REF_COST( 0, i_ref ); - i_fullpel_thresh -= i_ref_cost; + i_halfpel_thresh -= i_ref_cost; m.i_ref_cost = i_ref_cost; m.i_ref = i_ref; LOAD_HPELS( &m, h->mb.pic.p_fref[0][i_ref], 8*x8, 8*y8 ); x264_macroblock_cache_ref( h, 2*x8, 2*y8, 2, 2, 0, i_ref ); x264_mb_predict_mv( h, 0, 4*i, 2, m.mvp ); - x264_me_search_ref( h, &m, a->l0.mvc[i_ref], i+1, p_fullpel_thresh ); + x264_me_search_ref( h, &m, a->l0.mvc[i_ref], i+1, p_halfpel_thresh ); m.cost += i_ref_cost; - i_fullpel_thresh += i_ref_cost; + i_halfpel_thresh += i_ref_cost; *(uint64_t*)a->l0.mvc[i_ref][i+1] = *(uint64_t*)m.mv; if( m.cost < l0m->cost ) @@ -1166,8 +1166,8 @@ static void x264_mb_analyse_inter_b16x16( x264_t *h, x264_mb_analysis_t *a ) x264_me_t m; int i_ref; int mvc[8][2], i_mvc; - int i_fullpel_thresh = INT_MAX; - int *p_fullpel_thresh = h->i_ref0>1 ? &i_fullpel_thresh : NULL; + int i_halfpel_thresh = INT_MAX; + int *p_halfpel_thresh = h->i_ref0>1 ? &i_halfpel_thresh : NULL; /* 16x16 Search on all ref frame */ m.i_pixel = PIXEL_16x16; @@ -1182,7 +1182,7 @@ static void x264_mb_analyse_inter_b16x16( x264_t *h, x264_mb_analysis_t *a ) LOAD_HPELS( &m, h->mb.pic.p_fref[0][i_ref], 0, 0 ); x264_mb_predict_mv_16x16( h, 0, i_ref, m.mvp ); x264_mb_predict_mv_ref16x16( h, 0, i_ref, mvc, &i_mvc ); - x264_me_search_ref( h, &m, mvc, i_mvc, p_fullpel_thresh ); + x264_me_search_ref( h, &m, mvc, i_mvc, p_halfpel_thresh ); /* add ref cost */ m.cost += REF_COST( 0, i_ref ); @@ -1201,8 +1201,8 @@ static void x264_mb_analyse_inter_b16x16( x264_t *h, x264_mb_analysis_t *a ) a->l0.me16x16.cost -= REF_COST( 0, a->l0.i_ref ); /* ME for list 1 */ - i_fullpel_thresh = INT_MAX; - p_fullpel_thresh = h->i_ref1>1 ? &i_fullpel_thresh : NULL; + i_halfpel_thresh = INT_MAX; + p_halfpel_thresh = h->i_ref1>1 ? &i_halfpel_thresh : NULL; a->l1.me16x16.cost = INT_MAX; for( i_ref = 0; i_ref < h->i_ref1; i_ref++ ) { @@ -1210,7 +1210,7 @@ static void x264_mb_analyse_inter_b16x16( x264_t *h, x264_mb_analysis_t *a ) LOAD_HPELS( &m, h->mb.pic.p_fref[1][i_ref], 0, 0 ); x264_mb_predict_mv_16x16( h, 1, i_ref, m.mvp ); x264_mb_predict_mv_ref16x16( h, 1, i_ref, mvc, &i_mvc ); - x264_me_search_ref( h, &m, mvc, i_mvc, p_fullpel_thresh ); + x264_me_search_ref( h, &m, mvc, i_mvc, p_halfpel_thresh ); /* add ref cost */ m.cost += REF_COST( 1, i_ref ); diff --git a/encoder/me.c b/encoder/me.c index b2717547..326ee721 100644 --- a/encoder/me.c +++ b/encoder/me.c @@ -36,13 +36,13 @@ static const int subpel_iterations[][4] = {{1,0,0,0}, {1,1,0,0}, - {1,2,0,0}, + {0,1,1,0}, {0,2,1,0}, {0,2,1,1}, {0,2,1,2}, {0,0,2,3}}; -static void refine_subpel( x264_t *h, x264_me_t *m, int hpel_iters, int qpel_iters ); +static void refine_subpel( x264_t *h, x264_me_t *m, int hpel_iters, int qpel_iters, int *p_halfpel_thresh, int b_refine_qpel ); #define COST_MV( mx, my ) \ { \ @@ -58,11 +58,10 @@ static void refine_subpel( x264_t *h, x264_me_t *m, int hpel_iters, int qpel_ite } \ } -void x264_me_search_ref( x264_t *h, x264_me_t *m, int (*mvc)[2], int i_mvc, int *p_fullpel_thresh ) +void x264_me_search_ref( x264_t *h, x264_me_t *m, int (*mvc)[2], int i_mvc, int *p_halfpel_thresh ) { const int i_pixel = m->i_pixel; const int i_me_range = h->param.analyse.i_me_range; - const int b_chroma_me = h->mb.b_chroma_me && i_pixel <= PIXEL_8x8; int bmx, bmy, bcost; int omx, omy, pmx, pmy; uint8_t *p_fref = m->p_fref[0]; @@ -86,7 +85,7 @@ void x264_me_search_ref( x264_t *h, x264_me_t *m, int (*mvc)[2], int i_mvc, int bmx = pmx = x264_clip3( ( m->mvp[0] + 2 ) >> 2, mv_x_min, mv_x_max ); bmy = pmy = x264_clip3( ( m->mvp[1] + 2 ) >> 2, mv_y_min, mv_y_max ); bcost = COST_MAX; - COST_MV( bmx, bmy ); + COST_MV( pmx, pmy ); /* I don't know why this helps */ bcost -= p_cost_mvx[ bmx<<2 ] + p_cost_mvy[ bmy<<2 ]; @@ -246,39 +245,16 @@ umh_small_hex: /* compute the real cost */ m->cost_mv = p_cost_mvx[ m->mv[0] ] + p_cost_mvy[ m->mv[1] ]; - m->cost = h->pixf.mbcmp[i_pixel]( m->p_fenc[0], m->i_stride[0], - &p_fref[bmy * m->i_stride[0] + bmx], m->i_stride[0] ) - + m->cost_mv; - if( b_chroma_me ) - { - const int bw = x264_pixel_size[m->i_pixel].w; - const int bh = x264_pixel_size[m->i_pixel].h; - DECLARE_ALIGNED( uint8_t, pix[8*8*2], 16 ); - h->mc.mc_chroma( m->p_fref[4], m->i_stride[1], pix, 8, m->mv[0], m->mv[1], bw/2, bh/2 ); - h->mc.mc_chroma( m->p_fref[5], m->i_stride[1], pix+8*8, 8, m->mv[0], m->mv[1], bw/2, bh/2 ); - m->cost += h->pixf.mbcmp[i_pixel+3]( m->p_fenc[1], m->i_stride[1], pix, 8 ) - + h->pixf.mbcmp[i_pixel+3]( m->p_fenc[2], m->i_stride[1], pix+8*8, 8 ); - } - + m->cost = bcost; + if( bmx == pmx && bmy == pmy ) + m->cost += m->cost_mv; + /* subpel refine */ - if( h->mb.i_subpel_refine >= 3 ) + if( h->mb.i_subpel_refine >= 2 ) { - int hpel, qpel; - - /* early termination (when examining multiple reference frames) - * FIXME: this can update fullpel_thresh even if the match - * ref is rejected after subpel refinement */ - if( p_fullpel_thresh ) - { - if( (m->cost*7)>>3 > *p_fullpel_thresh ) - return; - else if( m->cost < *p_fullpel_thresh ) - *p_fullpel_thresh = m->cost; - } - - hpel = subpel_iterations[h->mb.i_subpel_refine][2]; - qpel = subpel_iterations[h->mb.i_subpel_refine][3]; - refine_subpel( h, m, hpel, qpel ); + int hpel = subpel_iterations[h->mb.i_subpel_refine][2]; + int qpel = subpel_iterations[h->mb.i_subpel_refine][3]; + refine_subpel( h, m, hpel, qpel, p_halfpel_thresh, 0 ); } } #undef COST_MV @@ -291,10 +267,24 @@ void x264_me_refine_qpel( x264_t *h, x264_me_t *m ) if( m->i_pixel <= PIXEL_8x8 && h->sh.i_type == SLICE_TYPE_P ) m->cost -= m->i_ref_cost; - refine_subpel( h, m, hpel, qpel ); + refine_subpel( h, m, hpel, qpel, NULL, 1 ); } -#define COST_MV( mx, my ) \ +#define COST_MV_SAD( mx, my ) \ +{ \ + int stride = 16; \ + uint8_t *src = h->mc.get_ref( m->p_fref, m->i_stride[0], pix, &stride, mx, my, bw, bh ); \ + int cost = h->pixf.sad[i_pixel]( m->p_fenc[0], m->i_stride[0], src, stride ) \ + + p_cost_mvx[ mx ] + p_cost_mvy[ my ]; \ + if( cost < bcost ) \ + { \ + bcost = cost; \ + bmx = mx; \ + bmy = my; \ + } \ +} + +#define COST_MV_SATD( mx, my ) \ { \ int stride = 16; \ uint8_t *src = h->mc.get_ref( m->p_fref, m->i_stride[0], pix, &stride, mx, my, bw, bh ); \ @@ -318,7 +308,7 @@ void x264_me_refine_qpel( x264_t *h, x264_me_t *m ) } \ } -static void refine_subpel( x264_t *h, x264_me_t *m, int hpel_iters, int qpel_iters ) +static void refine_subpel( x264_t *h, x264_me_t *m, int hpel_iters, int qpel_iters, int *p_halfpel_thresh, int b_refine_qpel ) { const int bw = x264_pixel_size[m->i_pixel].w; const int bh = x264_pixel_size[m->i_pixel].h; @@ -328,12 +318,14 @@ static void refine_subpel( x264_t *h, x264_me_t *m, int hpel_iters, int qpel_ite const int b_chroma_me = h->mb.b_chroma_me && i_pixel <= PIXEL_8x8; DECLARE_ALIGNED( uint8_t, pix[16*16], 16 ); - int step, i; + int omx, omy; + int i; int bmx = m->mv[0]; int bmy = m->mv[1]; int bcost = m->cost; + /* try the subpel component of the predicted mv if it's close to * the result of the fullpel search */ if( hpel_iters ) @@ -341,22 +333,54 @@ static void refine_subpel( x264_t *h, x264_me_t *m, int hpel_iters, int qpel_ite int mx = X264_ABS(bmx - m->mvp[0]) < 4 ? m->mvp[0] : bmx; int my = X264_ABS(bmy - m->mvp[1]) < 4 ? m->mvp[1] : bmy; if( mx != bmx || my != bmy ) - COST_MV( mx, my ); + COST_MV_SAD( mx, my ); } - - for( step = 2; step >= 1; step-- ) + + /* hpel search */ + for( i = hpel_iters; i > 0; i-- ) + { + omx = bmx; + omy = bmy; + COST_MV_SAD( omx, omy - 2 ); + COST_MV_SAD( omx, omy + 2 ); + COST_MV_SAD( omx - 2, omy ); + COST_MV_SAD( omx + 2, omy ); + if( bmx == omx && bmy == omy ) + break; + } + + if( !b_refine_qpel ) { - for( i = step>1 ? hpel_iters : qpel_iters; i > 0; i-- ) + bcost = COST_MAX; + COST_MV_SATD( bmx, bmy ); + } + + /* early termination when examining multiple reference frames */ + if( p_halfpel_thresh ) + { + if( (bcost*7)>>3 > *p_halfpel_thresh ) { - int omx = bmx; - int omy = bmy; - COST_MV( omx, omy - step ); - COST_MV( omx, omy + step ); - COST_MV( omx - step, omy ); - COST_MV( omx + step, omy ); - if( bmx == omx && bmy == omy ) - break; - } + m->cost = bcost; + m->mv[0] = bmx; + m->mv[1] = bmy; + // don't need cost_mv + return; + } + else if( bcost < *p_halfpel_thresh ) + *p_halfpel_thresh = bcost; + } + + /* qpel search */ + for( i = qpel_iters; i > 0; i-- ) + { + omx = bmx; + omy = bmy; + COST_MV_SATD( omx, omy - 1 ); + COST_MV_SATD( omx, omy + 1 ); + COST_MV_SATD( omx - 1, omy ); + COST_MV_SATD( omx + 1, omy ); + if( bmx == omx && bmy == omy ) + break; } m->cost = bcost; -- 2.40.0