From: Loren Merritt Date: Mon, 14 Feb 2005 23:32:38 +0000 (+0000) Subject: Precalculate lambda*bits for all allowed mvs. 1-2% speedup. X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=d688918e861714e23f8fa7bdaaa6bf47ffec0395;p=libx264 Precalculate lambda*bits for all allowed mvs. 1-2% speedup. git-svn-id: svn://svn.videolan.org/x264/trunk@127 df754926-b1dd-0310-bc7b-ec298dee348c --- diff --git a/common/common.c b/common/common.c index 4148b8af..997a7ce7 100644 --- a/common/common.c +++ b/common/common.c @@ -101,6 +101,7 @@ void x264_param_default( x264_param_t *param ) param->analyse.inter = X264_ANALYSE_I4x4 | X264_ANALYSE_PSUB16x16 | X264_ANALYSE_BSUB16x16; param->analyse.i_direct_mv_pred = X264_DIRECT_PRED_TEMPORAL; param->analyse.i_subpel_refine = 5; + param->analyse.i_mv_range = 512; param->analyse.b_psnr = 1; } diff --git a/encoder/analyse.c b/encoder/analyse.c index 98c97d45..1f339e39 100644 --- a/encoder/analyse.c +++ b/encoder/analyse.c @@ -70,6 +70,7 @@ typedef struct /* conduct the analysis using this lamda and QP */ int i_lambda; int i_qp; + int16_t *p_cost_mv; /* I: Intra part */ @@ -135,6 +136,27 @@ static const int i_sub_mb_p_cost_table[4] = { 5, 3, 3, 1 }; +/* initialize an array of lambda*nbits for all possible mvs */ +static void x264_mb_analyse_load_costs( x264_t *h, x264_mb_analysis_t *a ) +{ + static int16_t *p_cost_mv[52]; + + if( !p_cost_mv[a->i_qp] ) + { + /* could be faster, but isn't called many times */ + int i; + p_cost_mv[a->i_qp] = x264_malloc( (2*4*h->param.analyse.i_mv_range + 1) * sizeof(int16_t) ); + p_cost_mv[a->i_qp] += 4*h->param.analyse.i_mv_range; + for( i = 0; i <= 4*h->param.analyse.i_mv_range; i++ ) + { + p_cost_mv[a->i_qp][-i] = + p_cost_mv[a->i_qp][i] = a->i_lambda * bs_size_se( i ); + } + } + + a->p_cost_mv = p_cost_mv[a->i_qp]; +} + static void x264_mb_analyse_init( x264_t *h, x264_mb_analysis_t *a, int i_qp ) { memset( a, 0, sizeof( x264_mb_analysis_t ) ); @@ -152,19 +174,22 @@ static void x264_mb_analyse_init( x264_t *h, x264_mb_analysis_t *a, int i_qp ) if( h->sh.i_type != SLICE_TYPE_I ) { int i; + int i_fmv_range = h->param.analyse.i_mv_range - 16; /* Calculate max allowed MV range */ - h->mb.mv_min_fpel[0] = -16*h->mb.i_mb_x - 8; - h->mb.mv_max_fpel[0] = 16*( h->sps->i_mb_width - h->mb.i_mb_x ) - 8; +#define CLIP_FMV(mv) x264_clip3( mv, -i_fmv_range, i_fmv_range ) + h->mb.mv_min_fpel[0] = CLIP_FMV( -16*h->mb.i_mb_x - 8 ); + h->mb.mv_max_fpel[0] = CLIP_FMV( 16*( h->sps->i_mb_width - h->mb.i_mb_x ) - 8 ); h->mb.mv_min[0] = 4*( h->mb.mv_min_fpel[0] - 16 ); h->mb.mv_max[0] = 4*( h->mb.mv_max_fpel[0] + 16 ); if( h->mb.i_mb_x == 0) { - h->mb.mv_min_fpel[1] = -16*h->mb.i_mb_y - 8; - h->mb.mv_max_fpel[1] = 16*( h->sps->i_mb_height - h->mb.i_mb_y ) - 8; + h->mb.mv_min_fpel[1] = CLIP_FMV( -16*h->mb.i_mb_y - 8 ); + h->mb.mv_max_fpel[1] = CLIP_FMV( 16*( h->sps->i_mb_height - h->mb.i_mb_y ) - 8 ); h->mb.mv_min[1] = 4*( h->mb.mv_min_fpel[1] - 16 ); h->mb.mv_max[1] = 4*( h->mb.mv_max_fpel[1] + 16 ); } +#undef CLIP_FMV a->l0.me16x16.cost = -1; a->l0.i_cost8x8 = -1; @@ -502,14 +527,14 @@ static void x264_mb_analyse_inter_p16x16( x264_t *h, x264_mb_analysis_t *a ) /* 16x16 Search on all ref frame */ m.i_pixel = PIXEL_16x16; - m.lm = a->i_lambda; + m.p_cost_mv = a->p_cost_mv; m.p_fenc = h->mb.pic.p_fenc[0]; m.i_stride= h->mb.pic.i_stride[0]; a->l0.me16x16.cost = INT_MAX; for( i_ref = 0; i_ref < h->i_ref0; i_ref++ ) { - const int i_ref_cost = m.lm * bs_size_te( h->sh.i_num_ref_idx_l0_active - 1, i_ref ); + const int i_ref_cost = a->i_lambda * bs_size_te( h->sh.i_num_ref_idx_l0_active - 1, i_ref ); i_fullpel_thresh -= i_ref_cost; /* search with ref */ @@ -533,7 +558,7 @@ static void x264_mb_analyse_inter_p16x16( x264_t *h, x264_mb_analysis_t *a ) } /* subtract ref cost, so we don't have to add it for the other P types */ - a->l0.me16x16.cost -= m.lm * bs_size_te( h->sh.i_num_ref_idx_l0_active - 1, a->l0.i_ref ); + a->l0.me16x16.cost -= a->i_lambda * bs_size_te( h->sh.i_num_ref_idx_l0_active - 1, a->l0.i_ref ); /* Set global ref, needed for all others modes */ x264_macroblock_cache_ref( h, 0, 0, 4, 4, 0, a->l0.i_ref ); @@ -560,7 +585,7 @@ static void x264_mb_analyse_inter_p8x8( x264_t *h, x264_mb_analysis_t *a ) const int y8 = i/2; m->i_pixel = PIXEL_8x8; - m->lm = a->i_lambda; + m->p_cost_mv = a->p_cost_mv; m->p_fenc = &p_fenc[8*(y8*h->mb.pic.i_stride[0]+x8)]; m->i_stride= h->mb.pic.i_stride[0]; @@ -598,7 +623,7 @@ static void x264_mb_analyse_inter_p16x8( x264_t *h, x264_mb_analysis_t *a ) x264_me_t *m = &a->l0.me16x8[i]; m->i_pixel = PIXEL_16x8; - m->lm = a->i_lambda; + m->p_cost_mv = a->p_cost_mv; m->p_fenc = &p_fenc[8*i*h->mb.pic.i_stride[0]]; m->i_stride= h->mb.pic.i_stride[0]; @@ -633,7 +658,7 @@ static void x264_mb_analyse_inter_p8x16( x264_t *h, x264_mb_analysis_t *a ) x264_me_t *m = &a->l0.me8x16[i]; m->i_pixel = PIXEL_8x16; - m->lm = a->i_lambda; + m->p_cost_mv = a->p_cost_mv; m->p_fenc = &p_fenc[8*i]; m->i_stride= h->mb.pic.i_stride[0]; @@ -673,7 +698,7 @@ static void x264_mb_analyse_inter_p4x4( x264_t *h, x264_mb_analysis_t *a, int i8 x264_me_t *m = &a->l0.me4x4[i8x8][i4x4]; m->i_pixel = PIXEL_4x4; - m->lm = a->i_lambda; + m->p_cost_mv = a->p_cost_mv; m->p_fenc = &p_fenc[4*(y4*h->mb.pic.i_stride[0]+x4)]; m->i_stride= h->mb.pic.i_stride[0]; @@ -712,7 +737,7 @@ static void x264_mb_analyse_inter_p8x4( x264_t *h, x264_mb_analysis_t *a, int i8 x264_me_t *m = &a->l0.me8x4[i8x8][i8x4]; m->i_pixel = PIXEL_8x4; - m->lm = a->i_lambda; + m->p_cost_mv = a->p_cost_mv; m->p_fenc = &p_fenc[4*(y4*h->mb.pic.i_stride[0]+x4)]; m->i_stride= h->mb.pic.i_stride[0]; @@ -748,7 +773,7 @@ static void x264_mb_analyse_inter_p4x8( x264_t *h, x264_mb_analysis_t *a, int i8 x264_me_t *m = &a->l0.me4x8[i8x8][i4x8]; m->i_pixel = PIXEL_4x8; - m->lm = a->i_lambda; + m->p_cost_mv = a->p_cost_mv; m->p_fenc = &p_fenc[4*(y4*h->mb.pic.i_stride[0]+x4)]; m->i_stride= h->mb.pic.i_stride[0]; @@ -805,7 +830,7 @@ static void x264_mb_analyse_inter_b16x16( x264_t *h, x264_mb_analysis_t *a ) /* 16x16 Search on all ref frame */ m.i_pixel = PIXEL_16x16; - m.lm = a->i_lambda; + m.p_cost_mv = a->p_cost_mv; m.p_fenc = h->mb.pic.p_fenc[0]; m.i_stride= h->mb.pic.i_stride[0]; @@ -820,7 +845,7 @@ static void x264_mb_analyse_inter_b16x16( x264_t *h, x264_mb_analysis_t *a ) x264_me_search_ref( h, &m, mvc, i_mvc, p_fullpel_thresh ); /* add ref cost */ - m.cost += m.lm * bs_size_te( h->sh.i_num_ref_idx_l0_active - 1, i_ref ); + m.cost += a->i_lambda * bs_size_te( h->sh.i_num_ref_idx_l0_active - 1, i_ref ); if( m.cost < a->l0.me16x16.cost ) { @@ -833,7 +858,7 @@ static void x264_mb_analyse_inter_b16x16( x264_t *h, x264_mb_analysis_t *a ) h->mb.mvr[0][i_ref][h->mb.i_mb_xy][1] = m.mv[1]; } /* subtract ref cost, so we don't have to add it for the other MB types */ - a->l0.me16x16.cost -= m.lm * bs_size_te( h->sh.i_num_ref_idx_l0_active - 1, a->l0.i_ref ); + a->l0.me16x16.cost -= a->i_lambda * bs_size_te( h->sh.i_num_ref_idx_l0_active - 1, a->l0.i_ref ); /* ME for list 1 */ /* not using fullpel_thresh since we don't yet do more than 1 list 1 ref */ @@ -847,7 +872,7 @@ static void x264_mb_analyse_inter_b16x16( x264_t *h, x264_mb_analysis_t *a ) x264_me_search( h, &m, mvc, i_mvc ); /* add ref cost */ - m.cost += m.lm * bs_size_te( h->sh.i_num_ref_idx_l1_active - 1, i_ref ); + m.cost += a->i_lambda * bs_size_te( h->sh.i_num_ref_idx_l1_active - 1, i_ref ); if( m.cost < a->l1.me16x16.cost ) { @@ -860,7 +885,7 @@ static void x264_mb_analyse_inter_b16x16( x264_t *h, x264_mb_analysis_t *a ) h->mb.mvr[1][i_ref][h->mb.i_mb_xy][1] = m.mv[1]; } /* subtract ref cost, so we don't have to add it for the other MB types */ - a->l1.me16x16.cost -= m.lm * bs_size_te( h->sh.i_num_ref_idx_l1_active - 1, a->l1.i_ref ); + a->l1.me16x16.cost -= a->i_lambda * bs_size_te( h->sh.i_num_ref_idx_l1_active - 1, a->l1.i_ref ); /* Set global ref, needed for other modes? */ x264_macroblock_cache_ref( h, 0, 0, 4, 4, 0, a->l0.i_ref ); @@ -894,13 +919,11 @@ static void x264_mb_analyse_inter_b16x16( x264_t *h, x264_mb_analysis_t *a ) h->pixf.avg[PIXEL_16x16]( pix1, 16, src2, stride2 ); - a->i_cost16x16bi = h->pixf.satd[PIXEL_16x16]( h->mb.pic.p_fenc[0], h->mb.pic.i_stride[0], pix1, 16 ) + - a->i_lambda * ( bs_size_te( h->sh.i_num_ref_idx_l0_active - 1, a->l0.i_ref ) + - bs_size_te( h->sh.i_num_ref_idx_l1_active - 1, a->l1.i_ref ) + - bs_size_se( a->l0.me16x16.mv[0] - a->l0.me16x16.mvp[0] ) + - bs_size_se( a->l0.me16x16.mv[1] - a->l0.me16x16.mvp[1] ) + - bs_size_se( a->l1.me16x16.mv[0] - a->l1.me16x16.mvp[0] ) + - bs_size_se( a->l1.me16x16.mv[1] - a->l1.me16x16.mvp[1] ) ); + a->i_cost16x16bi = h->pixf.satd[PIXEL_16x16]( h->mb.pic.p_fenc[0], h->mb.pic.i_stride[0], pix1, 16 ) + + a->i_lambda * ( bs_size_te( h->sh.i_num_ref_idx_l0_active - 1, a->l0.i_ref ) + + bs_size_te( h->sh.i_num_ref_idx_l1_active - 1, a->l1.i_ref ) ) + + a->l0.me16x16.cost_mv + + a->l1.me16x16.cost_mv; /* mb type cost */ a->i_cost16x16bi += a->i_lambda * i_mb_b_cost_table[B_BI_BI]; @@ -1000,7 +1023,7 @@ static void x264_mb_analyse_inter_b8x8( x264_t *h, x264_mb_analysis_t *a ) x264_me_t *m = &lX->me8x8[i]; m->i_pixel = PIXEL_8x8; - m->lm = a->i_lambda; + m->p_cost_mv = a->p_cost_mv; m->p_fenc = p_fenc_i; m->i_stride = h->mb.pic.i_stride[0]; @@ -1014,8 +1037,7 @@ static void x264_mb_analyse_inter_b8x8( x264_t *h, x264_mb_analysis_t *a ) /* BI mode */ h->mc.mc_luma( m->p_fref, m->i_stride, pix[l], 8, m->mv[0], m->mv[1], 8, 8 ); - i_part_cost_bi += a->i_lambda * ( bs_size_se( m->mv[0] - m->mvp[0] ) + - bs_size_se( m->mv[1] - m->mvp[1] ) ); + i_part_cost_bi += m->cost_mv; /* FIXME: ref cost */ } @@ -1088,7 +1110,7 @@ static void x264_mb_analyse_inter_b16x8( x264_t *h, x264_mb_analysis_t *a ) x264_me_t *m = &lX->me16x8[i]; m->i_pixel = PIXEL_16x8; - m->lm = a->i_lambda; + m->p_cost_mv = a->p_cost_mv; m->p_fenc = p_fenc_i; m->i_stride= i_ref_stride; @@ -1106,8 +1128,7 @@ static void x264_mb_analyse_inter_b16x8( x264_t *h, x264_mb_analysis_t *a ) h->mc.mc_luma( m->p_fref, m->i_stride, pix[l], 8, m->mv[0], m->mv[1], 8, 8 ); /* FIXME: ref cost */ - i_part_cost_bi += a->i_lambda * ( bs_size_se( m->mv[0] - m->mvp[0] ) + - bs_size_se( m->mv[1] - m->mvp[1] ) ); + i_part_cost_bi += m->cost_mv; } h->pixf.avg[PIXEL_16x8]( pix[0], 8, pix[1], 8 ); @@ -1172,7 +1193,7 @@ static void x264_mb_analyse_inter_b8x16( x264_t *h, x264_mb_analysis_t *a ) x264_me_t *m = &lX->me8x16[i]; m->i_pixel = PIXEL_8x16; - m->lm = a->i_lambda; + m->p_cost_mv = a->p_cost_mv; m->p_fenc = p_fenc_i; m->i_stride= i_ref_stride; @@ -1190,8 +1211,7 @@ static void x264_mb_analyse_inter_b8x16( x264_t *h, x264_mb_analysis_t *a ) h->mc.mc_luma( m->p_fref, m->i_stride, pix[l], 8, m->mv[0], m->mv[1], 8, 8 ); /* FIXME: ref cost */ - i_part_cost_bi += a->i_lambda * ( bs_size_se( m->mv[0] - m->mvp[0] ) + - bs_size_se( m->mv[1] - m->mvp[1] ) ); + i_part_cost_bi += m->cost_mv; } h->pixf.avg[PIXEL_8x16]( pix[0], 8, pix[1], 8 ); @@ -1277,6 +1297,8 @@ void x264_macroblock_analyse( x264_t *h ) int i_type; int i_partition; + x264_mb_analyse_load_costs( h, &analysis ); + x264_mb_analyse_inter_p16x16( h, &analysis ); if( flags & X264_ANALYSE_PSUB16x16 ) x264_mb_analyse_inter_p8x8( h, &analysis ); @@ -1454,6 +1476,8 @@ void x264_macroblock_analyse( x264_t *h ) int i_partition; int i_cost; + x264_mb_analyse_load_costs( h, &analysis ); + /* select best inter mode */ /* direct must be first */ if( analysis.b_direct_available ) diff --git a/encoder/me.c b/encoder/me.c index 79ab54ef..628ef9a7 100644 --- a/encoder/me.c +++ b/encoder/me.c @@ -46,9 +46,9 @@ static void refine_subpel( x264_t *h, x264_me_t *m, int hpel_iters, int qpel_ite #define COST_MV( mx, my ) \ { \ int cost = h->pixf.sad[i_pixel]( m->p_fenc, m->i_stride, \ - &p_fref[(my)*m->i_stride+(mx)], m->i_stride ) + \ - m->lm * ( bs_size_se(((mx)<<2) - m->mvp[0] ) + \ - bs_size_se(((my)<<2) - m->mvp[1] ) ); \ + &p_fref[(my)*m->i_stride+(mx)], m->i_stride ) \ + + p_cost_mvx[ (mx)<<2 ] \ + + p_cost_mvy[ (my)<<2 ]; \ if( cost < bcost ) \ { \ bcost = cost; \ @@ -70,6 +70,12 @@ void x264_me_search_ref( x264_t *h, x264_me_t *m, int (*mvc)[2], int i_mvc, int const int mv_x_max = h->mb.mv_max_fpel[0]; const int mv_y_max = h->mb.mv_max_fpel[1]; + /* FIXME could theoretically run off the end of the prepared array of costs, + * if some mv predictors are very far from mvp */ + const int16_t *p_cost_mvx = m->p_cost_mv - m->mvp[0]; + const int16_t *p_cost_mvy = m->p_cost_mv - m->mvp[1]; + + /* init with mvp */ /* XXX: We don't need to clamp because the way diamond work, we will * never go outside padded picture, and predict mv won't compute vector @@ -145,10 +151,10 @@ void x264_me_search_ref( x264_t *h, x264_me_t *m, int (*mvc)[2], int i_mvc, int m->mv[1] = bmy << 2; /* compute the real cost */ + m->cost_mv = p_cost_mvx[ m->mv[0] ] + p_cost_mvy[ m->mv[1] ]; m->cost = h->pixf.satd[i_pixel]( m->p_fenc, m->i_stride, - &p_fref[bmy * m->i_stride + bmx], m->i_stride ) + - m->lm * ( bs_size_se( m->mv[0] - m->mvp[0] ) + - bs_size_se( m->mv[1] - m->mvp[1] ) ); + &p_fref[bmy * m->i_stride + bmx], m->i_stride ) + + m->cost_mv; /* subpel refine */ if( h->param.analyse.i_subpel_refine >= 3 ) @@ -185,6 +191,8 @@ static void refine_subpel( x264_t *h, x264_me_t *m, int hpel_iters, int qpel_ite { const int bw = x264_pixel_size[m->i_pixel].w; const int bh = x264_pixel_size[m->i_pixel].h; + const int16_t *p_cost_mvx = m->p_cost_mv - m->mvp[0]; + const int16_t *p_cost_mvy = m->p_cost_mv - m->mvp[1]; DECLARE_ALIGNED( uint8_t, pix[4][16*16], 16 ); uint8_t * src[4]; @@ -207,13 +215,13 @@ static void refine_subpel( x264_t *h, x264_me_t *m, int hpel_iters, int qpel_ite src[3] = h->mc.get_ref( m->p_fref, m->i_stride, pix[3], &stride[3], bmx + step, bmy + 0, bw, bh ); cost[0] = h->pixf.satd[m->i_pixel]( m->p_fenc, m->i_stride, src[0], stride[0] ) + - m->lm * ( bs_size_se( bmx + 0 - m->mvp[0] ) + bs_size_se( bmy - step - m->mvp[1] ) ); + p_cost_mvx[ bmx + 0 ] + p_cost_mvy[ bmy - step ]; cost[1] = h->pixf.satd[m->i_pixel]( m->p_fenc, m->i_stride, src[1], stride[1] ) + - m->lm * ( bs_size_se( bmx + 0 - m->mvp[0] ) + bs_size_se( bmy + step - m->mvp[1] ) ); + p_cost_mvx[ bmx + 0 ] + p_cost_mvy[ bmy + step ]; cost[2] = h->pixf.satd[m->i_pixel]( m->p_fenc, m->i_stride, src[2], stride[2] ) + - m->lm * ( bs_size_se( bmx - step - m->mvp[0] ) + bs_size_se( bmy + 0 - m->mvp[1] ) ); + p_cost_mvx[ bmx - step ] + p_cost_mvy[ bmy + 0 ]; cost[3] = h->pixf.satd[m->i_pixel]( m->p_fenc, m->i_stride, src[3], stride[3] ) + - m->lm * ( bs_size_se( bmx + step - m->mvp[0] ) + bs_size_se( bmy + 0 - m->mvp[1] ) ); + p_cost_mvx[ bmx + step ] + p_cost_mvy[ bmy + 0 ]; best = 0; if( cost[1] < cost[0] ) best = 1; @@ -234,5 +242,6 @@ static void refine_subpel( x264_t *h, x264_me_t *m, int hpel_iters, int qpel_ite m->mv[0] = bmx; m->mv[1] = bmy; + m->cost_mv = p_cost_mvx[ bmx ] + p_cost_mvy[ bmy ]; } diff --git a/encoder/me.h b/encoder/me.h index ef1adb06..d957b779 100644 --- a/encoder/me.h +++ b/encoder/me.h @@ -28,7 +28,7 @@ typedef struct { /* input */ int i_pixel; /* PIXEL_WxH */ - int lm; /* lambda motion */ + int16_t *p_cost_mv; /* lambda * nbits for each possible mv */ uint8_t *p_fref[4]; uint8_t *p_fenc; @@ -37,7 +37,8 @@ typedef struct int mvp[2]; /* output */ - int cost; /* satd + lm * nbits */ + int cost_mv; /* lambda * nbits for the chosen mv */ + int cost; /* satd + lambda * nbits */ int mv[2]; } x264_me_t; diff --git a/x264.h b/x264.h index 3cd571da..a62d2502 100644 --- a/x264.h +++ b/x264.h @@ -26,7 +26,7 @@ #include -#define X264_BUILD 0x000e +#define X264_BUILD 0x000f /* x264_t: * opaque handler for decoder and encoder */ @@ -136,6 +136,8 @@ typedef struct int i_subpel_refine; /* subpixel motion estimation quality */ + int i_mv_range; /* maximum length of a mv (in pixels) */ + int b_psnr; /* Do we compute PSNR stats (save a few % of cpu) */ } analyse;