/* conduct the analysis using this lamda and QP */
int i_lambda;
int i_qp;
+ int16_t *p_cost_mv;
/* I: Intra part */
5, 3, 3, 1
};
+/* initialize an array of lambda*nbits for all possible mvs */
+static void x264_mb_analyse_load_costs( x264_t *h, x264_mb_analysis_t *a )
+{
+ static int16_t *p_cost_mv[52];
+
+ if( !p_cost_mv[a->i_qp] )
+ {
+ /* could be faster, but isn't called many times */
+ int i;
+ p_cost_mv[a->i_qp] = x264_malloc( (2*4*h->param.analyse.i_mv_range + 1) * sizeof(int16_t) );
+ p_cost_mv[a->i_qp] += 4*h->param.analyse.i_mv_range;
+ for( i = 0; i <= 4*h->param.analyse.i_mv_range; i++ )
+ {
+ p_cost_mv[a->i_qp][-i] =
+ p_cost_mv[a->i_qp][i] = a->i_lambda * bs_size_se( i );
+ }
+ }
+
+ a->p_cost_mv = p_cost_mv[a->i_qp];
+}
+
static void x264_mb_analyse_init( x264_t *h, x264_mb_analysis_t *a, int i_qp )
{
memset( a, 0, sizeof( x264_mb_analysis_t ) );
if( h->sh.i_type != SLICE_TYPE_I )
{
int i;
+ int i_fmv_range = h->param.analyse.i_mv_range - 16;
/* Calculate max allowed MV range */
- h->mb.mv_min_fpel[0] = -16*h->mb.i_mb_x - 8;
- h->mb.mv_max_fpel[0] = 16*( h->sps->i_mb_width - h->mb.i_mb_x ) - 8;
+#define CLIP_FMV(mv) x264_clip3( mv, -i_fmv_range, i_fmv_range )
+ h->mb.mv_min_fpel[0] = CLIP_FMV( -16*h->mb.i_mb_x - 8 );
+ h->mb.mv_max_fpel[0] = CLIP_FMV( 16*( h->sps->i_mb_width - h->mb.i_mb_x ) - 8 );
h->mb.mv_min[0] = 4*( h->mb.mv_min_fpel[0] - 16 );
h->mb.mv_max[0] = 4*( h->mb.mv_max_fpel[0] + 16 );
if( h->mb.i_mb_x == 0)
{
- h->mb.mv_min_fpel[1] = -16*h->mb.i_mb_y - 8;
- h->mb.mv_max_fpel[1] = 16*( h->sps->i_mb_height - h->mb.i_mb_y ) - 8;
+ h->mb.mv_min_fpel[1] = CLIP_FMV( -16*h->mb.i_mb_y - 8 );
+ h->mb.mv_max_fpel[1] = CLIP_FMV( 16*( h->sps->i_mb_height - h->mb.i_mb_y ) - 8 );
h->mb.mv_min[1] = 4*( h->mb.mv_min_fpel[1] - 16 );
h->mb.mv_max[1] = 4*( h->mb.mv_max_fpel[1] + 16 );
}
+#undef CLIP_FMV
a->l0.me16x16.cost = -1;
a->l0.i_cost8x8 = -1;
/* 16x16 Search on all ref frame */
m.i_pixel = PIXEL_16x16;
- m.lm = a->i_lambda;
+ m.p_cost_mv = a->p_cost_mv;
m.p_fenc = h->mb.pic.p_fenc[0];
m.i_stride= h->mb.pic.i_stride[0];
a->l0.me16x16.cost = INT_MAX;
for( i_ref = 0; i_ref < h->i_ref0; i_ref++ )
{
- const int i_ref_cost = m.lm * bs_size_te( h->sh.i_num_ref_idx_l0_active - 1, i_ref );
+ const int i_ref_cost = a->i_lambda * bs_size_te( h->sh.i_num_ref_idx_l0_active - 1, i_ref );
i_fullpel_thresh -= i_ref_cost;
/* search with ref */
}
/* subtract ref cost, so we don't have to add it for the other P types */
- a->l0.me16x16.cost -= m.lm * bs_size_te( h->sh.i_num_ref_idx_l0_active - 1, a->l0.i_ref );
+ a->l0.me16x16.cost -= a->i_lambda * bs_size_te( h->sh.i_num_ref_idx_l0_active - 1, a->l0.i_ref );
/* Set global ref, needed for all others modes */
x264_macroblock_cache_ref( h, 0, 0, 4, 4, 0, a->l0.i_ref );
const int y8 = i/2;
m->i_pixel = PIXEL_8x8;
- m->lm = a->i_lambda;
+ m->p_cost_mv = a->p_cost_mv;
m->p_fenc = &p_fenc[8*(y8*h->mb.pic.i_stride[0]+x8)];
m->i_stride= h->mb.pic.i_stride[0];
x264_me_t *m = &a->l0.me16x8[i];
m->i_pixel = PIXEL_16x8;
- m->lm = a->i_lambda;
+ m->p_cost_mv = a->p_cost_mv;
m->p_fenc = &p_fenc[8*i*h->mb.pic.i_stride[0]];
m->i_stride= h->mb.pic.i_stride[0];
x264_me_t *m = &a->l0.me8x16[i];
m->i_pixel = PIXEL_8x16;
- m->lm = a->i_lambda;
+ m->p_cost_mv = a->p_cost_mv;
m->p_fenc = &p_fenc[8*i];
m->i_stride= h->mb.pic.i_stride[0];
x264_me_t *m = &a->l0.me4x4[i8x8][i4x4];
m->i_pixel = PIXEL_4x4;
- m->lm = a->i_lambda;
+ m->p_cost_mv = a->p_cost_mv;
m->p_fenc = &p_fenc[4*(y4*h->mb.pic.i_stride[0]+x4)];
m->i_stride= h->mb.pic.i_stride[0];
x264_me_t *m = &a->l0.me8x4[i8x8][i8x4];
m->i_pixel = PIXEL_8x4;
- m->lm = a->i_lambda;
+ m->p_cost_mv = a->p_cost_mv;
m->p_fenc = &p_fenc[4*(y4*h->mb.pic.i_stride[0]+x4)];
m->i_stride= h->mb.pic.i_stride[0];
x264_me_t *m = &a->l0.me4x8[i8x8][i4x8];
m->i_pixel = PIXEL_4x8;
- m->lm = a->i_lambda;
+ m->p_cost_mv = a->p_cost_mv;
m->p_fenc = &p_fenc[4*(y4*h->mb.pic.i_stride[0]+x4)];
m->i_stride= h->mb.pic.i_stride[0];
/* 16x16 Search on all ref frame */
m.i_pixel = PIXEL_16x16;
- m.lm = a->i_lambda;
+ m.p_cost_mv = a->p_cost_mv;
m.p_fenc = h->mb.pic.p_fenc[0];
m.i_stride= h->mb.pic.i_stride[0];
x264_me_search_ref( h, &m, mvc, i_mvc, p_fullpel_thresh );
/* add ref cost */
- m.cost += m.lm * bs_size_te( h->sh.i_num_ref_idx_l0_active - 1, i_ref );
+ m.cost += a->i_lambda * bs_size_te( h->sh.i_num_ref_idx_l0_active - 1, i_ref );
if( m.cost < a->l0.me16x16.cost )
{
h->mb.mvr[0][i_ref][h->mb.i_mb_xy][1] = m.mv[1];
}
/* subtract ref cost, so we don't have to add it for the other MB types */
- a->l0.me16x16.cost -= m.lm * bs_size_te( h->sh.i_num_ref_idx_l0_active - 1, a->l0.i_ref );
+ a->l0.me16x16.cost -= a->i_lambda * bs_size_te( h->sh.i_num_ref_idx_l0_active - 1, a->l0.i_ref );
/* ME for list 1 */
/* not using fullpel_thresh since we don't yet do more than 1 list 1 ref */
x264_me_search( h, &m, mvc, i_mvc );
/* add ref cost */
- m.cost += m.lm * bs_size_te( h->sh.i_num_ref_idx_l1_active - 1, i_ref );
+ m.cost += a->i_lambda * bs_size_te( h->sh.i_num_ref_idx_l1_active - 1, i_ref );
if( m.cost < a->l1.me16x16.cost )
{
h->mb.mvr[1][i_ref][h->mb.i_mb_xy][1] = m.mv[1];
}
/* subtract ref cost, so we don't have to add it for the other MB types */
- a->l1.me16x16.cost -= m.lm * bs_size_te( h->sh.i_num_ref_idx_l1_active - 1, a->l1.i_ref );
+ a->l1.me16x16.cost -= a->i_lambda * bs_size_te( h->sh.i_num_ref_idx_l1_active - 1, a->l1.i_ref );
/* Set global ref, needed for other modes? */
x264_macroblock_cache_ref( h, 0, 0, 4, 4, 0, a->l0.i_ref );
h->pixf.avg[PIXEL_16x16]( pix1, 16, src2, stride2 );
- a->i_cost16x16bi = h->pixf.satd[PIXEL_16x16]( h->mb.pic.p_fenc[0], h->mb.pic.i_stride[0], pix1, 16 ) +
- a->i_lambda * ( bs_size_te( h->sh.i_num_ref_idx_l0_active - 1, a->l0.i_ref ) +
- bs_size_te( h->sh.i_num_ref_idx_l1_active - 1, a->l1.i_ref ) +
- bs_size_se( a->l0.me16x16.mv[0] - a->l0.me16x16.mvp[0] ) +
- bs_size_se( a->l0.me16x16.mv[1] - a->l0.me16x16.mvp[1] ) +
- bs_size_se( a->l1.me16x16.mv[0] - a->l1.me16x16.mvp[0] ) +
- bs_size_se( a->l1.me16x16.mv[1] - a->l1.me16x16.mvp[1] ) );
+ a->i_cost16x16bi = h->pixf.satd[PIXEL_16x16]( h->mb.pic.p_fenc[0], h->mb.pic.i_stride[0], pix1, 16 )
+ + a->i_lambda * ( bs_size_te( h->sh.i_num_ref_idx_l0_active - 1, a->l0.i_ref )
+ + bs_size_te( h->sh.i_num_ref_idx_l1_active - 1, a->l1.i_ref ) )
+ + a->l0.me16x16.cost_mv
+ + a->l1.me16x16.cost_mv;
/* mb type cost */
a->i_cost16x16bi += a->i_lambda * i_mb_b_cost_table[B_BI_BI];
x264_me_t *m = &lX->me8x8[i];
m->i_pixel = PIXEL_8x8;
- m->lm = a->i_lambda;
+ m->p_cost_mv = a->p_cost_mv;
m->p_fenc = p_fenc_i;
m->i_stride = h->mb.pic.i_stride[0];
/* BI mode */
h->mc.mc_luma( m->p_fref, m->i_stride, pix[l], 8,
m->mv[0], m->mv[1], 8, 8 );
- i_part_cost_bi += a->i_lambda * ( bs_size_se( m->mv[0] - m->mvp[0] ) +
- bs_size_se( m->mv[1] - m->mvp[1] ) );
+ i_part_cost_bi += m->cost_mv;
/* FIXME: ref cost */
}
x264_me_t *m = &lX->me16x8[i];
m->i_pixel = PIXEL_16x8;
- m->lm = a->i_lambda;
+ m->p_cost_mv = a->p_cost_mv;
m->p_fenc = p_fenc_i;
m->i_stride= i_ref_stride;
h->mc.mc_luma( m->p_fref, m->i_stride, pix[l], 8,
m->mv[0], m->mv[1], 8, 8 );
/* FIXME: ref cost */
- i_part_cost_bi += a->i_lambda * ( bs_size_se( m->mv[0] - m->mvp[0] ) +
- bs_size_se( m->mv[1] - m->mvp[1] ) );
+ i_part_cost_bi += m->cost_mv;
}
h->pixf.avg[PIXEL_16x8]( pix[0], 8, pix[1], 8 );
x264_me_t *m = &lX->me8x16[i];
m->i_pixel = PIXEL_8x16;
- m->lm = a->i_lambda;
+ m->p_cost_mv = a->p_cost_mv;
m->p_fenc = p_fenc_i;
m->i_stride= i_ref_stride;
h->mc.mc_luma( m->p_fref, m->i_stride, pix[l], 8,
m->mv[0], m->mv[1], 8, 8 );
/* FIXME: ref cost */
- i_part_cost_bi += a->i_lambda * ( bs_size_se( m->mv[0] - m->mvp[0] ) +
- bs_size_se( m->mv[1] - m->mvp[1] ) );
+ i_part_cost_bi += m->cost_mv;
}
h->pixf.avg[PIXEL_8x16]( pix[0], 8, pix[1], 8 );
int i_type;
int i_partition;
+ x264_mb_analyse_load_costs( h, &analysis );
+
x264_mb_analyse_inter_p16x16( h, &analysis );
if( flags & X264_ANALYSE_PSUB16x16 )
x264_mb_analyse_inter_p8x8( h, &analysis );
int i_partition;
int i_cost;
+ x264_mb_analyse_load_costs( h, &analysis );
+
/* select best inter mode */
/* direct must be first */
if( analysis.b_direct_available )
#define COST_MV( mx, my ) \
{ \
int cost = h->pixf.sad[i_pixel]( m->p_fenc, m->i_stride, \
- &p_fref[(my)*m->i_stride+(mx)], m->i_stride ) + \
- m->lm * ( bs_size_se(((mx)<<2) - m->mvp[0] ) + \
- bs_size_se(((my)<<2) - m->mvp[1] ) ); \
+ &p_fref[(my)*m->i_stride+(mx)], m->i_stride ) \
+ + p_cost_mvx[ (mx)<<2 ] \
+ + p_cost_mvy[ (my)<<2 ]; \
if( cost < bcost ) \
{ \
bcost = cost; \
const int mv_x_max = h->mb.mv_max_fpel[0];
const int mv_y_max = h->mb.mv_max_fpel[1];
+ /* FIXME could theoretically run off the end of the prepared array of costs,
+ * if some mv predictors are very far from mvp */
+ const int16_t *p_cost_mvx = m->p_cost_mv - m->mvp[0];
+ const int16_t *p_cost_mvy = m->p_cost_mv - m->mvp[1];
+
+
/* init with mvp */
/* XXX: We don't need to clamp because the way diamond work, we will
* never go outside padded picture, and predict mv won't compute vector
m->mv[1] = bmy << 2;
/* compute the real cost */
+ m->cost_mv = p_cost_mvx[ m->mv[0] ] + p_cost_mvy[ m->mv[1] ];
m->cost = h->pixf.satd[i_pixel]( m->p_fenc, m->i_stride,
- &p_fref[bmy * m->i_stride + bmx], m->i_stride ) +
- m->lm * ( bs_size_se( m->mv[0] - m->mvp[0] ) +
- bs_size_se( m->mv[1] - m->mvp[1] ) );
+ &p_fref[bmy * m->i_stride + bmx], m->i_stride )
+ + m->cost_mv;
/* subpel refine */
if( h->param.analyse.i_subpel_refine >= 3 )
{
const int bw = x264_pixel_size[m->i_pixel].w;
const int bh = x264_pixel_size[m->i_pixel].h;
+ const int16_t *p_cost_mvx = m->p_cost_mv - m->mvp[0];
+ const int16_t *p_cost_mvy = m->p_cost_mv - m->mvp[1];
DECLARE_ALIGNED( uint8_t, pix[4][16*16], 16 );
uint8_t * src[4];
src[3] = h->mc.get_ref( m->p_fref, m->i_stride, pix[3], &stride[3], bmx + step, bmy + 0, bw, bh );
cost[0] = h->pixf.satd[m->i_pixel]( m->p_fenc, m->i_stride, src[0], stride[0] ) +
- m->lm * ( bs_size_se( bmx + 0 - m->mvp[0] ) + bs_size_se( bmy - step - m->mvp[1] ) );
+ p_cost_mvx[ bmx + 0 ] + p_cost_mvy[ bmy - step ];
cost[1] = h->pixf.satd[m->i_pixel]( m->p_fenc, m->i_stride, src[1], stride[1] ) +
- m->lm * ( bs_size_se( bmx + 0 - m->mvp[0] ) + bs_size_se( bmy + step - m->mvp[1] ) );
+ p_cost_mvx[ bmx + 0 ] + p_cost_mvy[ bmy + step ];
cost[2] = h->pixf.satd[m->i_pixel]( m->p_fenc, m->i_stride, src[2], stride[2] ) +
- m->lm * ( bs_size_se( bmx - step - m->mvp[0] ) + bs_size_se( bmy + 0 - m->mvp[1] ) );
+ p_cost_mvx[ bmx - step ] + p_cost_mvy[ bmy + 0 ];
cost[3] = h->pixf.satd[m->i_pixel]( m->p_fenc, m->i_stride, src[3], stride[3] ) +
- m->lm * ( bs_size_se( bmx + step - m->mvp[0] ) + bs_size_se( bmy + 0 - m->mvp[1] ) );
+ p_cost_mvx[ bmx + step ] + p_cost_mvy[ bmy + 0 ];
best = 0;
if( cost[1] < cost[0] ) best = 1;
m->mv[0] = bmx;
m->mv[1] = bmy;
+ m->cost_mv = p_cost_mvx[ bmx ] + p_cost_mvy[ bmy ];
}