From b1eac26510d0532ae9202249767e5f3ba22443ef Mon Sep 17 00:00:00 2001 From: Fiona Glaser Date: Sun, 13 Sep 2009 01:02:37 -0700 Subject: [PATCH] Make MV costs global instead of static Fixes some extremely rare threading race conditions and makes the code cleaner. Downside: slightly higher memory usage when calling multiple encoders from the same application. --- common/common.h | 8 ++++ common/osdep.h | 2 + encoder/analyse.c | 92 +++++++++++++++++++++++-------------------- encoder/analyse.h | 5 ++- encoder/encoder.c | 20 +++++----- encoder/me.c | 24 +++++------ encoder/me.h | 2 +- encoder/ratecontrol.c | 2 + encoder/slicetype.c | 14 ++----- x264.c | 5 ++- 10 files changed, 94 insertions(+), 80 deletions(-) diff --git a/common/common.h b/common/common.h index 81c7b003..6271bc59 100644 --- a/common/common.h +++ b/common/common.h @@ -52,6 +52,8 @@ do {\ #define X264_THREAD_MAX 128 #define X264_PCM_COST (386*8) #define X264_LOOKAHEAD_MAX 250 +// arbitrary, but low because SATD scores are 1/4 normal +#define X264_LOOKAHEAD_QP 12 // number of pixels (per thread) in progress at any given time. // 16 for the macroblock in progress + 3 for deblocking + 3 for motion compensation filter + 2 for extra safety @@ -347,6 +349,12 @@ struct x264_t uint16_t (*quant4_bias[4])[16]; /* [4][52][16] */ uint16_t (*quant8_bias[2])[64]; /* [2][52][64] */ + /* mv/ref cost arrays. Indexed by lambda instead of + * qp because, due to rounding, some quantizers share + * lambdas. This saves memory. */ + uint16_t *cost_mv[92]; + uint16_t *cost_mv_fpel[92][4]; + const uint8_t *chroma_qp_table; /* includes both the nonlinear luma->chroma mapping and chroma_qp_offset */ ALIGNED_16( uint32_t nr_residual_sum[2][64] ); diff --git a/common/osdep.h b/common/osdep.h index cb61b14b..71780a24 100644 --- a/common/osdep.h +++ b/common/osdep.h @@ -140,6 +140,7 @@ static inline int x264_pthread_create( x264_pthread_t *t, void *a, void *(*f)(vo #define x264_pthread_attr_t pthread_attr_t #define x264_pthread_attr_init pthread_attr_init #define x264_pthread_attr_destroy pthread_attr_destroy +#define X264_PTHREAD_MUTEX_INITIALIZER PTHREAD_MUTEX_INITIALIZER #else #define x264_pthread_mutex_t int #define x264_pthread_mutex_init(m,f) 0 @@ -154,6 +155,7 @@ static inline int x264_pthread_create( x264_pthread_t *t, void *a, void *(*f)(vo #define x264_pthread_attr_t int #define x264_pthread_attr_init(a) 0 #define x264_pthread_attr_destroy(a) +#define X264_PTHREAD_MUTEX_INITIALIZER 0 #endif #define WORD_SIZE sizeof(void*) diff --git a/encoder/analyse.c b/encoder/analyse.c index 68d0e948..70f8d0f5 100644 --- a/encoder/analyse.c +++ b/encoder/analyse.c @@ -77,7 +77,7 @@ typedef struct int i_lambda; int i_lambda2; int i_qp; - int16_t *p_cost_mv; + uint16_t *p_cost_mv; uint16_t *p_cost_ref0; uint16_t *p_cost_ref1; int i_mbrd; @@ -237,46 +237,36 @@ static const int i_sub_mb_p_cost_table[4] = { static void x264_analyse_update_cache( x264_t *h, x264_mb_analysis_t *a ); -/* Indexed by lambda instead of qp because, due to rounding, - * some quantizers share lambdas. This saves memory. */ -uint16_t *x264_cost_mv_fpel[92][4]; -uint16_t x264_cost_ref[92][3][33]; +static uint16_t x264_cost_ref[92][3][33]; +static x264_pthread_mutex_t cost_ref_mutex = X264_PTHREAD_MUTEX_INITIALIZER; -/* initialize an array of lambda*nbits for all possible mvs */ -static int x264_mb_analyse_load_costs( x264_t *h, x264_mb_analysis_t *a ) +int x264_analyse_init_costs( x264_t *h, int qp ) { - static int16_t *p_cost_mv[92]; int i, j; - - if( !p_cost_mv[a->i_lambda] ) - { - x264_emms(); - /* could be faster, but isn't called many times */ - /* factor of 4 from qpel, 2 from sign, and 2 because mv can be opposite from mvp */ - CHECKED_MALLOC( p_cost_mv[a->i_lambda], (4*4*2048 + 1) * sizeof(int16_t) ); - p_cost_mv[a->i_lambda] += 2*4*2048; - for( i = 0; i <= 2*4*2048; i++ ) - { - p_cost_mv[a->i_lambda][-i] = - p_cost_mv[a->i_lambda][i] = a->i_lambda * (log2f(i+1)*2 + 0.718f + !!i) + .5f; - } - for( i = 0; i < 3; i++ ) - for( j = 0; j < 33; j++ ) - x264_cost_ref[a->i_lambda][i][j] = i ? a->i_lambda * bs_size_te( i, j ) : 0; - } - a->p_cost_mv = p_cost_mv[a->i_lambda]; - a->p_cost_ref0 = x264_cost_ref[a->i_lambda][x264_clip3(h->sh.i_num_ref_idx_l0_active-1,0,2)]; - a->p_cost_ref1 = x264_cost_ref[a->i_lambda][x264_clip3(h->sh.i_num_ref_idx_l1_active-1,0,2)]; - - /* FIXME is this useful for all me methods? */ - if( h->param.analyse.i_me_method >= X264_ME_ESA && !x264_cost_mv_fpel[a->i_lambda][0] ) + int lambda = x264_lambda_tab[qp]; + if( h->cost_mv[lambda] ) + return 0; + /* factor of 4 from qpel, 2 from sign, and 2 because mv can be opposite from mvp */ + CHECKED_MALLOC( h->cost_mv[lambda], (4*4*2048 + 1) * sizeof(uint16_t) ); + h->cost_mv[lambda] += 2*4*2048; + for( i = 0; i <= 2*4*2048; i++ ) + { + h->cost_mv[lambda][-i] = + h->cost_mv[lambda][i] = lambda * (log2f(i+1)*2 + 0.718f + !!i) + .5f; + } + x264_pthread_mutex_lock( &cost_ref_mutex ); + for( i = 0; i < 3; i++ ) + for( j = 0; j < 33; j++ ) + x264_cost_ref[lambda][i][j] = i ? lambda * bs_size_te( i, j ) : 0; + x264_pthread_mutex_unlock( &cost_ref_mutex ); + if( h->param.analyse.i_me_method >= X264_ME_ESA && !h->cost_mv_fpel[lambda][0] ) { for( j=0; j<4; j++ ) { - CHECKED_MALLOC( x264_cost_mv_fpel[a->i_lambda][j], (4*2048 + 1) * sizeof(int16_t) ); - x264_cost_mv_fpel[a->i_lambda][j] += 2*2048; + CHECKED_MALLOC( h->cost_mv_fpel[lambda][j], (4*2048 + 1) * sizeof(uint16_t) ); + h->cost_mv_fpel[lambda][j] += 2*2048; for( i = -2*2048; i < 2*2048; i++ ) - x264_cost_mv_fpel[a->i_lambda][j][i] = p_cost_mv[a->i_lambda][i*4+j]; + h->cost_mv_fpel[lambda][j][i] = h->cost_mv[lambda][i*4+j]; } } return 0; @@ -284,6 +274,27 @@ fail: return -1; } +void x264_analyse_free_costs( x264_t *h ) +{ + int i, j; + for( i = 0; i < 92; i++ ) + { + if( h->cost_mv[i] ) + x264_free( h->cost_mv[i] - 2*4*2048 ); + if( h->cost_mv_fpel[i][0] ) + for( j = 0; j < 4; j++ ) + x264_free( h->cost_mv_fpel[i][j] - 2*2048 ); + } +} + +/* initialize an array of lambda*nbits for all possible mvs */ +static void x264_mb_analyse_load_costs( x264_t *h, x264_mb_analysis_t *a ) +{ + a->p_cost_mv = h->cost_mv[a->i_lambda]; + a->p_cost_ref0 = x264_cost_ref[a->i_lambda][x264_clip3(h->sh.i_num_ref_idx_l0_active-1,0,2)]; + a->p_cost_ref1 = x264_cost_ref[a->i_lambda][x264_clip3(h->sh.i_num_ref_idx_l1_active-1,0,2)]; +} + static void x264_mb_analyse_init( x264_t *h, x264_mb_analysis_t *a, int i_qp ) { int i = h->param.analyse.i_subpel_refine - (h->sh.i_type == SLICE_TYPE_B); @@ -2317,7 +2328,7 @@ static inline void x264_mb_analyse_qp_rd( x264_t *h, x264_mb_analysis_t *a ) /***************************************************************************** * x264_macroblock_analyse: *****************************************************************************/ -int x264_macroblock_analyse( x264_t *h ) +void x264_macroblock_analyse( x264_t *h ) { x264_mb_analysis_t analysis; int i_cost = COST_MAX; @@ -2392,13 +2403,12 @@ int x264_macroblock_analyse( x264_t *h ) int i_thresh16x8; int i_satd_inter, i_satd_intra; - if( x264_mb_analyse_load_costs( h, &analysis ) ) - return -1; + x264_mb_analyse_load_costs( h, &analysis ); x264_mb_analyse_inter_p16x16( h, &analysis ); if( h->mb.i_type == P_SKIP ) - return 0; + return; if( flags & X264_ANALYSE_PSUB16x16 ) { @@ -2686,8 +2696,7 @@ int x264_macroblock_analyse( x264_t *h ) int i_satd_inter; h->mb.b_skip_mc = 0; - if( x264_mb_analyse_load_costs( h, &analysis ) ) - return -1; + x264_mb_analyse_load_costs( h, &analysis ); /* select best inter mode */ /* direct must be first */ @@ -2713,7 +2722,7 @@ int x264_macroblock_analyse( x264_t *h ) { h->mb.i_type = B_SKIP; x264_analyse_update_cache( h, &analysis ); - return 0; + return; } } @@ -2945,7 +2954,6 @@ int x264_macroblock_analyse( x264_t *h ) x264_psy_trellis_init( h, 0 ); if( h->mb.b_trellis == 1 || h->mb.b_noise_reduction ) h->mb.i_skip_intra = 0; - return 0; } /*-------------------- Update MB from the analysis ----------------------*/ diff --git a/encoder/analyse.h b/encoder/analyse.h index 05aae40d..5342d04b 100644 --- a/encoder/analyse.h +++ b/encoder/analyse.h @@ -24,9 +24,10 @@ #ifndef X264_ANALYSE_H #define X264_ANALYSE_H -int x264_macroblock_analyse( x264_t *h ); +int x264_analyse_init_costs( x264_t *h, int qp ); +void x264_analyse_free_costs( x264_t *h ); +void x264_macroblock_analyse( x264_t *h ); void x264_slicetype_decide( x264_t *h ); -int x264_lowres_context_alloc( x264_t *h ); void x264_slicetype_analyse( x264_t *h, int keyframe ); diff --git a/encoder/encoder.c b/encoder/encoder.c index c6b33980..8e614a2b 100644 --- a/encoder/encoder.c +++ b/encoder/encoder.c @@ -748,7 +748,7 @@ x264_t *x264_encoder_open( x264_param_t *param ) { x264_t *h; char buf[1000], *p; - int i, i_slicetype_length; + int i, qp, i_slicetype_length; CHECKED_MALLOCZERO( h, sizeof(x264_t) ); @@ -869,6 +869,12 @@ x264_t *x264_encoder_open( x264_param_t *param ) p += sprintf( p, " none!" ); x264_log( h, X264_LOG_INFO, "%s\n", buf ); + for( qp = h->param.rc.i_qp_min; qp <= h->param.rc.i_qp_max; qp++ ) + if( x264_analyse_init_costs( h, qp ) ) + goto fail; + if( x264_analyse_init_costs( h, X264_LOOKAHEAD_QP ) ) + goto fail; + h->out.i_nal = 0; h->out.i_bitstream = X264_MAX( 1000000, h->param.i_width * h->param.i_height * 4 * ( h->param.rc.i_rc_method == X264_RC_ABR ? pow( 0.95, h->param.rc.i_qp_min ) @@ -900,9 +906,6 @@ x264_t *x264_encoder_open( x264_param_t *param ) if( x264_ratecontrol_new( h ) < 0 ) goto fail; - if( x264_lowres_context_alloc( h ) ) - goto fail; - if( h->param.psz_dump_yuv ) { /* create or truncate the reconstructed video file */ @@ -1332,12 +1335,7 @@ static int x264_slice_write( x264_t *h ) /* load cache */ x264_macroblock_cache_load( h, i_mb_x, i_mb_y ); - /* analyse parameters - * Slice I: choose I_4x4 or I_16x16 mode - * Slice P: choose between using P mode or intra (4x4 or 16x16) - * */ - if( x264_macroblock_analyse( h ) ) - return -1; + x264_macroblock_analyse( h ); /* encode this macroblock -> be careful it can change the mb type to P_SKIP if needed */ x264_macroblock_encode( h ); @@ -2230,6 +2228,8 @@ void x264_encoder_close ( x264_t *h ) x264_cqm_delete( h ); + x264_analyse_free_costs( h ); + if( h->param.i_threads > 1) h = h->thread[ h->i_thread_phase % h->param.i_threads ]; diff --git a/encoder/me.c b/encoder/me.c index eb0fd5a0..d7c71653 100644 --- a/encoder/me.c +++ b/encoder/me.c @@ -195,8 +195,8 @@ void x264_me_search_ref( x264_t *h, x264_me_t *m, int16_t (*mvc)[2], int i_mvc, #define CHECK_MVRANGE(mx,my) ( mx >= mv_x_min && mx <= mv_x_max && my >= mv_y_min && my <= mv_y_max ) - const int16_t *p_cost_mvx = m->p_cost_mv - m->mvp[0]; - const int16_t *p_cost_mvy = m->p_cost_mv - m->mvp[1]; + const uint16_t *p_cost_mvx = m->p_cost_mv - m->mvp[0]; + const uint16_t *p_cost_mvy = m->p_cost_mv - m->mvp[1]; bmx = x264_clip3( m->mvp[0], mv_x_min*4, mv_x_max*4 ); bmy = x264_clip3( m->mvp[1], mv_y_min*4, mv_y_max*4 ); @@ -452,8 +452,8 @@ me_hex2: /* hexagon grid */ omx = bmx; omy = bmy; - const int16_t *p_cost_omvx = p_cost_mvx + omx*4; - const int16_t *p_cost_omvy = p_cost_mvy + omy*4; + const uint16_t *p_cost_omvx = p_cost_mvx + omx*4; + const uint16_t *p_cost_omvy = p_cost_mvy + omy*4; i = 1; do { @@ -569,7 +569,7 @@ me_hex2: int delta = x264_pixel_size[sad_size].w; int16_t *xs = h->scratch_buffer; int xn; - uint16_t *cost_fpel_mvx = x264_cost_mv_fpel[x264_lambda_tab[h->mb.i_qp]][-m->mvp[0]&3] + (-m->mvp[0]>>2); + uint16_t *cost_fpel_mvx = h->cost_mv_fpel[x264_lambda_tab[h->mb.i_qp]][-m->mvp[0]&3] + (-m->mvp[0]>>2); h->pixf.sad_x4[sad_size]( zero, p_fenc, p_fenc+delta, p_fenc+delta*FENC_STRIDE, p_fenc+delta+delta*FENC_STRIDE, @@ -768,8 +768,8 @@ static void refine_subpel( x264_t *h, x264_me_t *m, int hpel_iters, int qpel_ite { const int bw = x264_pixel_size[m->i_pixel].w; const int bh = x264_pixel_size[m->i_pixel].h; - const int16_t *p_cost_mvx = m->p_cost_mv - m->mvp[0]; - const int16_t *p_cost_mvy = m->p_cost_mv - m->mvp[1]; + const uint16_t *p_cost_mvx = m->p_cost_mv - m->mvp[0]; + const uint16_t *p_cost_mvy = m->p_cost_mv - m->mvp[1]; const int i_pixel = m->i_pixel; const int b_chroma_me = h->mb.b_chroma_me && i_pixel <= PIXEL_8x8; @@ -942,10 +942,10 @@ static void ALWAYS_INLINE x264_me_refine_bidir( x264_t *h, x264_me_t *m0, x264_m const int i_pixel = m0->i_pixel; const int bw = x264_pixel_size[i_pixel].w; const int bh = x264_pixel_size[i_pixel].h; - const int16_t *p_cost_m0x = m0->p_cost_mv - m0->mvp[0]; - const int16_t *p_cost_m0y = m0->p_cost_mv - m0->mvp[1]; - const int16_t *p_cost_m1x = m1->p_cost_mv - m1->mvp[0]; - const int16_t *p_cost_m1y = m1->p_cost_mv - m1->mvp[1]; + const uint16_t *p_cost_m0x = m0->p_cost_mv - m0->mvp[0]; + const uint16_t *p_cost_m0y = m0->p_cost_mv - m0->mvp[1]; + const uint16_t *p_cost_m1x = m1->p_cost_mv - m1->mvp[0]; + const uint16_t *p_cost_m1y = m1->p_cost_mv - m1->mvp[1]; ALIGNED_ARRAY_16( uint8_t, pixy_buf,[2],[9][16*16] ); ALIGNED_8( uint8_t pixu_buf[2][9][8*8] ); ALIGNED_8( uint8_t pixv_buf[2][9][8*8] ); @@ -1073,7 +1073,7 @@ void x264_me_refine_qpel_rd( x264_t *h, x264_me_t *m, int i_lambda2, int i4, int static const int pixel_mv_offs[] = { 0, 4, 4*8, 0, 2, 2*8, 0 }; int16_t *cache_mv = h->mb.cache.mv[i_list][x264_scan8[i4]]; int16_t *cache_mv2 = cache_mv + pixel_mv_offs[m->i_pixel]; - const int16_t *p_cost_mvx, *p_cost_mvy; + const uint16_t *p_cost_mvx, *p_cost_mvy; const int bw = x264_pixel_size[m->i_pixel].w>>2; const int bh = x264_pixel_size[m->i_pixel].h>>2; const int i_pixel = m->i_pixel; diff --git a/encoder/me.h b/encoder/me.h index 8bdee2e5..0122b8b7 100644 --- a/encoder/me.h +++ b/encoder/me.h @@ -31,7 +31,7 @@ typedef struct { /* input */ int i_pixel; /* PIXEL_WxH */ - int16_t *p_cost_mv; /* lambda * nbits for each possible mv */ + uint16_t *p_cost_mv; /* lambda * nbits for each possible mv */ int i_ref_cost; int i_ref; diff --git a/encoder/ratecontrol.c b/encoder/ratecontrol.c index cb7fd3b8..b11c7c43 100644 --- a/encoder/ratecontrol.c +++ b/encoder/ratecontrol.c @@ -953,6 +953,8 @@ void x264_ratecontrol_start( x264_t *h, int i_force_qp ) } } + q = x264_clip3f( q, h->param.rc.i_qp_min, h->param.rc.i_qp_max ); + rc->qpa_rc = rc->qpa_aq = 0; h->fdec->f_qp_avg_rc = diff --git a/encoder/slicetype.c b/encoder/slicetype.c index 56b56e1b..985dfd65 100644 --- a/encoder/slicetype.c +++ b/encoder/slicetype.c @@ -29,22 +29,14 @@ #include "me.h" -static int x264_lowres_context_init( x264_t *h, x264_mb_analysis_t *a ) +static void x264_lowres_context_init( x264_t *h, x264_mb_analysis_t *a ) { - a->i_qp = 12; // arbitrary, but low because SATD scores are 1/4 normal + a->i_qp = X264_LOOKAHEAD_QP; a->i_lambda = x264_lambda_tab[ a->i_qp ]; - if( x264_mb_analyse_load_costs( h, a ) ) - return -1; + x264_mb_analyse_load_costs( h, a ); h->mb.i_me_method = X264_MIN( X264_ME_HEX, h->param.analyse.i_me_method ); // maybe dia? h->mb.i_subpel_refine = 4; // 3 should be enough, but not tweaking for speed now h->mb.b_chroma_me = 0; - return 0; -} - -int x264_lowres_context_alloc( x264_t *h ) -{ - x264_mb_analysis_t a; - return x264_lowres_context_init( h, &a ); } static int x264_slicetype_mb_cost( x264_t *h, x264_mb_analysis_t *a, diff --git a/x264.c b/x264.c index 0fdde7d8..3d43eb55 100644 --- a/x264.c +++ b/x264.c @@ -262,9 +262,10 @@ static void Help( x264_param_t *defaults, int longhelp ) " where