From d2a9d25429b6843874865a37a5b4f6b401d89abc Mon Sep 17 00:00:00 2001 From: Fiona Glaser Date: Thu, 10 Jan 2013 13:15:52 -0800 Subject: [PATCH] Improve lookahead-threads auto selection Smarter decision to improve fast-first-pass performance in 2-pass encodes. Dramatically improves CPU utilization on multi-core systems. Tested on a quad-core Ivy Bridge (12 threads, 1080p): Fast first pass: veryfast: ~7% faster faster: ~11% faster fast/medium: ~15% faster slow/slower: ~42% faster veryslow: ~55% faster CRF/1-pass: veryfast: ~9% faster (all others remained the same) --- encoder/encoder.c | 32 +++++++++++++++++++++++++++++--- 1 file changed, 29 insertions(+), 3 deletions(-) diff --git a/encoder/encoder.c b/encoder/encoder.c index 49896115..4f3d555b 100644 --- a/encoder/encoder.c +++ b/encoder/encoder.c @@ -503,8 +503,6 @@ static int x264_validate_parameters( x264_t *h, int b_open ) if( h->param.i_threads == X264_THREADS_AUTO ) h->param.i_threads = x264_cpu_num_processors() * (h->param.b_sliced_threads?2:3)/2; - if( h->param.i_lookahead_threads == X264_THREADS_AUTO ) - h->param.i_lookahead_threads = h->param.i_threads / (h->param.b_sliced_threads?1:6); int max_sliced_threads = X264_MAX( 1, (h->param.i_height+15)/16 / 4 ); if( h->param.i_threads > 1 ) { @@ -518,7 +516,6 @@ static int x264_validate_parameters( x264_t *h, int b_open ) h->param.i_threads = X264_MIN( h->param.i_threads, max_sliced_threads ); } h->param.i_threads = x264_clip3( h->param.i_threads, 1, X264_THREAD_MAX ); - h->param.i_lookahead_threads = x264_clip3( h->param.i_lookahead_threads, 1, X264_MIN( max_sliced_threads, X264_LOOKAHEAD_THREAD_MAX ) ); if( h->param.i_threads == 1 ) { h->param.b_sliced_threads = 0; @@ -895,6 +892,35 @@ static int x264_validate_parameters( x264_t *h, int b_open ) h->param.analyse.i_weighted_pred = x264_clip3( h->param.analyse.i_weighted_pred, X264_WEIGHTP_NONE, X264_WEIGHTP_SMART ); + if( h->param.i_lookahead_threads == X264_THREADS_AUTO ) + { + if( h->param.b_sliced_threads ) + h->param.i_lookahead_threads = h->param.i_threads; + else + { + /* If we're using much slower lookahead settings than encoding settings, it helps a lot to use + * more lookahead threads. This typically happens in the first pass of a two-pass encode, so + * try to guess at this sort of case. + * + * Tuned by a little bit of real encoding with the various presets. */ + int badapt = h->param.i_bframe_adaptive == X264_B_ADAPT_TRELLIS; + int subme = X264_MIN( h->param.analyse.i_subpel_refine / 3, 3 ) + (h->param.analyse.i_subpel_refine > 1); + int bframes = X264_MIN( (h->param.i_bframe - 1) / 3, 3 ); + + /* [b-adapt 0/1 vs 2][quantized subme][quantized bframes] */ + static const uint8_t lookahead_thread_div[2][5][4] = + {{{6,6,6,6}, {3,3,3,3}, {4,4,4,4}, {6,6,6,6}, {12,12,12,12}}, + {{3,2,1,1}, {2,1,1,1}, {4,3,2,1}, {6,4,3,2}, {12, 9, 6, 4}}}; + + h->param.i_lookahead_threads = h->param.i_threads / lookahead_thread_div[badapt][subme][bframes]; + /* Since too many lookahead threads significantly degrades lookahead accuracy, limit auto + * lookahead threads to about 8 macroblock rows high each at worst. This number is chosen + * pretty much arbitrarily. */ + h->param.i_lookahead_threads = X264_MIN( h->param.i_lookahead_threads, h->param.i_height / 128 ); + } + } + h->param.i_lookahead_threads = x264_clip3( h->param.i_lookahead_threads, 1, X264_MIN( max_sliced_threads, X264_LOOKAHEAD_THREAD_MAX ) ); + if( PARAM_INTERLACED ) { if( h->param.analyse.i_me_method >= X264_ME_ESA ) -- 2.40.0