From 6940dcaef140d8a0c43c9a62db158e9d71a8fdeb Mon Sep 17 00:00:00 2001 From: Steven Walters Date: Tue, 1 Sep 2009 18:46:51 -0700 Subject: [PATCH] Threaded lookahead Move lookahead into a separate thread, set to higher priority than the other threads, for optimal performance. Reduces the amount that lookahead bottlenecks encoding, greatly increasing performance with lookahead-intensive settings (e.g. b-adapt 2) on many-core CPUs. Buffer size can be controlled with --sync-lookahead, which defaults to auto (threads+bframes buffer size). Note that this buffer is separate from the rc-lookahead value. Note also that this does not split lookahead itself into multiple threads yet; this may be added in the future. Additionally, split frames into "fdec" and "fenc" frame types and keep the two separate. This split greatly reduces memory usage, which helps compensate for the larger lookahead size. Extremely special thanks to Michael Kazmier and Alex Giladi of Avail Media, the original authors of this patch. --- Makefile | 2 +- common/common.c | 8 ++ common/common.h | 26 ++-- common/cpu.h | 8 +- common/frame.c | 202 +++++++++++++++++++----------- common/frame.h | 25 +++- common/macroblock.c | 2 +- common/osdep.h | 13 ++ common/x86/cpu-a.asm | 4 +- encoder/analyse.h | 8 ++ encoder/encoder.c | 127 ++++++++----------- encoder/lookahead.c | 278 ++++++++++++++++++++++++++++++++++++++++++ encoder/ratecontrol.c | 8 +- encoder/slicetype.c | 107 ++++++++-------- x264.c | 4 +- x264.h | 4 +- 16 files changed, 598 insertions(+), 228 deletions(-) create mode 100644 encoder/lookahead.c diff --git a/Makefile b/Makefile index 0f34736e..04d639e8 100644 --- a/Makefile +++ b/Makefile @@ -10,7 +10,7 @@ SRCS = common/mc.c common/predict.c common/pixel.c common/macroblock.c \ common/quant.c common/vlc.c \ encoder/analyse.c encoder/me.c encoder/ratecontrol.c \ encoder/set.c encoder/macroblock.c encoder/cabac.c \ - encoder/cavlc.c encoder/encoder.c + encoder/cavlc.c encoder/encoder.c encoder/lookahead.c SRCCLI = x264.c matroska.c muxers.c diff --git a/common/common.c b/common/common.c index c0a56e3d..46173870 100644 --- a/common/common.c +++ b/common/common.c @@ -45,6 +45,7 @@ void x264_param_default( x264_param_t *param ) param->cpu = x264_cpu_detect(); param->i_threads = X264_THREADS_AUTO; param->b_deterministic = 1; + param->i_sync_lookahead = X264_SYNC_LOOKAHEAD_AUTO; /* Video properties */ param->i_csp = X264_CSP_I420; @@ -276,6 +277,13 @@ int x264_param_parse( x264_param_t *p, const char *name, const char *value ) else p->i_threads = atoi(value); } + OPT("sync-lookahead") + { + if( !strcmp(value, "auto") ) + p->i_sync_lookahead = X264_SYNC_LOOKAHEAD_AUTO; + else + p->i_sync_lookahead = atoi(value); + } OPT2("deterministic", "n-deterministic") p->b_deterministic = atobool(value); OPT2("level", "level-idc") diff --git a/common/common.h b/common/common.h index 574040d0..81c7b003 100644 --- a/common/common.h +++ b/common/common.h @@ -239,6 +239,19 @@ typedef struct } x264_slice_header_t; +typedef struct x264_lookahead_t +{ + uint8_t b_thread_active; + uint8_t b_exit_thread; + uint8_t b_analyse_keyframe; + int i_last_idr; + int i_slicetype_length; + x264_frame_t *last_nonb; + x264_synch_frame_list_t ifbuf; + x264_synch_frame_list_t next; + x264_synch_frame_list_t ofbuf; +} x264_lookahead_t; + /* From ffmpeg */ #define X264_SCAN8_SIZE (6*8) @@ -283,7 +296,7 @@ struct x264_t /* encoder parameters */ x264_param_t param; - x264_t *thread[X264_THREAD_MAX]; + x264_t *thread[X264_THREAD_MAX+1]; x264_pthread_t thread_handle; int b_thread_active; int i_thread_phase; /* which thread to use for the next frame */ @@ -349,13 +362,9 @@ struct x264_t struct { /* Frames to be encoded (whose types have been decided) */ - x264_frame_t *current[X264_LOOKAHEAD_MAX+3]; - /* Temporary buffer (frames types not yet decided) */ - x264_frame_t *next[X264_LOOKAHEAD_MAX+3]; - /* Unused frames */ - x264_frame_t *unused[X264_LOOKAHEAD_MAX + X264_THREAD_MAX*2 + 16+4]; - /* For adaptive B decision */ - x264_frame_t *last_nonb; + x264_frame_t **current; + /* Unused frames: 0 = fenc, 1 = fdec */ + x264_frame_t **unused[2]; /* frames used for reference + sentinels */ x264_frame_t *reference[16+2]; @@ -667,6 +676,7 @@ struct x264_t #if VISUALIZE struct visualize_t *visualize; #endif + x264_lookahead_t *lookahead; }; // included at the end because it needs x264_t diff --git a/common/cpu.h b/common/cpu.h index 4380a359..6901e1e1 100644 --- a/common/cpu.h +++ b/common/cpu.h @@ -33,12 +33,12 @@ void x264_cpu_mask_misalign_sse( void ); * gcc 4.2 introduced __attribute__((force_align_arg_pointer)) to fix this * problem, but I don't want to require such a new version. * This applies only to x86_32, since other architectures that need alignment - * also have ABIs that ensure aligned stack. */ + * either have ABIs that ensure aligned stack, or don't support it at all. */ #if defined(ARCH_X86) && defined(HAVE_MMX) -int x264_stack_align( void (*func)(x264_t*), x264_t *arg ); -#define x264_stack_align(func,arg) x264_stack_align((void (*)(x264_t*))func,arg) +int x264_stack_align( void (*func)(), ... ); +#define x264_stack_align(func,...) x264_stack_align((void (*)())func, __VA_ARGS__) #else -#define x264_stack_align(func,arg) func(arg) +#define x264_stack_align(func,...) func(__VA_ARGS__) #endif typedef struct { diff --git a/common/frame.c b/common/frame.c index 99052602..001c4fd9 100644 --- a/common/frame.c +++ b/common/frame.c @@ -26,7 +26,7 @@ #define ALIGN(x,a) (((x)+((a)-1))&~((a)-1)) -x264_frame_t *x264_frame_new( x264_t *h ) +x264_frame_t *x264_frame_new( x264_t *h, int b_fdec ) { x264_frame_t *frame; int i, j; @@ -60,9 +60,23 @@ x264_frame_t *x264_frame_new( x264_t *h ) CHECKED_MALLOC( frame->buffer[i], chroma_plane_size ); frame->plane[i] = frame->buffer[i] + (frame->i_stride[i] * i_padv + PADH)/2; } + + for( i = 0; i < h->param.i_bframe + 2; i++ ) + for( j = 0; j < h->param.i_bframe + 2; j++ ) + CHECKED_MALLOC( frame->i_row_satds[i][j], i_lines/16 * sizeof(int) ); + + frame->i_poc = -1; + frame->i_type = X264_TYPE_AUTO; + frame->i_qpplus1 = 0; + frame->i_pts = -1; + frame->i_frame = -1; + frame->i_frame_num = -1; + frame->i_lines_completed = -1; + frame->b_fdec = b_fdec; + /* all 4 luma planes allocated together, since the cacheline split code * requires them to be in-phase wrt cacheline alignment. */ - if( h->param.analyse.i_subpel_refine ) + if( h->param.analyse.i_subpel_refine && b_fdec ) { CHECKED_MALLOC( frame->buffer[0], 4*luma_plane_size); for( i = 0; i < 4; i++ ) @@ -75,77 +89,68 @@ x264_frame_t *x264_frame_new( x264_t *h ) frame->plane[0] = frame->buffer[0] + frame->i_stride[0] * i_padv + PADH; } - if( h->frames.b_have_lowres ) + if( b_fdec ) /* fdec frame */ { - frame->i_width_lowres = frame->i_width[0]/2; - frame->i_stride_lowres = ALIGN( frame->i_width_lowres + 2*PADH, align ); - frame->i_lines_lowres = frame->i_lines[0]/2; - - luma_plane_size = frame->i_stride_lowres * ( frame->i_lines[0]/2 + 2*i_padv ); - - CHECKED_MALLOC( frame->buffer_lowres[0], 4 * luma_plane_size ); - for( i = 0; i < 4; i++ ) - frame->lowres[i] = frame->buffer_lowres[0] + (frame->i_stride_lowres * i_padv + PADH) + i * luma_plane_size; - - for( j = 0; j <= !!h->param.i_bframe; j++ ) - for( i = 0; i <= h->param.i_bframe; i++ ) - { - CHECKED_MALLOCZERO( frame->lowres_mvs[j][i], 2*h->mb.i_mb_count*sizeof(int16_t) ); - CHECKED_MALLOC( frame->lowres_mv_costs[j][i], h->mb.i_mb_count*sizeof(int) ); - } - CHECKED_MALLOC( frame->i_propagate_cost, (i_mb_count+3) * sizeof(uint16_t) ); - for( j = 0; j <= h->param.i_bframe+1; j++ ) - for( i = 0; i <= h->param.i_bframe+1; i++ ) - { - CHECKED_MALLOC( frame->lowres_costs[j][i], (i_mb_count+3) * sizeof(uint16_t) ); - CHECKED_MALLOC( frame->lowres_inter_types[j][i], (i_mb_count+3)/4 * sizeof(uint8_t) ); - } - frame->i_intra_cost = frame->lowres_costs[0][0]; - memset( frame->i_intra_cost, -1, (i_mb_count+3) * sizeof(uint16_t) ); + CHECKED_MALLOC( frame->mb_type, i_mb_count * sizeof(int8_t)); + CHECKED_MALLOC( frame->mv[0], 2*16 * i_mb_count * sizeof(int16_t) ); + CHECKED_MALLOC( frame->ref[0], 4 * i_mb_count * sizeof(int8_t) ); + if( h->param.i_bframe ) + { + CHECKED_MALLOC( frame->mv[1], 2*16 * i_mb_count * sizeof(int16_t) ); + CHECKED_MALLOC( frame->ref[1], 4 * i_mb_count * sizeof(int8_t) ); + } + else + { + frame->mv[1] = NULL; + frame->ref[1] = NULL; + } + CHECKED_MALLOC( frame->i_row_bits, i_lines/16 * sizeof(int) ); + CHECKED_MALLOC( frame->i_row_qp, i_lines/16 * sizeof(int) ); + if( h->param.analyse.i_me_method >= X264_ME_ESA ) + { + CHECKED_MALLOC( frame->buffer[3], + frame->i_stride[0] * (frame->i_lines[0] + 2*i_padv) * sizeof(uint16_t) << h->frames.b_have_sub8x8_esa ); + frame->integral = (uint16_t*)frame->buffer[3] + frame->i_stride[0] * i_padv + PADH; + } } - - if( h->param.analyse.i_me_method >= X264_ME_ESA ) + else /* fenc frame */ { - CHECKED_MALLOC( frame->buffer[3], - frame->i_stride[0] * (frame->i_lines[0] + 2*i_padv) * sizeof(uint16_t) << h->frames.b_have_sub8x8_esa ); - frame->integral = (uint16_t*)frame->buffer[3] + frame->i_stride[0] * i_padv + PADH; - } - - frame->i_poc = -1; - frame->i_type = X264_TYPE_AUTO; - frame->i_qpplus1 = 0; - frame->i_pts = -1; - frame->i_frame = -1; - frame->i_frame_num = -1; - frame->i_lines_completed = -1; + if( h->frames.b_have_lowres ) + { + frame->i_width_lowres = frame->i_width[0]/2; + frame->i_stride_lowres = ALIGN( frame->i_width_lowres + 2*PADH, align ); + frame->i_lines_lowres = frame->i_lines[0]/2; - CHECKED_MALLOC( frame->mb_type, i_mb_count * sizeof(int8_t)); - CHECKED_MALLOC( frame->mv[0], 2*16 * i_mb_count * sizeof(int16_t) ); - CHECKED_MALLOC( frame->ref[0], 4 * i_mb_count * sizeof(int8_t) ); - if( h->param.i_bframe ) - { - CHECKED_MALLOC( frame->mv[1], 2*16 * i_mb_count * sizeof(int16_t) ); - CHECKED_MALLOC( frame->ref[1], 4 * i_mb_count * sizeof(int8_t) ); - } - else - { - frame->mv[1] = NULL; - frame->ref[1] = NULL; - } + luma_plane_size = frame->i_stride_lowres * ( frame->i_lines[0]/2 + 2*i_padv ); - CHECKED_MALLOC( frame->i_row_bits, i_lines/16 * sizeof(int) ); - CHECKED_MALLOC( frame->i_row_qp, i_lines/16 * sizeof(int) ); - for( i = 0; i < h->param.i_bframe + 2; i++ ) - for( j = 0; j < h->param.i_bframe + 2; j++ ) - CHECKED_MALLOC( frame->i_row_satds[i][j], i_lines/16 * sizeof(int) ); + CHECKED_MALLOC( frame->buffer_lowres[0], 4 * luma_plane_size ); + for( i = 0; i < 4; i++ ) + frame->lowres[i] = frame->buffer_lowres[0] + (frame->i_stride_lowres * i_padv + PADH) + i * luma_plane_size; - if( h->param.rc.i_aq_mode ) - { - CHECKED_MALLOC( frame->f_qp_offset, h->mb.i_mb_count * sizeof(float) ); - CHECKED_MALLOC( frame->f_qp_offset_aq, h->mb.i_mb_count * sizeof(float) ); - if( h->frames.b_have_lowres ) - /* shouldn't really be initialized, just silences a valgrind false-positive in x264_mbtree_propagate_cost_sse2 */ - CHECKED_MALLOCZERO( frame->i_inv_qscale_factor, (h->mb.i_mb_count+3) * sizeof(uint16_t) ); + for( j = 0; j <= !!h->param.i_bframe; j++ ) + for( i = 0; i <= h->param.i_bframe; i++ ) + { + CHECKED_MALLOCZERO( frame->lowres_mvs[j][i], 2*h->mb.i_mb_count*sizeof(int16_t) ); + CHECKED_MALLOC( frame->lowres_mv_costs[j][i], h->mb.i_mb_count*sizeof(int) ); + } + CHECKED_MALLOC( frame->i_propagate_cost, (i_mb_count+3) * sizeof(uint16_t) ); + for( j = 0; j <= h->param.i_bframe+1; j++ ) + for( i = 0; i <= h->param.i_bframe+1; i++ ) + { + CHECKED_MALLOC( frame->lowres_costs[j][i], (i_mb_count+3) * sizeof(uint16_t) ); + CHECKED_MALLOC( frame->lowres_inter_types[j][i], (i_mb_count+3)/4 * sizeof(uint8_t) ); + } + frame->i_intra_cost = frame->lowres_costs[0][0]; + memset( frame->i_intra_cost, -1, (i_mb_count+3) * sizeof(uint16_t) ); + } + if( h->param.rc.i_aq_mode ) + { + CHECKED_MALLOC( frame->f_qp_offset, h->mb.i_mb_count * sizeof(float) ); + CHECKED_MALLOC( frame->f_qp_offset_aq, h->mb.i_mb_count * sizeof(float) ); + if( h->frames.b_have_lowres ) + /* shouldn't really be initialized, just silences a valgrind false-positive in x264_mbtree_propagate_cost_sse2 */ + CHECKED_MALLOCZERO( frame->i_inv_qscale_factor, (h->mb.i_mb_count+3) * sizeof(uint16_t) ); + } } if( x264_pthread_mutex_init( &frame->mutex, NULL ) ) @@ -971,19 +976,19 @@ void x264_frame_push_unused( x264_t *h, x264_frame_t *frame ) assert( frame->i_reference_count > 0 ); frame->i_reference_count--; if( frame->i_reference_count == 0 ) - x264_frame_push( h->frames.unused, frame ); - assert( h->frames.unused[ sizeof(h->frames.unused) / sizeof(*h->frames.unused) - 1 ] == NULL ); + x264_frame_push( h->frames.unused[frame->b_fdec], frame ); } -x264_frame_t *x264_frame_pop_unused( x264_t *h ) +x264_frame_t *x264_frame_pop_unused( x264_t *h, int b_fdec ) { x264_frame_t *frame; - if( h->frames.unused[0] ) - frame = x264_frame_pop( h->frames.unused ); + if( h->frames.unused[b_fdec][0] ) + frame = x264_frame_pop( h->frames.unused[b_fdec] ); else - frame = x264_frame_new( h ); + frame = x264_frame_new( h, b_fdec ); if( !frame ) return NULL; + frame->b_last_minigop_bframe = 0; frame->i_reference_count = 1; frame->b_intra_calculated = 0; return frame; @@ -1008,3 +1013,54 @@ void x264_frame_sort( x264_frame_t **list, int b_dts ) } } while( !b_ok ); } + +void x264_frame_delete_list( x264_frame_t **list ) +{ + int i = 0; + while( list[i] ) + x264_frame_delete( list[i++] ); + x264_free( list ); +} + +int x264_synch_frame_list_init( x264_synch_frame_list_t *slist, int max_size ) +{ + if( max_size < 0 ) + return -1; + slist->i_max_size = max_size; + slist->i_size = 0; + CHECKED_MALLOCZERO( slist->list, (max_size+1) * sizeof(x264_frame_t*) ); + if( x264_pthread_mutex_init( &slist->mutex, NULL ) || + x264_pthread_cond_init( &slist->cv_fill, NULL ) || + x264_pthread_cond_init( &slist->cv_empty, NULL ) ) + return -1; + return 0; +fail: + return -1; +} + +void x264_synch_frame_list_delete( x264_synch_frame_list_t *slist ) +{ + x264_pthread_mutex_destroy( &slist->mutex ); + x264_pthread_cond_destroy( &slist->cv_fill ); + x264_pthread_cond_destroy( &slist->cv_empty ); + x264_frame_delete_list( slist->list ); +} + +void x264_synch_frame_list_push( x264_synch_frame_list_t *slist, x264_frame_t *frame ) +{ + x264_pthread_mutex_lock( &slist->mutex ); + while( slist->i_size == slist->i_max_size ) + x264_pthread_cond_wait( &slist->cv_empty, &slist->mutex ); + slist->list[ slist->i_size++ ] = frame; + x264_pthread_mutex_unlock( &slist->mutex ); + x264_pthread_cond_broadcast( &slist->cv_fill ); +} + +int x264_synch_frame_list_get_size( x264_synch_frame_list_t *slist ) +{ + int size; + x264_pthread_mutex_lock( &slist->mutex ); + size = slist->i_size; + x264_pthread_mutex_unlock( &slist->mutex ); + return size; +} diff --git a/common/frame.h b/common/frame.h index 9ca83f93..f6faa12b 100644 --- a/common/frame.h +++ b/common/frame.h @@ -40,6 +40,9 @@ typedef struct int i_frame; /* Presentation frame number */ int i_frame_num; /* Coded frame number */ int b_kept_as_ref; + uint8_t b_fdec; + uint8_t b_last_minigop_bframe; /* this frame is the last b in a sequence of bframes */ + uint8_t i_bframes; /* number of bframes following this nonb in coded order */ float f_qp_avg_rc; /* QPs as decided by ratecontrol */ float f_qp_avg_aq; /* QPs as decided by AQ in addition to ratecontrol */ @@ -104,6 +107,17 @@ typedef struct } x264_frame_t; +/* synchronized frame list */ +typedef struct +{ + x264_frame_t **list; + int i_max_size; + int i_size; + x264_pthread_mutex_t mutex; + x264_pthread_cond_t cv_fill; /* event signaling that the list became fuller */ + x264_pthread_cond_t cv_empty; /* event signaling that the list became emptier */ +} x264_synch_frame_list_t; + typedef void (*x264_deblock_inter_t)( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 ); typedef void (*x264_deblock_intra_t)( uint8_t *pix, int stride, int alpha, int beta ); typedef struct @@ -118,7 +132,7 @@ typedef struct x264_deblock_intra_t deblock_h_chroma_intra; } x264_deblock_function_t; -x264_frame_t *x264_frame_new( x264_t *h ); +x264_frame_t *x264_frame_new( x264_t *h, int b_fdec ); void x264_frame_delete( x264_frame_t *frame ); int x264_frame_copy_picture( x264_t *h, x264_frame_t *dst, x264_picture_t *src ); @@ -144,8 +158,15 @@ x264_frame_t *x264_frame_pop( x264_frame_t **list ); void x264_frame_unshift( x264_frame_t **list, x264_frame_t *frame ); x264_frame_t *x264_frame_shift( x264_frame_t **list ); void x264_frame_push_unused( x264_t *h, x264_frame_t *frame ); -x264_frame_t *x264_frame_pop_unused( x264_t *h ); +x264_frame_t *x264_frame_pop_unused( x264_t *h, int b_fdec ); void x264_frame_sort( x264_frame_t **list, int b_dts ); +void x264_frame_delete_list( x264_frame_t **list ); + +int x264_synch_frame_list_init( x264_synch_frame_list_t *slist, int nelem ); +void x264_synch_frame_list_delete( x264_synch_frame_list_t *slist ); +void x264_synch_frame_list_push( x264_synch_frame_list_t *slist, x264_frame_t *frame ); +int x264_synch_frame_list_get_size( x264_synch_frame_list_t *slist ); + #define x264_frame_sort_dts(list) x264_frame_sort(list, 1) #define x264_frame_sort_pts(list) x264_frame_sort(list, 0) diff --git a/common/macroblock.c b/common/macroblock.c index 790dde22..6e866d4c 100644 --- a/common/macroblock.c +++ b/common/macroblock.c @@ -703,7 +703,7 @@ int x264_macroblock_cache_init( x264_t *h ) for( j=0; j<3; j++ ) { /* shouldn't really be initialized, just silences a valgrind false-positive in predict_8x8_filter_mmx */ - CHECKED_MALLOCZERO( h->mb.intra_border_backup[i][j], h->fdec->i_stride[j] ); + CHECKED_MALLOCZERO( h->mb.intra_border_backup[i][j], (h->sps->i_mb_width*16+32)>>!!j ); h->mb.intra_border_backup[i][j] += 8; } diff --git a/common/osdep.h b/common/osdep.h index 696bbc9d..9d6a1e63 100644 --- a/common/osdep.h +++ b/common/osdep.h @@ -137,6 +137,9 @@ static inline int x264_pthread_create( x264_pthread_t *t, void *a, void *(*f)(vo #define x264_pthread_cond_destroy pthread_cond_destroy #define x264_pthread_cond_broadcast pthread_cond_broadcast #define x264_pthread_cond_wait pthread_cond_wait +#define x264_pthread_attr_t pthread_attr_t +#define x264_pthread_attr_init pthread_attr_init +#define x264_pthread_attr_destroy pthread_attr_destroy #else #define x264_pthread_mutex_t int #define x264_pthread_mutex_init(m,f) 0 @@ -148,6 +151,9 @@ static inline int x264_pthread_create( x264_pthread_t *t, void *a, void *(*f)(vo #define x264_pthread_cond_destroy(c) #define x264_pthread_cond_broadcast(c) #define x264_pthread_cond_wait(c,m) +#define x264_pthread_attr_t int +#define x264_pthread_attr_init(a) 0 +#define x264_pthread_attr_destroy(a) #endif #define WORD_SIZE sizeof(void*) @@ -216,4 +222,11 @@ static int ALWAYS_INLINE x264_clz( uint32_t x ) } #endif +#if defined(SYS_LINUX) && defined(HAVE_PTHREAD) +#include +#define x264_lower_thread_priority(p) { UNUSED int nice_ret = nice(p); } +#else +#define x264_lower_thread_priority(p) +#endif + #endif /* X264_OSDEP_H */ diff --git a/common/x86/cpu-a.asm b/common/x86/cpu-a.asm index 2df98fd4..285111a9 100644 --- a/common/x86/cpu-a.asm +++ b/common/x86/cpu-a.asm @@ -96,11 +96,13 @@ cglobal x264_cpu_cpuid, 0,6 cglobal x264_stack_align push ebp mov ebp, esp - sub esp, 4 + sub esp, 8 and esp, ~15 mov ecx, [ebp+8] mov edx, [ebp+12] mov [esp], edx + mov edx, [ebp+16] + mov [esp+4], edx call ecx leave ret diff --git a/encoder/analyse.h b/encoder/analyse.h index a2a04a55..05aae40d 100644 --- a/encoder/analyse.h +++ b/encoder/analyse.h @@ -28,4 +28,12 @@ int x264_macroblock_analyse( x264_t *h ); void x264_slicetype_decide( x264_t *h ); int x264_lowres_context_alloc( x264_t *h ); +void x264_slicetype_analyse( x264_t *h, int keyframe ); + +int x264_lookahead_init( x264_t *h, int i_slicetype_length ); +int x264_lookahead_is_empty( x264_t *h ); +void x264_lookahead_put_frame( x264_t *h, x264_frame_t *frame ); +void x264_lookahead_get_frames( x264_t *h ); +void x264_lookahead_delete( x264_t *h ); + #endif diff --git a/encoder/encoder.c b/encoder/encoder.c index eb6c4350..c6b33980 100644 --- a/encoder/encoder.c +++ b/encoder/encoder.c @@ -364,7 +364,7 @@ static int x264_validate_parameters( x264_t *h ) return -1; } - if( h->param.i_threads == 0 ) + if( h->param.i_threads == X264_THREADS_AUTO ) h->param.i_threads = x264_cpu_num_processors() * 3/2; h->param.i_threads = x264_clip3( h->param.i_threads, 1, X264_THREAD_MAX ); if( h->param.i_threads > 1 ) @@ -519,6 +519,14 @@ static int x264_validate_parameters( x264_t *h ) h->param.rc.b_mb_tree = 0; if( h->param.rc.f_qcompress == 1 ) h->param.rc.b_mb_tree = 0; +#ifdef HAVE_PTHREAD + if( h->param.i_sync_lookahead ) + h->param.i_sync_lookahead = x264_clip3( h->param.i_sync_lookahead, h->param.i_threads + h->param.i_bframe, X264_LOOKAHEAD_MAX ); + if( h->param.rc.b_stat_read || h->param.i_threads == 1 ) + h->param.i_sync_lookahead = 0; +#else + h->param.i_sync_lookahead = 0; +#endif h->mb.b_direct_auto_write = h->param.analyse.i_direct_mv_pred == X264_DIRECT_PRED_AUTO && h->param.i_bframe @@ -740,7 +748,7 @@ x264_t *x264_encoder_open( x264_param_t *param ) { x264_t *h; char buf[1000], *p; - int i; + int i, i_slicetype_length; CHECKED_MALLOCZERO( h, sizeof(x264_t) ); @@ -793,8 +801,10 @@ x264_t *x264_encoder_open( x264_param_t *param ) h->frames.i_delay = h->param.i_bframe; if( h->param.rc.b_mb_tree || h->param.rc.i_vbv_buffer_size ) h->frames.i_delay = X264_MAX( h->frames.i_delay, h->param.rc.i_lookahead ); + i_slicetype_length = h->frames.i_delay; h->frames.i_delay += h->param.i_threads - 1; h->frames.i_delay = X264_MIN( h->frames.i_delay, X264_LOOKAHEAD_MAX ); + h->frames.i_delay += h->param.i_sync_lookahead; h->frames.i_max_ref0 = h->param.i_frame_reference; h->frames.i_max_ref1 = h->sps->vui.i_num_reorder_frames; @@ -810,7 +820,12 @@ x264_t *x264_encoder_open( x264_param_t *param ) h->frames.i_last_idr = - h->param.i_keyint_max; h->frames.i_input = 0; - h->frames.last_nonb = NULL; + + CHECKED_MALLOCZERO( h->frames.unused[0], (h->frames.i_delay + 3) * sizeof(x264_frame_t *) ); + /* Allocate room for max refs plus a few extra just in case. */ + CHECKED_MALLOCZERO( h->frames.unused[1], (h->param.i_threads + 20) * sizeof(x264_frame_t *) ); + CHECKED_MALLOCZERO( h->frames.current, (h->param.i_sync_lookahead + h->param.i_bframe + + h->param.i_threads + 3) * sizeof(x264_frame_t *) ); h->i_ref0 = 0; h->i_ref1 = 0; @@ -861,14 +876,14 @@ x264_t *x264_encoder_open( x264_param_t *param ) h->thread[0] = h; h->i_thread_num = 0; - for( i = 1; i < h->param.i_threads; i++ ) + for( i = 1; i < h->param.i_threads + !!h->param.i_sync_lookahead; i++ ) CHECKED_MALLOC( h->thread[i], sizeof(x264_t) ); for( i = 0; i < h->param.i_threads; i++ ) { if( i > 0 ) *h->thread[i] = *h; - h->thread[i]->fdec = x264_frame_pop_unused( h ); + h->thread[i]->fdec = x264_frame_pop_unused( h, 1 ); if( !h->thread[i]->fdec ) goto fail; CHECKED_MALLOC( h->thread[i]->out.p_bitstream, h->out.i_bitstream ); @@ -879,6 +894,9 @@ x264_t *x264_encoder_open( x264_param_t *param ) goto fail; } + if( x264_lookahead_init( h, i_slicetype_length ) ) + goto fail; + if( x264_ratecontrol_new( h ) < 0 ) goto fail; @@ -1181,8 +1199,6 @@ static void x264_fdec_filter_row( x264_t *h, int mb_y ) static inline int x264_reference_update( x264_t *h ) { - int i; - if( h->fdec->i_frame >= 0 ) h->i_frame++; @@ -1191,29 +1207,18 @@ static inline int x264_reference_update( x264_t *h ) if( h->param.i_threads > 1 ) { x264_frame_push_unused( h, h->fdec ); - h->fdec = x264_frame_pop_unused( h ); + h->fdec = x264_frame_pop_unused( h, 1 ); if( !h->fdec ) return -1; } return 0; } - /* move lowres copy of the image to the ref frame */ - for( i = 0; i < 4; i++) - { - XCHG( uint8_t*, h->fdec->lowres[i], h->fenc->lowres[i] ); - XCHG( uint8_t*, h->fdec->buffer_lowres[i], h->fenc->buffer_lowres[i] ); - } - - /* adaptive B decision needs a pointer, since it can't use the ref lists */ - if( h->sh.i_type != SLICE_TYPE_B ) - h->frames.last_nonb = h->fdec; - /* move frame in the buffer */ x264_frame_push( h->frames.reference, h->fdec ); if( h->frames.reference[h->frames.i_max_dpb] ) x264_frame_push_unused( h, x264_frame_shift( h->frames.reference ) ); - h->fdec = x264_frame_pop_unused( h ); + h->fdec = x264_frame_pop_unused( h, 1 ); if( !h->fdec ) return -1; return 0; @@ -1516,6 +1521,8 @@ static void *x264_slices_write( x264_t *h ) { int i_frame_size = 0; int i_slice_num = 0; + if( h->param.i_sync_lookahead ) + x264_lower_thread_priority( 10 ); #ifdef HAVE_MMX /* Misalign mask has to be set separately for each thread. */ @@ -1619,7 +1626,7 @@ int x264_encoder_encode( x264_t *h, if( pic_in != NULL ) { /* 1: Copy the picture to a frame and move it to a buffer */ - x264_frame_t *fenc = x264_frame_pop_unused( h ); + x264_frame_t *fenc = x264_frame_pop_unused( h, 0 ); if( !fenc ) return -1; @@ -1632,8 +1639,6 @@ int x264_encoder_encode( x264_t *h, fenc->i_frame = h->frames.i_input++; - x264_frame_push( h->frames.next, fenc ); - if( h->frames.b_have_lowres ) x264_frame_init_lowres( h, fenc ); @@ -1645,55 +1650,33 @@ int x264_encoder_encode( x264_t *h, else if( h->param.rc.i_aq_mode ) x264_adaptive_quant_frame( h, fenc ); + /* 2: Place the frame into the queue for its slice type decision */ + x264_lookahead_put_frame( h, fenc ); + if( h->frames.i_input <= h->frames.i_delay + 1 - h->param.i_threads ) { - /* Nothing yet to encode */ - /* waiting for filling bframe buffer */ + /* Nothing yet to encode, waiting for filling of buffers */ pic_out->i_type = X264_TYPE_AUTO; return 0; } } - - if( h->frames.current[0] == NULL ) + else { - int bframes = 0; - /* 2: Select frame types */ - if( h->frames.next[0] == NULL ) - { - if( x264_encoder_frame_end( thread_oldest, thread_current, pp_nal, pi_nal, pic_out ) < 0 ) - return -1; - return 0; - } + /* signal kills for lookahead thread */ + h->lookahead->b_exit_thread = 1; + x264_pthread_cond_broadcast( &h->lookahead->ifbuf.cv_fill ); + } - x264_stack_align( x264_slicetype_decide, h ); + /* 3: The picture is analyzed in the lookahead */ + if( !h->frames.current[0] ) + x264_lookahead_get_frames( h ); - /* 3: move some B-frames and 1 non-B to encode queue */ - while( IS_X264_TYPE_B( h->frames.next[bframes]->i_type ) ) - bframes++; - x264_frame_push( h->frames.current, x264_frame_shift( &h->frames.next[bframes] ) ); - /* FIXME: when max B-frames > 3, BREF may no longer be centered after GOP closing */ - if( h->param.b_bframe_pyramid && bframes > 1 ) - { - x264_frame_t *mid = x264_frame_shift( &h->frames.next[bframes/2] ); - mid->i_type = X264_TYPE_BREF; - x264_frame_push( h->frames.current, mid ); - bframes--; - } - while( bframes-- ) - x264_frame_push( h->frames.current, x264_frame_shift( h->frames.next ) ); - } + if( !h->frames.current[0] && x264_lookahead_is_empty( h ) ) + return x264_encoder_frame_end( thread_oldest, thread_current, pp_nal, pi_nal, pic_out ); /* ------------------- Get frame to be encoded ------------------------- */ /* 4: get picture to encode */ h->fenc = x264_frame_shift( h->frames.current ); - if( h->fenc == NULL ) - { - /* Nothing yet to encode (ex: waiting for I/P with B frames) */ - /* waiting for filling bframe buffer */ - pic_out->i_type = X264_TYPE_AUTO; - return 0; - } - if( h->fenc->param ) { x264_encoder_reconfig( h, h->fenc->param ); @@ -1704,6 +1687,7 @@ int x264_encoder_encode( x264_t *h, if( h->fenc->i_type == X264_TYPE_IDR ) { h->frames.i_last_idr = h->fenc->i_frame; + h->i_frame_num = 0; } /* ------------------- Setup frame context ----------------------------- */ @@ -2029,6 +2013,8 @@ void x264_encoder_close ( x264_t *h ) || h->stat.i_mb_count[SLICE_TYPE_P][I_PCM] || h->stat.i_mb_count[SLICE_TYPE_B][I_PCM]; + x264_lookahead_delete( h ); + for( i=0; iparam.i_threads; i++ ) { // don't strictly have to wait for the other threads, but it's simpler than canceling them @@ -2248,21 +2234,9 @@ void x264_encoder_close ( x264_t *h ) h = h->thread[ h->i_thread_phase % h->param.i_threads ]; /* frames */ - for( i = 0; h->frames.current[i]; i++ ) - { - assert( h->frames.current[i]->i_reference_count == 1 ); - x264_frame_delete( h->frames.current[i] ); - } - for( i = 0; h->frames.next[i]; i++ ) - { - assert( h->frames.next[i]->i_reference_count == 1 ); - x264_frame_delete( h->frames.next[i] ); - } - for( i = 0; h->frames.unused[i]; i++ ) - { - assert( h->frames.unused[i]->i_reference_count == 0 ); - x264_frame_delete( h->frames.unused[i] ); - } + x264_frame_delete_list( h->frames.unused[0] ); + x264_frame_delete_list( h->frames.unused[1] ); + x264_frame_delete_list( h->frames.current ); h = h->thread[0]; @@ -2302,7 +2276,8 @@ int x264_encoder_delayed_frames( x264_t *h ) h = h->thread[ h->i_thread_phase % h->param.i_threads ]; for( i=0; h->frames.current[i]; i++ ) delayed_frames++; - for( i=0; h->frames.next[i]; i++ ) - delayed_frames++; + delayed_frames += x264_synch_frame_list_get_size( &h->lookahead->ifbuf ); + delayed_frames += x264_synch_frame_list_get_size( &h->lookahead->next ); + delayed_frames += x264_synch_frame_list_get_size( &h->lookahead->ofbuf ); return delayed_frames; } diff --git a/encoder/lookahead.c b/encoder/lookahead.c new file mode 100644 index 00000000..9df0ce35 --- /dev/null +++ b/encoder/lookahead.c @@ -0,0 +1,278 @@ +/***************************************************************************** + * lookahead.c: Lookahead slicetype decisions for x264 + ***************************************************************************** + * Lookahead.c and associated modifications: + * Copyright (C) 2008 Avail Media + * + * Authors: Michael Kazmier + * Alex Giladi + * Steven Walters + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111, USA. + *****************************************************************************/ + +/* LOOKAHEAD (threaded and non-threaded mode) + * + * Lookahead types: + * [1] Slice type / scene cut; + * + * In non-threaded mode, we run the existing slicetype decision code as it was. + * In threaded mode, we run in a separate thread, that lives between the calls + * to x264_encoder_open() and x264_encoder_close(), and performs lookahead for + * the number of frames specified in rc_lookahead. Recommended setting is + * # of bframes + # of threads. + */ +#include "common/common.h" +#include "common/cpu.h" +#include "analyse.h" + +static void x264_lookahead_shift( x264_synch_frame_list_t *dst, x264_synch_frame_list_t *src, int count ) +{ + int i = count; + while( i-- ) + { + assert( dst->i_size != dst->i_max_size ); + assert( src->i_size ); + dst->list[ dst->i_size++ ] = x264_frame_shift( src->list ); + src->i_size--; + } + if( count ) + { + x264_pthread_cond_broadcast( &dst->cv_fill ); + x264_pthread_cond_broadcast( &src->cv_empty ); + } +} + +static void x264_lookahead_update_last_nonb( x264_t *h, x264_frame_t *new_nonb ) +{ + if( h->lookahead->last_nonb ) + x264_frame_push_unused( h, h->lookahead->last_nonb ); + h->lookahead->last_nonb = new_nonb; + new_nonb->i_reference_count++; +} + +#ifdef HAVE_PTHREAD +static void x264_lookahead_slicetype_decide( x264_t *h ) +{ + int bframes = 0; + x264_stack_align( x264_slicetype_decide, h ); + + while( IS_X264_TYPE_B( h->lookahead->next.list[bframes]->i_type ) ) + bframes++; + x264_lookahead_update_last_nonb( h, h->lookahead->next.list[bframes] ); + + x264_pthread_mutex_lock( &h->lookahead->ofbuf.mutex ); + while( h->lookahead->ofbuf.i_size == h->lookahead->ofbuf.i_max_size ) + x264_pthread_cond_wait( &h->lookahead->ofbuf.cv_empty, &h->lookahead->ofbuf.mutex ); + + x264_pthread_mutex_lock( &h->lookahead->next.mutex ); + x264_lookahead_shift( &h->lookahead->ofbuf, &h->lookahead->next, bframes + 1 ); + x264_pthread_mutex_unlock( &h->lookahead->next.mutex ); + + /* For MB-tree and VBV lookahead, we have to perform propagation analysis on I-frames too. */ + if( h->lookahead->b_analyse_keyframe && IS_X264_TYPE_I( h->lookahead->last_nonb->i_type ) ) + x264_stack_align( x264_slicetype_analyse, h, 1 ); + + x264_pthread_mutex_unlock( &h->lookahead->ofbuf.mutex ); +} + +static void x264_lookahead_thread( x264_t *h ) +{ + int shift; +#ifdef HAVE_MMX + if( h->param.cpu&X264_CPU_SSE_MISALIGN ) + x264_cpu_mask_misalign_sse(); +#endif + h->lookahead->b_thread_active = 1; + while( !h->lookahead->b_exit_thread ) + { + x264_pthread_mutex_lock( &h->lookahead->ifbuf.mutex ); + x264_pthread_mutex_lock( &h->lookahead->next.mutex ); + shift = X264_MIN( h->lookahead->next.i_max_size - h->lookahead->next.i_size, h->lookahead->ifbuf.i_size ); + x264_lookahead_shift( &h->lookahead->next, &h->lookahead->ifbuf, shift ); + x264_pthread_mutex_unlock( &h->lookahead->next.mutex ); + if( h->lookahead->next.i_size <= h->lookahead->i_slicetype_length ) + { + while( !h->lookahead->ifbuf.i_size && !h->lookahead->b_exit_thread ) + x264_pthread_cond_wait( &h->lookahead->ifbuf.cv_fill, &h->lookahead->ifbuf.mutex ); + x264_pthread_mutex_unlock( &h->lookahead->ifbuf.mutex ); + } + else + { + x264_pthread_mutex_unlock( &h->lookahead->ifbuf.mutex ); + x264_lookahead_slicetype_decide( h ); + } + } /* end of input frames */ + x264_pthread_mutex_lock( &h->lookahead->next.mutex ); + x264_pthread_mutex_lock( &h->lookahead->ifbuf.mutex ); + x264_lookahead_shift( &h->lookahead->next, &h->lookahead->ifbuf, h->lookahead->ifbuf.i_size ); + x264_pthread_mutex_unlock( &h->lookahead->ifbuf.mutex ); + x264_pthread_mutex_unlock( &h->lookahead->next.mutex ); + while( h->lookahead->next.i_size ) + x264_lookahead_slicetype_decide( h ); + h->lookahead->b_thread_active = 0; +} +#endif + +int x264_lookahead_init( x264_t *h, int i_slicetype_length ) +{ + x264_lookahead_t *look; + CHECKED_MALLOCZERO( look, sizeof(x264_lookahead_t) ); + int i; + for( i = 0; i < h->param.i_threads; i++ ) + h->thread[i]->lookahead = look; + + look->i_last_idr = - h->param.i_keyint_max; + look->b_analyse_keyframe = (h->param.rc.b_mb_tree || (h->param.rc.i_vbv_buffer_size && h->param.rc.i_lookahead)) + && !h->param.rc.b_stat_read; + look->i_slicetype_length = i_slicetype_length; + + /* init frame lists */ + if( x264_synch_frame_list_init( &look->ifbuf, h->param.i_sync_lookahead+3 ) || + x264_synch_frame_list_init( &look->next, h->frames.i_delay+3 ) || + x264_synch_frame_list_init( &look->ofbuf, h->frames.i_delay+3 ) ) + goto fail; + + if( !h->param.i_sync_lookahead ) + return 0; + + x264_t *look_h = h->thread[h->param.i_threads]; + *look_h = *h; + if( x264_macroblock_cache_init( look_h ) ) + goto fail; + + UNUSED x264_pthread_attr_t attr; + if( x264_pthread_attr_init( &attr ) ) + goto fail; +#if defined(USE_REAL_PTHREAD) && !defined(SYS_LINUX) + int offset = sched_get_priority_max( SCHED_OTHER ); + x264_log( h, X264_LOG_DEBUG, "setting priority of lookahead thread to %d\n", offset ); + struct sched_param sp; + pthread_attr_getschedparam( &attr, &sp ); + sp.sched_priority = offset; + pthread_attr_setschedparam( &attr, &sp ); +#endif + + if( x264_pthread_create( &look_h->thread_handle, &attr, (void *)x264_lookahead_thread, look_h ) ) + goto fail; + + x264_pthread_attr_destroy( &attr ); + + return 0; +fail: + x264_free( look ); + return -1; +} + +void x264_lookahead_delete( x264_t *h ) +{ + if( h->param.i_sync_lookahead ) + { + h->lookahead->b_exit_thread = 1; + x264_pthread_cond_broadcast( &h->lookahead->ifbuf.cv_fill ); + x264_pthread_join( h->thread[h->param.i_threads]->thread_handle, NULL ); + x264_macroblock_cache_end( h->thread[h->param.i_threads] ); + x264_free( h->thread[h->param.i_threads] ); + } + x264_synch_frame_list_delete( &h->lookahead->ifbuf ); + x264_synch_frame_list_delete( &h->lookahead->next ); + x264_synch_frame_list_delete( &h->lookahead->ofbuf ); + if( h->lookahead->last_nonb ) + x264_frame_delete( h->lookahead->last_nonb ); + x264_free( h->lookahead ); +} + +void x264_lookahead_put_frame( x264_t *h, x264_frame_t *frame ) +{ + if( h->param.i_sync_lookahead ) + x264_synch_frame_list_push( &h->lookahead->ifbuf, frame ); + else + x264_synch_frame_list_push( &h->lookahead->next, frame ); +} + +int x264_lookahead_is_empty( x264_t *h ) +{ + return !x264_synch_frame_list_get_size( &h->lookahead->ofbuf ) && + !x264_synch_frame_list_get_size( &h->lookahead->next ); +} + +static void x264_lookahead_encoder_shift( x264_t *h ) +{ + int bframes = 0; + int i_frames = 0; + + while( h->lookahead->ofbuf.list[i_frames] ) + { + while( h->lookahead->b_thread_active && !h->lookahead->ofbuf.i_size ) + x264_pthread_cond_wait( &h->lookahead->ofbuf.cv_fill, &h->lookahead->ofbuf.mutex ); + if( IS_X264_TYPE_B( h->lookahead->ofbuf.list[bframes]->i_type ) ) + bframes++; + else + break; + i_frames++; + } + if( h->lookahead->ofbuf.list[i_frames] ) + { + x264_frame_push( h->frames.current, x264_frame_shift( &h->lookahead->ofbuf.list[bframes] ) ); + h->lookahead->ofbuf.i_size--; + if( h->param.b_bframe_pyramid && bframes > 1 ) + { + x264_frame_t *mid = x264_frame_shift( &h->lookahead->ofbuf.list[bframes/2] ); + h->lookahead->ofbuf.i_size--; + mid->i_type = X264_TYPE_BREF; + x264_frame_push( h->frames.current, mid ); + bframes--; + } + while( bframes-- ) + { + x264_frame_push( h->frames.current, x264_frame_shift( h->lookahead->ofbuf.list ) ); + h->lookahead->ofbuf.i_size--; + } + x264_pthread_cond_broadcast( &h->lookahead->ofbuf.cv_empty ); + } +} + +void x264_lookahead_get_frames( x264_t *h ) +{ + if( h->param.i_sync_lookahead ) + { /* We have a lookahead thread, so get frames from there */ + x264_pthread_mutex_lock( &h->lookahead->ofbuf.mutex ); + while( !h->lookahead->ofbuf.i_size && h->lookahead->b_thread_active ) + x264_pthread_cond_wait( &h->lookahead->ofbuf.cv_fill, &h->lookahead->ofbuf.mutex ); + x264_lookahead_encoder_shift( h ); + x264_pthread_mutex_unlock( &h->lookahead->ofbuf.mutex ); + } + else + { /* We are not running a lookahead thread, so perform all the slicetype decide on the fly */ + + if( h->frames.current[0] || !h->lookahead->next.i_size ) + return; + + x264_stack_align( x264_slicetype_decide, h ); + + int bframes=0; + while( IS_X264_TYPE_B( h->lookahead->next.list[bframes]->i_type ) ) + bframes++; + + x264_lookahead_update_last_nonb( h, h->lookahead->next.list[bframes] ); + x264_lookahead_shift( &h->lookahead->ofbuf, &h->lookahead->next, bframes + 1 ); + + /* For MB-tree and VBV lookahead, we have to perform propagation analysis on I-frames too. */ + if( h->lookahead->b_analyse_keyframe && IS_X264_TYPE_I( h->lookahead->last_nonb->i_type ) ) + x264_stack_align( x264_slicetype_analyse, h, 1 ); + + x264_lookahead_encoder_shift( h ); + } +} diff --git a/encoder/ratecontrol.c b/encoder/ratecontrol.c index ca19d64a..cb7fd3b8 100644 --- a/encoder/ratecontrol.c +++ b/encoder/ratecontrol.c @@ -922,11 +922,7 @@ void x264_ratecontrol_start( x264_t *h, int i_force_qp ) } if( h->sh.i_type != SLICE_TYPE_B ) - { - rc->bframes = 0; - while( h->frames.current[rc->bframes] && IS_X264_TYPE_B(h->frames.current[rc->bframes]->i_type) ) - rc->bframes++; - } + rc->bframes = h->fenc->i_bframes; if( i_force_qp ) { @@ -1250,7 +1246,7 @@ int x264_ratecontrol_end( x264_t *h, int bits ) if( h->sh.i_type == SLICE_TYPE_B ) { rc->bframe_bits += bits; - if( !h->frames.current[0] || !IS_X264_TYPE_B(h->frames.current[0]->i_type) ) + if( h->fenc->b_last_minigop_bframe ) { update_predictor( rc->pred_b_from_p, qp2qscale(rc->qpa_rc), h->fref1[h->i_ref1-1]->i_satd, rc->bframe_bits / rc->bframes ); diff --git a/encoder/slicetype.c b/encoder/slicetype.c index af74427d..88aff91b 100644 --- a/encoder/slicetype.c +++ b/encoder/slicetype.c @@ -625,7 +625,7 @@ static int x264_slicetype_path_cost( x264_t *h, x264_mb_analysis_t *a, x264_fram /* Uses strings due to the fact that the speed of the control functions is negligable compared to the cost of running slicetype_frame_cost, and because it makes debugging easier. */ -static void x264_slicetype_path( x264_t *h, x264_mb_analysis_t *a, x264_frame_t **frames, int length, int max_bframes, int buffer_size, char (*best_paths)[X264_LOOKAHEAD_MAX] ) +static void x264_slicetype_path( x264_t *h, x264_mb_analysis_t *a, x264_frame_t **frames, int length, int max_bframes, char (*best_paths)[X264_LOOKAHEAD_MAX] ) { char paths[X264_BFRAME_MAX+2][X264_LOOKAHEAD_MAX] = {{0}}; int num_paths = X264_MIN(max_bframes+1, length); @@ -666,7 +666,7 @@ static int scenecut( x264_t *h, x264_mb_analysis_t *a, x264_frame_t **frames, in int icost = frame->i_cost_est[0][0]; int pcost = frame->i_cost_est[p1-p0][0]; float f_bias; - int i_gop_size = frame->i_frame - h->frames.i_last_idr; + int i_gop_size = frame->i_frame - h->lookahead->i_last_idr; float f_thresh_max = h->param.i_scenecut_threshold / 100.0; /* magic numbers pulled out of thin air */ float f_thresh_min = f_thresh_max * h->param.i_keyint_min @@ -700,33 +700,33 @@ static int scenecut( x264_t *h, x264_mb_analysis_t *a, x264_frame_t **frames, in return res; } -static void x264_slicetype_analyse( x264_t *h, int keyframe ) +void x264_slicetype_analyse( x264_t *h, int keyframe ) { x264_mb_analysis_t a; x264_frame_t *frames[X264_LOOKAHEAD_MAX+3] = { NULL, }; - int num_frames; - int keyint_limit; - int i,j; + int num_frames, keyint_limit, idr_frame_type, i, j; int i_mb_count = NUM_MBS; int cost1p0, cost2p0, cost1b1, cost2p1; - int idr_frame_type; + int i_max_search = X264_MIN( h->lookahead->next.i_size, X264_LOOKAHEAD_MAX ); + if( h->param.b_deterministic ) + i_max_search = X264_MIN( i_max_search, h->lookahead->i_slicetype_length + !keyframe ); assert( h->frames.b_have_lowres ); - if( !h->frames.last_nonb ) + if( !h->lookahead->last_nonb ) return; - frames[0] = h->frames.last_nonb; - for( j = 0; h->frames.next[j] && h->frames.next[j]->i_type == X264_TYPE_AUTO; j++ ) - frames[j+1] = h->frames.next[j]; + frames[0] = h->lookahead->last_nonb; + for( j = 0; j < i_max_search && h->lookahead->next.list[j]->i_type == X264_TYPE_AUTO; j++ ) + frames[j+1] = h->lookahead->next.list[j]; if( !j ) return; - keyint_limit = h->param.i_keyint_max - frames[0]->i_frame + h->frames.i_last_idr - 1; + keyint_limit = h->param.i_keyint_max - frames[0]->i_frame + h->lookahead->i_last_idr - 1; num_frames = X264_MIN( j, keyint_limit ); x264_lowres_context_init( h, &a ); - idr_frame_type = frames[1]->i_frame - h->frames.i_last_idr >= h->param.i_keyint_min ? X264_TYPE_IDR : X264_TYPE_I; + idr_frame_type = frames[1]->i_frame - h->lookahead->i_last_idr >= h->param.i_keyint_min ? X264_TYPE_IDR : X264_TYPE_I; /* This is important psy-wise: if we have a non-scenecut keyframe, * there will be significant visual artifacts if the frames just before @@ -765,7 +765,7 @@ static void x264_slicetype_analyse( x264_t *h, int keyframe ) { /* Perform the frametype analysis. */ for( n = 2; n < num_frames-1; n++ ) - x264_slicetype_path( h, &a, frames, n, max_bframes, num_frames-max_bframes, best_paths ); + x264_slicetype_path( h, &a, frames, n, max_bframes, best_paths ); if( num_frames > 1 ) { num_bframes = strspn( best_paths[num_frames-2], "B" ); @@ -888,15 +888,15 @@ void x264_slicetype_decide( x264_t *h ) int bframes; int i; - if( h->frames.next[0] == NULL ) + if( !h->lookahead->next.i_size ) return; if( h->param.rc.b_stat_read ) { /* Use the frame types from the first pass */ - for( i = 0; h->frames.next[i] != NULL; i++ ) - h->frames.next[i]->i_type = - x264_ratecontrol_slice_type( h, h->frames.next[i]->i_frame ); + for( i = 0; i < h->lookahead->next.i_size; i++ ) + h->lookahead->next.list[i]->i_type = + x264_ratecontrol_slice_type( h, h->lookahead->next.list[i]->i_frame ); } else if( (h->param.i_bframe && h->param.i_bframe_adaptive) || h->param.i_scenecut_threshold @@ -906,10 +906,10 @@ void x264_slicetype_decide( x264_t *h ) for( bframes = 0;; bframes++ ) { - frm = h->frames.next[bframes]; + frm = h->lookahead->next.list[bframes]; /* Limit GOP size */ - if( frm->i_frame - h->frames.i_last_idr >= h->param.i_keyint_max ) + if( frm->i_frame - h->lookahead->i_last_idr >= h->param.i_keyint_max ) { if( frm->i_type == X264_TYPE_AUTO ) frm->i_type = X264_TYPE_IDR; @@ -919,19 +919,16 @@ void x264_slicetype_decide( x264_t *h ) if( frm->i_type == X264_TYPE_IDR ) { /* Close GOP */ + h->lookahead->i_last_idr = frm->i_frame; if( bframes > 0 ) { bframes--; - h->frames.next[bframes]->i_type = X264_TYPE_P; - } - else - { - h->i_frame_num = 0; + h->lookahead->next.list[bframes]->i_type = X264_TYPE_P; } } - if( bframes == h->param.i_bframe - || h->frames.next[bframes+1] == NULL ) + if( bframes == h->param.i_bframe || + !h->lookahead->next.list[bframes+1] ) { if( IS_X264_TYPE_B( frm->i_type ) ) x264_log( h, X264_LOG_WARNING, "specified frame type is not compatible with max B-frames\n" ); @@ -945,45 +942,47 @@ void x264_slicetype_decide( x264_t *h ) else if( !IS_X264_TYPE_B( frm->i_type ) ) break; } + + if( bframes ) + h->lookahead->next.list[bframes-1]->b_last_minigop_bframe = 1; + h->lookahead->next.list[bframes]->i_bframes = bframes; + + /* calculate the frame costs ahead of time for x264_rc_analyse_slice while we still have lowres */ + if( h->param.rc.i_rc_method != X264_RC_CQP ) + { + x264_mb_analysis_t a; + x264_frame_t *frames[X264_BFRAME_MAX+2] = { NULL, }; + int p0=0, p1, b; + + x264_lowres_context_init( h, &a ); + + if( IS_X264_TYPE_I( h->lookahead->next.list[bframes]->i_type ) ) + p1 = b = 0; + else // P + p1 = b = bframes + 1; + frames[p0] = h->lookahead->last_nonb; + frames[b] = h->lookahead->next.list[bframes]; + + x264_slicetype_frame_cost( h, &a, frames, p0, p1, b, 0 ); + } } int x264_rc_analyse_slice( x264_t *h ) { - x264_mb_analysis_t a; - x264_frame_t *frames[X264_LOOKAHEAD_MAX+2] = { NULL, }; + x264_frame_t *frames[X264_BFRAME_MAX+2] = { NULL, }; int p0=0, p1, b; int cost; - x264_lowres_context_init( h, &a ); - if( IS_X264_TYPE_I(h->fenc->i_type) ) - { p1 = b = 0; - /* For MB-tree and VBV lookahead, we have to perform propagation analysis on I-frames too. */ - if( h->param.rc.b_mb_tree || (h->param.rc.i_vbv_buffer_size && h->param.rc.i_lookahead) ) - { - h->frames.last_nonb = h->fenc; - x264_slicetype_analyse( h, 1 ); - } - } - else if( X264_TYPE_P == h->fenc->i_type ) - { - p1 = 0; - while( h->frames.current[p1] && IS_X264_TYPE_B( h->frames.current[p1]->i_type ) ) - p1++; - p1++; - b = p1; - } - else //B - { - p1 = (h->fref1[0]->i_poc - h->fref0[0]->i_poc)/2; - b = (h->fref1[0]->i_poc - h->fenc->i_poc)/2; - frames[p1] = h->fref1[0]; - } + else // P + p1 = b = h->fenc->i_bframes + 1; frames[p0] = h->fref0[0]; frames[b] = h->fenc; - cost = x264_slicetype_frame_cost( h, &a, frames, p0, p1, b, 0 ); + /* cost should have been already calculated by x264_slicetype_decide */ + cost = frames[b]->i_cost_est[b-p0][p1-b]; + assert( cost >= 0 ); if( h->param.rc.b_mb_tree && !h->param.rc.b_stat_read ) cost = x264_slicetype_frame_cost_recalculate( h, frames, p0, p1, b ); diff --git a/x264.c b/x264.c index 76e53072..0ed538a0 100644 --- a/x264.c +++ b/x264.c @@ -355,6 +355,7 @@ static void Help( x264_param_t *defaults, int b_longhelp ) H0( " --ssim Enable SSIM computation\n" ); H0( " --threads Force a specific number of threads\n" ); H1( " --thread-input Run Avisynth in its own thread\n" ); + H1( " --sync-lookahead Number of buffer frames for threaded lookahead\n" ); H1( " --non-deterministic Slightly improve quality of SMP, at the cost of repeatability\n" ); H1( " --asm Override CPU detection\n" ); H1( " --no-asm Disable all CPU optimizations\n" ); @@ -467,6 +468,7 @@ static struct option long_options[] = { "slice-max-mbs", required_argument, NULL, 0 }, { "slices", required_argument, NULL, 0 }, { "thread-input", no_argument, NULL, OPT_THREAD_INPUT }, + { "sync-lookahead", required_argument, NULL, 0 }, { "non-deterministic", no_argument, NULL, 0 }, { "psnr", no_argument, NULL, 0 }, { "ssim", no_argument, NULL, 0 }, @@ -988,7 +990,7 @@ generic_option: #ifdef HAVE_PTHREAD if( b_thread_input || param->i_threads > 1 - || (param->i_threads == 0 && x264_cpu_num_processors() > 1) ) + || (param->i_threads == X264_THREADS_AUTO && x264_cpu_num_processors() > 1) ) { if( open_file_thread( NULL, &opt->hin, param ) ) { diff --git a/x264.h b/x264.h index 5e6d4117..66f4f282 100644 --- a/x264.h +++ b/x264.h @@ -35,7 +35,7 @@ #include -#define X264_BUILD 74 +#define X264_BUILD 75 /* x264_t: * opaque handler for encoder */ @@ -139,6 +139,7 @@ static const char * const x264_colmatrix_names[] = { "GBR", "bt709", "undef", "" /* Threading */ #define X264_THREADS_AUTO 0 /* Automatically select optimal number of threads */ +#define X264_SYNC_LOOKAHEAD_AUTO -1 /* Automatically select optimal lookahead thread buffer size */ /* Zones: override ratecontrol or other options for specific sections of the video. * See x264_encoder_reconfig() for which options can be changed. @@ -158,6 +159,7 @@ typedef struct x264_param_t unsigned int cpu; int i_threads; /* encode multiple frames in parallel */ int b_deterministic; /* whether to allow non-deterministic optimizations when threaded */ + int i_sync_lookahead; /* threaded lookahead buffer */ /* Video Properties */ int i_width; -- 2.40.0