From 71061e9332c05324007e7f6c900285273793366d Mon Sep 17 00:00:00 2001 From: Ranjit Kumar Tulabandu Date: Fri, 10 Feb 2017 16:25:50 +0530 Subject: [PATCH] Row based multi-threading of encoding stage (Yunqing Wang) This patch implements the row-based multi-threading within tiles in the encoding pass, and substantially speeds up the multi-threaded encoder in VP9. Speed tests at speed 1 on STDHD(using 4 tiles) set show that the average speedups of the encoding pass(second pass in the 2-pass encoding) is 7% while using 2 threads, 16% while using 4 threads, 85% while using 8 threads, and 116% while using 16 threads. Change-Id: I12e41dbc171951958af9e6d098efd6e2c82827de --- vp9/encoder/vp9_bitstream.c | 5 +- vp9/encoder/vp9_block.h | 5 ++ vp9/encoder/vp9_encodeframe.c | 48 ++++++++++++---- vp9/encoder/vp9_encodeframe.h | 3 + vp9/encoder/vp9_encoder.c | 26 +++++---- vp9/encoder/vp9_encoder.h | 6 ++ vp9/encoder/vp9_ethread.c | 99 +++++++++++++++++++++++++++++++- vp9/encoder/vp9_ethread.h | 2 + vp9/encoder/vp9_mcomp.c | 45 +++++++++++++-- vp9/encoder/vp9_multi_thread.c | 40 +++++++++++++ vp9/encoder/vp9_pickmode.c | 10 +++- vp9/encoder/vp9_rd.c | 14 ++++- vp9/encoder/vp9_rd.h | 25 +++++++- vp9/encoder/vp9_rdopt.c | 48 ++++++++++++++-- vp9/encoder/vp9_speed_features.c | 18 ++++++ vp9/vp9_cx_iface.c | 3 + 16 files changed, 357 insertions(+), 40 deletions(-) diff --git a/vp9/encoder/vp9_bitstream.c b/vp9/encoder/vp9_bitstream.c index 43c5eaed0..71f85bbe7 100644 --- a/vp9/encoder/vp9_bitstream.c +++ b/vp9/encoder/vp9_bitstream.c @@ -925,10 +925,11 @@ int vp9_get_refresh_mask(VP9_COMP *cpi) { static int encode_tile_worker(VP9_COMP *cpi, VP9BitstreamWorkerData *data) { MACROBLOCKD *const xd = &data->xd; + const int tile_row = 0; vpx_start_encode(&data->bit_writer, data->dest); write_modes(cpi, xd, &cpi->tile_data[data->tile_idx].tile_info, - &data->bit_writer, 0, data->tile_idx, &data->max_mv_magnitude, - data->interp_filter_selected); + &data->bit_writer, tile_row, data->tile_idx, + &data->max_mv_magnitude, data->interp_filter_selected); vpx_stop_encode(&data->bit_writer); return 1; } diff --git a/vp9/encoder/vp9_block.h b/vp9/encoder/vp9_block.h index 91d07e3a0..c0c69f6b5 100644 --- a/vp9/encoder/vp9_block.h +++ b/vp9/encoder/vp9_block.h @@ -11,6 +11,8 @@ #ifndef VP9_ENCODER_VP9_BLOCK_H_ #define VP9_ENCODER_VP9_BLOCK_H_ +#include "vpx_util/vpx_thread.h" + #include "vp9/common/vp9_entropymv.h" #include "vp9/common/vp9_entropy.h" @@ -88,6 +90,9 @@ struct macroblock { int mb_energy; int *m_search_count_ptr; int *ex_search_count_ptr; +#if CONFIG_MULTITHREAD + pthread_mutex_t *search_count_mutex; +#endif // These are set to their default values at the beginning, and then adjusted // further in the encoding process. diff --git a/vp9/encoder/vp9_encodeframe.c b/vp9/encoder/vp9_encodeframe.c index 1bbdeece5..215f8b8f6 100644 --- a/vp9/encoder/vp9_encodeframe.c +++ b/vp9/encoder/vp9_encodeframe.c @@ -3095,13 +3095,18 @@ static void encode_rd_sb_row(VP9_COMP *cpi, ThreadData *td, const int mi_col_start = tile_info->mi_col_start; const int mi_col_end = tile_info->mi_col_end; int mi_col; + const int sb_row = mi_row >> MI_BLOCK_SIZE_LOG2; + const int num_sb_cols = + get_num_cols(tile_data->tile_info, MI_BLOCK_SIZE_LOG2); + int sb_col_in_tile; // Initialize the left context for the new SB row memset(&xd->left_context, 0, sizeof(xd->left_context)); memset(xd->left_seg_context, 0, sizeof(xd->left_seg_context)); // Code each SB in the row - for (mi_col = mi_col_start; mi_col < mi_col_end; mi_col += MI_BLOCK_SIZE) { + for (mi_col = mi_col_start, sb_col_in_tile = 0; mi_col < mi_col_end; + mi_col += MI_BLOCK_SIZE, sb_col_in_tile++) { const struct segmentation *const seg = &cm->seg; int dummy_rate; int64_t dummy_dist; @@ -3112,6 +3117,9 @@ static void encode_rd_sb_row(VP9_COMP *cpi, ThreadData *td, const int idx_str = cm->mi_stride * mi_row + mi_col; MODE_INFO **mi = cm->mi_grid_visible + idx_str; + (*(cpi->row_mt_sync_read_ptr))(&tile_data->row_mt_sync, sb_row, + sb_col_in_tile - 1); + if (sf->adaptive_pred_interp_filter) { for (i = 0; i < 64; ++i) td->leaf_tree[i].pred_interp_filter = SWITCHABLE; @@ -3163,6 +3171,8 @@ static void encode_rd_sb_row(VP9_COMP *cpi, ThreadData *td, rd_pick_partition(cpi, td, tile_data, tp, mi_row, mi_col, BLOCK_64X64, &dummy_rdc, INT64_MAX, td->pc_root); } + (*(cpi->row_mt_sync_write_ptr))(&tile_data->row_mt_sync, sb_row, + sb_col_in_tile, num_sb_cols); } } @@ -4109,13 +4119,17 @@ void vp9_init_tile_data(VP9_COMP *cpi) { tile_data->mode_map[i][j] = j; } } +#if CONFIG_MULTITHREAD + tile_data->search_count_mutex = NULL; + tile_data->enc_row_mt_mutex = NULL; +#endif } } for (tile_row = 0; tile_row < tile_rows; ++tile_row) { for (tile_col = 0; tile_col < tile_cols; ++tile_col) { - TileInfo *tile_info = - &cpi->tile_data[tile_row * tile_cols + tile_col].tile_info; + TileDataEnc *this_tile = &cpi->tile_data[tile_row * tile_cols + tile_col]; + TileInfo *tile_info = &this_tile->tile_info; vp9_tile_init(tile_info, cm, tile_row, tile_col); cpi->tile_tok[tile_row][tile_col] = pre_tok + tile_tok; @@ -4125,6 +4139,10 @@ void vp9_init_tile_data(VP9_COMP *cpi) { cpi->tplist[tile_row][tile_col] = tplist + tplist_count; tplist = cpi->tplist[tile_row][tile_col]; tplist_count = get_num_vert_units(*tile_info, MI_BLOCK_SIZE_LOG2); + + // Set up pointers to per thread motion search counters. + this_tile->m_search_count = 0; // Count of motion search hits. + this_tile->ex_search_count = 0; // Exhaustive mesh search hits. } } } @@ -4170,10 +4188,11 @@ void vp9_encode_tile(VP9_COMP *cpi, ThreadData *td, int tile_row, int mi_row; // Set up pointers to per thread motion search counters. - this_tile->m_search_count = 0; // Count of motion search hits. - this_tile->ex_search_count = 0; // Exhaustive mesh search hits. td->mb.m_search_count_ptr = &this_tile->m_search_count; td->mb.ex_search_count_ptr = &this_tile->ex_search_count; +#if CONFIG_MULTITHREAD + td->mb.search_count_mutex = this_tile->search_count_mutex; +#endif for (mi_row = mi_row_start; mi_row < mi_row_end; mi_row += MI_BLOCK_SIZE) vp9_encode_sb_row(cpi, td, tile_row, tile_col, mi_row); @@ -4289,11 +4308,20 @@ static void encode_frame_internal(VP9_COMP *cpi) { } #endif - // If allowed, encoding tiles in parallel with one thread handling one tile. - if (VPXMIN(cpi->oxcf.max_threads, 1 << cm->log2_tile_cols) > 1) - vp9_encode_tiles_mt(cpi); - else - encode_tiles(cpi); + if (!cpi->new_mt) { + cpi->row_mt_sync_read_ptr = vp9_row_mt_sync_read_dummy; + cpi->row_mt_sync_write_ptr = vp9_row_mt_sync_write_dummy; + // If allowed, encoding tiles in parallel with one thread handling one + // tile when row based multi-threading is disabled. + if (VPXMIN(cpi->oxcf.max_threads, 1 << cm->log2_tile_cols) > 1) + vp9_encode_tiles_mt(cpi); + else + encode_tiles(cpi); + } else { + cpi->row_mt_sync_read_ptr = vp9_row_mt_sync_read; + cpi->row_mt_sync_write_ptr = vp9_row_mt_sync_write; + vp9_encode_tiles_row_mt(cpi); + } vpx_usec_timer_mark(&emr_timer); cpi->time_encode_sb_row += vpx_usec_timer_elapsed(&emr_timer); diff --git a/vp9/encoder/vp9_encodeframe.h b/vp9/encoder/vp9_encodeframe.h index aa5494785..2b9b65dcd 100644 --- a/vp9/encoder/vp9_encodeframe.h +++ b/vp9/encoder/vp9_encodeframe.h @@ -39,6 +39,9 @@ void vp9_init_tile_data(struct VP9_COMP *cpi); void vp9_encode_tile(struct VP9_COMP *cpi, struct ThreadData *td, int tile_row, int tile_col); +void vp9_encode_sb_row(struct VP9_COMP *cpi, struct ThreadData *td, + int tile_row, int tile_col, int mi_row); + void vp9_set_variance_partition_thresholds(struct VP9_COMP *cpi, int q); #ifdef __cplusplus diff --git a/vp9/encoder/vp9_encoder.c b/vp9/encoder/vp9_encoder.c index 50fa8c682..2ce46c657 100644 --- a/vp9/encoder/vp9_encoder.c +++ b/vp9/encoder/vp9_encoder.c @@ -1575,17 +1575,7 @@ void vp9_change_config(struct VP9_COMP *cpi, const VP9EncoderConfig *oxcf) { highbd_set_var_fns(cpi); #endif - // Enable multi-threading for first pass. - cpi->new_mt = 0; - if (((cpi->oxcf.mode == GOOD || cpi->oxcf.mode == BEST) && - cpi->oxcf.speed < 5 && cpi->oxcf.pass == 1) && - cpi->oxcf.new_mt && !cpi->use_svc) - cpi->new_mt = 1; - - if (cpi->oxcf.mode == GOOD && cpi->oxcf.speed < 5 && - (cpi->oxcf.pass == 0 || cpi->oxcf.pass == 2) && cpi->oxcf.new_mt && - !cpi->use_svc) - cpi->new_mt = 1; + vp9_set_new_mt(cpi); } #ifndef M_LOG2_E @@ -5213,3 +5203,17 @@ void vp9_apply_encoding_flags(VP9_COMP *cpi, vpx_enc_frame_flags_t flags) { vp9_update_entropy(cpi, 0); } } + +void vp9_set_new_mt(VP9_COMP *cpi) { + // Enable row based multi-threading for supported modes of encoding + cpi->new_mt = 0; + if (((cpi->oxcf.mode == GOOD || cpi->oxcf.mode == BEST) && + cpi->oxcf.speed < 5 && cpi->oxcf.pass == 1) && + cpi->oxcf.new_mt && !cpi->use_svc) + cpi->new_mt = 1; + + if (cpi->oxcf.mode == GOOD && cpi->oxcf.speed < 5 && + (cpi->oxcf.pass == 0 || cpi->oxcf.pass == 2) && cpi->oxcf.new_mt && + !cpi->use_svc) + cpi->new_mt = 1; +} diff --git a/vp9/encoder/vp9_encoder.h b/vp9/encoder/vp9_encoder.h index 65f3f86de..675512618 100644 --- a/vp9/encoder/vp9_encoder.h +++ b/vp9/encoder/vp9_encoder.h @@ -276,6 +276,10 @@ typedef struct TileDataEnc { int ex_search_count; FIRSTPASS_DATA fp_data; VP9RowMTSync row_mt_sync; +#if CONFIG_MULTITHREAD + pthread_mutex_t *search_count_mutex; + pthread_mutex_t *enc_row_mt_mutex; +#endif } TileDataEnc; typedef struct RowMTInfo { @@ -897,6 +901,8 @@ VP9_LEVEL vp9_get_level(const Vp9LevelSpec *const level_spec); void vp9_new_framerate(VP9_COMP *cpi, double framerate); +void vp9_set_new_mt(VP9_COMP *cpi); + #define LAYER_IDS_TO_IDX(sl, tl, num_tl) ((sl) * (num_tl) + (tl)) #ifdef __cplusplus diff --git a/vp9/encoder/vp9_ethread.c b/vp9/encoder/vp9_ethread.c index 1bffc4030..bf8108416 100644 --- a/vp9/encoder/vp9_ethread.c +++ b/vp9/encoder/vp9_ethread.c @@ -341,7 +341,7 @@ void vp9_row_mt_sync_write(VP9RowMTSync *const row_mt_sync, int r, int c, #if CONFIG_MULTITHREAD const int nsync = row_mt_sync->sync_range; int cur; - // Only signal when there are enough filtered SB for next row to run. + // Only signal when there are enough encoded blocks for next row to run. int sig = 1; if (c < cols - 1) { @@ -542,3 +542,100 @@ void vp9_temporal_filter_row_mt(VP9_COMP *cpi) { launch_enc_workers(cpi, (VPxWorkerHook)temporal_filter_worker_hook, multi_thread_ctxt, num_workers); } + +static int enc_row_mt_worker_hook(EncWorkerData *const thread_data, + MultiThreadHandle *multi_thread_ctxt) { + VP9_COMP *const cpi = thread_data->cpi; + const VP9_COMMON *const cm = &cpi->common; + const int tile_cols = 1 << cm->log2_tile_cols; + int tile_row, tile_col; + TileDataEnc *this_tile; + int end_of_frame; + int thread_id = thread_data->thread_id; + int cur_tile_id = multi_thread_ctxt->thread_id_to_tile_id[thread_id]; + JobNode *proc_job = NULL; + int mi_row; + + end_of_frame = 0; + while (0 == end_of_frame) { + // Get the next job in the queue + proc_job = + (JobNode *)vp9_enc_grp_get_next_job(multi_thread_ctxt, cur_tile_id); + if (NULL == proc_job) { + // Query for the status of other tiles + end_of_frame = vp9_get_tiles_proc_status( + multi_thread_ctxt, thread_data->tile_completion_status, &cur_tile_id, + tile_cols); + } else { + tile_col = proc_job->tile_col_id; + tile_row = proc_job->tile_row_id; + mi_row = proc_job->vert_unit_row_num * MI_BLOCK_SIZE; + + this_tile = &cpi->tile_data[tile_row * tile_cols + tile_col]; + thread_data->td->mb.m_search_count_ptr = &this_tile->m_search_count; + thread_data->td->mb.ex_search_count_ptr = &this_tile->ex_search_count; +#if CONFIG_MULTITHREAD + thread_data->td->mb.search_count_mutex = this_tile->search_count_mutex; +#endif + + vp9_encode_sb_row(cpi, thread_data->td, tile_row, tile_col, mi_row); + } + } + return 0; +} + +void vp9_encode_tiles_row_mt(VP9_COMP *cpi) { + VP9_COMMON *const cm = &cpi->common; + const int tile_cols = 1 << cm->log2_tile_cols; + const int tile_rows = 1 << cm->log2_tile_rows; + MultiThreadHandle *multi_thread_ctxt = &cpi->multi_thread_ctxt; + int num_workers = VPXMAX(cpi->oxcf.max_threads, 1); + int i; + + if (multi_thread_ctxt->allocated_tile_cols < tile_cols || + multi_thread_ctxt->allocated_tile_rows < tile_rows || + multi_thread_ctxt->allocated_vert_unit_rows < cm->mb_rows) { + vp9_row_mt_mem_dealloc(cpi); + vp9_init_tile_data(cpi); + vp9_row_mt_mem_alloc(cpi); + } else { + vp9_init_tile_data(cpi); + } + + create_enc_workers(cpi, num_workers); + + vp9_assign_tile_to_thread(multi_thread_ctxt, tile_cols, cpi->num_workers); + + vp9_prepare_job_queue(cpi, ENCODE_JOB); + + vp9_multi_thread_tile_init(cpi); + + for (i = 0; i < num_workers; i++) { + EncWorkerData *thread_data; + thread_data = &cpi->tile_thr_data[i]; + + // Before encoding a frame, copy the thread data from cpi. + if (thread_data->td != &cpi->td) { + thread_data->td->mb = cpi->td.mb; + thread_data->td->rd_counts = cpi->td.rd_counts; + } + if (thread_data->td->counts != &cpi->common.counts) { + memcpy(thread_data->td->counts, &cpi->common.counts, + sizeof(cpi->common.counts)); + } + } + + launch_enc_workers(cpi, (VPxWorkerHook)enc_row_mt_worker_hook, + multi_thread_ctxt, num_workers); + + for (i = 0; i < num_workers; i++) { + VPxWorker *const worker = &cpi->workers[i]; + EncWorkerData *const thread_data = (EncWorkerData *)worker->data1; + + // Accumulate counters. + if (i < cpi->num_workers - 1) { + vp9_accumulate_frame_counts(&cm->counts, thread_data->td->counts, 0); + accumulate_rd_opt(&cpi->td, thread_data->td); + } + } +} diff --git a/vp9/encoder/vp9_ethread.h b/vp9/encoder/vp9_ethread.h index 908bb6ff6..a396e621d 100644 --- a/vp9/encoder/vp9_ethread.h +++ b/vp9/encoder/vp9_ethread.h @@ -44,6 +44,8 @@ typedef struct VP9RowMTSyncData { void vp9_encode_tiles_mt(struct VP9_COMP *cpi); +void vp9_encode_tiles_row_mt(struct VP9_COMP *cpi); + void vp9_encode_fp_row_mt(struct VP9_COMP *cpi); void vp9_row_mt_sync_read(VP9RowMTSync *const row_mt_sync, int r, int c); diff --git a/vp9/encoder/vp9_mcomp.c b/vp9/encoder/vp9_mcomp.c index 03877e9aa..300cda648 100644 --- a/vp9/encoder/vp9_mcomp.c +++ b/vp9/encoder/vp9_mcomp.c @@ -1993,9 +1993,18 @@ static int full_pixel_exhaustive(VP9_COMP *cpi, MACROBLOCK *x, int range = sf->mesh_patterns[0].range; int baseline_interval_divisor; +#if CONFIG_MULTITHREAD + if (NULL != x->search_count_mutex) pthread_mutex_lock(x->search_count_mutex); +#endif + // Keep track of number of exhaustive calls (this frame in this thread). ++(*x->ex_search_count_ptr); +#if CONFIG_MULTITHREAD + if (NULL != x->search_count_mutex) + pthread_mutex_unlock(x->search_count_mutex); +#endif + // Trap illegal values for interval and range for this function. if ((range < MIN_RANGE) || (range > MAX_RANGE) || (interval < MIN_INTERVAL) || (interval > range)) @@ -2356,13 +2365,27 @@ int vp9_refining_search_8p_c(const MACROBLOCK *x, MV *ref_mv, int error_per_bit, #define MIN_EX_SEARCH_LIMIT 128 static int is_exhaustive_allowed(VP9_COMP *cpi, MACROBLOCK *x) { const SPEED_FEATURES *const sf = &cpi->sf; - const int max_ex = - VPXMAX(MIN_EX_SEARCH_LIMIT, - (*x->m_search_count_ptr * sf->max_exaustive_pct) / 100); + int is_exhaustive_allowed; + int max_ex; + +#if CONFIG_MULTITHREAD + if (NULL != x->search_count_mutex) pthread_mutex_lock(x->search_count_mutex); +#endif + + max_ex = VPXMAX(MIN_EX_SEARCH_LIMIT, + (*x->m_search_count_ptr * sf->max_exaustive_pct) / 100); - return sf->allow_exhaustive_searches && - (sf->exhaustive_searches_thresh < INT_MAX) && - (*x->ex_search_count_ptr <= max_ex) && !cpi->rc.is_src_frame_alt_ref; + is_exhaustive_allowed = sf->allow_exhaustive_searches && + (sf->exhaustive_searches_thresh < INT_MAX) && + (*x->ex_search_count_ptr <= max_ex) && + !cpi->rc.is_src_frame_alt_ref; + +#if CONFIG_MULTITHREAD + if (NULL != x->search_count_mutex) + pthread_mutex_unlock(x->search_count_mutex); +#endif + + return is_exhaustive_allowed; } int vp9_full_pixel_search(VP9_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize, @@ -2407,9 +2430,19 @@ int vp9_full_pixel_search(VP9_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize, MAX_MVSEARCH_STEPS - 1 - step_param, 1, cost_list, fn_ptr, ref_mv, tmp_mv); +#if CONFIG_MULTITHREAD + if (NULL != x->search_count_mutex) + pthread_mutex_lock(x->search_count_mutex); +#endif + // Keep track of number of searches (this frame in this thread). ++(*x->m_search_count_ptr); +#if CONFIG_MULTITHREAD + if (NULL != x->search_count_mutex) + pthread_mutex_unlock(x->search_count_mutex); +#endif + // Should we allow a follow on exhaustive search? if (is_exhaustive_allowed(cpi, x)) { int64_t exhuastive_thr = sf->exhaustive_searches_thresh; diff --git a/vp9/encoder/vp9_multi_thread.c b/vp9/encoder/vp9_multi_thread.c index 23b0b4276..e27b1ed3a 100644 --- a/vp9/encoder/vp9_multi_thread.c +++ b/vp9/encoder/vp9_multi_thread.c @@ -100,11 +100,32 @@ void vp9_row_mt_mem_alloc(VP9_COMP *cpi) { multi_thread_ctxt->num_tile_vert_sbs[tile_row] = get_num_vert_units(*tile_info, MI_BLOCK_SIZE_LOG2); } + +#if CONFIG_MULTITHREAD + for (tile_row = 0; tile_row < tile_rows; tile_row++) { + for (tile_col = 0; tile_col < tile_cols; tile_col++) { + TileDataEnc *this_tile = &cpi->tile_data[tile_row * tile_cols + tile_col]; + + CHECK_MEM_ERROR(cm, this_tile->search_count_mutex, + vpx_malloc(sizeof(*this_tile->search_count_mutex))); + + pthread_mutex_init(this_tile->search_count_mutex, NULL); + + CHECK_MEM_ERROR(cm, this_tile->enc_row_mt_mutex, + vpx_malloc(sizeof(*this_tile->enc_row_mt_mutex))); + + pthread_mutex_init(this_tile->enc_row_mt_mutex, NULL); + } + } +#endif } void vp9_row_mt_mem_dealloc(VP9_COMP *cpi) { MultiThreadHandle *multi_thread_ctxt = &cpi->multi_thread_ctxt; int tile_col; +#if CONFIG_MULTITHREAD + int tile_row; +#endif // Deallocate memory for job queue if (multi_thread_ctxt->job_queue) vpx_free(multi_thread_ctxt->job_queue); @@ -124,6 +145,25 @@ void vp9_row_mt_mem_dealloc(VP9_COMP *cpi) { TileDataEnc *this_tile = &cpi->tile_data[tile_col]; vp9_row_mt_sync_mem_dealloc(&this_tile->row_mt_sync); } + +#if CONFIG_MULTITHREAD + for (tile_row = 0; tile_row < multi_thread_ctxt->allocated_tile_rows; + tile_row++) { + for (tile_col = 0; tile_col < multi_thread_ctxt->allocated_tile_cols; + tile_col++) { + TileDataEnc *this_tile = + &cpi->tile_data[tile_row * multi_thread_ctxt->allocated_tile_cols + + tile_col]; + pthread_mutex_destroy(this_tile->search_count_mutex); + vpx_free(this_tile->search_count_mutex); + this_tile->search_count_mutex = NULL; + + pthread_mutex_destroy(this_tile->enc_row_mt_mutex); + vpx_free(this_tile->enc_row_mt_mutex); + this_tile->enc_row_mt_mutex = NULL; + } + } +#endif } void vp9_multi_thread_tile_init(VP9_COMP *cpi) { diff --git a/vp9/encoder/vp9_pickmode.c b/vp9/encoder/vp9_pickmode.c index cff8a3fa9..9f2e93adc 100644 --- a/vp9/encoder/vp9_pickmode.c +++ b/vp9/encoder/vp9_pickmode.c @@ -1657,7 +1657,10 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data, mode_rd_thresh = mode_rd_thresh << 3; if (rd_less_than_thresh(best_rdc.rdcost, mode_rd_thresh, - rd_thresh_freq_fact[mode_index])) +#if CONFIG_MULTITHREAD + tile_data->enc_row_mt_mutex, +#endif + &rd_thresh_freq_fact[mode_index])) continue; if (this_mode == NEWMV) { @@ -2018,7 +2021,10 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data, continue; if (rd_less_than_thresh(best_rdc.rdcost, mode_rd_thresh, - rd_thresh_freq_fact[mode_index])) +#if CONFIG_MULTITHREAD + tile_data->enc_row_mt_mutex, +#endif + &rd_thresh_freq_fact[mode_index])) continue; mi->mode = this_mode; diff --git a/vp9/encoder/vp9_rd.c b/vp9/encoder/vp9_rd.c index 3bbfa1aac..21e3b1f63 100644 --- a/vp9/encoder/vp9_rd.c +++ b/vp9/encoder/vp9_rd.c @@ -610,7 +610,15 @@ void vp9_set_rd_speed_thresholds_sub8x8(VP9_COMP *cpi) { } void vp9_update_rd_thresh_fact(int (*factor_buf)[MAX_MODES], int rd_thresh, - int bsize, int best_mode_index) { + int bsize, +#if CONFIG_MULTITHREAD + pthread_mutex_t *enc_row_mt_mutex, +#endif + int best_mode_index) { +#if CONFIG_MULTITHREAD + if (NULL != enc_row_mt_mutex) pthread_mutex_lock(enc_row_mt_mutex); +#endif + if (rd_thresh > 0) { const int top_mode = bsize < BLOCK_8X8 ? MAX_REFS : MAX_MODES; int mode; @@ -628,6 +636,10 @@ void vp9_update_rd_thresh_fact(int (*factor_buf)[MAX_MODES], int rd_thresh, } } } + +#if CONFIG_MULTITHREAD + if (NULL != enc_row_mt_mutex) pthread_mutex_unlock(enc_row_mt_mutex); +#endif } int vp9_get_intra_cost_penalty(int qindex, int qdelta, diff --git a/vp9/encoder/vp9_rd.h b/vp9/encoder/vp9_rd.h index 1c6831358..74a2f5d95 100644 --- a/vp9/encoder/vp9_rd.h +++ b/vp9/encoder/vp9_rd.h @@ -164,11 +164,32 @@ void vp9_set_rd_speed_thresholds(struct VP9_COMP *cpi); void vp9_set_rd_speed_thresholds_sub8x8(struct VP9_COMP *cpi); void vp9_update_rd_thresh_fact(int (*fact)[MAX_MODES], int rd_thresh, int bsize, +#if CONFIG_MULTITHREAD + pthread_mutex_t *enc_row_mt_mutex, +#endif int best_mode_index); static INLINE int rd_less_than_thresh(int64_t best_rd, int thresh, - int thresh_fact) { - return best_rd < ((int64_t)thresh * thresh_fact >> 5) || thresh == INT_MAX; +#if CONFIG_MULTITHREAD + pthread_mutex_t *enc_row_mt_mutex, +#endif + const int *const thresh_fact) { + int is_rd_less_than_thresh; + +#if CONFIG_MULTITHREAD + // Synchronize to ensure data coherency as thresh_freq_fact is maintained at + // tile level and not thread-safe with row based multi-threading + if (NULL != enc_row_mt_mutex) pthread_mutex_lock(enc_row_mt_mutex); +#endif + + is_rd_less_than_thresh = + best_rd < ((int64_t)thresh * (*thresh_fact) >> 5) || thresh == INT_MAX; + +#if CONFIG_MULTITHREAD + if (NULL != enc_row_mt_mutex) pthread_mutex_unlock(enc_row_mt_mutex); +#endif + + return is_rd_less_than_thresh; } static INLINE void set_error_per_bit(MACROBLOCK *x, int rdmult) { diff --git a/vp9/encoder/vp9_rdopt.c b/vp9/encoder/vp9_rdopt.c index 4e1ca328c..8d1006b6e 100644 --- a/vp9/encoder/vp9_rdopt.c +++ b/vp9/encoder/vp9_rdopt.c @@ -3043,7 +3043,10 @@ void vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, TileDataEnc *tile_data, const int *const rd_threshes = rd_opt->threshes[segment_id][bsize]; const int *const rd_thresh_freq_fact = tile_data->thresh_freq_fact[bsize]; int64_t mode_threshold[MAX_MODES]; - int *mode_map = tile_data->mode_map[bsize]; + int *tile_mode_map = tile_data->mode_map[bsize]; + int mode_map[MAX_MODES]; // Maintain mode_map information locally to avoid + // lock mechanism involved with reads from + // tile_mode_map const int mode_search_skip_flags = sf->mode_search_skip_flags; int64_t mask_filter = 0; int64_t filter_cache[SWITCHABLE_FILTER_CONTEXTS]; @@ -3155,10 +3158,19 @@ void vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, TileDataEnc *tile_data, ~(sf->intra_y_mode_mask[max_txsize_lookup[bsize]]); for (i = 0; i <= LAST_NEW_MV_INDEX; ++i) mode_threshold[i] = 0; + +#if CONFIG_MULTITHREAD + if (NULL != tile_data->enc_row_mt_mutex) + pthread_mutex_lock(tile_data->enc_row_mt_mutex); +#endif + for (i = LAST_NEW_MV_INDEX + 1; i < MAX_MODES; ++i) mode_threshold[i] = ((int64_t)rd_threshes[i] * rd_thresh_freq_fact[i]) >> 5; midx = sf->schedule_mode_search ? mode_skip_start : 0; + + memcpy(mode_map, tile_mode_map, sizeof(mode_map)); + while (midx > 4) { uint8_t end_pos = 0; for (i = 5; i < midx; ++i) { @@ -3172,6 +3184,13 @@ void vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, TileDataEnc *tile_data, midx = end_pos; } + memcpy(tile_mode_map, mode_map, sizeof(mode_map)); + +#if CONFIG_MULTITHREAD + if (NULL != tile_data->enc_row_mt_mutex) + pthread_mutex_unlock(tile_data->enc_row_mt_mutex); +#endif + for (midx = 0; midx < MAX_MODES; ++midx) { int mode_index = mode_map[midx]; int mode_excluded = 0; @@ -3573,6 +3592,9 @@ void vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, TileDataEnc *tile_data, } if (best_mode_index < 0 || best_rd >= best_rd_so_far) { + // If adaptive interp filter is enabled, then the current leaf node of 8x8 + // data is needed for sub8x8. Hence preserve the context. + if (cpi->new_mt && bsize == BLOCK_8X8) ctx->mic = *xd->mi[0]; rd_cost->rate = INT_MAX; rd_cost->rdcost = INT64_MAX; return; @@ -3599,7 +3621,11 @@ void vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, TileDataEnc *tile_data, if (!cpi->rc.is_src_frame_alt_ref) vp9_update_rd_thresh_fact(tile_data->thresh_freq_fact, - sf->adaptive_rd_thresh, bsize, best_mode_index); + sf->adaptive_rd_thresh, bsize, +#if CONFIG_MULTITHREAD + tile_data->enc_row_mt_mutex, +#endif + best_mode_index); // macroblock modes *mi = best_mbmode; @@ -3737,7 +3763,11 @@ void vp9_rd_pick_inter_mode_sb_seg_skip(VP9_COMP *cpi, TileDataEnc *tile_data, (cm->interp_filter == mi->interp_filter)); vp9_update_rd_thresh_fact(tile_data->thresh_freq_fact, - cpi->sf.adaptive_rd_thresh, bsize, THR_ZEROMV); + cpi->sf.adaptive_rd_thresh, bsize, +#if CONFIG_MULTITHREAD + tile_data->enc_row_mt_mutex, +#endif + THR_ZEROMV); vp9_zero(best_pred_diff); vp9_zero(best_filter_diff); @@ -3789,6 +3819,7 @@ void vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, TileDataEnc *tile_data, int64_t filter_cache[SWITCHABLE_FILTER_CONTEXTS]; int internal_active_edge = vp9_active_edge_sb(cpi, mi_row, mi_col) && vp9_internal_image_edge(cpi); + const int *const rd_thresh_freq_fact = tile_data->thresh_freq_fact[bsize]; x->skip_encode = sf->skip_encode_frame && x->q_index < QIDX_SKIP_THRESH; memset(x->zcoeff_blk[TX_4X4], 0, 4); @@ -3880,7 +3911,10 @@ void vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, TileDataEnc *tile_data, if (!internal_active_edge && rd_less_than_thresh(best_rd, rd_opt->threshes[segment_id][bsize][ref_index], - tile_data->thresh_freq_fact[bsize][ref_index])) +#if CONFIG_MULTITHREAD + tile_data->enc_row_mt_mutex, +#endif + &rd_thresh_freq_fact[ref_index])) continue; comp_pred = second_ref_frame > INTRA_FRAME; @@ -4324,7 +4358,11 @@ void vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, TileDataEnc *tile_data, !is_inter_block(&best_mbmode)); vp9_update_rd_thresh_fact(tile_data->thresh_freq_fact, sf->adaptive_rd_thresh, - bsize, best_ref_index); + bsize, +#if CONFIG_MULTITHREAD + tile_data->enc_row_mt_mutex, +#endif + best_ref_index); // macroblock modes *mi = best_mbmode; diff --git a/vp9/encoder/vp9_speed_features.c b/vp9/encoder/vp9_speed_features.c index 9ce756efe..09e91fc17 100644 --- a/vp9/encoder/vp9_speed_features.c +++ b/vp9/encoder/vp9_speed_features.c @@ -585,6 +585,15 @@ void vp9_set_speed_features_framesize_dependent(VP9_COMP *cpi) { rd->thresh_mult_sub8x8[i] = INT_MAX; } } + + // With row based multi-threading, the following speed features + // have to be disabled to guarantee that bitstreams encoded with single thread + // and multiple threads match + if (cpi->oxcf.ethread_bit_match) { + sf->adaptive_rd_thresh = 0; + sf->allow_exhaustive_searches = 0; + sf->adaptive_pred_interp_filter = 0; + } } void vp9_set_speed_features_framesize_independent(VP9_COMP *cpi) { @@ -747,4 +756,13 @@ void vp9_set_speed_features_framesize_independent(VP9_COMP *cpi) { if (!cpi->oxcf.frame_periodic_boost) { sf->max_delta_qindex = 0; } + + // With row based multi-threading, the following speed features + // have to be disabled to guarantee that bitstreams encoded with single thread + // and multiple threads match + if (cpi->oxcf.ethread_bit_match) { + sf->adaptive_rd_thresh = 0; + sf->allow_exhaustive_searches = 0; + sf->adaptive_pred_interp_filter = 0; + } } diff --git a/vp9/vp9_cx_iface.c b/vp9/vp9_cx_iface.c index 481189020..cc946dfd6 100644 --- a/vp9/vp9_cx_iface.c +++ b/vp9/vp9_cx_iface.c @@ -1459,6 +1459,9 @@ static vpx_codec_err_t ctrl_set_svc(vpx_codec_alg_priv_t *ctx, va_list args) { cfg->ss_number_layers > 1 && cfg->ts_number_layers > 1) { return VPX_CODEC_INVALID_PARAM; } + + vp9_set_new_mt(ctx->cpi); + return VPX_CODEC_OK; } -- 2.40.0