From 4da98b0cc3a539d1be028d0c4c87d7cd3d718781 Mon Sep 17 00:00:00 2001 From: Supradeep T R Date: Tue, 12 Jun 2018 13:57:39 +0530 Subject: [PATCH] Loopfilter Multi-Thread Optimization Take the original loopfilter multi-thread optimization (dafe064289a917977439ab6f4f002b9946496084) along with the fixes for bugs 1558 and 1562. BUG=webm:1558 BUG=webm:1562 Change-Id: Ibbf6bd13f4ffff0e79184ccfd6b85a49e067a6d8 --- test/invalid_file_test.cc | 2 + test/test-data.mk | 4 + test/test-data.sha1 | 4 + vp9/common/vp9_onyxc_int.h | 2 + vp9/common/vp9_thread_common.c | 157 +++++++++++++++++++++++++++++++++ vp9/common/vp9_thread_common.h | 19 ++++ vp9/decoder/vp9_decodeframe.c | 100 ++++++++++++++++++--- vp9/decoder/vp9_decoder.h | 3 + vp9/vp9_dx_iface.c | 11 +++ vp9/vp9_dx_iface.h | 1 + vpx/vp8dx.h | 12 +++ vpx_util/vpx_thread.h | 17 ++++ vpxdec.c | 51 ++++++++--- 13 files changed, 361 insertions(+), 22 deletions(-) diff --git a/test/invalid_file_test.cc b/test/invalid_file_test.cc index 9cfaa1f1f..6f61d3bc2 100644 --- a/test/invalid_file_test.cc +++ b/test/invalid_file_test.cc @@ -204,6 +204,8 @@ const DecodeParam kMultiThreadedVP9InvalidFileTests[] = { { 2, "invalid-vp90-2-09-aq2.webm.ivf.s3984_r01-05_b6-.v2.ivf" }, { 4, "invalid-vp90-2-09-subpixel-00.ivf.s19552_r01-05_b6-.v2.ivf" }, { 2, "invalid-crbug-629481.webm" }, + { 3, "invalid-crbug-1558.ivf" }, + { 4, "invalid-crbug-1562.ivf" }, }; INSTANTIATE_TEST_CASE_P( diff --git a/test/test-data.mk b/test/test-data.mk index 83ce7b942..1d42b94e0 100644 --- a/test/test-data.mk +++ b/test/test-data.mk @@ -787,6 +787,10 @@ LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-2-07-frame_parallel-2.web LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-2-07-frame_parallel-3.webm LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-crbug-629481.webm LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-crbug-629481.webm.res +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-crbug-1558.ivf +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-crbug-1558.ivf.res +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-crbug-1562.ivf +LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-crbug-1562.ivf.res LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-crbug-667044.webm LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-crbug-667044.webm.res LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += crbug-1539.rawfile diff --git a/test/test-data.sha1 b/test/test-data.sha1 index 839aa802b..771aeca01 100644 --- a/test/test-data.sha1 +++ b/test/test-data.sha1 @@ -859,3 +859,7 @@ e2f9e1e47a791b4e939a9bdc50bf7a25b3761f77 *vp90-2-22-svc_1280x720_1.webm.md5 a0fbbbc5dd50fd452096f4455a58c1a8c9f66697 *invalid-vp80-00-comprehensive-s17661_r01-05_b6-.ivf a61774cf03fc584bd9f0904fc145253bb8ea6c4c *invalid-vp80-00-comprehensive-s17661_r01-05_b6-.ivf.res 894fae3afee0290546590823974203ab4b8abd95 *crbug-1539.rawfile +f1026c03efd5da21b381c8eb21f0d64e6d7e4ba3 *invalid-crbug-1558.ivf +eb198c25f861c3fe2cbd310de11eb96843019345 *invalid-crbug-1558.ivf.res +c62b005a9fd32c36a1b3f67de6840330f9915e34 *invalid-crbug-1562.ivf +f0cd8389948ad16085714d96567612136f6a46c5 *invalid-crbug-1562.ivf.res diff --git a/vp9/common/vp9_onyxc_int.h b/vp9/common/vp9_onyxc_int.h index 45d3b0f82..c5c63e476 100644 --- a/vp9/common/vp9_onyxc_int.h +++ b/vp9/common/vp9_onyxc_int.h @@ -259,6 +259,8 @@ typedef struct VP9Common { PARTITION_CONTEXT *above_seg_context; ENTROPY_CONTEXT *above_context; int above_context_alloc_cols; + + int lf_row; } VP9_COMMON; static INLINE YV12_BUFFER_CONFIG *get_buf_frame(VP9_COMMON *cm, int index) { diff --git a/vp9/common/vp9_thread_common.c b/vp9/common/vp9_thread_common.c index d4b076645..36530fae6 100644 --- a/vp9/common/vp9_thread_common.c +++ b/vp9/common/vp9_thread_common.c @@ -229,6 +229,28 @@ void vp9_loop_filter_frame_mt(YV12_BUFFER_CONFIG *frame, VP9_COMMON *cm, workers, num_workers, lf_sync); } +void vp9_lpf_mt_init(VP9LfSync *lf_sync, VP9_COMMON *cm, int frame_filter_level, + int num_workers) { + const int sb_rows = mi_cols_aligned_to_sb(cm->mi_rows) >> MI_BLOCK_SIZE_LOG2; + + if (!frame_filter_level) return; + + if (!lf_sync->sync_range || sb_rows != lf_sync->rows || + num_workers > lf_sync->num_workers) { + vp9_loop_filter_dealloc(lf_sync); + vp9_loop_filter_alloc(lf_sync, cm, sb_rows, cm->width, num_workers); + } + + // Initialize cur_sb_col to -1 for all SB rows. + memset(lf_sync->cur_sb_col, -1, sizeof(*lf_sync->cur_sb_col) * sb_rows); + + lf_sync->corrupted = 0; + + memset(lf_sync->num_tiles_done, 0, + sizeof(*lf_sync->num_tiles_done) * sb_rows); + cm->lf_row = 0; +} + // Set up nsync by width. static INLINE int get_sync_range(int width) { // nsync numbers are picked by testing. For example, for 4k @@ -266,6 +288,25 @@ void vp9_loop_filter_alloc(VP9LfSync *lf_sync, VP9_COMMON *cm, int rows, pthread_cond_init(&lf_sync->cond[i], NULL); } } + pthread_mutex_init(&lf_sync->lf_mutex, NULL); + + CHECK_MEM_ERROR(cm, lf_sync->recon_done_mutex, + vpx_malloc(sizeof(*lf_sync->recon_done_mutex) * rows)); + if (lf_sync->recon_done_mutex) { + int i; + for (i = 0; i < rows; ++i) { + pthread_mutex_init(&lf_sync->recon_done_mutex[i], NULL); + } + } + + CHECK_MEM_ERROR(cm, lf_sync->recon_done_cond, + vpx_malloc(sizeof(*lf_sync->recon_done_cond) * rows)); + if (lf_sync->recon_done_cond) { + int i; + for (i = 0; i < rows; ++i) { + pthread_cond_init(&lf_sync->recon_done_cond[i], NULL); + } + } } #endif // CONFIG_MULTITHREAD @@ -276,6 +317,11 @@ void vp9_loop_filter_alloc(VP9LfSync *lf_sync, VP9_COMMON *cm, int rows, CHECK_MEM_ERROR(cm, lf_sync->cur_sb_col, vpx_malloc(sizeof(*lf_sync->cur_sb_col) * rows)); + CHECK_MEM_ERROR(cm, lf_sync->num_tiles_done, + vpx_malloc(sizeof(*lf_sync->num_tiles_done) * + mi_cols_aligned_to_sb(cm->mi_rows) >> + MI_BLOCK_SIZE_LOG2)); + // Set up nsync. lf_sync->sync_range = get_sync_range(width); } @@ -298,15 +344,126 @@ void vp9_loop_filter_dealloc(VP9LfSync *lf_sync) { } vpx_free(lf_sync->cond); } + if (lf_sync->recon_done_mutex != NULL) { + int i; + for (i = 0; i < lf_sync->rows; ++i) { + pthread_mutex_destroy(&lf_sync->recon_done_mutex[i]); + } + vpx_free(lf_sync->recon_done_mutex); + } + + pthread_mutex_destroy(&lf_sync->lf_mutex); + if (lf_sync->recon_done_cond != NULL) { + int i; + for (i = 0; i < lf_sync->rows; ++i) { + pthread_cond_destroy(&lf_sync->recon_done_cond[i]); + } + vpx_free(lf_sync->recon_done_cond); + } #endif // CONFIG_MULTITHREAD + vpx_free(lf_sync->lfdata); vpx_free(lf_sync->cur_sb_col); + vpx_free(lf_sync->num_tiles_done); // clear the structure as the source of this call may be a resize in which // case this call will be followed by an _alloc() which may fail. vp9_zero(*lf_sync); } } +static int get_next_row(VP9_COMMON *cm, VP9LfSync *lf_sync) { + int return_val = -1; + int cur_row; + const int max_rows = cm->mi_rows; + +#if CONFIG_MULTITHREAD + const int tile_cols = 1 << cm->log2_tile_cols; + + pthread_mutex_lock(&lf_sync->lf_mutex); + if (cm->lf_row < max_rows) { + cur_row = cm->lf_row >> MI_BLOCK_SIZE_LOG2; + return_val = cm->lf_row; + cm->lf_row += MI_BLOCK_SIZE; + if (cm->lf_row < max_rows) { + /* If this is not the last row, make sure the next row is also decoded. + * This is because the intra predict has to happen before loop filter */ + cur_row += 1; + } + } + pthread_mutex_unlock(&lf_sync->lf_mutex); + + if (return_val == -1) return return_val; + + pthread_mutex_lock(&lf_sync->recon_done_mutex[cur_row]); + if (lf_sync->num_tiles_done[cur_row] < tile_cols) { + pthread_cond_wait(&lf_sync->recon_done_cond[cur_row], + &lf_sync->recon_done_mutex[cur_row]); + } + pthread_mutex_unlock(&lf_sync->recon_done_mutex[cur_row]); + pthread_mutex_lock(&lf_sync->lf_mutex); + if (lf_sync->corrupted) { + return_val = -1; + } + pthread_mutex_unlock(&lf_sync->lf_mutex); +#else + (void)lf_sync; + if (cm->lf_row < max_rows) { + cur_row = cm->lf_row >> MI_BLOCK_SIZE_LOG2; + return_val = cm->lf_row; + cm->lf_row += MI_BLOCK_SIZE; + if (cm->lf_row < max_rows) { + /* If this is not the last row, make sure the next row is also decoded. + * This is because the intra predict has to happen before loop filter */ + cur_row += 1; + } + } +#endif // CONFIG_MULTITHREAD + + return return_val; +} + +void vp9_loopfilter_rows(LFWorkerData *lf_data, VP9LfSync *lf_sync) { + int mi_row; + VP9_COMMON *cm = lf_data->cm; + + while ((mi_row = get_next_row(cm, lf_sync)) != -1 && mi_row < cm->mi_rows) { + lf_data->start = mi_row; + lf_data->stop = mi_row + MI_BLOCK_SIZE; + + thread_loop_filter_rows(lf_data->frame_buffer, lf_data->cm, lf_data->planes, + lf_data->start, lf_data->stop, lf_data->y_only, + lf_sync); + } +} + +void vp9_set_row(VP9LfSync *lf_sync, int num_tiles, int row, int is_last_row, + int corrupted) { +#if CONFIG_MULTITHREAD + pthread_mutex_lock(&lf_sync->lf_mutex); + lf_sync->corrupted |= corrupted; + pthread_mutex_unlock(&lf_sync->lf_mutex); + pthread_mutex_lock(&lf_sync->recon_done_mutex[row]); + lf_sync->num_tiles_done[row] += 1; + if (num_tiles == lf_sync->num_tiles_done[row]) { + if (is_last_row) { + /* The last 2 rows wait on the last row to be done. + * So, we have to broadcast the signal in this case. + */ + pthread_cond_broadcast(&lf_sync->recon_done_cond[row]); + } else { + pthread_cond_signal(&lf_sync->recon_done_cond[row]); + } + } + pthread_mutex_unlock(&lf_sync->recon_done_mutex[row]); +#else + (void)lf_sync; + (void)num_tiles; + (void)row; + (void)is_last_row; + (void)corrupted; +#endif // CONFIG_MULTITHREAD +} + // Accumulate frame counts. void vp9_accumulate_frame_counts(FRAME_COUNTS *accum, const FRAME_COUNTS *counts, int is_dec) { diff --git a/vp9/common/vp9_thread_common.h b/vp9/common/vp9_thread_common.h index f92df5bd6..b97e9ee13 100644 --- a/vp9/common/vp9_thread_common.h +++ b/vp9/common/vp9_thread_common.h @@ -37,6 +37,14 @@ typedef struct VP9LfSyncData { // Row-based parallel loopfilter data LFWorkerData *lfdata; int num_workers; + +#if CONFIG_MULTITHREAD + pthread_mutex_t lf_mutex; + pthread_mutex_t *recon_done_mutex; + pthread_cond_t *recon_done_cond; +#endif + int *num_tiles_done; + int corrupted; } VP9LfSync; // Allocate memory for loopfilter row synchronization. @@ -53,6 +61,17 @@ void vp9_loop_filter_frame_mt(YV12_BUFFER_CONFIG *frame, struct VP9Common *cm, int partial_frame, VPxWorker *workers, int num_workers, VP9LfSync *lf_sync); +// Multi-threaded loopfilter initialisations +void vp9_lpf_mt_init(VP9LfSync *lf_sync, struct VP9Common *cm, + int frame_filter_level, int num_workers); + +void vp9_loopfilter_rows(LFWorkerData *lf_data, VP9LfSync *lf_sync); + +void vp9_set_row(VP9LfSync *lf_sync, int num_tiles, int row, int is_last_row, + int corrupted); + +void vp9_set_last_decoded_row(struct VP9Common *cm, int tile_col, int mi_row); + void vp9_accumulate_frame_counts(struct FRAME_COUNTS *accum, const struct FRAME_COUNTS *counts, int is_dec); diff --git a/vp9/decoder/vp9_decodeframe.c b/vp9/decoder/vp9_decodeframe.c index 48c49e2f5..95e376d04 100644 --- a/vp9/decoder/vp9_decodeframe.c +++ b/vp9/decoder/vp9_decodeframe.c @@ -1451,6 +1451,25 @@ static const uint8_t *decode_tiles(VP9Decoder *pbi, const uint8_t *data, return vpx_reader_find_end(&tile_data->bit_reader); } +static void set_rows_after_error(VP9LfSync *lf_sync, int start_row, int mi_rows, + int num_tiles_left, int total_num_tiles) { + do { + int mi_row; + const int aligned_rows = mi_cols_aligned_to_sb(mi_rows); + const int sb_rows = (aligned_rows >> MI_BLOCK_SIZE_LOG2); + const int corrupted = 1; + for (mi_row = start_row; mi_row < mi_rows; mi_row += MI_BLOCK_SIZE) { + const int is_last_row = (sb_rows - 1 == mi_row >> MI_BLOCK_SIZE_LOG2); + vp9_set_row(lf_sync, total_num_tiles, mi_row >> MI_BLOCK_SIZE_LOG2, + is_last_row, corrupted); + } + /* If there are multiple tiles, the second tile should start marking row + * progress from row 0. + */ + start_row = 0; + } while (num_tiles_left--); +} + // On entry 'tile_data->data_end' points to the end of the input frame, on exit // it is updated to reflect the bitreader position of the final tile column if // present in the tile buffer group or NULL otherwise. @@ -1461,6 +1480,12 @@ static int tile_worker_hook(void *arg1, void *arg2) { TileInfo *volatile tile = &tile_data->xd.tile; const int final_col = (1 << pbi->common.log2_tile_cols) - 1; const uint8_t *volatile bit_reader_end = NULL; + VP9_COMMON *cm = &pbi->common; + + LFWorkerData *lf_data = tile_data->lf_data; + VP9LfSync *lf_sync = tile_data->lf_sync; + + volatile int mi_row = 0; volatile int n = tile_data->buf_start; tile_data->error_info.setjmp = 1; @@ -1468,14 +1493,26 @@ static int tile_worker_hook(void *arg1, void *arg2) { tile_data->error_info.setjmp = 0; tile_data->xd.corrupted = 1; tile_data->data_end = NULL; + if (pbi->lpf_mt_opt && cm->lf.filter_level && !cm->skip_loop_filter) { + const int num_tiles_left = tile_data->buf_end - n; + const int mi_row_start = mi_row; + set_rows_after_error(lf_sync, mi_row_start, cm->mi_rows, num_tiles_left, + 1 << cm->log2_tile_cols); + } return 0; } tile_data->xd.corrupted = 0; do { - int mi_row, mi_col; + int mi_col; const TileBuffer *const buf = pbi->tile_buffers + n; + + /* Initialize to 0 is safe since we do not deal with streams that have + * more than one row of tiles. (So tile->mi_row_start will be 0) + */ + assert(cm->log2_tile_rows == 0); + mi_row = 0; vp9_zero(tile_data->dqcoeff); vp9_tile_init(tile, &pbi->common, 0, buf->col); setup_token_decoder(buf->data, tile_data->data_end, buf->size, @@ -1493,6 +1530,14 @@ static int tile_worker_hook(void *arg1, void *arg2) { mi_col += MI_BLOCK_SIZE) { decode_partition(tile_data, pbi, mi_row, mi_col, BLOCK_64X64, 4); } + if (pbi->lpf_mt_opt && cm->lf.filter_level && !cm->skip_loop_filter) { + const int aligned_rows = mi_cols_aligned_to_sb(cm->mi_rows); + const int sb_rows = (aligned_rows >> MI_BLOCK_SIZE_LOG2); + const int is_last_row = (sb_rows - 1 == mi_row >> MI_BLOCK_SIZE_LOG2); + vp9_set_row(lf_sync, 1 << cm->log2_tile_cols, + mi_row >> MI_BLOCK_SIZE_LOG2, is_last_row, + tile_data->xd.corrupted); + } } if (buf->col == final_col) { @@ -1500,6 +1545,21 @@ static int tile_worker_hook(void *arg1, void *arg2) { } } while (!tile_data->xd.corrupted && ++n <= tile_data->buf_end); + if (pbi->lpf_mt_opt && n < tile_data->buf_end && cm->lf.filter_level && + !cm->skip_loop_filter) { + /* This was not incremented in the tile loop, so increment before tiles left + * calculation + */ + ++n; + set_rows_after_error(lf_sync, 0, cm->mi_rows, tile_data->buf_end - n, + 1 << cm->log2_tile_cols); + } + + if (pbi->lpf_mt_opt && !tile_data->xd.corrupted && cm->lf.filter_level && + !cm->skip_loop_filter) { + vp9_loopfilter_rows(lf_data, lf_sync); + } + tile_data->data_end = bit_reader_end; return !tile_data->xd.corrupted; } @@ -1516,6 +1576,8 @@ static const uint8_t *decode_tiles_mt(VP9Decoder *pbi, const uint8_t *data, VP9_COMMON *const cm = &pbi->common; const VPxWorkerInterface *const winterface = vpx_get_worker_interface(); const uint8_t *bit_reader_end = NULL; + VP9LfSync *lf_row_sync = &pbi->lf_row_sync; + YV12_BUFFER_CONFIG *const new_fb = get_frame_new_buffer(cm); const int aligned_mi_cols = mi_cols_aligned_to_sb(cm->mi_cols); const int tile_cols = 1 << cm->log2_tile_cols; const int tile_rows = 1 << cm->log2_tile_rows; @@ -1542,12 +1604,26 @@ static const uint8_t *decode_tiles_mt(VP9Decoder *pbi, const uint8_t *data, } } + // Initialize LPF + if (pbi->lpf_mt_opt && cm->lf.filter_level && !cm->skip_loop_filter) { + vp9_lpf_mt_init(lf_row_sync, cm, cm->lf.filter_level, + pbi->num_tile_workers); + } + // Reset tile decoding hook for (n = 0; n < num_workers; ++n) { VPxWorker *const worker = &pbi->tile_workers[n]; TileWorkerData *const tile_data = &pbi->tile_worker_data[n + pbi->total_tiles]; winterface->sync(worker); + + if (pbi->lpf_mt_opt && cm->lf.filter_level && !cm->skip_loop_filter) { + tile_data->lf_sync = lf_row_sync; + tile_data->lf_data = &tile_data->lf_sync->lfdata[n]; + vp9_loop_filter_data_reset(tile_data->lf_data, new_fb, cm, pbi->mb.plane); + tile_data->lf_data->y_only = 0; + } + tile_data->xd = pbi->mb; tile_data->xd.counts = cm->frame_parallel_decoding_mode ? NULL : &tile_data->counts; @@ -2069,17 +2145,19 @@ void vp9_decode_frame(VP9Decoder *pbi, const uint8_t *data, if (pbi->max_threads > 1 && tile_rows == 1 && tile_cols > 1) { // Multi-threaded tile decoder *p_data_end = decode_tiles_mt(pbi, data + first_partition_size, data_end); - if (!xd->corrupted) { - if (!cm->skip_loop_filter) { - // If multiple threads are used to decode tiles, then we use those - // threads to do parallel loopfiltering. - vp9_loop_filter_frame_mt(new_fb, cm, pbi->mb.plane, cm->lf.filter_level, - 0, 0, pbi->tile_workers, pbi->num_tile_workers, - &pbi->lf_row_sync); + if (!pbi->lpf_mt_opt) { + if (!xd->corrupted) { + if (!cm->skip_loop_filter) { + // If multiple threads are used to decode tiles, then we use those + // threads to do parallel loopfiltering. + vp9_loop_filter_frame_mt(new_fb, cm, pbi->mb.plane, + cm->lf.filter_level, 0, 0, pbi->tile_workers, + pbi->num_tile_workers, &pbi->lf_row_sync); + } + } else { + vpx_internal_error(&cm->error, VPX_CODEC_CORRUPT_FRAME, + "Decode failed. Frame data is corrupted."); } - } else { - vpx_internal_error(&cm->error, VPX_CODEC_CORRUPT_FRAME, - "Decode failed. Frame data is corrupted."); } } else { *p_data_end = decode_tiles(pbi, data + first_partition_size, data_end); diff --git a/vp9/decoder/vp9_decoder.h b/vp9/decoder/vp9_decoder.h index 1c488961a..425c8964c 100644 --- a/vp9/decoder/vp9_decoder.h +++ b/vp9/decoder/vp9_decoder.h @@ -37,6 +37,8 @@ typedef struct TileWorkerData { int buf_start, buf_end; // pbi->tile_buffers to decode, inclusive vpx_reader bit_reader; FRAME_COUNTS counts; + LFWorkerData *lf_data; + VP9LfSync *lf_sync; DECLARE_ALIGNED(16, MACROBLOCKD, xd); /* dqcoeff are shared by all the planes. So planes must be decoded serially */ DECLARE_ALIGNED(16, tran_low_t, dqcoeff[32 * 32]); @@ -74,6 +76,7 @@ typedef struct VP9Decoder { int hold_ref_buf; // hold the reference buffer. int row_mt; + int lpf_mt_opt; } VP9Decoder; int vp9_receive_compressed_data(struct VP9Decoder *pbi, size_t size, diff --git a/vp9/vp9_dx_iface.c b/vp9/vp9_dx_iface.c index fdff87768..6a4cb9acf 100644 --- a/vp9/vp9_dx_iface.c +++ b/vp9/vp9_dx_iface.c @@ -270,6 +270,9 @@ static vpx_codec_err_t init_decoder(vpx_codec_alg_priv_t *ctx) { RANGE_CHECK(ctx, row_mt, 0, 1); ctx->pbi->row_mt = ctx->row_mt; + RANGE_CHECK(ctx, lpf_opt, 0, 1); + ctx->pbi->lpf_mt_opt = ctx->lpf_opt; + // If postprocessing was enabled by the application and a // configuration has not been provided, default it. if (!ctx->postproc_cfg_set && (ctx->base.init_flags & VPX_CODEC_USE_POSTPROC)) @@ -658,6 +661,13 @@ static vpx_codec_err_t ctrl_set_row_mt(vpx_codec_alg_priv_t *ctx, return VPX_CODEC_OK; } +static vpx_codec_err_t ctrl_enable_lpf_opt(vpx_codec_alg_priv_t *ctx, + va_list args) { + ctx->lpf_opt = va_arg(args, int); + + return VPX_CODEC_OK; +} + static vpx_codec_ctrl_fn_map_t decoder_ctrl_maps[] = { { VP8_COPY_REFERENCE, ctrl_copy_reference }, @@ -670,6 +680,7 @@ static vpx_codec_ctrl_fn_map_t decoder_ctrl_maps[] = { { VP9_SET_SKIP_LOOP_FILTER, ctrl_set_skip_loop_filter }, { VP9_DECODE_SVC_SPATIAL_LAYER, ctrl_set_spatial_layer_svc }, { VP9D_SET_ROW_MT, ctrl_set_row_mt }, + { VP9D_SET_LOOP_FILTER_OPT, ctrl_enable_lpf_opt }, // Getters { VPXD_GET_LAST_QUANTIZER, ctrl_get_quantizer }, diff --git a/vp9/vp9_dx_iface.h b/vp9/vp9_dx_iface.h index a1c335278..f60688c4d 100644 --- a/vp9/vp9_dx_iface.h +++ b/vp9/vp9_dx_iface.h @@ -46,6 +46,7 @@ struct vpx_codec_alg_priv { int svc_decoding; int svc_spatial_layer; int row_mt; + int lpf_opt; }; #endif // VPX_VP9_VP9_DX_IFACE_H_ diff --git a/vpx/vp8dx.h b/vpx/vp8dx.h index fd6030107..c31afc1e6 100644 --- a/vpx/vp8dx.h +++ b/vpx/vp8dx.h @@ -132,6 +132,16 @@ enum vp8_dec_control_id { */ VP9D_SET_ROW_MT, + /*!\brief Codec control function to set loopfilter optimization. + * + * 0 : off, Loop filter is done after all tiles have been decoded + * 1 : on, Loop filter is done immediately after decode without + * waiting for all threads to sync. + * + * Supported in codecs: VP9 + */ + VP9D_SET_LOOP_FILTER_OPT, + VP8_DECODER_CTRL_ID_MAX }; @@ -191,6 +201,8 @@ VPX_CTRL_USE_TYPE(VP9_DECODE_SVC_SPATIAL_LAYER, int) VPX_CTRL_USE_TYPE(VP9_SET_SKIP_LOOP_FILTER, int) #define VPX_CTRL_VP9_DECODE_SET_ROW_MT VPX_CTRL_USE_TYPE(VP9D_SET_ROW_MT, int) +#define VPX_CTRL_VP9_SET_LOOP_FILTER_OPT +VPX_CTRL_USE_TYPE(VP9D_SET_LOOP_FILTER_OPT, int) /*!\endcond */ /*! @} - end defgroup vp8_decoder */ diff --git a/vpx_util/vpx_thread.h b/vpx_util/vpx_thread.h index 43a978007..19a8bfe95 100644 --- a/vpx_util/vpx_thread.h +++ b/vpx_util/vpx_thread.h @@ -159,6 +159,23 @@ static INLINE int pthread_cond_init(pthread_cond_t *const condition, return 0; } +static INLINE int pthread_cond_broadcast(pthread_cond_t *const condition) { + int ok = 1; +#ifdef USE_WINDOWS_CONDITION_VARIABLE + WakeAllConditionVariable(condition); +#else + while (WaitForSingleObject(condition->waiting_sem_, 0) == WAIT_OBJECT_0) { + // a thread is waiting in pthread_cond_wait: allow it to be notified + ok &= SetEvent(condition->signal_event_); + // wait until the event is consumed so the signaler cannot consume + // the event via its own pthread_cond_wait. + ok &= (WaitForSingleObject(condition->received_sem_, INFINITE) != + WAIT_OBJECT_0); + } +#endif + return !ok; +} + static INLINE int pthread_cond_signal(pthread_cond_t *const condition) { int ok = 1; #ifdef USE_WINDOWS_CONDITION_VARIABLE diff --git a/vpxdec.c b/vpxdec.c index 522eda126..eaa28bd84 100644 --- a/vpxdec.c +++ b/vpxdec.c @@ -100,19 +100,39 @@ static const arg_def_t framestatsarg = ARG_DEF(NULL, "framestats", 1, "Output per-frame stats (.csv format)"); static const arg_def_t rowmtarg = ARG_DEF(NULL, "row-mt", 1, "Enable multi-threading to run row-wise in VP9"); - -static const arg_def_t *all_args[] = { - &help, &codecarg, &use_yv12, &use_i420, - &flipuvarg, &rawvideo, &noblitarg, &progressarg, - &limitarg, &skiparg, &postprocarg, &summaryarg, - &outputfile, &threadsarg, &frameparallelarg, &verbosearg, - &scalearg, &fb_arg, &md5arg, &error_concealment, - &continuearg, +static const arg_def_t lpfoptarg = + ARG_DEF(NULL, "lpf-opt", 1, + "Do loopfilter without waiting for all threads to sync."); + +static const arg_def_t *all_args[] = { &help, + &codecarg, + &use_yv12, + &use_i420, + &flipuvarg, + &rawvideo, + &noblitarg, + &progressarg, + &limitarg, + &skiparg, + &postprocarg, + &summaryarg, + &outputfile, + &threadsarg, + &frameparallelarg, + &verbosearg, + &scalearg, + &fb_arg, + &md5arg, + &error_concealment, + &continuearg, #if CONFIG_VP9_HIGHBITDEPTH - &outbitdeptharg, + &outbitdeptharg, #endif - &svcdecodingarg, &framestatsarg, &rowmtarg, NULL -}; + &svcdecodingarg, + &framestatsarg, + &rowmtarg, + &lpfoptarg, + NULL }; #if CONFIG_VP8_DECODER static const arg_def_t addnoise_level = @@ -509,6 +529,7 @@ static int main_loop(int argc, const char **argv_) { int ec_enabled = 0; int keep_going = 0; int enable_row_mt = 0; + int enable_lpf_opt = 0; const VpxInterface *interface = NULL; const VpxInterface *fourcc_interface = NULL; uint64_t dx_time = 0; @@ -633,6 +654,8 @@ static int main_loop(int argc, const char **argv_) { } } else if (arg_match(&arg, &rowmtarg, argi)) { enable_row_mt = arg_parse_uint(&arg); + } else if (arg_match(&arg, &lpfoptarg, argi)) { + enable_lpf_opt = arg_parse_uint(&arg); } #if CONFIG_VP8_DECODER else if (arg_match(&arg, &addnoise_level, argi)) { @@ -764,6 +787,12 @@ static int main_loop(int argc, const char **argv_) { vpx_codec_error(&decoder)); goto fail; } + if (interface->fourcc == VP9_FOURCC && + vpx_codec_control(&decoder, VP9D_SET_LOOP_FILTER_OPT, enable_lpf_opt)) { + fprintf(stderr, "Failed to set decoder in optimized loopfilter mode: %s\n", + vpx_codec_error(&decoder)); + goto fail; + } if (!quiet) fprintf(stderr, "%s\n", decoder.name); #if CONFIG_VP8_DECODER -- 2.40.0