From 5deffa1175420c09b89e998b60bf3e26f8379397 Mon Sep 17 00:00:00 2001 From: Vignesh Venkatasubramanian Date: Wed, 19 Oct 2016 11:07:41 -0700 Subject: [PATCH] vp9_bitstream: Encode tiles in parallel Re-use the tile worker threads to pack the bitstream in parallel on a per-tile basis. Restricting this to real-time only for now (further testing is needed to ensure this does not make 2-pass worse in any case). BUG=webm:1309 Change-Id: I8a80da7c5089b837d0df79a5c49d5e3022dfc8ec --- test/codec_factory.h | 6 ++ test/vp9_ethread_test.cc | 13 ++-- vp9/encoder/vp9_bitstream.c | 128 +++++++++++++++++++++++++++++++++++- vp9/encoder/vp9_bitstream.h | 18 +++++ vp9/encoder/vp9_encoder.c | 5 +- vp9/encoder/vp9_encoder.h | 1 + 6 files changed, 165 insertions(+), 6 deletions(-) diff --git a/test/codec_factory.h b/test/codec_factory.h index 3415284ab..d5882ed9c 100644 --- a/test/codec_factory.h +++ b/test/codec_factory.h @@ -65,6 +65,12 @@ class CodecTestWith3Params : public ::testing::TestWithParam< std::tr1::tuple > {}; +template +class CodecTestWith4Params + : public ::testing::TestWithParam< + std::tr1::tuple > { +}; + /* * VP8 Codec Definitions */ diff --git a/test/vp9_ethread_test.cc b/test/vp9_ethread_test.cc index 0590487fc..804dc8956 100644 --- a/test/vp9_ethread_test.cc +++ b/test/vp9_ethread_test.cc @@ -20,10 +20,12 @@ namespace { class VPxEncoderThreadTest : public ::libvpx_test::EncoderTest, - public ::libvpx_test::CodecTestWith2Params { + public ::libvpx_test::CodecTestWith4Params { protected: VPxEncoderThreadTest() - : EncoderTest(GET_PARAM(0)), encoder_initialized_(false), tiles_(2), + : EncoderTest(GET_PARAM(0)), encoder_initialized_(false), + tiles_(GET_PARAM(3)), threads_(GET_PARAM(4)), encoding_mode_(GET_PARAM(1)), set_cpu_used_(GET_PARAM(2)) { init_flags_ = VPX_CODEC_USE_PSNR; md5_.clear(); @@ -91,6 +93,7 @@ class VPxEncoderThreadTest bool encoder_initialized_; int tiles_; + int threads_; ::libvpx_test::TestMode encoding_mode_; int set_cpu_used_; std::vector md5_; @@ -111,7 +114,7 @@ TEST_P(VPxEncoderThreadTest, EncoderResultTest) { md5_.clear(); // Encode using multiple threads. - cfg_.g_threads = 4; + cfg_.g_threads = threads_; ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); multi_thr_md5 = md5_; md5_.clear(); @@ -124,5 +127,7 @@ VP9_INSTANTIATE_TEST_CASE(VPxEncoderThreadTest, ::testing::Values(::libvpx_test::kTwoPassGood, ::libvpx_test::kOnePassGood, ::libvpx_test::kRealTime), - ::testing::Range(1, 9)); + ::testing::Range(1, 9), // cpu_used + ::testing::Range(0, 3), // tile_columns + ::testing::Range(2, 5)); // threads } // namespace diff --git a/vp9/encoder/vp9_bitstream.c b/vp9/encoder/vp9_bitstream.c index 22b28de83..49aea69eb 100644 --- a/vp9/encoder/vp9_bitstream.c +++ b/vp9/encoder/vp9_bitstream.c @@ -915,6 +915,125 @@ int vp9_get_refresh_mask(VP9_COMP *cpi) { } } +static int encode_tile_worker(VP9_COMP *cpi, VP9BitstreamWorkerData *data) { + MACROBLOCKD *const xd = &data->xd; + vpx_start_encode(&data->bit_writer, data->dest); + write_modes(cpi, xd, &cpi->tile_data[data->tile_idx].tile_info, + &data->bit_writer, &data->tok, data->tok_end, + &data->max_mv_magnitude, data->interp_filter_selected); + assert(data->tok == data->tok_end); + vpx_stop_encode(&data->bit_writer); + return 1; +} + +void vp9_bitstream_encode_tiles_buffer_dealloc(VP9_COMP *const cpi) { + if (cpi->vp9_bitstream_worker_data) { + int i; + for (i = 1; i < cpi->num_workers; ++i) { + vpx_free(cpi->vp9_bitstream_worker_data[i].dest); + } + vpx_free(cpi->vp9_bitstream_worker_data); + cpi->vp9_bitstream_worker_data = NULL; + } +} + +static int encode_tiles_buffer_alloc(VP9_COMP *const cpi) { + int i; + const size_t worker_data_size = + cpi->num_workers * sizeof(*cpi->vp9_bitstream_worker_data); + cpi->vp9_bitstream_worker_data = vpx_memalign(16, worker_data_size); + memset(cpi->vp9_bitstream_worker_data, 0, worker_data_size); + if (!cpi->vp9_bitstream_worker_data) return 1; + for (i = 1; i < cpi->num_workers; ++i) { + cpi->vp9_bitstream_worker_data[i].dest_size = + cpi->oxcf.width * cpi->oxcf.height; + cpi->vp9_bitstream_worker_data[i].dest = + vpx_malloc(cpi->vp9_bitstream_worker_data[i].dest_size); + if (!cpi->vp9_bitstream_worker_data[i].dest) return 1; + } + return 0; +} + +static size_t encode_tiles_mt(VP9_COMP *cpi, uint8_t *data_ptr) { + const VPxWorkerInterface *const winterface = vpx_get_worker_interface(); + VP9_COMMON *const cm = &cpi->common; + const int tile_cols = 1 << cm->log2_tile_cols; + const int num_workers = cpi->num_workers; + size_t total_size = 0; + int tile_col = 0; + + if (!cpi->vp9_bitstream_worker_data || + cpi->vp9_bitstream_worker_data[1].dest_size > + (cpi->oxcf.width * cpi->oxcf.height)) { + vp9_bitstream_encode_tiles_buffer_dealloc(cpi); + if (encode_tiles_buffer_alloc(cpi)) return 0; + } + + while (tile_col < tile_cols) { + int i, j; + for (i = 0; i < num_workers && tile_col < tile_cols; ++i) { + VPxWorker *const worker = &cpi->workers[i]; + VP9BitstreamWorkerData *const data = &cpi->vp9_bitstream_worker_data[i]; + + // Populate the worker data. + data->xd = cpi->td.mb.e_mbd; + data->tile_idx = tile_col; + data->tok = cpi->tile_tok[0][tile_col]; + data->tok_end = cpi->tile_tok[0][tile_col] + cpi->tok_count[0][tile_col]; + data->max_mv_magnitude = cpi->max_mv_magnitude; + memset(data->interp_filter_selected, 0, + sizeof(data->interp_filter_selected[0][0]) * SWITCHABLE); + + // First thread can directly write into the output buffer. + if (i == 0) { + // If this worker happens to be for the last tile, then do not offset it + // by 4 for the tile size. + data->dest = + data_ptr + total_size + (tile_col == tile_cols - 1 ? 0 : 4); + } + worker->data1 = cpi; + worker->data2 = data; + worker->hook = (VPxWorkerHook)encode_tile_worker; + worker->had_error = 0; + + if (i < num_workers - 1) { + winterface->launch(worker); + } else { + winterface->execute(worker); + } + ++tile_col; + } + for (j = 0; j < i; ++j) { + VPxWorker *const worker = &cpi->workers[j]; + VP9BitstreamWorkerData *const data = + (VP9BitstreamWorkerData *)worker->data2; + uint32_t tile_size; + int k; + + if (!winterface->sync(worker)) return 0; + tile_size = data->bit_writer.pos; + + // Aggregate per-thread bitstream stats. + cpi->max_mv_magnitude = + VPXMAX(cpi->max_mv_magnitude, data->max_mv_magnitude); + for (k = 0; k < SWITCHABLE; ++k) { + cpi->interp_filter_selected[0][k] += data->interp_filter_selected[0][k]; + } + + // Prefix the size of the tile on all but the last. + if (tile_col != tile_cols || j < i - 1) { + mem_put_be32(data_ptr + total_size, tile_size); + total_size += 4; + } + if (j > 0) { + memcpy(data_ptr + total_size, data->dest, tile_size); + } + total_size += tile_size; + } + } + return total_size; +} + static size_t encode_tiles(VP9_COMP *cpi, uint8_t *data_ptr) { VP9_COMMON *const cm = &cpi->common; MACROBLOCKD *const xd = &cpi->td.mb.e_mbd; @@ -928,6 +1047,14 @@ static size_t encode_tiles(VP9_COMP *cpi, uint8_t *data_ptr) { memset(cm->above_seg_context, 0, sizeof(*cm->above_seg_context) * mi_cols_aligned_to_sb(cm->mi_cols)); + // Encoding tiles in parallel is done only for realtime mode now. In other + // modes the speed up is insignificant and requires further testing to ensure + // that it does not make the overall process worse in any case. + if (cpi->oxcf.mode == REALTIME && cpi->num_workers > 1 && tile_rows == 1 && + tile_cols > 1) { + return encode_tiles_mt(cpi, data_ptr); + } + for (tile_row = 0; tile_row < tile_rows; tile_row++) { for (tile_col = 0; tile_col < tile_cols; tile_col++) { int tile_idx = tile_row * tile_cols + tile_col; @@ -955,7 +1082,6 @@ static size_t encode_tiles(VP9_COMP *cpi, uint8_t *data_ptr) { total_size += residual_bc.pos; } } - return total_size; } diff --git a/vp9/encoder/vp9_bitstream.h b/vp9/encoder/vp9_bitstream.h index 8c97d37f7..044a3bbc7 100644 --- a/vp9/encoder/vp9_bitstream.h +++ b/vp9/encoder/vp9_bitstream.h @@ -17,8 +17,26 @@ extern "C" { #include "vp9/encoder/vp9_encoder.h" +typedef struct VP9BitstreamWorkerData { + uint8_t *dest; + int dest_size; + TOKENEXTRA *tok; + TOKENEXTRA *tok_end; + vpx_writer bit_writer; + int tile_idx; + unsigned int max_mv_magnitude; + // The size of interp_filter_selected in VP9_COMP is actually + // MAX_REFERENCE_FRAMES x SWITCHABLE. But when encoding tiles, all we ever do + // is increment the very first index (index 0) for the first dimension. Hence + // this is sufficient. + int interp_filter_selected[1][SWITCHABLE]; + DECLARE_ALIGNED(16, MACROBLOCKD, xd); +} VP9BitstreamWorkerData; + int vp9_get_refresh_mask(VP9_COMP *cpi); +void vp9_bitstream_encode_tiles_buffer_dealloc(VP9_COMP *const cpi); + void vp9_pack_bitstream(VP9_COMP *cpi, uint8_t *dest, size_t *size); static INLINE int vp9_preserve_existing_gf(VP9_COMP *cpi) { diff --git a/vp9/encoder/vp9_encoder.c b/vp9/encoder/vp9_encoder.c index d98c4938e..43b708be8 100644 --- a/vp9/encoder/vp9_encoder.c +++ b/vp9/encoder/vp9_encoder.c @@ -2030,7 +2030,10 @@ void vp9_remove_compressor(VP9_COMP *cpi) { vpx_free(cpi->tile_thr_data); vpx_free(cpi->workers); - if (cpi->num_workers > 1) vp9_loop_filter_dealloc(&cpi->lf_row_sync); + if (cpi->num_workers > 1) { + vp9_loop_filter_dealloc(&cpi->lf_row_sync); + vp9_bitstream_encode_tiles_buffer_dealloc(cpi); + } vp9_alt_ref_aq_destroy(cpi->alt_ref_aq); diff --git a/vp9/encoder/vp9_encoder.h b/vp9/encoder/vp9_encoder.h index 77eb31cb3..e353d4779 100644 --- a/vp9/encoder/vp9_encoder.h +++ b/vp9/encoder/vp9_encoder.h @@ -601,6 +601,7 @@ typedef struct VP9_COMP { VPxWorker *workers; struct EncWorkerData *tile_thr_data; VP9LfSync lf_row_sync; + struct VP9BitstreamWorkerData *vp9_bitstream_worker_data; int keep_level_stats; Vp9LevelInfo level_info; -- 2.40.0