From a0ffa2794b9d3d831332f3956c2f88f3f3345aab Mon Sep 17 00:00:00 2001 From: James Zern Date: Wed, 31 Jul 2013 16:15:10 -0700 Subject: [PATCH] vp9/decoder: threaded row-based loop filter Currently the only threaded option for vp9 decode. Enabled when the decoder config thread count is > 1. Change-Id: I082959abac9e31aa4a38ed9fd68b94680e57f4df --- test/vp9_thread_test.cc | 29 +++++++++++++++++++++++++++++ vp9/common/vp9_loopfilter.c | 8 ++++++++ vp9/common/vp9_loopfilter.h | 14 ++++++++++++++ vp9/decoder/vp9_decodframe.c | 31 ++++++++++++++++++++++++++++--- vp9/decoder/vp9_onyxd_if.c | 12 ++++++++++++ vp9/decoder/vp9_onyxd_int.h | 3 ++- 6 files changed, 93 insertions(+), 4 deletions(-) diff --git a/test/vp9_thread_test.cc b/test/vp9_thread_test.cc index 308ba9438..41d22dd3a 100644 --- a/test/vp9_thread_test.cc +++ b/test/vp9_thread_test.cc @@ -11,6 +11,10 @@ #include "vp9/decoder/vp9_thread.h" #include "third_party/googletest/src/include/gtest/gtest.h" +#include "test/codec_factory.h" +#include "test/decode_test_driver.h" +#include "test/md5_helper.h" +#include "test/webm_video_source.h" namespace { @@ -77,4 +81,29 @@ TEST_F(VP9WorkerThreadTest, HookFailure) { EXPECT_FALSE(worker_.had_error); } +TEST(VP9DecodeMTTest, MTDecode) { + libvpx_test::WebMVideoSource video("vp90-2-03-size-226x226.webm"); + video.Init(); + + vpx_codec_dec_cfg_t cfg = {0}; + cfg.threads = 2; + libvpx_test::VP9Decoder decoder(cfg, 0); + + libvpx_test::MD5 md5; + for (video.Begin(); video.cxdata(); video.Next()) { + const vpx_codec_err_t res = + decoder.DecodeFrame(video.cxdata(), video.frame_size()); + ASSERT_EQ(VPX_CODEC_OK, res) << decoder.DecodeError(); + + libvpx_test::DxDataIterator dec_iter = decoder.GetDxData(); + const vpx_image_t *img = NULL; + + // Get decompressed data + while ((img = dec_iter.Next())) { + md5.Add(img); + } + } + EXPECT_STREQ("b35a1b707b28e82be025d960aba039bc", md5.Get()); +} + } // namespace diff --git a/vp9/common/vp9_loopfilter.c b/vp9/common/vp9_loopfilter.c index 5498b1717..c57f0a55d 100644 --- a/vp9/common/vp9_loopfilter.c +++ b/vp9/common/vp9_loopfilter.c @@ -376,3 +376,11 @@ void vp9_loop_filter_frame(VP9_COMMON *cm, MACROBLOCKD *xd, vp9_loop_filter_rows(cm->frame_to_show, cm, xd, 0, cm->mi_rows, y_only); } + +int vp9_loop_filter_worker(void *arg1, void *arg2) { + LFWorkerData *const lf_data = (LFWorkerData*)arg1; + (void)arg2; + vp9_loop_filter_rows(lf_data->frame_buffer, lf_data->cm, &lf_data->xd, + lf_data->start, lf_data->stop, lf_data->y_only); + return 1; +} diff --git a/vp9/common/vp9_loopfilter.h b/vp9/common/vp9_loopfilter.h index e59cc6485..c6fe112ec 100644 --- a/vp9/common/vp9_loopfilter.h +++ b/vp9/common/vp9_loopfilter.h @@ -64,4 +64,18 @@ void vp9_loop_filter_frame(struct VP9Common *cm, void vp9_loop_filter_rows(const YV12_BUFFER_CONFIG *frame_buffer, struct VP9Common *cm, struct macroblockd *xd, int start, int stop, int y_only); + +typedef struct LoopFilterWorkerData { + const YV12_BUFFER_CONFIG *frame_buffer; + struct VP9Common *cm; + struct macroblockd xd; // TODO(jzern): most of this is unnecessary to the + // loopfilter. the planes are necessary as their state + // is changed during decode. + int start; + int stop; + int y_only; +} LFWorkerData; + +// Operates on the rows described by LFWorkerData passed as 'arg1'. +int vp9_loop_filter_worker(void *arg1, void *arg2); #endif // VP9_COMMON_VP9_LOOPFILTER_H_ diff --git a/vp9/decoder/vp9_decodframe.c b/vp9/decoder/vp9_decodframe.c index ff7cb8d34..2b6f5a9c6 100644 --- a/vp9/decoder/vp9_decodframe.c +++ b/vp9/decoder/vp9_decodframe.c @@ -34,6 +34,7 @@ #include "vp9/decoder/vp9_idct_blk.h" #include "vp9/decoder/vp9_onyxd_int.h" #include "vp9/decoder/vp9_read_bit_buffer.h" +#include "vp9/decoder/vp9_thread.h" #include "vp9/decoder/vp9_treereader.h" static int read_be32(const uint8_t *p) { @@ -585,10 +586,18 @@ static void setup_frame_size_with_refs(VP9D_COMP *pbi, } static void decode_tile(VP9D_COMP *pbi, vp9_reader *r) { + const int num_threads = pbi->oxcf.max_threads; VP9_COMMON *const pc = &pbi->common; int mi_row, mi_col; if (pbi->do_loopfilter_inline) { + if (num_threads > 1) { + LFWorkerData *const lf_data = (LFWorkerData*)pbi->lf_worker.data1; + lf_data->frame_buffer = &pbi->common.yv12_fb[pbi->common.new_fb_idx]; + lf_data->cm = pc; + lf_data->xd = pbi->mb; + lf_data->y_only = 0; + } vp9_loop_filter_frame_init(pc, &pbi->mb, pbi->mb.lf.filter_level); } @@ -603,17 +612,33 @@ static void decode_tile(VP9D_COMP *pbi, vp9_reader *r) { } if (pbi->do_loopfilter_inline) { - YV12_BUFFER_CONFIG *const fb = - &pbi->common.yv12_fb[pbi->common.new_fb_idx]; // delay the loopfilter by 1 macroblock row. const int lf_start = mi_row - MI_BLOCK_SIZE; if (lf_start < 0) continue; - vp9_loop_filter_rows(fb, pc, &pbi->mb, lf_start, mi_row, 0); + + if (num_threads > 1) { + LFWorkerData *const lf_data = (LFWorkerData*)pbi->lf_worker.data1; + + vp9_worker_sync(&pbi->lf_worker); + lf_data->start = lf_start; + lf_data->stop = mi_row; + pbi->lf_worker.hook = vp9_loop_filter_worker; + vp9_worker_launch(&pbi->lf_worker); + } else { + YV12_BUFFER_CONFIG *const fb = + &pbi->common.yv12_fb[pbi->common.new_fb_idx]; + vp9_loop_filter_rows(fb, pc, &pbi->mb, lf_start, mi_row, 0); + } } } if (pbi->do_loopfilter_inline) { YV12_BUFFER_CONFIG *const fb = &pbi->common.yv12_fb[pbi->common.new_fb_idx]; + if (num_threads > 1) { + // TODO(jzern): since the loop filter is delayed one mb row, this will be + // forced to wait for the last row scheduled in the for loop. + vp9_worker_sync(&pbi->lf_worker); + } vp9_loop_filter_rows(fb, pc, &pbi->mb, mi_row - MI_BLOCK_SIZE, pc->mi_rows, 0); } diff --git a/vp9/decoder/vp9_onyxd_if.c b/vp9/decoder/vp9_onyxd_if.c index aefb56f9a..5a01dd790 100644 --- a/vp9/decoder/vp9_onyxd_if.c +++ b/vp9/decoder/vp9_onyxd_if.c @@ -141,6 +141,16 @@ VP9D_PTR vp9_create_decompressor(VP9D_CONFIG *oxcf) { pbi->common.error.setjmp = 0; pbi->decoded_key_frame = 0; + if (pbi->oxcf.max_threads > 1) { + vp9_worker_init(&pbi->lf_worker); + pbi->lf_worker.data1 = vpx_malloc(sizeof(LFWorkerData)); + pbi->lf_worker.hook = (VP9WorkerHook)vp9_loop_filter_worker; + if (pbi->lf_worker.data1 == NULL || !vp9_worker_reset(&pbi->lf_worker)) { + vp9_remove_decompressor(pbi); + return NULL; + } + } + return pbi; } @@ -154,6 +164,8 @@ void vp9_remove_decompressor(VP9D_PTR ptr) { vpx_free(pbi->common.last_frame_seg_map); vp9_remove_common(&pbi->common); + vp9_worker_end(&pbi->lf_worker); + vpx_free(pbi->lf_worker.data1); vpx_free(pbi); } diff --git a/vp9/decoder/vp9_onyxd_int.h b/vp9/decoder/vp9_onyxd_int.h index 607d14c13..a051971a1 100644 --- a/vp9/decoder/vp9_onyxd_int.h +++ b/vp9/decoder/vp9_onyxd_int.h @@ -14,8 +14,8 @@ #include "./vpx_config.h" #include "vp9/common/vp9_onyxc_int.h" - #include "vp9/decoder/vp9_onyxd.h" +#include "vp9/decoder/vp9_thread.h" typedef struct VP9Decompressor { DECLARE_ALIGNED(16, MACROBLOCKD, mb); @@ -38,6 +38,7 @@ typedef struct VP9Decompressor { int initial_height; int do_loopfilter_inline; // apply loopfilter to available rows immediately + VP9Worker lf_worker; } VP9D_COMP; #endif // VP9_DECODER_VP9_TREEREADER_H_ -- 2.40.0