From 20c1edf6124fde58b9f755e39129289918a5fd1a Mon Sep 17 00:00:00 2001 From: hkuang Date: Thu, 15 May 2014 10:51:55 -0700 Subject: [PATCH] Refactor decode_tiles and loopfilter code. The current decode_tiles decodes the frame one tile by one tile and then loopfilter the whole frame or use another worker thread to do loopfiltering. |------|------|------|------| |Tile1-|Tile2-|Tile3-|Tile4-| |------|------|------|------| For example, if a tile video has one row and four cols, decode_tiles will decode the Tile1, then Tile2, then Tile3, then Tile4. And during decode each tile, decode_tile will decode row by row in each tile. For frame parallel decoding, decode_tiles will decode video in row order across the tiles. So the order will be: "Decode 1st row of Tile1" -> "Decode 1st row of Tile2" -> "Decode 1st row of Tile3" -> "Decode 1st row of Tile4" -> "Decode 2nd row of Tile1" -> "Decode 2nd row of Tile2" -> "Decode 2nd row of Tile3" -> "Decode 2nd row of Tile4"-> "loopfilter 1st row" Change-Id: I2211f9adc6d142fbf411d491031203cb8a6dbf6b --- vp9/common/vp9_tile_common.c | 10 +- vp9/common/vp9_tile_common.h | 3 + vp9/decoder/vp9_decodeframe.c | 192 ++++++++++++++++++---------------- vp9/decoder/vp9_decoder.c | 1 + vp9/decoder/vp9_decoder.h | 11 +- 5 files changed, 123 insertions(+), 94 deletions(-) diff --git a/vp9/common/vp9_tile_common.c b/vp9/common/vp9_tile_common.c index 78909dd9b..8c4a30353 100644 --- a/vp9/common/vp9_tile_common.c +++ b/vp9/common/vp9_tile_common.c @@ -21,13 +21,21 @@ static int get_tile_offset(int idx, int mis, int log2) { return MIN(offset, mis); } -void vp9_tile_init(TileInfo *tile, const VP9_COMMON *cm, int row, int col) { +void vp9_tile_set_row(TileInfo *tile, const VP9_COMMON *cm, int row) { tile->mi_row_start = get_tile_offset(row, cm->mi_rows, cm->log2_tile_rows); tile->mi_row_end = get_tile_offset(row + 1, cm->mi_rows, cm->log2_tile_rows); +} + +void vp9_tile_set_col(TileInfo *tile, const VP9_COMMON *cm, int col) { tile->mi_col_start = get_tile_offset(col, cm->mi_cols, cm->log2_tile_cols); tile->mi_col_end = get_tile_offset(col + 1, cm->mi_cols, cm->log2_tile_cols); } +void vp9_tile_init(TileInfo *tile, const VP9_COMMON *cm, int row, int col) { + vp9_tile_set_row(tile, cm, row); + vp9_tile_set_col(tile, cm, col); +} + void vp9_get_tile_n_bits(int mi_cols, int *min_log2_tile_cols, int *max_log2_tile_cols) { const int sb_cols = mi_cols_aligned_to_sb(mi_cols) >> MI_BLOCK_SIZE_LOG2; diff --git a/vp9/common/vp9_tile_common.h b/vp9/common/vp9_tile_common.h index a97719e29..ae58805de 100644 --- a/vp9/common/vp9_tile_common.h +++ b/vp9/common/vp9_tile_common.h @@ -27,6 +27,9 @@ typedef struct TileInfo { void vp9_tile_init(TileInfo *tile, const struct VP9Common *cm, int row, int col); +void vp9_tile_set_row(TileInfo *tile, const struct VP9Common *cm, int row); +void vp9_tile_set_col(TileInfo *tile, const struct VP9Common *cm, int col); + void vp9_get_tile_n_bits(int mi_cols, int *min_log2_tile_cols, int *max_log2_tile_cols); diff --git a/vp9/decoder/vp9_decodeframe.c b/vp9/decoder/vp9_decodeframe.c index 3124158bd..de58939fc 100644 --- a/vp9/decoder/vp9_decodeframe.c +++ b/vp9/decoder/vp9_decodeframe.c @@ -675,64 +675,6 @@ static void setup_frame_size_with_refs(VP9_COMMON *cm, setup_display_size(cm, rb); } -static void decode_tile(VP9Decoder *pbi, const TileInfo *const tile, - int do_loopfilter_inline, vp9_reader *r) { - const int num_threads = pbi->max_threads; - VP9_COMMON *const cm = &pbi->common; - int mi_row, mi_col; - MACROBLOCKD *xd = &pbi->mb; - - if (do_loopfilter_inline) { - LFWorkerData *const lf_data = (LFWorkerData*)pbi->lf_worker.data1; - lf_data->frame_buffer = get_frame_new_buffer(cm); - lf_data->cm = cm; - vp9_copy(lf_data->planes, pbi->mb.plane); - lf_data->stop = 0; - lf_data->y_only = 0; - vp9_loop_filter_frame_init(cm, cm->lf.filter_level); - } - - for (mi_row = tile->mi_row_start; mi_row < tile->mi_row_end; - mi_row += MI_BLOCK_SIZE) { - // For a SB there are 2 left contexts, each pertaining to a MB row within - vp9_zero(xd->left_context); - vp9_zero(xd->left_seg_context); - for (mi_col = tile->mi_col_start; mi_col < tile->mi_col_end; - mi_col += MI_BLOCK_SIZE) { - decode_partition(cm, xd, tile, mi_row, mi_col, r, BLOCK_64X64); - } - - if (do_loopfilter_inline) { - const int lf_start = mi_row - MI_BLOCK_SIZE; - LFWorkerData *const lf_data = (LFWorkerData*)pbi->lf_worker.data1; - - // delay the loopfilter by 1 macroblock row. - if (lf_start < 0) continue; - - // decoding has completed: finish up the loop filter in this thread. - if (mi_row + MI_BLOCK_SIZE >= tile->mi_row_end) continue; - - vp9_worker_sync(&pbi->lf_worker); - lf_data->start = lf_start; - lf_data->stop = mi_row; - if (num_threads > 1) { - vp9_worker_launch(&pbi->lf_worker); - } else { - vp9_worker_execute(&pbi->lf_worker); - } - } - } - - if (do_loopfilter_inline) { - LFWorkerData *const lf_data = (LFWorkerData*)pbi->lf_worker.data1; - - vp9_worker_sync(&pbi->lf_worker); - lf_data->start = lf_data->stop; - lf_data->stop = cm->mi_rows; - vp9_worker_execute(&pbi->lf_worker); - } -} - static void setup_tile_info(VP9_COMMON *cm, struct vp9_read_bit_buffer *rb) { int min_log2_tile_cols, max_log2_tile_cols, max_ones; vp9_get_tile_n_bits(cm->mi_cols, &min_log2_tile_cols, &max_log2_tile_cols); @@ -811,16 +753,35 @@ static void get_tile_buffers(VP9Decoder *pbi, static const uint8_t *decode_tiles(VP9Decoder *pbi, const uint8_t *data, - const uint8_t *data_end, - int do_loopfilter_inline) { + const uint8_t *data_end) { VP9_COMMON *const cm = &pbi->common; const int aligned_cols = mi_cols_aligned_to_sb(cm->mi_cols); const int tile_cols = 1 << cm->log2_tile_cols; const int tile_rows = 1 << cm->log2_tile_rows; TileBuffer tile_buffers[4][1 << 6]; int tile_row, tile_col; - const uint8_t *end = NULL; - vp9_reader r; + int mi_row, mi_col; + TileData *tile_data = NULL; + + if (cm->lf.filter_level && pbi->lf_worker.data1 == NULL) { + CHECK_MEM_ERROR(cm, pbi->lf_worker.data1, + vpx_memalign(32, sizeof(LFWorkerData))); + pbi->lf_worker.hook = (VP9WorkerHook)vp9_loop_filter_worker; + if (pbi->max_threads > 1 && !vp9_worker_reset(&pbi->lf_worker)) { + vpx_internal_error(&cm->error, VPX_CODEC_ERROR, + "Loop filter thread creation failed"); + } + } + + if (cm->lf.filter_level) { + LFWorkerData *const lf_data = (LFWorkerData*)pbi->lf_worker.data1; + lf_data->frame_buffer = get_frame_new_buffer(cm); + lf_data->cm = cm; + vp9_copy(lf_data->planes, pbi->mb.plane); + lf_data->stop = 0; + lf_data->y_only = 0; + vp9_loop_filter_frame_init(cm, cm->lf.filter_level); + } assert(tile_rows <= 4); assert(tile_cols <= (1 << 6)); @@ -835,26 +796,88 @@ static const uint8_t *decode_tiles(VP9Decoder *pbi, get_tile_buffers(pbi, data, data_end, tile_cols, tile_rows, tile_buffers); - // Decode tiles using data from tile_buffers + if (pbi->tile_data == NULL || + (tile_cols * tile_rows) != pbi->total_tiles) { + vpx_free(pbi->tile_data); + CHECK_MEM_ERROR( + cm, + pbi->tile_data, + vpx_malloc(tile_cols * tile_rows * (sizeof(*pbi->tile_data)))); + pbi->total_tiles = tile_rows * tile_cols; + } + + // Load all tile information into tile_data. for (tile_row = 0; tile_row < tile_rows; ++tile_row) { for (tile_col = 0; tile_col < tile_cols; ++tile_col) { - const int col = pbi->inv_tile_order ? tile_cols - tile_col - 1 : tile_col; - const int last_tile = tile_row == tile_rows - 1 && - col == tile_cols - 1; - const TileBuffer *const buf = &tile_buffers[tile_row][col]; TileInfo tile; + const TileBuffer *const buf = &tile_buffers[tile_row][tile_col]; + tile_data = pbi->tile_data + tile_cols * tile_row + tile_col; + tile_data->cm = cm; + tile_data->xd = pbi->mb; + tile_data->xd.corrupted = 0; + vp9_tile_init(&tile, tile_data->cm, tile_row, tile_col); + setup_token_decoder(buf->data, data_end, buf->size, &cm->error, + &tile_data->bit_reader, pbi->decrypt_cb, + pbi->decrypt_state); + init_macroblockd(cm, &tile_data->xd); + vp9_zero(tile_data->xd.dqcoeff); + } + } - vp9_tile_init(&tile, cm, tile_row, col); - setup_token_decoder(buf->data, data_end, buf->size, &cm->error, &r, - pbi->decrypt_cb, pbi->decrypt_state); - decode_tile(pbi, &tile, do_loopfilter_inline, &r); - - if (last_tile) - end = vp9_reader_find_end(&r); + for (tile_row = 0; tile_row < tile_rows; ++tile_row) { + TileInfo tile; + vp9_tile_set_row(&tile, cm, tile_row); + for (mi_row = tile.mi_row_start; mi_row < tile.mi_row_end; + mi_row += MI_BLOCK_SIZE) { + for (tile_col = 0; tile_col < tile_cols; ++tile_col) { + const int col = pbi->inv_tile_order ? + tile_cols - tile_col - 1 : tile_col; + tile_data = pbi->tile_data + tile_cols * tile_row + col; + vp9_tile_set_col(&tile, tile_data->cm, col); + vp9_zero(tile_data->xd.left_context); + vp9_zero(tile_data->xd.left_seg_context); + for (mi_col = tile.mi_col_start; mi_col < tile.mi_col_end; + mi_col += MI_BLOCK_SIZE) { + decode_partition(tile_data->cm, &tile_data->xd, &tile, mi_row, mi_col, + &tile_data->bit_reader, BLOCK_64X64); + } + } + // Loopfilter one row. + if (cm->lf.filter_level) { + const int lf_start = mi_row - MI_BLOCK_SIZE; + LFWorkerData *const lf_data = (LFWorkerData*)pbi->lf_worker.data1; + + // delay the loopfilter by 1 macroblock row. + if (lf_start < 0) continue; + + // decoding has completed: finish up the loop filter in this thread. + if (mi_row + MI_BLOCK_SIZE >= cm->mi_rows) continue; + + vp9_worker_sync(&pbi->lf_worker); + lf_data->start = lf_start; + lf_data->stop = mi_row; + if (pbi->max_threads > 1) { + vp9_worker_launch(&pbi->lf_worker); + } else { + vp9_worker_execute(&pbi->lf_worker); + } + } } } - return end; + // Loopfilter remaining rows in the frame. + if (cm->lf.filter_level) { + LFWorkerData *const lf_data = (LFWorkerData*)pbi->lf_worker.data1; + vp9_worker_sync(&pbi->lf_worker); + lf_data->start = lf_data->stop; + lf_data->stop = cm->mi_rows; + vp9_worker_execute(&pbi->lf_worker); + } + + // Get last tile data. + tile_data = pbi->tile_data + tile_cols * tile_rows - 1; + + return vp9_reader_find_end(&tile_data->bit_reader); } static int tile_worker_hook(void *arg1, void *arg2) { @@ -1279,7 +1302,6 @@ static struct vp9_read_bit_buffer* init_read_bit_buffer( const uint8_t *data, const uint8_t *data_end, uint8_t *clear_data /* buffer size MAX_VP9_HEADER_SIZE */) { - vp9_zero(*rb); rb->bit_offset = 0; rb->error_handler = error_handler; rb->error_handler_data = &pbi->common; @@ -1300,7 +1322,7 @@ int vp9_decode_frame(VP9Decoder *pbi, const uint8_t **p_data_end) { VP9_COMMON *const cm = &pbi->common; MACROBLOCKD *const xd = &pbi->mb; - struct vp9_read_bit_buffer rb; + struct vp9_read_bit_buffer rb = { 0 }; uint8_t clear_data[MAX_VP9_HEADER_SIZE]; const size_t first_partition_size = read_uncompressed_header(pbi, init_read_bit_buffer(pbi, &rb, data, data_end, clear_data)); @@ -1308,8 +1330,6 @@ int vp9_decode_frame(VP9Decoder *pbi, const int tile_rows = 1 << cm->log2_tile_rows; const int tile_cols = 1 << cm->log2_tile_cols; YV12_BUFFER_CONFIG *const new_fb = get_frame_new_buffer(cm); - const int do_loopfilter_inline = tile_rows == 1 && tile_cols == 1 && - cm->lf.filter_level; xd->cur_buf = new_fb; if (!first_partition_size) { @@ -1352,19 +1372,7 @@ int vp9_decode_frame(VP9Decoder *pbi, // to do parallel loopfiltering. vp9_loop_filter_frame_mt(new_fb, pbi, cm, cm->lf.filter_level, 0); } else { - if (do_loopfilter_inline && pbi->lf_worker.data1 == NULL) { - CHECK_MEM_ERROR(cm, pbi->lf_worker.data1, - vpx_memalign(32, sizeof(LFWorkerData))); - pbi->lf_worker.hook = (VP9WorkerHook)vp9_loop_filter_worker; - if (pbi->max_threads > 1 && !vp9_worker_reset(&pbi->lf_worker)) { - vpx_internal_error(&cm->error, VPX_CODEC_ERROR, - "Loop filter thread creation failed"); - } - } - *p_data_end = decode_tiles(pbi, data + first_partition_size, data_end, - do_loopfilter_inline); - if (!do_loopfilter_inline) - vp9_loop_filter_frame(new_fb, cm, &pbi->mb, cm->lf.filter_level, 0, 0); + *p_data_end = decode_tiles(pbi, data + first_partition_size, data_end); } new_fb->corrupted |= xd->corrupted; diff --git a/vp9/decoder/vp9_decoder.c b/vp9/decoder/vp9_decoder.c index 98b890b63..e1292c222 100644 --- a/vp9/decoder/vp9_decoder.c +++ b/vp9/decoder/vp9_decoder.c @@ -90,6 +90,7 @@ void vp9_decoder_remove(VP9Decoder *pbi) { vp9_remove_common(cm); vp9_worker_end(&pbi->lf_worker); vpx_free(pbi->lf_worker.data1); + vpx_free(pbi->tile_data); for (i = 0; i < pbi->num_tile_workers; ++i) { VP9Worker *const worker = &pbi->tile_workers[i]; vp9_worker_end(worker); diff --git a/vp9/decoder/vp9_decoder.h b/vp9/decoder/vp9_decoder.h index d6cb5071d..36fb7ea94 100644 --- a/vp9/decoder/vp9_decoder.h +++ b/vp9/decoder/vp9_decoder.h @@ -27,6 +27,13 @@ extern "C" { #endif +// TODO(hkuang): combine this with TileWorkerData. +typedef struct TileData { + VP9_COMMON *cm; + vp9_reader bit_reader; + DECLARE_ALIGNED(16, MACROBLOCKD, xd); +} TileData; + typedef struct VP9Decoder { DECLARE_ALIGNED(16, MACROBLOCKD, mb); @@ -40,10 +47,12 @@ typedef struct VP9Decoder { int decoded_key_frame; VP9Worker lf_worker; - VP9Worker *tile_workers; int num_tile_workers; + TileData *tile_data; + int total_tiles; + VP9LfSync lf_row_sync; vpx_decrypt_cb decrypt_cb; -- 2.40.0