From 3a833ea38f1231dd940eb129b0f1caccd370ff10 Mon Sep 17 00:00:00 2001 From: Deb Mukherjee Date: Mon, 4 Nov 2013 16:34:18 -0800 Subject: [PATCH] token_cache changes in decoder Removes stack-alocation of token_cache in decode_coefs function Seems to achieve about 1% decode speed improvement as tested on 25 480p videos. Change-Id: I8e7eb3361fa09d9654dfad0677a6d606701fdc6e --- vp9/decoder/vp9_decodframe.c | 55 +++++++++++++++++++++++------------- vp9/decoder/vp9_detokenize.c | 12 ++++---- vp9/decoder/vp9_detokenize.h | 3 +- vp9/decoder/vp9_onyxd_int.h | 2 ++ 4 files changed, 46 insertions(+), 26 deletions(-) diff --git a/vp9/decoder/vp9_decodframe.c b/vp9/decoder/vp9_decodframe.c index 63b889dcc..1fd9e979a 100644 --- a/vp9/decoder/vp9_decodframe.c +++ b/vp9/decoder/vp9_decodframe.c @@ -41,6 +41,7 @@ typedef struct TileWorkerData { VP9_COMMON *cm; vp9_reader bit_reader; DECLARE_ALIGNED(16, MACROBLOCKD, xd); + DECLARE_ALIGNED(16, unsigned char, token_cache[1024]); } TileWorkerData; static int read_be32(const uint8_t *p) { @@ -297,6 +298,7 @@ struct intra_args { VP9_COMMON *cm; MACROBLOCKD *xd; vp9_reader *r; + unsigned char* token_cache; }; static void predict_and_reconstruct_intra_block(int plane, int block, @@ -326,7 +328,7 @@ static void predict_and_reconstruct_intra_block(int plane, int block, if (!mi->mbmi.skip_coeff) { vp9_decode_block_tokens(cm, xd, plane, block, plane_bsize, tx_size, - args->r); + args->r, args->token_cache); inverse_transform_block(xd, plane, block, plane_bsize, tx_size); } } @@ -336,6 +338,7 @@ struct inter_args { MACROBLOCKD *xd; vp9_reader *r; int *eobtotal; + unsigned char* token_cache; }; static void reconstruct_inter_block(int plane, int block, @@ -346,7 +349,8 @@ static void reconstruct_inter_block(int plane, int block, MACROBLOCKD *const xd = args->xd; *args->eobtotal += vp9_decode_block_tokens(cm, xd, plane, block, - plane_bsize, tx_size, args->r); + plane_bsize, tx_size, + args->r, args->token_cache); inverse_transform_block(xd, plane, block, plane_bsize, tx_size); } @@ -398,7 +402,8 @@ static void set_ref(VP9_COMMON *const cm, MACROBLOCKD *const xd, static void decode_modes_b(VP9_COMMON *const cm, MACROBLOCKD *const xd, const TileInfo *const tile, int mi_row, int mi_col, - vp9_reader *r, BLOCK_SIZE bsize) { + vp9_reader *r, BLOCK_SIZE bsize, + unsigned char *token_cache) { const int less8x8 = bsize < BLOCK_8X8; MB_MODE_INFO *mbmi; @@ -420,7 +425,7 @@ static void decode_modes_b(VP9_COMMON *const cm, MACROBLOCKD *const xd, } if (!is_inter_block(mbmi)) { - struct intra_args arg = { cm, xd, r }; + struct intra_args arg = { cm, xd, r, token_cache }; foreach_transformed_block(xd, bsize, predict_and_reconstruct_intra_block, &arg); } else { @@ -438,7 +443,7 @@ static void decode_modes_b(VP9_COMMON *const cm, MACROBLOCKD *const xd, // Reconstruction if (!mbmi->skip_coeff) { int eobtotal = 0; - struct inter_args arg = { cm, xd, r, &eobtotal }; + struct inter_args arg = { cm, xd, r, &eobtotal, token_cache }; foreach_transformed_block(xd, bsize, reconstruct_inter_block, &arg); if (!less8x8 && eobtotal == 0) mbmi->skip_coeff = 1; // skip loopfilter @@ -477,7 +482,8 @@ static PARTITION_TYPE read_partition(VP9_COMMON *cm, MACROBLOCKD *xd, int hbs, static void decode_modes_sb(VP9_COMMON *const cm, MACROBLOCKD *const xd, const TileInfo *const tile, int mi_row, int mi_col, - vp9_reader* r, BLOCK_SIZE bsize) { + vp9_reader* r, BLOCK_SIZE bsize, + unsigned char *token_cache) { const int hbs = num_8x8_blocks_wide_lookup[bsize] / 2; PARTITION_TYPE partition; BLOCK_SIZE subsize; @@ -488,27 +494,33 @@ static void decode_modes_sb(VP9_COMMON *const cm, MACROBLOCKD *const xd, partition = read_partition(cm, xd, hbs, mi_row, mi_col, bsize, r); subsize = get_subsize(bsize, partition); if (subsize < BLOCK_8X8) { - decode_modes_b(cm, xd, tile, mi_row, mi_col, r, subsize); + decode_modes_b(cm, xd, tile, mi_row, mi_col, r, subsize, token_cache); } else { switch (partition) { case PARTITION_NONE: - decode_modes_b(cm, xd, tile, mi_row, mi_col, r, subsize); + decode_modes_b(cm, xd, tile, mi_row, mi_col, r, subsize, token_cache); break; case PARTITION_HORZ: - decode_modes_b(cm, xd, tile, mi_row, mi_col, r, subsize); + decode_modes_b(cm, xd, tile, mi_row, mi_col, r, subsize, token_cache); if (mi_row + hbs < cm->mi_rows) - decode_modes_b(cm, xd, tile, mi_row + hbs, mi_col, r, subsize); + decode_modes_b(cm, xd, tile, mi_row + hbs, mi_col, r, subsize, + token_cache); break; case PARTITION_VERT: - decode_modes_b(cm, xd, tile, mi_row, mi_col, r, subsize); + decode_modes_b(cm, xd, tile, mi_row, mi_col, r, subsize, token_cache); if (mi_col + hbs < cm->mi_cols) - decode_modes_b(cm, xd, tile, mi_row, mi_col + hbs, r, subsize); + decode_modes_b(cm, xd, tile, mi_row, mi_col + hbs, r, subsize, + token_cache); break; case PARTITION_SPLIT: - decode_modes_sb(cm, xd, tile, mi_row, mi_col, r, subsize); - decode_modes_sb(cm, xd, tile, mi_row, mi_col + hbs, r, subsize); - decode_modes_sb(cm, xd, tile, mi_row + hbs, mi_col, r, subsize); - decode_modes_sb(cm, xd, tile, mi_row + hbs, mi_col + hbs, r, subsize); + decode_modes_sb(cm, xd, tile, mi_row, mi_col, r, subsize, + token_cache); + decode_modes_sb(cm, xd, tile, mi_row, mi_col + hbs, r, subsize, + token_cache); + decode_modes_sb(cm, xd, tile, mi_row + hbs, mi_col, r, subsize, + token_cache); + decode_modes_sb(cm, xd, tile, mi_row + hbs, mi_col + hbs, r, subsize, + token_cache); break; default: assert(!"Invalid partition type"); @@ -791,7 +803,8 @@ static void decode_tile(VP9D_COMP *pbi, const TileInfo *const tile, vp9_zero(xd->left_seg_context); for (mi_col = tile->mi_col_start; mi_col < tile->mi_col_end; mi_col += MI_BLOCK_SIZE) - decode_modes_sb(cm, xd, tile, mi_row, mi_col, r, BLOCK_64X64); + decode_modes_sb(cm, xd, tile, mi_row, mi_col, r, BLOCK_64X64, + pbi->token_cache); if (pbi->do_loopfilter_inline) { const int lf_start = mi_row - MI_BLOCK_SIZE; @@ -935,7 +948,7 @@ static const uint8_t *decode_tiles(VP9D_COMP *pbi, const uint8_t *data) { } static int tile_worker_hook(void *arg1, void *arg2) { - TileWorkerData *const tile_data = (TileWorkerData*)arg1; + TileWorkerData *tile_data = (TileWorkerData*)arg1; const TileInfo *const tile = (TileInfo*)arg2; int mi_row, mi_col; @@ -944,9 +957,11 @@ static int tile_worker_hook(void *arg1, void *arg2) { vp9_zero(tile_data->xd.left_context); vp9_zero(tile_data->xd.left_seg_context); for (mi_col = tile->mi_col_start; mi_col < tile->mi_col_end; - mi_col += MI_BLOCK_SIZE) + mi_col += MI_BLOCK_SIZE) { decode_modes_sb(tile_data->cm, &tile_data->xd, tile, - mi_row, mi_col, &tile_data->bit_reader, BLOCK_64X64); + mi_row, mi_col, &tile_data->bit_reader, BLOCK_64X64, + tile_data->token_cache); + } } return !tile_data->xd.corrupted; } diff --git a/vp9/decoder/vp9_detokenize.c b/vp9/decoder/vp9_detokenize.c index 6ecce2867..70d0d74ef 100644 --- a/vp9/decoder/vp9_detokenize.c +++ b/vp9/decoder/vp9_detokenize.c @@ -70,7 +70,6 @@ static const vp9_prob cat6_prob[15] = { DCT_EOB_MODEL_TOKEN : TWO_TOKEN) : \ token]; \ } \ - token_cache[scan[c]] = vp9_pt_energy_class[token]; \ } while (0) #define WRITE_COEF_CONTINUE(val, token) \ @@ -78,6 +77,7 @@ static const vp9_prob cat6_prob[15] = { qcoeff_ptr[scan[c]] = vp9_read_and_apply_sign(r, val) * \ dq[c > 0] / (1 + (tx_size == TX_32X32)); \ INCREMENT_COUNT(token); \ + token_cache[scan[c]] = vp9_pt_energy_class[token]; \ c++; \ continue; \ } @@ -91,7 +91,8 @@ static const vp9_prob cat6_prob[15] = { static int decode_coefs(VP9_COMMON *cm, const MACROBLOCKD *xd, vp9_reader *r, int block_idx, PLANE_TYPE type, int seg_eob, int16_t *qcoeff_ptr, - TX_SIZE tx_size, const int16_t *dq, int pt) { + TX_SIZE tx_size, const int16_t *dq, int pt, + uint8_t *token_cache) { const FRAME_CONTEXT *const fc = &cm->fc; FRAME_COUNTS *const counts = &cm->counts; const int ref = is_inter_block(&xd->mi_8x8[0]->mbmi); @@ -104,7 +105,6 @@ static int decode_coefs(VP9_COMMON *cm, const MACROBLOCKD *xd, vp9_coeff_count_model *coef_counts = counts->coef[tx_size]; const int16_t *scan, *nb; const uint8_t *const band_translate = get_band_translate(tx_size); - uint8_t token_cache[1024]; get_scan(xd, tx_size, type, block_idx, &scan, &nb); while (1) { @@ -131,6 +131,7 @@ static int decode_coefs(VP9_COMMON *cm, const MACROBLOCKD *xd, if (!vp9_read(r, prob[ZERO_CONTEXT_NODE])) { INCREMENT_COUNT(ZERO_TOKEN); + token_cache[scan[c]] = vp9_pt_energy_class[ZERO_TOKEN]; ++c; goto SKIP_START; } @@ -212,7 +213,8 @@ static int decode_coefs(VP9_COMMON *cm, const MACROBLOCKD *xd, int vp9_decode_block_tokens(VP9_COMMON *cm, MACROBLOCKD *xd, int plane, int block, BLOCK_SIZE plane_bsize, - TX_SIZE tx_size, vp9_reader *r) { + TX_SIZE tx_size, vp9_reader *r, + uint8_t *token_cache) { struct macroblockd_plane *const pd = &xd->plane[plane]; const int seg_eob = get_tx_eob(&cm->seg, xd->mi_8x8[0]->mbmi.segment_id, tx_size); @@ -223,7 +225,7 @@ int vp9_decode_block_tokens(VP9_COMMON *cm, MACROBLOCKD *xd, eob = decode_coefs(cm, xd, r, block, pd->plane_type, seg_eob, BLOCK_OFFSET(pd->qcoeff, block), - tx_size, pd->dequant, pt); + tx_size, pd->dequant, pt, token_cache); set_contexts(xd, pd, plane_bsize, tx_size, eob > 0, aoff, loff); diff --git a/vp9/decoder/vp9_detokenize.h b/vp9/decoder/vp9_detokenize.h index 94dd8e46e..04939ead3 100644 --- a/vp9/decoder/vp9_detokenize.h +++ b/vp9/decoder/vp9_detokenize.h @@ -17,6 +17,7 @@ int vp9_decode_block_tokens(VP9_COMMON *cm, MACROBLOCKD *xd, int plane, int block, BLOCK_SIZE plane_bsize, - TX_SIZE tx_size, vp9_reader *r); + TX_SIZE tx_size, vp9_reader *r, + uint8_t *token_cache); #endif // VP9_DECODER_VP9_DETOKENIZE_H_ diff --git a/vp9/decoder/vp9_onyxd_int.h b/vp9/decoder/vp9_onyxd_int.h index 83ea96771..7c4c9db36 100644 --- a/vp9/decoder/vp9_onyxd_int.h +++ b/vp9/decoder/vp9_onyxd_int.h @@ -49,6 +49,8 @@ typedef struct VP9Decompressor { ENTROPY_CONTEXT *above_context[MAX_MB_PLANE]; PARTITION_CONTEXT *above_seg_context; + + DECLARE_ALIGNED(16, unsigned char, token_cache[1024]); } VP9D_COMP; #endif // VP9_DECODER_VP9_ONYXD_INT_H_ -- 2.40.0