From faff6ed0fbb01ece1331021b749ec2f9114332ff Mon Sep 17 00:00:00 2001 From: Jingning Han Date: Mon, 8 Jul 2013 16:48:47 -0700 Subject: [PATCH] Skip duplicate block encoding in the rd loop This speed feature allows the encoder to largely remove the spatial dependency between blocks inside a 64x64 superblock, thereby removing the need to repeatedly encode superblocks per partition type in the rate-distortion optimization loop. A major challenge lies in the intra modes tested in the rate-distortion optimization loop. The subsequent blocks do not have access to the reconstructed boundary pixels without the intermediate coding steps. This was resolved by using the original pixels for intra prediction in the rd loop, followed by an appropriately designed distortion modeling on the quantization parameters. Experiments also suggested that the performance impact is more discernible at lower bit-rate/psnr settings. Hence a quantizer dependent threshold is applied to deactivate skip of block coding. For bus_cif at 2000 kbps, speed 0: runtime 269854ms -> 237774ms (12% speed-up) at 0.05dB performance loss. speed 1: runtime 65312ms -> 61536ms, (7% speed-up) at 0.04dB performance loss. This operation is currently turned on in settings of speed 1. Change-Id: Ib689741dfff8dd38365d8c1b92860a3e176f56ec --- vp9/encoder/vp9_block.h | 1 + vp9/encoder/vp9_encodeframe.c | 18 +++++++++++++++ vp9/encoder/vp9_encodeintra.c | 1 + vp9/encoder/vp9_encodemb.c | 6 ++++- vp9/encoder/vp9_onyx_if.c | 4 ++++ vp9/encoder/vp9_onyx_int.h | 2 +- vp9/encoder/vp9_rdopt.c | 43 +++++++++++++++++++++++++---------- vp9/encoder/vp9_rdopt.h | 2 ++ 8 files changed, 63 insertions(+), 14 deletions(-) diff --git a/vp9/encoder/vp9_block.h b/vp9/encoder/vp9_block.h index d575647ea..ae9f0aaa7 100644 --- a/vp9/encoder/vp9_block.h +++ b/vp9/encoder/vp9_block.h @@ -141,6 +141,7 @@ struct macroblock { // indicate if it is in the rd search loop or encoding process int rd_search; + int skip_encode; // TODO(jingning): Need to refactor the structure arrays that buffers the // coding mode decisions of each partition type. diff --git a/vp9/encoder/vp9_encodeframe.c b/vp9/encoder/vp9_encodeframe.c index 9a3d94b5c..8ac46c023 100644 --- a/vp9/encoder/vp9_encodeframe.c +++ b/vp9/encoder/vp9_encodeframe.c @@ -1866,6 +1866,20 @@ static void encode_frame_internal(VP9_COMP *cpi) { cpi->time_encode_mb_row += vpx_usec_timer_elapsed(&emr_timer); } + if (cpi->sf.skip_encode_sb) { + int j; + unsigned int intra_count = 0, inter_count = 0; + for (j = 0; j < INTRA_INTER_CONTEXTS; ++j) { + intra_count += cpi->intra_inter_count[j][0]; + inter_count += cpi->intra_inter_count[j][1]; + } + cpi->sf.skip_encode_frame = ((intra_count << 2) < inter_count); + cpi->sf.skip_encode_frame &= (cm->frame_type != KEY_FRAME); + cpi->sf.skip_encode_frame &= cm->show_frame; + } else { + cpi->sf.skip_encode_frame = 0; + } + // 256 rate units to the bit, // projected_frame_size in units of BYTES cpi->projected_frame_size = totalrate >> 8; @@ -2276,6 +2290,10 @@ static void encode_superblock(VP9_COMP *cpi, TOKENEXTRA **t, int output_enabled, const int bwl = mi_width_log2(bsize); const int bw = 1 << bwl, bh = 1 << mi_height_log2(bsize); x->rd_search = 0; + x->skip_encode = (!output_enabled && cpi->sf.skip_encode_frame && + xd->q_index < QIDX_SKIP_THRESH); + if (x->skip_encode) + return; if (cm->frame_type == KEY_FRAME) { if (cpi->oxcf.tuning == VP8_TUNE_SSIM) { diff --git a/vp9/encoder/vp9_encodeintra.c b/vp9/encoder/vp9_encodeintra.c index 5cab867a8..d49e53258 100644 --- a/vp9/encoder/vp9_encodeintra.c +++ b/vp9/encoder/vp9_encodeintra.c @@ -18,6 +18,7 @@ int vp9_encode_intra(VP9_COMP *cpi, MACROBLOCK *x, int use_16x16_pred) { MB_MODE_INFO * mbmi = &x->e_mbd.mode_info_context->mbmi; (void) cpi; + x->skip_encode = 0; mbmi->mode = DC_PRED; mbmi->ref_frame[0] = INTRA_FRAME; mbmi->txfm_size = use_16x16_pred ? (mbmi->sb_type >= BLOCK_SIZE_MB16X16 ? diff --git a/vp9/encoder/vp9_encodemb.c b/vp9/encoder/vp9_encodemb.c index eb8f2aa6e..6ca8e6eb2 100644 --- a/vp9/encoder/vp9_encodemb.c +++ b/vp9/encoder/vp9_encodemb.c @@ -610,7 +610,8 @@ void encode_block_intra(int plane, int block, BLOCK_SIZE_TYPE bsize, plane_b_size = b_width_log2(bsize) - pd->subsampling_x; vp9_predict_intra_block(xd, tx_ib, plane_b_size, tx_size, b_mode, - dst, pd->dst.stride, + x->skip_encode ? src : dst, + x->skip_encode ? p->src.stride : pd->dst.stride, dst, pd->dst.stride); vp9_subtract_block(txfm_b_size, txfm_b_size, src_diff, bw, src, p->src.stride, dst, pd->dst.stride); @@ -618,6 +619,9 @@ void encode_block_intra(int plane, int block, BLOCK_SIZE_TYPE bsize, xform_quant(plane, block, bsize, ss_txfrm_size, arg); + if (x->skip_encode) + return; + // if (x->optimize) // vp9_optimize_b(plane, block, bsize, ss_txfrm_size, // args->cm, x, args->ctx); diff --git a/vp9/encoder/vp9_onyx_if.c b/vp9/encoder/vp9_onyx_if.c index 1edd1ebec..14d36bdf6 100644 --- a/vp9/encoder/vp9_onyx_if.c +++ b/vp9/encoder/vp9_onyx_if.c @@ -721,6 +721,7 @@ void vp9_set_speed_features(VP9_COMP *cpi) { sf->mode_search_skip_flags = 0; sf->last_chroma_intra_mode = TM_PRED; sf->use_rd_breakout = 0; + sf->skip_encode_sb = 0; // Skip any mode not chosen at size < X for all sizes > X // Hence BLOCK_SIZE_SB64X64 (skip is off) @@ -769,6 +770,7 @@ void vp9_set_speed_features(VP9_COMP *cpi) { FLAG_SKIP_COMP_BESTINTRA; sf->last_chroma_intra_mode = H_PRED; sf->use_rd_breakout = 1; + sf->skip_encode_sb = 1; } if (speed == 2) { sf->adjust_thresholds_by_speed = 1; @@ -790,6 +792,7 @@ void vp9_set_speed_features(VP9_COMP *cpi) { FLAG_SKIP_COMP_REFMISMATCH; sf->last_chroma_intra_mode = DC_PRED; sf->use_rd_breakout = 1; + sf->skip_encode_sb = 1; } if (speed == 3) { sf->comp_inter_joint_search_thresh = BLOCK_SIZE_TYPES; @@ -804,6 +807,7 @@ void vp9_set_speed_features(VP9_COMP *cpi) { FLAG_SKIP_COMP_BESTINTRA | FLAG_SKIP_COMP_REFMISMATCH; sf->use_rd_breakout = 1; + sf->skip_encode_sb = 1; } if (speed == 4) { sf->comp_inter_joint_search_thresh = BLOCK_SIZE_TYPES; diff --git a/vp9/encoder/vp9_onyx_int.h b/vp9/encoder/vp9_onyx_int.h index d3822c201..48f5a12e4 100644 --- a/vp9/encoder/vp9_onyx_int.h +++ b/vp9/encoder/vp9_onyx_int.h @@ -247,6 +247,7 @@ typedef struct { int comp_inter_joint_search_thresh; int adaptive_rd_thresh; int skip_encode_sb; + int skip_encode_frame; int use_lastframe_partitioning; TX_SIZE_SEARCH_METHOD tx_size_search_method; int use_8tap_always; @@ -277,7 +278,6 @@ typedef struct { } SPEED_FEATURES; typedef struct VP9_COMP { - DECLARE_ALIGNED(16, int16_t, y_quant[QINDEX_RANGE][8]); DECLARE_ALIGNED(16, int16_t, y_quant_shift[QINDEX_RANGE][8]); DECLARE_ALIGNED(16, int16_t, y_zbin[QINDEX_RANGE][8]); diff --git a/vp9/encoder/vp9_rdopt.c b/vp9/encoder/vp9_rdopt.c index a70c90447..ff671aeea 100644 --- a/vp9/encoder/vp9_rdopt.c +++ b/vp9/encoder/vp9_rdopt.c @@ -685,6 +685,15 @@ static void dist_block(int plane, int block, BLOCK_SIZE_TYPE bsize, args->dist += vp9_block_error(coeff, dqcoeff, 16 << ss_txfrm_size, &this_sse) >> shift; args->sse += this_sse >> shift; + + if (x->skip_encode && + xd->mode_info_context->mbmi.ref_frame[0] == INTRA_FRAME) { + // TODO(jingning): tune the model to better capture the distortion. + int64_t p = (pd->dequant[1] * pd->dequant[1] * + (1 << ss_txfrm_size)) >> shift; + args->dist += p; + args->sse += p; + } } static void rate_block(int plane, int block, BLOCK_SIZE_TYPE bsize, @@ -1169,6 +1178,7 @@ static int64_t rd_pick_intra4x4block(VP9_COMP *cpi, MACROBLOCK *x, int ib, struct macroblock_plane *p = &x->plane[0]; struct macroblockd_plane *pd = &xd->plane[0]; const int src_stride = p->src.stride; + const int dst_stride = pd->dst.stride; uint8_t *src, *dst; int16_t *src_diff, *coeff; @@ -1215,15 +1225,15 @@ static int64_t rd_pick_intra4x4block(VP9_COMP *cpi, MACROBLOCK *x, int ib, p->src_diff); coeff = BLOCK_OFFSET(x->plane[0].coeff, block, 16); dst = raster_block_offset_uint8(xd, BLOCK_SIZE_SB8X8, 0, block, - pd->dst.buf, - pd->dst.stride); + pd->dst.buf, dst_stride); vp9_predict_intra_block(xd, block, b_width_log2(BLOCK_SIZE_SB8X8), TX_4X4, mode, - dst, pd->dst.stride, - dst, pd->dst.stride); + x->skip_encode ? src : dst, + x->skip_encode ? src_stride : dst_stride, + dst, dst_stride); vp9_subtract_block(4, 4, src_diff, 8, src, src_stride, - dst, pd->dst.stride); + dst, dst_stride); tx_type = get_tx_type_4x4(xd, block); if (tx_type != DCT_DCT) { @@ -1272,24 +1282,30 @@ static int64_t rd_pick_intra4x4block(VP9_COMP *cpi, MACROBLOCK *x, int ib, } } + if (x->skip_encode) + return best_rd; + for (idy = 0; idy < bh; ++idy) { for (idx = 0; idx < bw; ++idx) { block = ib + idy * 2 + idx; xd->mode_info_context->bmi[block].as_mode = *best_mode; + src = raster_block_offset_uint8(xd, BLOCK_SIZE_SB8X8, 0, block, + p->src.buf, src_stride); dst = raster_block_offset_uint8(xd, BLOCK_SIZE_SB8X8, 0, block, - pd->dst.buf, - pd->dst.stride); + pd->dst.buf, dst_stride); vp9_predict_intra_block(xd, block, b_width_log2(BLOCK_SIZE_SB8X8), TX_4X4, - *best_mode, dst, pd->dst.stride, - dst, pd->dst.stride); + *best_mode, + x->skip_encode ? src : dst, + x->skip_encode ? src_stride : dst_stride, + dst, dst_stride); // inverse transform if (best_tx_type != DCT_DCT) vp9_short_iht4x4_add(best_dqcoeff[idy * 2 + idx], dst, - pd->dst.stride, best_tx_type); + dst_stride, best_tx_type); else xd->inv_txm4x4_add(best_dqcoeff[idy * 2 + idx], dst, - pd->dst.stride); + dst_stride); } } @@ -2897,6 +2913,7 @@ void vp9_rd_pick_intra_mode_sb(VP9_COMP *cpi, MACROBLOCK *x, int64_t dist4x4_y; int64_t err4x4 = INT64_MAX; + x->skip_encode = 0; vpx_memset(&txfm_cache,0,sizeof(txfm_cache)); ctx->skip = 0; xd->mode_info_context->mbmi.mode = DC_PRED; @@ -3006,9 +3023,11 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x, int bhs = (1 << bhsl) / 4; // mode_info step for subsize int best_skip2 = 0; + x->skip_encode = (cpi->sf.skip_encode_frame && + xd->q_index < QIDX_SKIP_THRESH); + for (i = 0; i < 4; i++) { int j; - for (j = 0; j < MAX_REF_FRAMES; j++) seg_mvs[i][j].as_int = INVALID_MV; } diff --git a/vp9/encoder/vp9_rdopt.h b/vp9/encoder/vp9_rdopt.h index 67ef73db7..22d0a950a 100644 --- a/vp9/encoder/vp9_rdopt.h +++ b/vp9/encoder/vp9_rdopt.h @@ -15,6 +15,8 @@ #define RDCOST(RM,DM,R,D) ( ((128+((int64_t)R)*(RM)) >> 8) + ((int64_t)DM)*(D) ) #define RDCOST_8x8(RM,DM,R,D) ( ((128+((int64_t)R)*(RM)) >> 8) + ((int64_t)DM)*(D) ) +#define QIDX_SKIP_THRESH 115 + void vp9_initialize_rd_consts(VP9_COMP *cpi, int qindex); void vp9_initialize_me_consts(VP9_COMP *cpi, int qindex); -- 2.40.0