From bacc67f4a808e488d24fda2e93cfd5fbe0b806a4 Mon Sep 17 00:00:00 2001 From: jackychen Date: Fri, 20 May 2016 13:45:46 -0700 Subject: [PATCH] vp9: Skip some modes when variance is low for big blocks, for 1 pass real-time. Skip intra-mode and some inter-modes (newmv, nearmv, nearestmv) for golden frame if the variance got from choose_partitioning is very low. Only for 1 pass real-time CBR mode and bsize >= 32x32, it has ~2.5% speed up with less than 0.1% PSNR drop for rtc test set. Don't see visual regression. Change-Id: I70efbc95a1007231ae36f02c5b2fbf6cd35077ad --- vp9/encoder/vp9_block.h | 5 ++ vp9/encoder/vp9_encodeframe.c | 33 +++++++++++ vp9/encoder/vp9_pickmode.c | 98 +++++++++++++++++++++++++------- vp9/encoder/vp9_speed_features.c | 6 ++ vp9/encoder/vp9_speed_features.h | 4 ++ 5 files changed, 125 insertions(+), 21 deletions(-) diff --git a/vp9/encoder/vp9_block.h b/vp9/encoder/vp9_block.h index 147743e8d..bbdfbb823 100644 --- a/vp9/encoder/vp9_block.h +++ b/vp9/encoder/vp9_block.h @@ -145,6 +145,11 @@ struct macroblock { uint8_t sb_is_skin; + // Used to save the status of whether a block has a low variance in + // choose_partitioning. 0 for 64x64, 1 2 for 64x32, 3 4 for 32x64, 5~8 for + // 32x32. + uint8_t variance_low[9]; + void (*fwd_txm4x4)(const int16_t *input, tran_low_t *output, int stride); void (*itxm_add)(const tran_low_t *input, uint8_t *dest, int stride, int eob); #if CONFIG_VP9_HIGHBITDEPTH diff --git a/vp9/encoder/vp9_encodeframe.c b/vp9/encoder/vp9_encodeframe.c index 67069e7c1..2eac53989 100644 --- a/vp9/encoder/vp9_encodeframe.c +++ b/vp9/encoder/vp9_encodeframe.c @@ -747,6 +747,8 @@ static int choose_partitioning(VP9_COMP *cpi, const uint8_t *d; int sp; int dp; + // Ref frame used in partitioning. + MV_REFERENCE_FRAME ref_frame_partition = LAST_FRAME; int pixels_wide = 64, pixels_high = 64; int64_t thresholds[4] = {cpi->vbp_thresholds[0], cpi->vbp_thresholds[1], cpi->vbp_thresholds[2], cpi->vbp_thresholds[3]}; @@ -771,6 +773,10 @@ static int choose_partitioning(VP9_COMP *cpi, } } + for (i = 0; i < 9; i++) { + x->variance_low[i] = 0; + } + if (xd->mb_to_right_edge < 0) pixels_wide += (xd->mb_to_right_edge >> 3); if (xd->mb_to_bottom_edge < 0) @@ -831,8 +837,10 @@ static int choose_partitioning(VP9_COMP *cpi, mi->ref_frame[0] = GOLDEN_FRAME; mi->mv[0].as_int = 0; y_sad = y_sad_g; + ref_frame_partition = GOLDEN_FRAME; } else { x->pred_mv[LAST_FRAME] = mi->mv[0].as_mv; + ref_frame_partition = LAST_FRAME; } set_ref_ptrs(cm, xd, mi->ref_frame[0], mi->ref_frame[1]); @@ -1019,6 +1027,31 @@ static int choose_partitioning(VP9_COMP *cpi, force_split[0] = 1; } + if (cpi->sf.short_circuit_low_temp_var) { + // Set low variance flag, only for blocks >= 32x32 and if LAST_FRAME was + // selected. + if (ref_frame_partition == LAST_FRAME) { + // 64x64 + if (vt.part_variances.none.variance < (thresholds[0] >> 1)) + x->variance_low[0] = 1; + // 64x32 + if (vt.part_variances.horz[0].variance < (thresholds[0] >> 2)) + x->variance_low[1] = 1; + if (vt.part_variances.horz[1].variance < (thresholds[0] >> 2)) + x->variance_low[2] = 1; + // 32x64 + if (vt.part_variances.vert[0].variance < (thresholds[0] >> 2)) + x->variance_low[3] = 1; + if (vt.part_variances.vert[1].variance < (thresholds[0] >> 2)) + x->variance_low[4] = 1; + // 32x32 + for (i = 0; i < 4; i++) { + if (vt.split[i].part_variances.none.variance < (thresholds[1] >> 1)) + x->variance_low[i + 5] = 1; + } + } + } + // Now go through the entire structure, splitting every block size until // we get to one that's got a variance lower than our threshold. if ( mi_col + 8 > cm->mi_cols || mi_row + 8 > cm->mi_rows || diff --git a/vp9/encoder/vp9_pickmode.c b/vp9/encoder/vp9_pickmode.c index f9c5fb340..554409b74 100644 --- a/vp9/encoder/vp9_pickmode.c +++ b/vp9/encoder/vp9_pickmode.c @@ -1126,34 +1126,38 @@ static INLINE void find_predictors(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data, int mi_row, int mi_col, struct buf_2d yv12_mb[4][MAX_MB_PLANE], - BLOCK_SIZE bsize) { + BLOCK_SIZE bsize, + int force_skip_low_temp_var) { VP9_COMMON *const cm = &cpi->common; MACROBLOCKD *const xd = &x->e_mbd; const YV12_BUFFER_CONFIG *yv12 = get_ref_frame_buffer(cpi, ref_frame); TileInfo *const tile_info = &tile_data->tile_info; -// TODO(jingning) placeholder for inter-frame non-RD mode decision. + // TODO(jingning) placeholder for inter-frame non-RD mode decision. x->pred_mv_sad[ref_frame] = INT_MAX; frame_mv[NEWMV][ref_frame].as_int = INVALID_MV; frame_mv[ZEROMV][ref_frame].as_int = 0; -// this needs various further optimizations. to be continued.. + // this needs various further optimizations. to be continued.. if ((cpi->ref_frame_flags & flag_list[ref_frame]) && (yv12 != NULL)) { int_mv *const candidates = x->mbmi_ext->ref_mvs[ref_frame]; const struct scale_factors *const sf = &cm->frame_refs[ref_frame - 1].sf; vp9_setup_pred_block(xd, yv12_mb[ref_frame], yv12, mi_row, mi_col, sf, sf); - if (cm->use_prev_frame_mvs) + if (cm->use_prev_frame_mvs) { vp9_find_mv_refs(cm, xd, xd->mi[0], ref_frame, candidates, mi_row, mi_col, x->mbmi_ext->mode_context); - else - const_motion[ref_frame] = - mv_refs_rt(cpi, cm, x, xd, tile_info, xd->mi[0], ref_frame, - candidates, &frame_mv[NEWMV][ref_frame], mi_row, mi_col, - (int)(cpi->svc.use_base_mv && cpi->svc.spatial_layer_id)); + } else { + const_motion[ref_frame] = + mv_refs_rt(cpi, cm, x, xd, tile_info, xd->mi[0], ref_frame, + candidates, &frame_mv[NEWMV][ref_frame], mi_row, mi_col, + (int)(cpi->svc.use_base_mv && cpi->svc.spatial_layer_id)); + } vp9_find_best_ref_mvs(xd, cm->allow_high_precision_mv, candidates, &frame_mv[NEARESTMV][ref_frame], &frame_mv[NEARMV][ref_frame]); - if (!vp9_is_scaled(sf) && bsize >= BLOCK_8X8) { + // Early exit for golden frame if force_skip_low_temp_var is set. + if (!vp9_is_scaled(sf) && bsize >= BLOCK_8X8 && + !(force_skip_low_temp_var && ref_frame == GOLDEN_FRAME)) { vp9_mv_pred(cpi, x, yv12_mb[ref_frame][0].buf, yv12->y_stride, ref_frame, bsize); } @@ -1266,6 +1270,39 @@ static void recheck_zeromv_after_denoising( } #endif // CONFIG_VP9_TEMPORAL_DENOISING +static INLINE int set_force_skip_low_temp_var(uint8_t *variance_low, + int mi_row, int mi_col, + BLOCK_SIZE bsize) { + int force_skip_low_temp_var = 0; + // Set force_skip_low_temp_var based on the block size and block offset. + if (bsize == BLOCK_64X64) { + force_skip_low_temp_var = variance_low[0]; + } else if (bsize == BLOCK_64X32) { + if (!(mi_col & 0x7) && !(mi_row & 0x7)) { + force_skip_low_temp_var = variance_low[1]; + } else if (!(mi_col & 0x7) && (mi_row & 0x7)) { + force_skip_low_temp_var = variance_low[2]; + } + } else if (bsize == BLOCK_32X64) { + if (!(mi_col & 0x7) && !(mi_row & 0x7)) { + force_skip_low_temp_var = variance_low[3]; + } else if ((mi_col & 0x7) && !(mi_row & 0x7)) { + force_skip_low_temp_var = variance_low[4]; + } + } else if (bsize == BLOCK_32X32) { + if (!(mi_col & 0x7) && !(mi_row & 0x7)) { + force_skip_low_temp_var = variance_low[5]; + } else if ((mi_col & 0x7) && !(mi_row & 0x7)) { + force_skip_low_temp_var = variance_low[6]; + } else if (!(mi_col & 0x7) && (mi_row & 0x7)) { + force_skip_low_temp_var = variance_low[7]; + } else if ((mi_col & 0x7) && (mi_row & 0x7)) { + force_skip_low_temp_var = variance_low[8]; + } + } + return force_skip_low_temp_var; +} + void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data, int mi_row, int mi_col, RD_COST *rd_cost, @@ -1324,6 +1361,7 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, int svc_force_zero_mode[3] = {0}; int perform_intra_pred = 1; int use_golden_nonzeromv = 1; + int force_skip_low_temp_var = 0; #if CONFIG_VP9_TEMPORAL_DENOISING VP9_PICKMODE_CTX_DEN ctx_den; int64_t zero_last_cost_orig = INT64_MAX; @@ -1410,14 +1448,19 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, } } + if (cpi->sf.short_circuit_low_temp_var) { + force_skip_low_temp_var = + set_force_skip_low_temp_var(&x->variance_low[0], mi_row, mi_col, bsize); + } + if (!((cpi->ref_frame_flags & flag_list[GOLDEN_FRAME]) && - !svc_force_zero_mode[GOLDEN_FRAME - 1])) + !svc_force_zero_mode[GOLDEN_FRAME - 1] && !force_skip_low_temp_var)) use_golden_nonzeromv = 0; for (ref_frame = LAST_FRAME; ref_frame <= usable_ref_frame; ++ref_frame) { find_predictors(cpi, x, ref_frame, frame_mv, const_motion, &ref_frame_skip_mask, flag_list, tile_data, mi_row, mi_col, - yv12_mb, bsize); + yv12_mb, bsize, force_skip_low_temp_var); } for (idx = 0; idx < RT_INTER_MODES; ++idx) { @@ -1429,6 +1472,7 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, int is_skippable; int this_early_term = 0; PREDICTION_MODE this_mode = ref_mode_set[idx].pred_mode; + if (cpi->use_svc) this_mode = ref_mode_set_svc[idx].pred_mode; @@ -1447,17 +1491,27 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, if (!(cpi->ref_frame_flags & flag_list[ref_frame])) continue; + if (const_motion[ref_frame] && this_mode == NEARMV) continue; + // Skip non-zeromv mode search for golden frame if force_skip_low_temp_var + // is set. If nearestmv for golden frame is 0, zeromv mode will be skipped + // later. + if (force_skip_low_temp_var && ref_frame == GOLDEN_FRAME && + frame_mv[this_mode][ref_frame].as_int != 0) { + continue; + } + if (cpi->use_svc) { if (svc_force_zero_mode[ref_frame - 1] && frame_mv[this_mode][ref_frame].as_int != 0) continue; } - if (!(frame_mv[this_mode][ref_frame].as_int == 0 && - ref_frame == LAST_FRAME)) { + if (!force_skip_low_temp_var && + !(frame_mv[this_mode][ref_frame].as_int == 0 && + ref_frame == LAST_FRAME)) { i = (ref_frame == LAST_FRAME) ? GOLDEN_FRAME : LAST_FRAME; if ((cpi->ref_frame_flags & flag_list[i]) && sf->reference_masking) if (x->pred_mv_sad[ref_frame] > (x->pred_mv_sad[i] << 1)) @@ -1548,8 +1602,10 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, } } - if (use_golden_nonzeromv && - this_mode == NEWMV && ref_frame == LAST_FRAME && + // If use_golden_nonzeromv is false, NEWMV mode is skipped for golden, no + // need to compute best_pred_sad which is only used to skip golden NEWMV. + if (use_golden_nonzeromv && this_mode == NEWMV && + ref_frame == LAST_FRAME && frame_mv[NEWMV][LAST_FRAME].as_int != INVALID_MV) { const int pre_stride = xd->plane[0].pre[0].stride; const uint8_t * const pre_buf = xd->plane[0].pre[0].buf + @@ -1786,11 +1842,11 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, inter_mode_thresh = (inter_mode_thresh << 1) + inter_mode_thresh; } // Perform intra prediction search, if the best SAD is above a certain - // threshold. - if (perform_intra_pred && - ((best_rdc.rdcost == INT64_MAX || - (!x->skip && best_rdc.rdcost > inter_mode_thresh && - bsize <= cpi->sf.max_intra_bsize)))) { + // threshold. Skip intra prediction if force_skip_low_temp_var is set. + if (!force_skip_low_temp_var && perform_intra_pred && + (best_rdc.rdcost == INT64_MAX || + (!x->skip && best_rdc.rdcost > inter_mode_thresh && + bsize <= cpi->sf.max_intra_bsize))) { struct estimate_block_intra_args args = { cpi, x, DC_PRED, 1, 0, 0 }; int i; TX_SIZE best_intra_tx_size = TX_SIZES; diff --git a/vp9/encoder/vp9_speed_features.c b/vp9/encoder/vp9_speed_features.c index 02be3c3f9..c3a71feed 100644 --- a/vp9/encoder/vp9_speed_features.c +++ b/vp9/encoder/vp9_speed_features.c @@ -429,6 +429,11 @@ static void set_rt_speed_feature(VP9_COMP *cpi, SPEED_FEATURES *sf, sf->mv.search_method = NSTEP; sf->mv.reduce_first_step_size = 1; sf->skip_encode_sb = 0; + if (!cpi->use_svc && cpi->oxcf.rc_mode == VPX_CBR && cpi->oxcf.pass == 0 && + content != VP9E_CONTENT_SCREEN) { + // Enable short circuit when temporal variance is very low. + sf->short_circuit_low_temp_var = 1; + } } if (speed >= 7) { @@ -554,6 +559,7 @@ void vp9_set_speed_features_framesize_independent(VP9_COMP *cpi) { sf->default_interp_filter = SWITCHABLE; sf->simple_model_rd_from_var = 0; sf->short_circuit_flat_blocks = 0; + sf->short_circuit_low_temp_var = 0; // Some speed-up features even for best quality as minimal impact on quality. sf->adaptive_rd_thresh = 1; diff --git a/vp9/encoder/vp9_speed_features.h b/vp9/encoder/vp9_speed_features.h index 90b32164b..71ff0ac10 100644 --- a/vp9/encoder/vp9_speed_features.h +++ b/vp9/encoder/vp9_speed_features.h @@ -446,6 +446,10 @@ typedef struct SPEED_FEATURES { // Skip a number of expensive mode evaluations for blocks with zero source // variance. int short_circuit_flat_blocks; + + // Skip a number of expensive mode evaluations for blocks with very low + // temporal variance. + int short_circuit_low_temp_var; } SPEED_FEATURES; struct VP9_COMP; -- 2.40.0