From fdfec4c7be6a0cf61806a099ce7df4ec16dd1a01 Mon Sep 17 00:00:00 2001 From: Jingning Han Date: Fri, 6 Jul 2018 14:51:32 -0700 Subject: [PATCH] Change the tpl model operating block size to 32x32 Increase the temporal dependency model operating block size from 8x8 to 32x32. Change-Id: I26b13493fe957d67c8646575370e651584b56ea5 --- vp9/encoder/vp9_encoder.c | 131 ++++++++++++++++++++------------------ vp9/encoder/vp9_encoder.h | 10 +-- 2 files changed, 75 insertions(+), 66 deletions(-) diff --git a/vp9/encoder/vp9_encoder.c b/vp9/encoder/vp9_encoder.c index 8afd87694..b17ea9fce 100644 --- a/vp9/encoder/vp9_encoder.c +++ b/vp9/encoder/vp9_encoder.c @@ -5603,7 +5603,7 @@ uint32_t motion_compensated_prediction(VP9_COMP *cpi, ThreadData *td, vp9_set_mv_search_range(&x->mv_limits, &best_ref_mv1); - vp9_full_pixel_search(cpi, x, BLOCK_8X8, &best_ref_mv1_full, step_param, + vp9_full_pixel_search(cpi, x, BLOCK_32X32, &best_ref_mv1_full, step_param, search_method, sadpb, cond_cost_list(cpi, cost_list), &best_ref_mv1, mv, 0, 0); @@ -5613,7 +5613,7 @@ uint32_t motion_compensated_prediction(VP9_COMP *cpi, ThreadData *td, // Ignore mv costing by sending NULL pointer instead of cost array bestsme = cpi->find_fractional_mv_step( x, mv, &best_ref_mv1, cpi->common.allow_high_precision_mv, x->errorperbit, - &cpi->fn_ptr[BLOCK_8X8], 0, mv_sf->subpel_iters_per_step, + &cpi->fn_ptr[BLOCK_32X32], 0, mv_sf->subpel_iters_per_step, cond_cost_list(cpi, cost_list), NULL, NULL, &distortion, &sse, NULL, 0, 0); @@ -5626,20 +5626,20 @@ int get_overlap_area(int grid_pos_row, int grid_pos_col, int ref_pos_row, int width = 0, height = 0; switch (block) { case 0: - width = grid_pos_col + MI_SIZE - ref_pos_col; - height = grid_pos_row + MI_SIZE - ref_pos_row; + width = grid_pos_col + 4 * MI_SIZE - ref_pos_col; + height = grid_pos_row + 4 * MI_SIZE - ref_pos_row; break; case 1: - width = ref_pos_col + MI_SIZE - grid_pos_col; - height = grid_pos_row + MI_SIZE - ref_pos_row; + width = ref_pos_col + 4 * MI_SIZE - grid_pos_col; + height = grid_pos_row + 4 * MI_SIZE - ref_pos_row; break; case 2: - width = grid_pos_col + MI_SIZE - ref_pos_col; - height = ref_pos_row + MI_SIZE - grid_pos_row; + width = grid_pos_col + 4 * MI_SIZE - ref_pos_col; + height = ref_pos_row + 4 * MI_SIZE - grid_pos_row; break; case 3: - width = ref_pos_col + MI_SIZE - grid_pos_col; - height = ref_pos_row + MI_SIZE - grid_pos_row; + width = ref_pos_col + 4 * MI_SIZE - grid_pos_col; + height = ref_pos_row + 4 * MI_SIZE - grid_pos_row; break; default: assert(0); } @@ -5647,18 +5647,18 @@ int get_overlap_area(int grid_pos_row, int grid_pos_col, int ref_pos_row, return overlap_area = width * height; } -int round_floor(int ref_pos) { +int round_floor(int ref_pos, int bsize_pix) { int round; if (ref_pos < 0) - round = -(1 + (-ref_pos - 1) / MI_SIZE); + round = -(1 + (-ref_pos - 1) / bsize_pix); else - round = ref_pos / MI_SIZE; + round = ref_pos / bsize_pix; return round; } void tpl_model_update(TplDepFrame *tpl_frame, TplDepStats *tpl_stats, - int mi_row, int mi_col) { + int mi_row, int mi_col, const BLOCK_SIZE bsize) { TplDepFrame *ref_tpl_frame = &tpl_frame[tpl_stats->ref_frame_index]; TplDepStats *ref_stats = ref_tpl_frame->tpl_stats_ptr; MV mv = tpl_stats->mv.as_mv; @@ -5668,32 +5668,38 @@ void tpl_model_update(TplDepFrame *tpl_frame, TplDepStats *tpl_stats, int ref_pos_row = mi_row * MI_SIZE + mv_row; int ref_pos_col = mi_col * MI_SIZE + mv_col; - // top-left on grid block location - int grid_pos_row_base = round_floor(ref_pos_row) * MI_SIZE; - int grid_pos_col_base = round_floor(ref_pos_col) * MI_SIZE; + const int bw = 4 << b_width_log2_lookup[bsize]; + const int bh = 4 << b_height_log2_lookup[bsize]; + const int mi_height = num_8x8_blocks_high_lookup[bsize]; + const int mi_width = num_8x8_blocks_wide_lookup[bsize]; + const int pix_num = bw * bh; + + // top-left on grid block location in pixel + int grid_pos_row_base = round_floor(ref_pos_row, bh) * bh; + int grid_pos_col_base = round_floor(ref_pos_col, bw) * bw; int block; for (block = 0; block < 4; ++block) { - int grid_pos_row = grid_pos_row_base + MI_SIZE * (block >> 1); - int grid_pos_col = grid_pos_col_base + MI_SIZE * (block & 0x01); + int grid_pos_row = grid_pos_row_base + bh * (block >> 1); + int grid_pos_col = grid_pos_col_base + bw * (block & 0x01); if (grid_pos_row >= 0 && grid_pos_row < ref_tpl_frame->mi_rows * MI_SIZE && grid_pos_col >= 0 && grid_pos_col < ref_tpl_frame->mi_cols * MI_SIZE) { int overlap_area = get_overlap_area(grid_pos_row, grid_pos_col, ref_pos_row, ref_pos_col, block); - int ref_mi_row = round_floor(grid_pos_row); - int ref_mi_col = round_floor(grid_pos_col); + int ref_mi_row = round_floor(grid_pos_row, bh) * mi_height; + int ref_mi_col = round_floor(grid_pos_col, bw) * mi_width; int64_t mc_flow = tpl_stats->mc_dep_cost - (tpl_stats->mc_dep_cost * tpl_stats->inter_cost) / tpl_stats->intra_cost; ref_stats[ref_mi_row * ref_tpl_frame->stride + ref_mi_col].mc_flow += - (mc_flow * overlap_area) >> (MI_SIZE_LOG2 * 2); + (mc_flow * overlap_area) / pix_num; ref_stats[ref_mi_row * ref_tpl_frame->stride + ref_mi_col].mc_ref_cost += - ((tpl_stats->intra_cost - tpl_stats->inter_cost) * overlap_area) >> - (MI_SIZE_LOG2 * 2); + ((tpl_stats->intra_cost - tpl_stats->inter_cost) * overlap_area) / + pix_num; assert(overlap_area >= 0); } } @@ -5713,20 +5719,25 @@ void mc_flow_dispenser(VP9_COMP *cpi, GF_PICTURE *gf_picture, int frame_idx) { int mi_row, mi_col; const InterpKernel *const kernel = vp9_filter_kernels[EIGHTTAP_SHARP]; - // TODO(jingning): Let's keep the buffer size to support 16x16 pixel block, - // in case we would like to increase the operating block size. #if CONFIG_VP9_HIGHBITDEPTH - DECLARE_ALIGNED(16, uint16_t, predictor16[16 * 16 * 3]); - DECLARE_ALIGNED(16, uint8_t, predictor8[16 * 16 * 3]); + DECLARE_ALIGNED(16, uint16_t, predictor16[32 * 32 * 3]); + DECLARE_ALIGNED(16, uint8_t, predictor8[32 * 32 * 3]); uint8_t *predictor; #else - DECLARE_ALIGNED(16, uint8_t, predictor[16 * 16 * 3]); + DECLARE_ALIGNED(16, uint8_t, predictor[32 * 32 * 3]); #endif - DECLARE_ALIGNED(16, int16_t, src_diff[16 * 16]); - DECLARE_ALIGNED(16, tran_low_t, coeff[16 * 16]); + DECLARE_ALIGNED(16, int16_t, src_diff[32 * 32]); + DECLARE_ALIGNED(16, tran_low_t, coeff[32 * 32]); MODE_INFO mi_above, mi_left; + const BLOCK_SIZE bsize = BLOCK_32X32; + const int bw = 4 << b_width_log2_lookup[bsize]; + const int bh = 4 << b_height_log2_lookup[bsize]; + const int mi_height = num_8x8_blocks_high_lookup[bsize]; + const int mi_width = num_8x8_blocks_wide_lookup[bsize]; + const int pix_num = bw * bh; + // Setup scaling factor #if CONFIG_VP9_HIGHBITDEPTH vp9_setup_scale_factors_for_frame( @@ -5761,12 +5772,12 @@ void mc_flow_dispenser(VP9_COMP *cpi, GF_PICTURE *gf_picture, int frame_idx) { vp9_initialize_me_consts(cpi, &cpi->td.mb, ARNR_FILT_QINDEX); tpl_frame->is_valid = 1; - for (mi_row = 0; mi_row < cm->mi_rows; ++mi_row) { + for (mi_row = 0; mi_row < cm->mi_rows; mi_row += mi_height) { // Motion estimation row boundary x->mv_limits.row_min = -((mi_row * MI_SIZE) + (17 - 2 * VP9_INTERP_EXTEND)); x->mv_limits.row_max = (cm->mi_rows - 1 - mi_row) * MI_SIZE + (17 - 2 * VP9_INTERP_EXTEND); - for (mi_col = 0; mi_col < cm->mi_cols; ++mi_col) { + for (mi_col = 0; mi_col < cm->mi_cols; mi_col += mi_width) { int mb_y_offset = mi_row * MI_SIZE * this_frame->y_stride + mi_col * MI_SIZE; int best_rf_idx = -1; @@ -5793,9 +5804,9 @@ void mc_flow_dispenser(VP9_COMP *cpi, GF_PICTURE *gf_picture, int frame_idx) { src_stride = this_frame->y_stride; dst = &predictor[0]; - dst_stride = MI_SIZE; + dst_stride = bw; - xd->mi[0]->sb_type = BLOCK_8X8; + xd->mi[0]->sb_type = BLOCK_32X32; xd->mi[0]->ref_frame[0] = INTRA_FRAME; xd->mb_to_top_edge = -((mi_row * MI_SIZE) * 8); xd->mb_to_bottom_edge = ((cm->mi_rows - 1 - mi_row) * MI_SIZE) * 8; @@ -5804,16 +5815,16 @@ void mc_flow_dispenser(VP9_COMP *cpi, GF_PICTURE *gf_picture, int frame_idx) { xd->above_mi = (mi_row > 0) ? &mi_above : NULL; xd->left_mi = (mi_col > 0) ? &mi_left : NULL; - vp9_predict_intra_block(xd, b_width_log2_lookup[BLOCK_8X8], TX_8X8, + vp9_predict_intra_block(xd, b_width_log2_lookup[BLOCK_32X32], TX_32X32, mode, src, src_stride, dst, dst_stride, 0, 0, 0); - vpx_subtract_block(MI_SIZE, MI_SIZE, src_diff, MI_SIZE, src, src_stride, - dst, dst_stride); + vpx_subtract_block(bh, bw, src_diff, bw, src, src_stride, dst, + dst_stride); - vpx_hadamard_8x8(src_diff, MI_SIZE, coeff); + vpx_fdct32x32(src_diff, coeff, bw); - intra_cost = vpx_satd(coeff, MI_SIZE * MI_SIZE); + intra_cost = vpx_satd(coeff, pix_num); if (intra_cost < best_intra_cost) best_intra_cost = intra_cost; } @@ -5844,35 +5855,33 @@ void mc_flow_dispenser(VP9_COMP *cpi, GF_PICTURE *gf_picture, int frame_idx) { vp9_highbd_build_inter_predictor( CONVERT_TO_SHORTPTR(ref_frame[rf_idx]->y_buffer + mb_y_offset), ref_frame[rf_idx]->y_stride, CONVERT_TO_SHORTPTR(&predictor[0]), - MI_SIZE, &mv.as_mv, &sf, MI_SIZE, MI_SIZE, 0, kernel, - MV_PRECISION_Q3, mi_col * MI_SIZE, mi_row * MI_SIZE, xd->bd); - vpx_highbd_subtract_block(MI_SIZE, MI_SIZE, src_diff, MI_SIZE, - this_frame->y_buffer + mb_y_offset, - this_frame->y_stride, &predictor[0], - MI_SIZE, xd->bd); + bw, &mv.as_mv, &sf, bw, bh, 0, kernel, MV_PRECISION_Q3, + mi_col * MI_SIZE, mi_row * MI_SIZE, xd->bd); + vpx_highbd_subtract_block( + bh, bw, src_diff, bw, this_frame->y_buffer + mb_y_offset, + this_frame->y_stride, &predictor[0], bw, xd->bd); } else { vp9_build_inter_predictor(ref_frame[rf_idx]->y_buffer + mb_y_offset, ref_frame[rf_idx]->y_stride, &predictor[0], - MI_SIZE, &mv.as_mv, &sf, MI_SIZE, MI_SIZE, - 0, kernel, MV_PRECISION_Q3, - mi_col * MI_SIZE, mi_row * MI_SIZE); - vpx_subtract_block(MI_SIZE, MI_SIZE, src_diff, MI_SIZE, + bw, &mv.as_mv, &sf, bw, bh, 0, kernel, + MV_PRECISION_Q3, mi_col * MI_SIZE, + mi_row * MI_SIZE); + vpx_subtract_block(bh, bw, src_diff, bw, this_frame->y_buffer + mb_y_offset, - this_frame->y_stride, &predictor[0], MI_SIZE); + this_frame->y_stride, &predictor[0], bw); } #else - vp9_build_inter_predictor(ref_frame[rf_idx]->y_buffer + mb_y_offset, - ref_frame[rf_idx]->y_stride, &predictor[0], - MI_SIZE, &mv.as_mv, &sf, MI_SIZE, MI_SIZE, 0, - kernel, MV_PRECISION_Q3, mi_col * MI_SIZE, - mi_row * MI_SIZE); - vpx_subtract_block(MI_SIZE, MI_SIZE, src_diff, MI_SIZE, + vp9_build_inter_predictor( + ref_frame[rf_idx]->y_buffer + mb_y_offset, + ref_frame[rf_idx]->y_stride, &predictor[0], bw, &mv.as_mv, &sf, bw, + bh, 0, kernel, MV_PRECISION_Q3, mi_col * MI_SIZE, mi_row * MI_SIZE); + vpx_subtract_block(bh, bw, src_diff, bw, this_frame->y_buffer + mb_y_offset, - this_frame->y_stride, &predictor[0], MI_SIZE); + this_frame->y_stride, &predictor[0], bw); #endif - vpx_hadamard_8x8(src_diff, MI_SIZE, coeff); + vpx_fdct32x32(src_diff, coeff, bw); - inter_cost = vpx_satd(coeff, MI_SIZE * MI_SIZE); + inter_cost = vpx_satd(coeff, pix_num); if (inter_cost < best_inter_cost) { best_rf_idx = rf_idx; @@ -5890,7 +5899,7 @@ void mc_flow_dispenser(VP9_COMP *cpi, GF_PICTURE *gf_picture, int frame_idx) { tpl_stats->ref_frame_index = gf_picture[frame_idx].ref_frame[best_rf_idx]; tpl_stats->mv.as_int = best_mv.as_int; - tpl_model_update(cpi->tpl_stats, tpl_stats, mi_row, mi_col); + tpl_model_update(cpi->tpl_stats, tpl_stats, mi_row, mi_col, bsize); (void)best_mv; (void)best_rf_idx; } diff --git a/vp9/encoder/vp9_encoder.h b/vp9/encoder/vp9_encoder.h index 2a4a98337..d80aa4ae9 100644 --- a/vp9/encoder/vp9_encoder.h +++ b/vp9/encoder/vp9_encoder.h @@ -279,11 +279,11 @@ static INLINE int is_lossless_requested(const VP9EncoderConfig *cfg) { } typedef struct TplDepStats { - uint64_t intra_cost; - uint64_t inter_cost; - uint64_t mc_flow; - uint64_t mc_dep_cost; - uint64_t mc_ref_cost; + int64_t intra_cost; + int64_t inter_cost; + int64_t mc_flow; + int64_t mc_dep_cost; + int64_t mc_ref_cost; int ref_frame_index; int_mv mv; -- 2.40.0