From 359a6796dad01cd399f84e00ce6948640d6309e7 Mon Sep 17 00:00:00 2001 From: Ranjit Kumar Tulabandu Date: Wed, 1 Feb 2017 16:34:49 +0530 Subject: [PATCH] Changes to facilitate row based multi-threading of ARNR filtering Change-Id: I2fd72af00afbbeb903e4fe364611abcc148f2fbb --- vp9/encoder/vp9_encoder.h | 9 + vp9/encoder/vp9_mbgraph.c | 4 +- vp9/encoder/vp9_mcomp.c | 8 +- vp9/encoder/vp9_mcomp.h | 6 +- vp9/encoder/vp9_pickmode.c | 14 +- vp9/encoder/vp9_rdopt.c | 9 +- vp9/encoder/vp9_temporal_filter.c | 468 ++++++++++++++++-------------- 7 files changed, 283 insertions(+), 235 deletions(-) diff --git a/vp9/encoder/vp9_encoder.h b/vp9/encoder/vp9_encoder.h index e1046f14a..a295d907a 100644 --- a/vp9/encoder/vp9_encoder.h +++ b/vp9/encoder/vp9_encoder.h @@ -420,6 +420,14 @@ typedef struct { double max_cpb_size; // in bits } LevelConstraint; +typedef struct ARNRFilterData { + YV12_BUFFER_CONFIG *frames[MAX_LAG_BUFFERS]; + int strength; + int frame_count; + int alt_ref_index; + struct scale_factors sf; +} ARNRFilterData; + typedef struct VP9_COMP { QUANTS quants; ThreadData td; @@ -662,6 +670,7 @@ typedef struct VP9_COMP { MultiThreadHandle multi_thread_ctxt; void (*row_mt_sync_read_ptr)(VP9RowMTSync *const, int, int); void (*row_mt_sync_write_ptr)(VP9RowMTSync *const, int, int, const int); + ARNRFilterData arnr_filter_data; int new_mt; // Previous Partition Info diff --git a/vp9/encoder/vp9_mbgraph.c b/vp9/encoder/vp9_mbgraph.c index e000220b9..e9379f418 100644 --- a/vp9/encoder/vp9_mbgraph.c +++ b/vp9/encoder/vp9_mbgraph.c @@ -45,8 +45,8 @@ static unsigned int do_16x16_motion_iteration(VP9_COMP *cpi, const MV *ref_mv, mv_sf->search_method = HEX; vp9_full_pixel_search(cpi, x, BLOCK_16X16, &ref_full, step_param, - x->errorperbit, cond_cost_list(cpi, cost_list), ref_mv, - dst_mv, 0, 0); + cpi->sf.mv.search_method, x->errorperbit, + cond_cost_list(cpi, cost_list), ref_mv, dst_mv, 0, 0); mv_sf->search_method = old_search_method; // Try sub-pixel MC diff --git a/vp9/encoder/vp9_mcomp.c b/vp9/encoder/vp9_mcomp.c index 59b90be14..c8a78eb28 100644 --- a/vp9/encoder/vp9_mcomp.c +++ b/vp9/encoder/vp9_mcomp.c @@ -2364,11 +2364,11 @@ static int is_exhaustive_allowed(VP9_COMP *cpi, MACROBLOCK *x) { } int vp9_full_pixel_search(VP9_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize, - MV *mvp_full, int step_param, int error_per_bit, - int *cost_list, const MV *ref_mv, MV *tmp_mv, - int var_max, int rd) { + MV *mvp_full, int step_param, int search_method, + int error_per_bit, int *cost_list, const MV *ref_mv, + MV *tmp_mv, int var_max, int rd) { const SPEED_FEATURES *const sf = &cpi->sf; - const SEARCH_METHODS method = sf->mv.search_method; + const SEARCH_METHODS method = (SEARCH_METHODS)search_method; vp9_variance_fn_ptr_t *fn_ptr = &cpi->fn_ptr[bsize]; int var = 0; if (cost_list) { diff --git a/vp9/encoder/vp9_mcomp.h b/vp9/encoder/vp9_mcomp.h index 2726b9e23..d17b8e9bb 100644 --- a/vp9/encoder/vp9_mcomp.h +++ b/vp9/encoder/vp9_mcomp.h @@ -105,9 +105,9 @@ int vp9_refining_search_8p_c(const MACROBLOCK *x, MV *ref_mv, int error_per_bit, struct VP9_COMP; int vp9_full_pixel_search(struct VP9_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize, - MV *mvp_full, int step_param, int error_per_bit, - int *cost_list, const MV *ref_mv, MV *tmp_mv, - int var_max, int rd); + MV *mvp_full, int step_param, int search_method, + int error_per_bit, int *cost_list, const MV *ref_mv, + MV *tmp_mv, int var_max, int rd); #ifdef __cplusplus } // extern "C" diff --git a/vp9/encoder/vp9_pickmode.c b/vp9/encoder/vp9_pickmode.c index e7374263c..d3fc3dce7 100644 --- a/vp9/encoder/vp9_pickmode.c +++ b/vp9/encoder/vp9_pickmode.c @@ -184,9 +184,9 @@ static int combined_motion_search(VP9_COMP *cpi, MACROBLOCK *x, else center_mv = tmp_mv->as_mv; - vp9_full_pixel_search(cpi, x, bsize, &mvp_full, step_param, sadpb, - cond_cost_list(cpi, cost_list), ¢er_mv, - &tmp_mv->as_mv, INT_MAX, 0); + vp9_full_pixel_search( + cpi, x, bsize, &mvp_full, step_param, cpi->sf.mv.search_method, sadpb, + cond_cost_list(cpi, cost_list), ¢er_mv, &tmp_mv->as_mv, INT_MAX, 0); x->mv_limits = tmp_mv_limits; @@ -2260,10 +2260,10 @@ void vp9_pick_inter_mode_sub8x8(VP9_COMP *cpi, MACROBLOCK *x, int mi_row, vp9_set_mv_search_range(&x->mv_limits, &mbmi_ext->ref_mvs[0]->as_mv); - vp9_full_pixel_search(cpi, x, bsize, &mvp_full, step_param, - x->sadperbit4, cond_cost_list(cpi, cost_list), - &mbmi_ext->ref_mvs[ref_frame][0].as_mv, - &tmp_mv, INT_MAX, 0); + vp9_full_pixel_search( + cpi, x, bsize, &mvp_full, step_param, cpi->sf.mv.search_method, + x->sadperbit4, cond_cost_list(cpi, cost_list), + &mbmi_ext->ref_mvs[ref_frame][0].as_mv, &tmp_mv, INT_MAX, 0); x->mv_limits = tmp_mv_limits; diff --git a/vp9/encoder/vp9_rdopt.c b/vp9/encoder/vp9_rdopt.c index 27d4e9d6d..31fed26ad 100644 --- a/vp9/encoder/vp9_rdopt.c +++ b/vp9/encoder/vp9_rdopt.c @@ -1998,7 +1998,8 @@ static int64_t rd_pick_best_sub8x8_mode( vp9_set_mv_search_range(&x->mv_limits, &bsi->ref_mv[0]->as_mv); bestsme = vp9_full_pixel_search( - cpi, x, bsize, &mvp_full, step_param, sadpb, + cpi, x, bsize, &mvp_full, step_param, cpi->sf.mv.search_method, + sadpb, sf->mv.subpel_search_method != SUBPEL_TREE ? cost_list : NULL, &bsi->ref_mv[0]->as_mv, new_mv, INT_MAX, 1); @@ -2403,9 +2404,9 @@ static void single_motion_search(VP9_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize, mvp_full.col >>= 3; mvp_full.row >>= 3; - bestsme = vp9_full_pixel_search(cpi, x, bsize, &mvp_full, step_param, sadpb, - cond_cost_list(cpi, cost_list), &ref_mv, - &tmp_mv->as_mv, INT_MAX, 1); + bestsme = vp9_full_pixel_search( + cpi, x, bsize, &mvp_full, step_param, cpi->sf.mv.search_method, sadpb, + cond_cost_list(cpi, cost_list), &ref_mv, &tmp_mv->as_mv, INT_MAX, 1); x->mv_limits = tmp_mv_limits; diff --git a/vp9/encoder/vp9_temporal_filter.c b/vp9/encoder/vp9_temporal_filter.c index e2770281b..ae0fcc766 100644 --- a/vp9/encoder/vp9_temporal_filter.c +++ b/vp9/encoder/vp9_temporal_filter.c @@ -15,6 +15,7 @@ #include "vp9/common/vp9_onyxc_int.h" #include "vp9/common/vp9_quant_common.h" #include "vp9/common/vp9_reconinter.h" +#include "vp9/encoder/vp9_encodeframe.h" #include "vp9/encoder/vp9_extend.h" #include "vp9/encoder/vp9_firstpass.h" #include "vp9/encoder/vp9_mcomp.h" @@ -209,13 +210,14 @@ void vp9_highbd_temporal_filter_apply_c( #endif // CONFIG_VP9_HIGHBITDEPTH static uint32_t temporal_filter_find_matching_mb_c(VP9_COMP *cpi, + ThreadData *td, uint8_t *arf_frame_buf, uint8_t *frame_ptr_buf, - int stride) { - MACROBLOCK *const x = &cpi->td.mb; + int stride, MV *ref_mv) { + MACROBLOCK *const x = &td->mb; MACROBLOCKD *const xd = &x->e_mbd; MV_SPEED_FEATURES *const mv_sf = &cpi->sf.mv; - const SEARCH_METHODS old_search_method = mv_sf->search_method; + const SEARCH_METHODS search_method = HEX; int step_param; int sadpb = x->sadperbit16; uint32_t bestsme = UINT_MAX; @@ -225,7 +227,6 @@ static uint32_t temporal_filter_find_matching_mb_c(VP9_COMP *cpi, MV best_ref_mv1 = { 0, 0 }; MV best_ref_mv1_full; /* full-pixel value of best_ref_mv1 */ - MV *ref_mv = &x->e_mbd.mi[0]->bmi[0].as_mv[0].as_mv; // Save input state struct buf_2d src = x->plane[0].src; @@ -243,11 +244,9 @@ static uint32_t temporal_filter_find_matching_mb_c(VP9_COMP *cpi, step_param = mv_sf->reduce_first_step_size; step_param = VPXMIN(step_param, MAX_MVSEARCH_STEPS - 2); - mv_sf->search_method = HEX; vp9_full_pixel_search(cpi, x, BLOCK_16X16, &best_ref_mv1_full, step_param, - sadpb, cond_cost_list(cpi, cost_list), &best_ref_mv1, - ref_mv, 0, 0); - mv_sf->search_method = old_search_method; + search_method, sadpb, cond_cost_list(cpi, cost_list), + &best_ref_mv1, ref_mv, 0, 0); // Ignore mv costing by sending NULL pointer instead of cost array bestsme = cpi->find_fractional_mv_step( @@ -263,22 +262,24 @@ static uint32_t temporal_filter_find_matching_mb_c(VP9_COMP *cpi, return bestsme; } -static void temporal_filter_iterate_c(VP9_COMP *cpi, - YV12_BUFFER_CONFIG **frames, - int frame_count, int alt_ref_index, - int strength, - struct scale_factors *scale) { +static void temporal_filter_iterate_row_c(VP9_COMP *cpi, ThreadData *td, + int mb_row, int mb_col_start, + int mb_col_end) { + ARNRFilterData *arnr_filter_data = &cpi->arnr_filter_data; + YV12_BUFFER_CONFIG **frames = arnr_filter_data->frames; + int frame_count = arnr_filter_data->frame_count; + int alt_ref_index = arnr_filter_data->alt_ref_index; + int strength = arnr_filter_data->strength; + struct scale_factors *scale = &arnr_filter_data->sf; int byte; int frame; - int mb_col, mb_row; + int mb_col; unsigned int filter_weight; int mb_cols = (frames[alt_ref_index]->y_crop_width + 15) >> 4; int mb_rows = (frames[alt_ref_index]->y_crop_height + 15) >> 4; - int mb_y_offset = 0; - int mb_uv_offset = 0; DECLARE_ALIGNED(16, unsigned int, accumulator[16 * 16 * 3]); DECLARE_ALIGNED(16, uint16_t, count[16 * 16 * 3]); - MACROBLOCKD *mbd = &cpi->td.mb.e_mbd; + MACROBLOCKD *mbd = &td->mb.e_mbd; YV12_BUFFER_CONFIG *f = frames[alt_ref_index]; uint8_t *dst1, *dst2; #if CONFIG_VP9_HIGHBITDEPTH @@ -290,10 +291,11 @@ static void temporal_filter_iterate_c(VP9_COMP *cpi, #endif const int mb_uv_height = 16 >> mbd->plane[1].subsampling_y; const int mb_uv_width = 16 >> mbd->plane[1].subsampling_x; + // Addition of the tile col level offsets + int mb_y_offset = mb_row * 16 * (f->y_stride) + 16 * mb_col_start; + int mb_uv_offset = + mb_row * mb_uv_height * f->uv_stride + mb_uv_width * mb_col_start; - // Save input state - uint8_t *input_buffer[MAX_MB_PLANE]; - int i; #if CONFIG_VP9_HIGHBITDEPTH if (mbd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { predictor = CONVERT_TO_BYTEPTR(predictor16); @@ -302,102 +304,83 @@ static void temporal_filter_iterate_c(VP9_COMP *cpi, } #endif - for (i = 0; i < MAX_MB_PLANE; i++) input_buffer[i] = mbd->plane[i].pre[0].buf; - - for (mb_row = 0; mb_row < mb_rows; mb_row++) { - // Source frames are extended to 16 pixels. This is different than - // L/A/G reference frames that have a border of 32 (VP9ENCBORDERINPIXELS) - // A 6/8 tap filter is used for motion search. This requires 2 pixels - // before and 3 pixels after. So the largest Y mv on a border would - // then be 16 - VP9_INTERP_EXTEND. The UV blocks are half the size of the - // Y and therefore only extended by 8. The largest mv that a UV block - // can support is 8 - VP9_INTERP_EXTEND. A UV mv is half of a Y mv. - // (16 - VP9_INTERP_EXTEND) >> 1 which is greater than - // 8 - VP9_INTERP_EXTEND. - // To keep the mv in play for both Y and UV planes the max that it - // can be on a border is therefore 16 - (2*VP9_INTERP_EXTEND+1). - cpi->td.mb.mv_limits.row_min = - -((mb_row * 16) + (17 - 2 * VP9_INTERP_EXTEND)); - cpi->td.mb.mv_limits.row_max = - ((mb_rows - 1 - mb_row) * 16) + (17 - 2 * VP9_INTERP_EXTEND); - - for (mb_col = 0; mb_col < mb_cols; mb_col++) { - int i, j, k; - int stride; - - memset(accumulator, 0, 16 * 16 * 3 * sizeof(accumulator[0])); - memset(count, 0, 16 * 16 * 3 * sizeof(count[0])); - - cpi->td.mb.mv_limits.col_min = - -((mb_col * 16) + (17 - 2 * VP9_INTERP_EXTEND)); - cpi->td.mb.mv_limits.col_max = - ((mb_cols - 1 - mb_col) * 16) + (17 - 2 * VP9_INTERP_EXTEND); - - for (frame = 0; frame < frame_count; frame++) { - const uint32_t thresh_low = 10000; - const uint32_t thresh_high = 20000; - - if (frames[frame] == NULL) continue; - - mbd->mi[0]->bmi[0].as_mv[0].as_mv.row = 0; - mbd->mi[0]->bmi[0].as_mv[0].as_mv.col = 0; - - if (frame == alt_ref_index) { - filter_weight = 2; - } else { - // Find best match in this frame by MC - uint32_t err = temporal_filter_find_matching_mb_c( - cpi, frames[alt_ref_index]->y_buffer + mb_y_offset, - frames[frame]->y_buffer + mb_y_offset, frames[frame]->y_stride); - - // Assign higher weight to matching MB if its error - // score is lower. If not applying MC default behavior - // is to weight all MBs equal. - filter_weight = err < thresh_low ? 2 : err < thresh_high ? 1 : 0; - } + // Source frames are extended to 16 pixels. This is different than + // L/A/G reference frames that have a border of 32 (VP9ENCBORDERINPIXELS) + // A 6/8 tap filter is used for motion search. This requires 2 pixels + // before and 3 pixels after. So the largest Y mv on a border would + // then be 16 - VP9_INTERP_EXTEND. The UV blocks are half the size of the + // Y and therefore only extended by 8. The largest mv that a UV block + // can support is 8 - VP9_INTERP_EXTEND. A UV mv is half of a Y mv. + // (16 - VP9_INTERP_EXTEND) >> 1 which is greater than + // 8 - VP9_INTERP_EXTEND. + // To keep the mv in play for both Y and UV planes the max that it + // can be on a border is therefore 16 - (2*VP9_INTERP_EXTEND+1). + td->mb.mv_limits.row_min = -((mb_row * 16) + (17 - 2 * VP9_INTERP_EXTEND)); + td->mb.mv_limits.row_max = + ((mb_rows - 1 - mb_row) * 16) + (17 - 2 * VP9_INTERP_EXTEND); + + for (mb_col = mb_col_start; mb_col < mb_col_end; mb_col++) { + int i, j, k; + int stride; + MV ref_mv; + + memset(accumulator, 0, 16 * 16 * 3 * sizeof(accumulator[0])); + memset(count, 0, 16 * 16 * 3 * sizeof(count[0])); + + td->mb.mv_limits.col_min = -((mb_col * 16) + (17 - 2 * VP9_INTERP_EXTEND)); + td->mb.mv_limits.col_max = + ((mb_cols - 1 - mb_col) * 16) + (17 - 2 * VP9_INTERP_EXTEND); + + for (frame = 0; frame < frame_count; frame++) { + const uint32_t thresh_low = 10000; + const uint32_t thresh_high = 20000; + + if (frames[frame] == NULL) continue; + + ref_mv.row = 0; + ref_mv.col = 0; + + if (frame == alt_ref_index) { + filter_weight = 2; + } else { + // Find best match in this frame by MC + uint32_t err = temporal_filter_find_matching_mb_c( + cpi, td, frames[alt_ref_index]->y_buffer + mb_y_offset, + frames[frame]->y_buffer + mb_y_offset, frames[frame]->y_stride, + &ref_mv); + + // Assign higher weight to matching MB if its error + // score is lower. If not applying MC default behavior + // is to weight all MBs equal. + filter_weight = err < thresh_low ? 2 : err < thresh_high ? 1 : 0; + } - if (filter_weight != 0) { - // Construct the predictors - temporal_filter_predictors_mb_c( - mbd, frames[frame]->y_buffer + mb_y_offset, - frames[frame]->u_buffer + mb_uv_offset, - frames[frame]->v_buffer + mb_uv_offset, frames[frame]->y_stride, - mb_uv_width, mb_uv_height, mbd->mi[0]->bmi[0].as_mv[0].as_mv.row, - mbd->mi[0]->bmi[0].as_mv[0].as_mv.col, predictor, scale, - mb_col * 16, mb_row * 16); + if (filter_weight != 0) { + // Construct the predictors + temporal_filter_predictors_mb_c( + mbd, frames[frame]->y_buffer + mb_y_offset, + frames[frame]->u_buffer + mb_uv_offset, + frames[frame]->v_buffer + mb_uv_offset, frames[frame]->y_stride, + mb_uv_width, mb_uv_height, ref_mv.row, ref_mv.col, predictor, scale, + mb_col * 16, mb_row * 16); #if CONFIG_VP9_HIGHBITDEPTH - if (mbd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { - int adj_strength = strength + 2 * (mbd->bd - 8); - // Apply the filter (YUV) - vp9_highbd_temporal_filter_apply_c( - f->y_buffer + mb_y_offset, f->y_stride, predictor, 16, 16, - adj_strength, filter_weight, accumulator, count); - vp9_highbd_temporal_filter_apply_c( - f->u_buffer + mb_uv_offset, f->uv_stride, predictor + 256, - mb_uv_width, mb_uv_height, adj_strength, filter_weight, - accumulator + 256, count + 256); - vp9_highbd_temporal_filter_apply_c( - f->v_buffer + mb_uv_offset, f->uv_stride, predictor + 512, - mb_uv_width, mb_uv_height, adj_strength, filter_weight, - accumulator + 512, count + 512); - } else { - // Apply the filter (YUV) - vp9_temporal_filter_apply_c(f->y_buffer + mb_y_offset, f->y_stride, - predictor, 16, 16, strength, - filter_weight, accumulator, count); - vp9_temporal_filter_apply_c( - f->u_buffer + mb_uv_offset, f->uv_stride, predictor + 256, - mb_uv_width, mb_uv_height, strength, filter_weight, - accumulator + 256, count + 256); - vp9_temporal_filter_apply_c( - f->v_buffer + mb_uv_offset, f->uv_stride, predictor + 512, - mb_uv_width, mb_uv_height, strength, filter_weight, - accumulator + 512, count + 512); - } -#else + if (mbd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { + int adj_strength = strength + 2 * (mbd->bd - 8); + // Apply the filter (YUV) + vp9_highbd_temporal_filter_apply_c( + f->y_buffer + mb_y_offset, f->y_stride, predictor, 16, 16, + adj_strength, filter_weight, accumulator, count); + vp9_highbd_temporal_filter_apply_c( + f->u_buffer + mb_uv_offset, f->uv_stride, predictor + 256, + mb_uv_width, mb_uv_height, adj_strength, filter_weight, + accumulator + 256, count + 256); + vp9_highbd_temporal_filter_apply_c( + f->v_buffer + mb_uv_offset, f->uv_stride, predictor + 512, + mb_uv_width, mb_uv_height, adj_strength, filter_weight, + accumulator + 512, count + 512); + } else { // Apply the filter (YUV) - // TODO(jingning): Need SIMD optimization for this. vp9_temporal_filter_apply_c(f->y_buffer + mb_y_offset, f->y_stride, predictor, 16, 16, strength, filter_weight, accumulator, count); @@ -409,108 +392,78 @@ static void temporal_filter_iterate_c(VP9_COMP *cpi, predictor + 512, mb_uv_width, mb_uv_height, strength, filter_weight, accumulator + 512, count + 512); -#endif // CONFIG_VP9_HIGHBITDEPTH } +#else + // Apply the filter (YUV) + // TODO(jingning): Need SIMD optimization for this. + vp9_temporal_filter_apply_c(f->y_buffer + mb_y_offset, f->y_stride, + predictor, 16, 16, strength, filter_weight, + accumulator, count); + vp9_temporal_filter_apply_c(f->u_buffer + mb_uv_offset, f->uv_stride, + predictor + 256, mb_uv_width, mb_uv_height, + strength, filter_weight, accumulator + 256, + count + 256); + vp9_temporal_filter_apply_c(f->v_buffer + mb_uv_offset, f->uv_stride, + predictor + 512, mb_uv_width, mb_uv_height, + strength, filter_weight, accumulator + 512, + count + 512); +#endif // CONFIG_VP9_HIGHBITDEPTH } + } #if CONFIG_VP9_HIGHBITDEPTH - if (mbd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { - uint16_t *dst1_16; - uint16_t *dst2_16; - // Normalize filter output to produce AltRef frame - dst1 = cpi->alt_ref_buffer.y_buffer; - dst1_16 = CONVERT_TO_SHORTPTR(dst1); - stride = cpi->alt_ref_buffer.y_stride; - byte = mb_y_offset; - for (i = 0, k = 0; i < 16; i++) { - for (j = 0; j < 16; j++, k++) { - unsigned int pval = accumulator[k] + (count[k] >> 1); - pval *= fixed_divide[count[k]]; - pval >>= 19; - - dst1_16[byte] = (uint16_t)pval; - - // move to next pixel - byte++; - } + if (mbd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { + uint16_t *dst1_16; + uint16_t *dst2_16; + // Normalize filter output to produce AltRef frame + dst1 = cpi->alt_ref_buffer.y_buffer; + dst1_16 = CONVERT_TO_SHORTPTR(dst1); + stride = cpi->alt_ref_buffer.y_stride; + byte = mb_y_offset; + for (i = 0, k = 0; i < 16; i++) { + for (j = 0; j < 16; j++, k++) { + unsigned int pval = accumulator[k] + (count[k] >> 1); + pval *= fixed_divide[count[k]]; + pval >>= 19; - byte += stride - 16; + dst1_16[byte] = (uint16_t)pval; + + // move to next pixel + byte++; } - dst1 = cpi->alt_ref_buffer.u_buffer; - dst2 = cpi->alt_ref_buffer.v_buffer; - dst1_16 = CONVERT_TO_SHORTPTR(dst1); - dst2_16 = CONVERT_TO_SHORTPTR(dst2); - stride = cpi->alt_ref_buffer.uv_stride; - byte = mb_uv_offset; - for (i = 0, k = 256; i < mb_uv_height; i++) { - for (j = 0; j < mb_uv_width; j++, k++) { - int m = k + 256; - - // U - unsigned int pval = accumulator[k] + (count[k] >> 1); - pval *= fixed_divide[count[k]]; - pval >>= 19; - dst1_16[byte] = (uint16_t)pval; - - // V - pval = accumulator[m] + (count[m] >> 1); - pval *= fixed_divide[count[m]]; - pval >>= 19; - dst2_16[byte] = (uint16_t)pval; - - // move to next pixel - byte++; - } + byte += stride - 16; + } - byte += stride - mb_uv_width; - } - } else { - // Normalize filter output to produce AltRef frame - dst1 = cpi->alt_ref_buffer.y_buffer; - stride = cpi->alt_ref_buffer.y_stride; - byte = mb_y_offset; - for (i = 0, k = 0; i < 16; i++) { - for (j = 0; j < 16; j++, k++) { - unsigned int pval = accumulator[k] + (count[k] >> 1); - pval *= fixed_divide[count[k]]; - pval >>= 19; - - dst1[byte] = (uint8_t)pval; - - // move to next pixel - byte++; - } - byte += stride - 16; - } + dst1 = cpi->alt_ref_buffer.u_buffer; + dst2 = cpi->alt_ref_buffer.v_buffer; + dst1_16 = CONVERT_TO_SHORTPTR(dst1); + dst2_16 = CONVERT_TO_SHORTPTR(dst2); + stride = cpi->alt_ref_buffer.uv_stride; + byte = mb_uv_offset; + for (i = 0, k = 256; i < mb_uv_height; i++) { + for (j = 0; j < mb_uv_width; j++, k++) { + int m = k + 256; - dst1 = cpi->alt_ref_buffer.u_buffer; - dst2 = cpi->alt_ref_buffer.v_buffer; - stride = cpi->alt_ref_buffer.uv_stride; - byte = mb_uv_offset; - for (i = 0, k = 256; i < mb_uv_height; i++) { - for (j = 0; j < mb_uv_width; j++, k++) { - int m = k + 256; - - // U - unsigned int pval = accumulator[k] + (count[k] >> 1); - pval *= fixed_divide[count[k]]; - pval >>= 19; - dst1[byte] = (uint8_t)pval; - - // V - pval = accumulator[m] + (count[m] >> 1); - pval *= fixed_divide[count[m]]; - pval >>= 19; - dst2[byte] = (uint8_t)pval; - - // move to next pixel - byte++; - } - byte += stride - mb_uv_width; + // U + unsigned int pval = accumulator[k] + (count[k] >> 1); + pval *= fixed_divide[count[k]]; + pval >>= 19; + dst1_16[byte] = (uint16_t)pval; + + // V + pval = accumulator[m] + (count[m] >> 1); + pval *= fixed_divide[count[m]]; + pval >>= 19; + dst2_16[byte] = (uint16_t)pval; + + // move to next pixel + byte++; } + + byte += stride - mb_uv_width; } -#else + } else { // Normalize filter output to produce AltRef frame dst1 = cpi->alt_ref_buffer.y_buffer; stride = cpi->alt_ref_buffer.y_stride; @@ -554,12 +507,93 @@ static void temporal_filter_iterate_c(VP9_COMP *cpi, } byte += stride - mb_uv_width; } + } +#else + // Normalize filter output to produce AltRef frame + dst1 = cpi->alt_ref_buffer.y_buffer; + stride = cpi->alt_ref_buffer.y_stride; + byte = mb_y_offset; + for (i = 0, k = 0; i < 16; i++) { + for (j = 0; j < 16; j++, k++) { + unsigned int pval = accumulator[k] + (count[k] >> 1); + pval *= fixed_divide[count[k]]; + pval >>= 19; + + dst1[byte] = (uint8_t)pval; + + // move to next pixel + byte++; + } + byte += stride - 16; + } + + dst1 = cpi->alt_ref_buffer.u_buffer; + dst2 = cpi->alt_ref_buffer.v_buffer; + stride = cpi->alt_ref_buffer.uv_stride; + byte = mb_uv_offset; + for (i = 0, k = 256; i < mb_uv_height; i++) { + for (j = 0; j < mb_uv_width; j++, k++) { + int m = k + 256; + + // U + unsigned int pval = accumulator[k] + (count[k] >> 1); + pval *= fixed_divide[count[k]]; + pval >>= 19; + dst1[byte] = (uint8_t)pval; + + // V + pval = accumulator[m] + (count[m] >> 1); + pval *= fixed_divide[count[m]]; + pval >>= 19; + dst2[byte] = (uint8_t)pval; + + // move to next pixel + byte++; + } + byte += stride - mb_uv_width; + } #endif // CONFIG_VP9_HIGHBITDEPTH - mb_y_offset += 16; - mb_uv_offset += mb_uv_width; + mb_y_offset += 16; + mb_uv_offset += mb_uv_width; + } +} + +static void temporal_filter_iterate_tile_c(VP9_COMP *cpi, int tile_row, + int tile_col) { + VP9_COMMON *const cm = &cpi->common; + const int tile_cols = 1 << cm->log2_tile_cols; + TileInfo *tile_info = + &cpi->tile_data[tile_row * tile_cols + tile_col].tile_info; + const int mb_row_start = (tile_info->mi_row_start) >> 1; + const int mb_row_end = (tile_info->mi_row_end + 1) >> 1; + const int mb_col_start = (tile_info->mi_col_start) >> 1; + const int mb_col_end = (tile_info->mi_col_end + 1) >> 1; + int mb_row; + + for (mb_row = mb_row_start; mb_row < mb_row_end; mb_row++) { + temporal_filter_iterate_row_c(cpi, &cpi->td, mb_row, mb_col_start, + mb_col_end); + } +} + +static void temporal_filter_iterate_c(VP9_COMP *cpi) { + VP9_COMMON *const cm = &cpi->common; + const int tile_cols = 1 << cm->log2_tile_cols; + const int tile_rows = 1 << cm->log2_tile_rows; + int tile_row, tile_col; + MACROBLOCKD *mbd = &cpi->td.mb.e_mbd; + // Save input state + uint8_t *input_buffer[MAX_MB_PLANE]; + int i; + + for (i = 0; i < MAX_MB_PLANE; i++) input_buffer[i] = mbd->plane[i].pre[0].buf; + + vp9_init_tile_data(cpi); + + for (tile_row = 0; tile_row < tile_rows; ++tile_row) { + for (tile_col = 0; tile_col < tile_cols; ++tile_col) { + temporal_filter_iterate_tile_c(cpi, tile_row, tile_col); } - mb_y_offset += 16 * (f->y_stride - mb_cols); - mb_uv_offset += mb_uv_height * f->uv_stride - mb_uv_width * mb_cols; } // Restore input state @@ -638,14 +672,15 @@ void vp9_temporal_filter(VP9_COMP *cpi, int distance) { VP9_COMMON *const cm = &cpi->common; RATE_CONTROL *const rc = &cpi->rc; MACROBLOCKD *const xd = &cpi->td.mb.e_mbd; + ARNRFilterData *arnr_filter_data = &cpi->arnr_filter_data; int frame; int frames_to_blur; int start_frame; int strength; int frames_to_blur_backward; int frames_to_blur_forward; - struct scale_factors sf; - YV12_BUFFER_CONFIG *frames[MAX_LAG_BUFFERS] = { NULL }; + struct scale_factors *sf = &arnr_filter_data->sf; + YV12_BUFFER_CONFIG **frames = arnr_filter_data->frames; int rdmult; // Apply context specific adjustments to the arnr filter parameters. @@ -654,6 +689,10 @@ void vp9_temporal_filter(VP9_COMP *cpi, int distance) { frames_to_blur_forward = ((frames_to_blur - 1) / 2); start_frame = distance + frames_to_blur_forward; + arnr_filter_data->strength = strength; + arnr_filter_data->frame_count = frames_to_blur; + arnr_filter_data->alt_ref_index = frames_to_blur_backward; + // Setup frame pointers, NULL indicates frame not included in filter. for (frame = 0; frame < frames_to_blur; ++frame) { const int which_buffer = start_frame - frame; @@ -671,13 +710,13 @@ void vp9_temporal_filter(VP9_COMP *cpi, int distance) { int frame_used = 0; #if CONFIG_VP9_HIGHBITDEPTH vp9_setup_scale_factors_for_frame( - &sf, get_frame_new_buffer(cm)->y_crop_width, + sf, get_frame_new_buffer(cm)->y_crop_width, get_frame_new_buffer(cm)->y_crop_height, get_frame_new_buffer(cm)->y_crop_width, get_frame_new_buffer(cm)->y_crop_height, cm->use_highbitdepth); #else vp9_setup_scale_factors_for_frame( - &sf, get_frame_new_buffer(cm)->y_crop_width, + sf, get_frame_new_buffer(cm)->y_crop_width, get_frame_new_buffer(cm)->y_crop_height, get_frame_new_buffer(cm)->y_crop_width, get_frame_new_buffer(cm)->y_crop_height); @@ -709,12 +748,12 @@ void vp9_temporal_filter(VP9_COMP *cpi, int distance) { // ARF is produced at the native frame size and resized when coded. #if CONFIG_VP9_HIGHBITDEPTH vp9_setup_scale_factors_for_frame( - &sf, frames[0]->y_crop_width, frames[0]->y_crop_height, + sf, frames[0]->y_crop_width, frames[0]->y_crop_height, frames[0]->y_crop_width, frames[0]->y_crop_height, cm->use_highbitdepth); #else vp9_setup_scale_factors_for_frame( - &sf, frames[0]->y_crop_width, frames[0]->y_crop_height, + sf, frames[0]->y_crop_width, frames[0]->y_crop_height, frames[0]->y_crop_width, frames[0]->y_crop_height); #endif // CONFIG_VP9_HIGHBITDEPTH } @@ -726,6 +765,5 @@ void vp9_temporal_filter(VP9_COMP *cpi, int distance) { set_error_per_bit(&cpi->td.mb, rdmult); vp9_initialize_me_consts(cpi, &cpi->td.mb, ARNR_FILT_QINDEX); - temporal_filter_iterate_c(cpi, frames, frames_to_blur, - frames_to_blur_backward, strength, &sf); + temporal_filter_iterate_c(cpi); } -- 2.40.0