From 037d67f684683ffad22e38ab9a6381ccfedd813f Mon Sep 17 00:00:00 2001 From: Angie Chiang Date: Fri, 12 Jul 2019 16:15:55 -0700 Subject: [PATCH] Use sdx8f in exhaustive_mesh_search_single_step This speed up non_greedy_mv by 4% Change-Id: I9288c88db56ea4201a7ec4493ca5c567d76af0f1 --- vp9/encoder/vp9_encoder.c | 51 ++++++++++++++++------------- vp9/encoder/vp9_mcomp.c | 68 +++++++++++++++++++++++++-------------- vpx_dsp/variance.h | 1 + 3 files changed, 74 insertions(+), 46 deletions(-) diff --git a/vp9/encoder/vp9_encoder.c b/vp9/encoder/vp9_encoder.c index 395cc0aa3..5bbad5ca5 100644 --- a/vp9/encoder/vp9_encoder.c +++ b/vp9/encoder/vp9_encoder.c @@ -1513,13 +1513,15 @@ static void set_rc_buffer_sizes(RATE_CONTROL *rc, } #if CONFIG_VP9_HIGHBITDEPTH +// TODO(angiebird): make sdx8f available for highbitdepth if needed #define HIGHBD_BFP(BT, SDF, SDAF, VF, SVF, SVAF, SDX4DF) \ cpi->fn_ptr[BT].sdf = SDF; \ cpi->fn_ptr[BT].sdaf = SDAF; \ cpi->fn_ptr[BT].vf = VF; \ cpi->fn_ptr[BT].svf = SVF; \ cpi->fn_ptr[BT].svaf = SVAF; \ - cpi->fn_ptr[BT].sdx4df = SDX4DF; + cpi->fn_ptr[BT].sdx4df = SDX4DF; \ + cpi->fn_ptr[BT].sdx8f = NULL; #define MAKE_BFP_SAD_WRAPPER(fnname) \ static unsigned int fnname##_bits8(const uint8_t *src_ptr, \ @@ -2418,62 +2420,67 @@ VP9_COMP *vp9_create_compressor(VP9EncoderConfig *oxcf, CHECK_MEM_ERROR(cm, cpi->source_diff_var, vpx_calloc(cm->MBs, sizeof(diff))); cpi->source_var_thresh = 0; cpi->frames_till_next_var_check = 0; +#define BFP(BT, SDF, SDAF, VF, SVF, SVAF, SDX4DF, SDX8F) \ + cpi->fn_ptr[BT].sdf = SDF; \ + cpi->fn_ptr[BT].sdaf = SDAF; \ + cpi->fn_ptr[BT].vf = VF; \ + cpi->fn_ptr[BT].svf = SVF; \ + cpi->fn_ptr[BT].svaf = SVAF; \ + cpi->fn_ptr[BT].sdx4df = SDX4DF; \ + cpi->fn_ptr[BT].sdx8f = SDX8F; -#define BFP(BT, SDF, SDAF, VF, SVF, SVAF, SDX4DF) \ - cpi->fn_ptr[BT].sdf = SDF; \ - cpi->fn_ptr[BT].sdaf = SDAF; \ - cpi->fn_ptr[BT].vf = VF; \ - cpi->fn_ptr[BT].svf = SVF; \ - cpi->fn_ptr[BT].svaf = SVAF; \ - cpi->fn_ptr[BT].sdx4df = SDX4DF; - + // TODO(angiebird): make sdx8f available for every block size BFP(BLOCK_32X16, vpx_sad32x16, vpx_sad32x16_avg, vpx_variance32x16, vpx_sub_pixel_variance32x16, vpx_sub_pixel_avg_variance32x16, - vpx_sad32x16x4d) + vpx_sad32x16x4d, NULL) BFP(BLOCK_16X32, vpx_sad16x32, vpx_sad16x32_avg, vpx_variance16x32, vpx_sub_pixel_variance16x32, vpx_sub_pixel_avg_variance16x32, - vpx_sad16x32x4d) + vpx_sad16x32x4d, NULL) BFP(BLOCK_64X32, vpx_sad64x32, vpx_sad64x32_avg, vpx_variance64x32, vpx_sub_pixel_variance64x32, vpx_sub_pixel_avg_variance64x32, - vpx_sad64x32x4d) + vpx_sad64x32x4d, NULL) BFP(BLOCK_32X64, vpx_sad32x64, vpx_sad32x64_avg, vpx_variance32x64, vpx_sub_pixel_variance32x64, vpx_sub_pixel_avg_variance32x64, - vpx_sad32x64x4d) + vpx_sad32x64x4d, NULL) BFP(BLOCK_32X32, vpx_sad32x32, vpx_sad32x32_avg, vpx_variance32x32, vpx_sub_pixel_variance32x32, vpx_sub_pixel_avg_variance32x32, - vpx_sad32x32x4d) + vpx_sad32x32x4d, NULL) BFP(BLOCK_64X64, vpx_sad64x64, vpx_sad64x64_avg, vpx_variance64x64, vpx_sub_pixel_variance64x64, vpx_sub_pixel_avg_variance64x64, - vpx_sad64x64x4d) + vpx_sad64x64x4d, NULL) BFP(BLOCK_16X16, vpx_sad16x16, vpx_sad16x16_avg, vpx_variance16x16, vpx_sub_pixel_variance16x16, vpx_sub_pixel_avg_variance16x16, - vpx_sad16x16x4d) + vpx_sad16x16x4d, vpx_sad16x16x8) BFP(BLOCK_16X8, vpx_sad16x8, vpx_sad16x8_avg, vpx_variance16x8, vpx_sub_pixel_variance16x8, vpx_sub_pixel_avg_variance16x8, - vpx_sad16x8x4d) + vpx_sad16x8x4d, vpx_sad16x8x8) BFP(BLOCK_8X16, vpx_sad8x16, vpx_sad8x16_avg, vpx_variance8x16, vpx_sub_pixel_variance8x16, vpx_sub_pixel_avg_variance8x16, - vpx_sad8x16x4d) + vpx_sad8x16x4d, vpx_sad8x16x8) BFP(BLOCK_8X8, vpx_sad8x8, vpx_sad8x8_avg, vpx_variance8x8, - vpx_sub_pixel_variance8x8, vpx_sub_pixel_avg_variance8x8, vpx_sad8x8x4d) + vpx_sub_pixel_variance8x8, vpx_sub_pixel_avg_variance8x8, vpx_sad8x8x4d, + vpx_sad8x8x8) BFP(BLOCK_8X4, vpx_sad8x4, vpx_sad8x4_avg, vpx_variance8x4, - vpx_sub_pixel_variance8x4, vpx_sub_pixel_avg_variance8x4, vpx_sad8x4x4d) + vpx_sub_pixel_variance8x4, vpx_sub_pixel_avg_variance8x4, vpx_sad8x4x4d, + NULL) BFP(BLOCK_4X8, vpx_sad4x8, vpx_sad4x8_avg, vpx_variance4x8, - vpx_sub_pixel_variance4x8, vpx_sub_pixel_avg_variance4x8, vpx_sad4x8x4d) + vpx_sub_pixel_variance4x8, vpx_sub_pixel_avg_variance4x8, vpx_sad4x8x4d, + NULL) BFP(BLOCK_4X4, vpx_sad4x4, vpx_sad4x4_avg, vpx_variance4x4, - vpx_sub_pixel_variance4x4, vpx_sub_pixel_avg_variance4x4, vpx_sad4x4x4d) + vpx_sub_pixel_variance4x4, vpx_sub_pixel_avg_variance4x4, vpx_sad4x4x4d, + vpx_sad4x4x8) #if CONFIG_VP9_HIGHBITDEPTH highbd_set_var_fns(cpi); diff --git a/vp9/encoder/vp9_mcomp.c b/vp9/encoder/vp9_mcomp.c index 70cbb5c25..d5707fb64 100644 --- a/vp9/encoder/vp9_mcomp.c +++ b/vp9/encoder/vp9_mcomp.c @@ -1979,18 +1979,16 @@ static int64_t exhaustive_mesh_search_single_step( end_row = VPXMIN(center_mv->row + range, mv_limits->row_max); end_col = VPXMIN(center_mv->col + range, mv_limits->col_max); for (r = start_row; r <= end_row; r += 1) { - for (c = start_col; c <= end_col; c += 4) { - // 4 sads in a single call if we are checking every location - if (c + 3 <= end_col) { - unsigned int sads[4]; - const uint8_t *addrs[4]; - for (i = 0; i < 4; ++i) { - const MV mv = { r, c + i }; - addrs[i] = get_buf_from_mv(pre, &mv); - } - fn_ptr->sdx4df(src->buf, src->stride, addrs, pre->stride, sads); - - for (i = 0; i < 4; ++i) { + c = start_col; + // sdx8f may not be available some block size + if (fn_ptr->sdx8f) { + while (c + 7 <= end_col) { + unsigned int sads[8]; + const MV mv = { r, c }; + const uint8_t *buf = get_buf_from_mv(pre, &mv); + fn_ptr->sdx8f(src->buf, src->stride, buf, pre->stride, sads); + + for (i = 0; i < 8; ++i) { int64_t sad = (int64_t)sads[i] << LOG2_PRECISION; if (sad < best_sad) { const MV mv = { r, c + i }; @@ -2002,23 +2000,45 @@ static int64_t exhaustive_mesh_search_single_step( } } } - } else { - for (i = 0; i <= end_col - c; ++i) { + c += 8; + } + } + while (c + 3 <= end_col) { + unsigned int sads[4]; + const uint8_t *addrs[4]; + for (i = 0; i < 4; ++i) { + const MV mv = { r, c + i }; + addrs[i] = get_buf_from_mv(pre, &mv); + } + fn_ptr->sdx4df(src->buf, src->stride, addrs, pre->stride, sads); + + for (i = 0; i < 4; ++i) { + int64_t sad = (int64_t)sads[i] << LOG2_PRECISION; + if (sad < best_sad) { const MV mv = { r, c + i }; - int64_t sad = - (int64_t)fn_ptr->sdf(src->buf, src->stride, - get_buf_from_mv(pre, &mv), pre->stride) - << LOG2_PRECISION; + sad += + lambda * vp9_nb_mvs_inconsistency(&mv, nb_full_mvs, full_mv_num); if (sad < best_sad) { - sad += lambda * - vp9_nb_mvs_inconsistency(&mv, nb_full_mvs, full_mv_num); - if (sad < best_sad) { - best_sad = sad; - *best_mv = mv; - } + best_sad = sad; + *best_mv = mv; } } } + c += 4; + } + while (c <= end_col) { + const MV mv = { r, c }; + int64_t sad = (int64_t)fn_ptr->sdf(src->buf, src->stride, + get_buf_from_mv(pre, &mv), pre->stride) + << LOG2_PRECISION; + if (sad < best_sad) { + sad += lambda * vp9_nb_mvs_inconsistency(&mv, nb_full_mvs, full_mv_num); + if (sad < best_sad) { + best_sad = sad; + *best_mv = mv; + } + } + c += 1; } } return best_sad; diff --git a/vpx_dsp/variance.h b/vpx_dsp/variance.h index 6d0e1b8a6..bbf3e8f46 100644 --- a/vpx_dsp/variance.h +++ b/vpx_dsp/variance.h @@ -76,6 +76,7 @@ typedef struct vp9_variance_vtable { vpx_subpixvariance_fn_t svf; vpx_subp_avg_variance_fn_t svaf; vpx_sad_multi_d_fn_t sdx4df; + vpx_sad_multi_fn_t sdx8f; } vp9_variance_fn_ptr_t; #endif // CONFIG_VP9 -- 2.40.0