From 1470529f62729b99d623e3c2f36a588ceb85ddb7 Mon Sep 17 00:00:00 2001 From: Jingning Han Date: Tue, 31 Mar 2015 17:46:41 -0700 Subject: [PATCH] Refactor block_yrd function for RTC coding mode This commit separates Hadamard transform/quantization operations from rate and distortion computation in block_yrd. This allows one to skip SATD computation when all transform blocks are quantized to zero. It also uses a new block error function that skips repeated computation of sum of squared residuals. It reduces the CPU cycles spent on block error calculation in block_yrd by 40%. Change-Id: I726acb2454b44af1c3bd95385abecac209959b10 --- vp9/common/vp9_rtcd_defs.pl | 3 ++ vp9/encoder/vp9_pickmode.c | 59 ++++++++++++++++++++---------- vp9/encoder/vp9_rdopt.c | 12 ++++++ vp9/encoder/x86/vp9_error_sse2.asm | 46 +++++++++++++++++++++++ 4 files changed, 101 insertions(+), 19 deletions(-) diff --git a/vp9/common/vp9_rtcd_defs.pl b/vp9/common/vp9_rtcd_defs.pl index 887f407ba..b1a33f5a0 100644 --- a/vp9/common/vp9_rtcd_defs.pl +++ b/vp9/common/vp9_rtcd_defs.pl @@ -1171,6 +1171,9 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") { add_proto qw/int64_t vp9_block_error/, "const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz"; specialize qw/vp9_block_error avx2/, "$sse2_x86inc"; + add_proto qw/int64_t vp9_block_error_fp/, "const int16_t *coeff, const int16_t *dqcoeff, int block_size"; + specialize qw/vp9_block_error_fp sse2/; + add_proto qw/void vp9_quantize_fp/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan"; specialize qw/vp9_quantize_fp neon sse2/, "$ssse3_x86_64"; diff --git a/vp9/encoder/vp9_pickmode.c b/vp9/encoder/vp9_pickmode.c index f8a5e6ae7..e26b4c09f 100644 --- a/vp9/encoder/vp9_pickmode.c +++ b/vp9/encoder/vp9_pickmode.c @@ -315,6 +315,20 @@ static void model_rd_for_sb_y(VP9_COMP *cpi, BLOCK_SIZE bsize, *out_dist_sum += dist << 4; } +#if CONFIG_VP9_HIGHBITDEPTH +static void block_yrd(VP9_COMP *cpi, MACROBLOCK *x, int *rate, int64_t *dist, + int *skippable, int64_t *sse, int plane, + BLOCK_SIZE bsize, TX_SIZE tx_size) { + MACROBLOCKD *xd = &x->e_mbd; + unsigned int var_y, sse_y; + (void)plane; + (void)tx_size; + model_rd_for_sb_y(cpi, bsize, x, xd, rate, dist, &var_y, &sse_y); + *sse = INT_MAX; + *skippable = 0; + return; +} +#else static void block_yrd(VP9_COMP *cpi, MACROBLOCK *x, int *rate, int64_t *dist, int *skippable, int64_t *sse, int plane, BLOCK_SIZE bsize, TX_SIZE tx_size) { @@ -332,23 +346,9 @@ static void block_yrd(VP9_COMP *cpi, MACROBLOCK *x, int *rate, int64_t *dist, const int max_blocks_high = num_4x4_h + (xd->mb_to_bottom_edge >= 0 ? 0 : xd->mb_to_bottom_edge >> (5 + pd->subsampling_y)); -#if CONFIG_VP9_HIGHBITDEPTH - unsigned int var_y, sse_y; - model_rd_for_sb_y(cpi, bsize, x, xd, rate, dist, &var_y, &sse_y); - *sse = INT_MAX; - *skippable = 0; - return; -#else (void)cpi; -#endif - vp9_subtract_plane(x, bsize, plane); - *skippable = 1; - *rate = 0; - *dist = 0; - *sse = 0; - // Keep track of the row and column of the blocks we use so that we know // if we are in the unrestricted motion border. for (r = 0; r < max_blocks_high; r += block_step) { @@ -362,7 +362,6 @@ static void block_yrd(VP9_COMP *cpi, MACROBLOCK *x, int *rate, int64_t *dist, const int diff_stride = 4 * num_4x4_blocks_wide_lookup[bsize]; int i, j; const int16_t *src_diff; - int64_t this_sse; txfrm_block_to_raster_xy(bsize, tx_size, block, &i, &j); src_diff = &p->src_diff[4 * (j * diff_stride + i)]; @@ -399,16 +398,36 @@ static void block_yrd(VP9_COMP *cpi, MACROBLOCK *x, int *rate, int64_t *dist, assert(0); break; } + *skippable &= (*eob == 0); + } + block += step; + } + } + + if (*skippable && *sse < INT64_MAX) { + *dist = (*sse << 6) >> shift; + *sse = *dist; + return; + } - *dist += vp9_block_error(coeff, dqcoeff, step << 4, &this_sse) >> shift; + block = 0; + *rate = 0; + *dist = 0; + *sse = (*sse << 6) >> shift; + for (r = 0; r < max_blocks_high; r += block_step) { + for (c = 0; c < num_4x4_w; c += block_step) { + if (c < max_blocks_wide) { + tran_low_t *const coeff = BLOCK_OFFSET(p->coeff, block); + tran_low_t *const qcoeff = BLOCK_OFFSET(p->qcoeff, block); + tran_low_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block); + uint16_t *const eob = &p->eobs[block]; if (*eob == 1) *rate += (int)abs(qcoeff[0]); else if (*eob > 1) *rate += (int)vp9_satd((const int16_t *)qcoeff, step << 4); - *sse += (this_sse >> shift); - *skippable &= (*eob == 0); + *dist += vp9_block_error_fp(coeff, dqcoeff, step << 4) >> shift; } block += step; } @@ -417,6 +436,7 @@ static void block_yrd(VP9_COMP *cpi, MACROBLOCK *x, int *rate, int64_t *dist, *rate <<= 8; *rate *= 6; } +#endif static void model_rd_for_sb_uv(VP9_COMP *cpi, BLOCK_SIZE bsize, MACROBLOCK *x, MACROBLOCKD *xd, @@ -624,7 +644,7 @@ static void estimate_block_intra(int plane, int block, BLOCK_SIZE plane_bsize, int i, j; int rate; int64_t dist; - int64_t this_sse; + int64_t this_sse = INT64_MAX; int is_skippable; txfrm_block_to_raster_xy(plane_bsize, tx_size, block, &i, &j); @@ -1074,6 +1094,7 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, } if (bsize <= BLOCK_16X16) { + this_sse = (int64_t)sse_y; block_yrd(cpi, x, &this_rdc.rate, &this_rdc.dist, &is_skippable, &this_sse, 0, bsize, mbmi->tx_size); x->skip_txfm[0] = is_skippable; diff --git a/vp9/encoder/vp9_rdopt.c b/vp9/encoder/vp9_rdopt.c index e8c3f767e..166535b8b 100644 --- a/vp9/encoder/vp9_rdopt.c +++ b/vp9/encoder/vp9_rdopt.c @@ -292,6 +292,18 @@ int64_t vp9_block_error_c(const tran_low_t *coeff, const tran_low_t *dqcoeff, return error; } +int64_t vp9_block_error_fp_c(const int16_t *coeff, const int16_t *dqcoeff, + int block_size) { + int i; + int64_t error = 0; + + for (i = 0; i < block_size; i++) { + const int diff = coeff[i] - dqcoeff[i]; + error += diff * diff; + } + + return error; +} #if CONFIG_VP9_HIGHBITDEPTH int64_t vp9_highbd_block_error_c(const tran_low_t *coeff, diff --git a/vp9/encoder/x86/vp9_error_sse2.asm b/vp9/encoder/x86/vp9_error_sse2.asm index 1126fdb61..318379777 100644 --- a/vp9/encoder/x86/vp9_error_sse2.asm +++ b/vp9/encoder/x86/vp9_error_sse2.asm @@ -72,3 +72,49 @@ cglobal block_error, 3, 3, 8, uqc, dqc, size, ssz movd edx, m5 %endif RET + +; Compute the sum of squared difference between two int16_t vectors. +; int64_t vp9_block_error_fp(int16_t *coeff, int16_t *dqcoeff, +; intptr_t block_size) + +INIT_XMM sse2 +cglobal block_error_fp, 3, 3, 8, uqc, dqc, size + pxor m4, m4 ; sse accumulator + pxor m5, m5 ; dedicated zero register + lea uqcq, [uqcq+sizeq*2] + lea dqcq, [dqcq+sizeq*2] + neg sizeq +.loop: + mova m2, [uqcq+sizeq*2] + mova m0, [dqcq+sizeq*2] + mova m3, [uqcq+sizeq*2+mmsize] + mova m1, [dqcq+sizeq*2+mmsize] + psubw m0, m2 + psubw m1, m3 + ; individual errors are max. 15bit+sign, so squares are 30bit, and + ; thus the sum of 2 should fit in a 31bit integer (+ unused sign bit) + pmaddwd m0, m0 + pmaddwd m1, m1 + ; accumulate in 64bit + punpckldq m7, m0, m5 + punpckhdq m0, m5 + paddq m4, m7 + punpckldq m7, m1, m5 + paddq m4, m0 + punpckhdq m1, m5 + paddq m4, m7 + paddq m4, m1 + add sizeq, mmsize + jl .loop + + ; accumulate horizontally and store in return value + movhlps m5, m4 + paddq m4, m5 +%if ARCH_X86_64 + movq rax, m4 +%else + pshufd m5, m4, 0x1 + movd eax, m4 + movd edx, m5 +%endif + RET -- 2.40.0