From: Ronald S. Bultje Date: Fri, 28 Jun 2013 00:41:54 +0000 (-0700) Subject: Make coefficient skip condition an explicit RD choice. X-Git-Tag: v1.3.0~991^2 X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=af660715c01fc5403700c49686885b2f0ee6133b;p=libvpx Make coefficient skip condition an explicit RD choice. This commit replaces zrun_zbin_boost, a method of biasing non-zero coefficients following runs of zero-coefficients to be rounded towards zero, with an explicit skip-block choice in the RD loop. The logic is basically that if individual coefficients should be rounded towards zero (from a RD point of view), the trellis/optimize loop should take care of it. If whole blocks should be zero (from a RD point of view), a single RD check is much more efficient than a complete serialization of the quantization loop. Quality change: derf +0.5% psnr, +1.6% ssim; yt +0.6% psnr, +1.1% ssim. SIMD for quantize will follow in a separate patch. Results for other test sets pending. Change-Id: Ife5fa641163ac5150ac428011e87188f1937c1f4 --- diff --git a/vp9/common/vp9_rtcd_defs.sh b/vp9/common/vp9_rtcd_defs.sh index c76e8f736..bddbd49ec 100644 --- a/vp9/common/vp9_rtcd_defs.sh +++ b/vp9/common/vp9_rtcd_defs.sh @@ -558,7 +558,7 @@ prototype unsigned int vp9_get_mb_ss "const int16_t *" specialize vp9_get_mb_ss mmx sse2 # ENCODEMB INVOKE -prototype int64_t vp9_block_error "int16_t *coeff, int16_t *dqcoeff, intptr_t block_size" +prototype int64_t vp9_block_error "int16_t *coeff, int16_t *dqcoeff, intptr_t block_size, int64_t *ssz" specialize vp9_block_error sse2 prototype void vp9_subtract_block "int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride" diff --git a/vp9/encoder/vp9_block.h b/vp9/encoder/vp9_block.h index 59cc3d95c..74f61a101 100644 --- a/vp9/encoder/vp9_block.h +++ b/vp9/encoder/vp9_block.h @@ -68,7 +68,6 @@ struct macroblock_plane { int16_t *quant; uint8_t *quant_shift; int16_t *zbin; - int16_t *zrun_zbin_boost; int16_t *round; // Zbin Over Quant value diff --git a/vp9/encoder/vp9_onyx_int.h b/vp9/encoder/vp9_onyx_int.h index a1f567aed..0e6c97a5e 100644 --- a/vp9/encoder/vp9_onyx_int.h +++ b/vp9/encoder/vp9_onyx_int.h @@ -268,11 +268,7 @@ typedef struct VP9_COMP { DECLARE_ALIGNED(16, unsigned char, a_quant_shift[QINDEX_RANGE][16]); DECLARE_ALIGNED(16, short, a_zbin[QINDEX_RANGE][16]); DECLARE_ALIGNED(16, short, a_round[QINDEX_RANGE][16]); - - DECLARE_ALIGNED(16, short, zrun_zbin_boost_a[QINDEX_RANGE][16]); #endif - DECLARE_ALIGNED(16, short, zrun_zbin_boost_y[QINDEX_RANGE][16]); - DECLARE_ALIGNED(16, short, zrun_zbin_boost_uv[QINDEX_RANGE][16]); MACROBLOCK mb; VP9_COMMON common; diff --git a/vp9/encoder/vp9_quantize.c b/vp9/encoder/vp9_quantize.c index 8deeea13d..e68a48b12 100644 --- a/vp9/encoder/vp9_quantize.c +++ b/vp9/encoder/vp9_quantize.c @@ -21,8 +21,7 @@ extern int enc_debug; #endif -static void quantize(int16_t *zbin_boost_orig_ptr, - int16_t *coeff_ptr, int n_coeffs, int skip_block, +static void quantize(int16_t *coeff_ptr, int n_coeffs, int skip_block, int16_t *zbin_ptr, int16_t *round_ptr, int16_t *quant_ptr, uint8_t *quant_shift_ptr, int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr, @@ -31,8 +30,6 @@ static void quantize(int16_t *zbin_boost_orig_ptr, int i, rc, eob; int zbins[2], nzbins[2], zbin; int x, y, z, sz; - int zero_run = 0; - int16_t *zbin_boost_ptr = zbin_boost_orig_ptr; int zero_flag = n_coeffs; vpx_memset(qcoeff_ptr, 0, n_coeffs*sizeof(int16_t)); @@ -65,8 +62,7 @@ static void quantize(int16_t *zbin_boost_orig_ptr, rc = scan[i]; z = coeff_ptr[rc]; - zbin = (zbins[rc != 0] + zbin_boost_ptr[zero_run]); - zero_run += (zero_run < 15); + zbin = (zbins[rc != 0]); sz = (z >> 31); // sign of z x = (z ^ sz) - sz; @@ -81,7 +77,6 @@ static void quantize(int16_t *zbin_boost_orig_ptr, if (y) { eob = i; // last nonzero coeffs - zero_run = 0; // set zero_run } } } @@ -90,8 +85,7 @@ static void quantize(int16_t *zbin_boost_orig_ptr, } // This function works well for large transform size. -static void quantize_sparse(int16_t *zbin_boost_orig_ptr, - int16_t *coeff_ptr, int n_coeffs, int skip_block, +static void quantize_sparse(int16_t *coeff_ptr, int n_coeffs, int skip_block, int16_t *zbin_ptr, int16_t *round_ptr, int16_t *quant_ptr, uint8_t *quant_shift_ptr, int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr, @@ -101,10 +95,7 @@ static void quantize_sparse(int16_t *zbin_boost_orig_ptr, int i, rc, eob; int zbins[2], nzbins[2], zbin; int x, y, z, sz; - int zero_run = 0; - int16_t *zbin_boost_ptr = zbin_boost_orig_ptr; int idx = 0; - int pre_idx = 0; vpx_memset(qcoeff_ptr, 0, n_coeffs*sizeof(int16_t)); vpx_memset(dqcoeff_ptr, 0, n_coeffs*sizeof(int16_t)); @@ -135,11 +126,8 @@ static void quantize_sparse(int16_t *zbin_boost_orig_ptr, rc = scan[idx_arr[i]]; // Calculate ZBIN - zero_run += idx_arr[i] - pre_idx; - if(zero_run > 15) zero_run = 15; - zbin = (zbins[rc != 0] + zbin_boost_ptr[zero_run]); + zbin = (zbins[rc != 0]); - pre_idx = idx_arr[i]; z = coeff_ptr[rc] * 2; sz = (z >> 31); // sign of z x = (z ^ sz) - sz; // x = abs(z) @@ -155,7 +143,6 @@ static void quantize_sparse(int16_t *zbin_boost_orig_ptr, if (y) { eob = idx_arr[i]; // last nonzero coeffs - zero_run = -1; // set zero_run } } } @@ -189,8 +176,7 @@ void vp9_quantize(MACROBLOCK *mb, int plane, int block, int n_coeffs, // Save index of picked coefficient in pre-scan pass. int idx_arr[1024]; - quantize_sparse(mb->plane[plane].zrun_zbin_boost, - BLOCK_OFFSET(mb->plane[plane].coeff, block, 16), + quantize_sparse(BLOCK_OFFSET(mb->plane[plane].coeff, block, 16), n_coeffs, mb->skip_block, mb->plane[plane].zbin, mb->plane[plane].round, @@ -204,8 +190,7 @@ void vp9_quantize(MACROBLOCK *mb, int plane, int block, int n_coeffs, scan, idx_arr); } else { - quantize(mb->plane[plane].zrun_zbin_boost, - BLOCK_OFFSET(mb->plane[plane].coeff, block, 16), + quantize(BLOCK_OFFSET(mb->plane[plane].coeff, block, 16), n_coeffs, mb->skip_block, mb->plane[plane].zbin, mb->plane[plane].round, @@ -226,8 +211,7 @@ void vp9_regular_quantize_b_4x4(MACROBLOCK *mb, int b_idx, TX_TYPE tx_type, const struct plane_block_idx pb_idx = plane_block_idx(y_blocks, b_idx); const int *pt_scan = get_scan_4x4(tx_type); - quantize(mb->plane[pb_idx.plane].zrun_zbin_boost, - BLOCK_OFFSET(mb->plane[pb_idx.plane].coeff, pb_idx.block, 16), + quantize(BLOCK_OFFSET(mb->plane[pb_idx.plane].coeff, pb_idx.block, 16), 16, mb->skip_block, mb->plane[pb_idx.plane].zbin, mb->plane[pb_idx.plane].round, @@ -261,9 +245,6 @@ void vp9_init_quantizer(VP9_COMP *cpi) { #endif int q; - static const int zbin_boost[16] = { 0, 0, 0, 8, 8, 8, 10, 12, - 14, 16, 20, 24, 28, 32, 36, 40 }; - for (q = 0; q < QINDEX_RANGE; q++) { int qzbin_factor = (vp9_dc_quant(q, 0) < 148) ? 84 : 80; int qrounding_factor = 48; @@ -277,14 +258,12 @@ void vp9_init_quantizer(VP9_COMP *cpi) { cpi->y_zbin[q][0] = ROUND_POWER_OF_TWO(qzbin_factor * quant_val, 7); cpi->y_round[q][0] = (qrounding_factor * quant_val) >> 7; cpi->common.y_dequant[q][0] = quant_val; - cpi->zrun_zbin_boost_y[q][0] = (quant_val * zbin_boost[0]) >> 7; quant_val = vp9_dc_quant(q, cpi->common.uv_dc_delta_q); invert_quant(cpi->uv_quant[q] + 0, cpi->uv_quant_shift[q] + 0, quant_val); cpi->uv_zbin[q][0] = ROUND_POWER_OF_TWO(qzbin_factor * quant_val, 7); cpi->uv_round[q][0] = (qrounding_factor * quant_val) >> 7; cpi->common.uv_dequant[q][0] = quant_val; - cpi->zrun_zbin_boost_uv[q][0] = (quant_val * zbin_boost[0]) >> 7; #if CONFIG_ALPHA quant_val = vp9_dc_quant(q, cpi->common.a_dc_delta_q); @@ -292,7 +271,6 @@ void vp9_init_quantizer(VP9_COMP *cpi) { cpi->a_zbin[q][0] = ROUND_POWER_OF_TWO(qzbin_factor * quant_val, 7); cpi->a_round[q][0] = (qrounding_factor * quant_val) >> 7; cpi->common.a_dequant[q][0] = quant_val; - cpi->zrun_zbin_boost_a[q][0] = (quant_val * zbin_boost[0]) >> 7; #endif quant_val = vp9_ac_quant(q, 0); @@ -310,15 +288,11 @@ void vp9_init_quantizer(VP9_COMP *cpi) { invert_quant(cpi->y_quant[q] + rc, cpi->y_quant_shift[q] + rc, quant_val); cpi->y_zbin[q][rc] = ROUND_POWER_OF_TWO(qzbin_factor * quant_val, 7); cpi->y_round[q][rc] = (qrounding_factor * quant_val) >> 7; - cpi->zrun_zbin_boost_y[q][i] = - ROUND_POWER_OF_TWO(quant_val * zbin_boost[i], 7); invert_quant(cpi->uv_quant[q] + rc, cpi->uv_quant_shift[q] + rc, quant_uv_val); cpi->uv_zbin[q][rc] = ROUND_POWER_OF_TWO(qzbin_factor * quant_uv_val, 7); cpi->uv_round[q][rc] = (qrounding_factor * quant_uv_val) >> 7; - cpi->zrun_zbin_boost_uv[q][i] = - ROUND_POWER_OF_TWO(quant_uv_val * zbin_boost[i], 7); #if CONFIG_ALPHA invert_quant(cpi->a_quant[q] + rc, cpi->a_quant_shift[q] + rc, @@ -326,8 +300,6 @@ void vp9_init_quantizer(VP9_COMP *cpi) { cpi->a_zbin[q][rc] = ROUND_POWER_OF_TWO(qzbin_factor * quant_alpha_val, 7); cpi->a_round[q][rc] = (qrounding_factor * quant_alpha_val) >> 7; - cpi->zrun_zbin_boost_a[q][i] = - ROUND_POWER_OF_TWO(quant_alpha_val * zbin_boost[i], 7); #endif } } @@ -348,7 +320,6 @@ void vp9_mb_init_quantizer(VP9_COMP *cpi, MACROBLOCK *x) { x->plane[0].quant_shift = cpi->y_quant_shift[qindex]; x->plane[0].zbin = cpi->y_zbin[qindex]; x->plane[0].round = cpi->y_round[qindex]; - x->plane[0].zrun_zbin_boost = cpi->zrun_zbin_boost_y[qindex]; x->plane[0].zbin_extra = (int16_t)zbin_extra; x->e_mbd.plane[0].dequant = cpi->common.y_dequant[qindex]; @@ -361,7 +332,6 @@ void vp9_mb_init_quantizer(VP9_COMP *cpi, MACROBLOCK *x) { x->plane[i].quant_shift = cpi->uv_quant_shift[qindex]; x->plane[i].zbin = cpi->uv_zbin[qindex]; x->plane[i].round = cpi->uv_round[qindex]; - x->plane[i].zrun_zbin_boost = cpi->zrun_zbin_boost_uv[qindex]; x->plane[i].zbin_extra = (int16_t)zbin_extra; x->e_mbd.plane[i].dequant = cpi->common.uv_dequant[qindex]; } @@ -371,7 +341,6 @@ void vp9_mb_init_quantizer(VP9_COMP *cpi, MACROBLOCK *x) { x->plane[3].quant_shift = cpi->a_quant_shift[qindex]; x->plane[3].zbin = cpi->a_zbin[qindex]; x->plane[3].round = cpi->a_round[qindex]; - x->plane[3].zrun_zbin_boost = cpi->zrun_zbin_boost_a[qindex]; x->plane[3].zbin_extra = (int16_t)zbin_extra; x->e_mbd.plane[3].dequant = cpi->common.a_dequant[qindex]; #endif diff --git a/vp9/encoder/vp9_rdopt.c b/vp9/encoder/vp9_rdopt.c index 37fc9316d..7a2ec56bb 100644 --- a/vp9/encoder/vp9_rdopt.c +++ b/vp9/encoder/vp9_rdopt.c @@ -283,15 +283,17 @@ void vp9_initialize_rd_consts(VP9_COMP *cpi, int qindex) { } int64_t vp9_block_error_c(int16_t *coeff, int16_t *dqcoeff, - intptr_t block_size) { + intptr_t block_size, int64_t *ssz) { int i; - int64_t error = 0; + int64_t error = 0, sqcoeff = 0; for (i = 0; i < block_size; i++) { int this_diff = coeff[i] - dqcoeff[i]; error += (unsigned)this_diff * this_diff; + sqcoeff += (unsigned) coeff[i] * coeff[i]; } + *ssz = sqcoeff; return error; } @@ -501,27 +503,31 @@ static void choose_txfm_size_from_rd(VP9_COMP *cpi, MACROBLOCK *x, } static int64_t block_error_sby(MACROBLOCK *x, BLOCK_SIZE_TYPE bsize, - int shift) { + int shift, int64_t *sse) { struct macroblockd_plane *p = &x->e_mbd.plane[0]; const int bw = plane_block_width(bsize, p); const int bh = plane_block_height(bsize, p); - return vp9_block_error(x->plane[0].coeff, x->e_mbd.plane[0].dqcoeff, - bw * bh) >> shift; + int64_t e = vp9_block_error(x->plane[0].coeff, x->e_mbd.plane[0].dqcoeff, + bw * bh, sse) >> shift; + *sse >>= shift; + return e; } static int64_t block_error_sbuv(MACROBLOCK *x, BLOCK_SIZE_TYPE bsize, - int shift) { - int64_t sum = 0; + int shift, int64_t *sse) { + int64_t sum = 0, this_sse; int plane; + *sse = 0; for (plane = 1; plane < MAX_MB_PLANE; plane++) { struct macroblockd_plane *p = &x->e_mbd.plane[plane]; const int bw = plane_block_width(bsize, p); const int bh = plane_block_height(bsize, p); sum += vp9_block_error(x->plane[plane].coeff, x->e_mbd.plane[plane].dqcoeff, - bw * bh); + bw * bh, &this_sse); + *sse += this_sse; } - + *sse >>= shift; return sum >> shift; } @@ -581,7 +587,7 @@ static int rdcost_uv(VP9_COMMON *const cm, MACROBLOCK *x, static void super_block_yrd_for_txfm(VP9_COMMON *const cm, MACROBLOCK *x, int *rate, int64_t *distortion, - int *skippable, + int *skippable, int64_t *sse, BLOCK_SIZE_TYPE bsize, TX_SIZE tx_size) { MACROBLOCKD *const xd = &x->e_mbd; xd->mode_info_context->mbmi.txfm_size = tx_size; @@ -591,18 +597,18 @@ static void super_block_yrd_for_txfm(VP9_COMMON *const cm, MACROBLOCK *x, else vp9_xform_quant_sby(cm, x, bsize); - *distortion = block_error_sby(x, bsize, tx_size == TX_32X32 ? 0 : 2); + *distortion = block_error_sby(x, bsize, tx_size == TX_32X32 ? 0 : 2, sse); *rate = rdcost_plane(cm, x, 0, bsize, tx_size); *skippable = vp9_sby_is_skippable(xd, bsize); } static void super_block_yrd(VP9_COMP *cpi, MACROBLOCK *x, int *rate, int64_t *distortion, - int *skip, BLOCK_SIZE_TYPE bs, + int *skip, int64_t *psse, BLOCK_SIZE_TYPE bs, int64_t txfm_cache[NB_TXFM_MODES]) { VP9_COMMON *const cm = &cpi->common; int r[TX_SIZE_MAX_SB][2], s[TX_SIZE_MAX_SB]; - int64_t d[TX_SIZE_MAX_SB]; + int64_t d[TX_SIZE_MAX_SB], sse[TX_SIZE_MAX_SB]; MACROBLOCKD *xd = &x->e_mbd; MB_MODE_INFO *const mbmi = &xd->mode_info_context->mbmi; @@ -621,25 +627,27 @@ static void super_block_yrd(VP9_COMP *cpi, mbmi->txfm_size = TX_4X4; } vpx_memset(txfm_cache, 0, NB_TXFM_MODES * sizeof(int64_t)); - super_block_yrd_for_txfm(cm, x, rate, distortion, skip, bs, + super_block_yrd_for_txfm(cm, x, rate, distortion, skip, &sse[0], bs, mbmi->txfm_size); return; } if (bs >= BLOCK_SIZE_SB32X32) super_block_yrd_for_txfm(cm, x, &r[TX_32X32][0], &d[TX_32X32], &s[TX_32X32], - bs, TX_32X32); + &sse[TX_32X32], bs, TX_32X32); if (bs >= BLOCK_SIZE_MB16X16) super_block_yrd_for_txfm(cm, x, &r[TX_16X16][0], &d[TX_16X16], &s[TX_16X16], - bs, TX_16X16); - super_block_yrd_for_txfm(cm, x, &r[TX_8X8][0], &d[TX_8X8], &s[TX_8X8], bs, - TX_8X8); - super_block_yrd_for_txfm(cm, x, &r[TX_4X4][0], &d[TX_4X4], &s[TX_4X4], bs, - TX_4X4); + &sse[TX_16X16], bs, TX_16X16); + super_block_yrd_for_txfm(cm, x, &r[TX_8X8][0], &d[TX_8X8], &s[TX_8X8], + &sse[TX_8X8], bs, TX_8X8); + super_block_yrd_for_txfm(cm, x, &r[TX_4X4][0], &d[TX_4X4], &s[TX_4X4], + &sse[TX_4X4], bs, TX_4X4); choose_txfm_size_from_rd(cpi, x, r, rate, d, distortion, s, skip, txfm_cache, TX_32X32 - (bs < BLOCK_SIZE_SB32X32) - (bs < BLOCK_SIZE_MB16X16)); + if (psse) + *psse = sse[mbmi->txfm_size]; } static int64_t rd_pick_intra4x4block(VP9_COMP *cpi, MACROBLOCK *x, int ib, @@ -688,6 +696,8 @@ static int64_t rd_pick_intra4x4block(VP9_COMP *cpi, MACROBLOCK *x, int ib, for (idy = 0; idy < bh; ++idy) { for (idx = 0; idx < bw; ++idx) { + int64_t ssz; + block = ib + idy * 2 + idx; xd->mode_info_context->bmi[block].as_mode.first = mode; src = raster_block_offset_uint8(xd, BLOCK_SIZE_SB8X8, 0, block, @@ -718,7 +728,8 @@ static int64_t rd_pick_intra4x4block(VP9_COMP *cpi, MACROBLOCK *x, int ib, ratey += cost_coeffs(cm, x, 0, block, PLANE_TYPE_Y_WITH_DC, tempa + idx, templ + idy, TX_4X4, 16); distortion += vp9_block_error(coeff, BLOCK_OFFSET(pd->dqcoeff, - block, 16), 16) >> 2; + block, 16), + 16, &ssz) >> 2; if (best_tx_type != DCT_DCT) vp9_short_iht4x4_add(BLOCK_OFFSET(pd->dqcoeff, block, 16), @@ -881,7 +892,7 @@ static int64_t rd_pick_intra_sby_mode(VP9_COMP *cpi, MACROBLOCK *x, } x->e_mbd.mode_info_context->mbmi.mode = mode; - super_block_yrd(cpi, x, &this_rate_tokenonly, &this_distortion, &s, + super_block_yrd(cpi, x, &this_rate_tokenonly, &this_distortion, &s, NULL, bsize, local_txfm_cache); this_rate = this_rate_tokenonly + bmode_costs[mode]; @@ -914,22 +925,25 @@ static int64_t rd_pick_intra_sby_mode(VP9_COMP *cpi, MACROBLOCK *x, static void super_block_uvrd_for_txfm(VP9_COMMON *const cm, MACROBLOCK *x, int *rate, int64_t *distortion, - int *skippable, BLOCK_SIZE_TYPE bsize, + int *skippable, int64_t *sse, + BLOCK_SIZE_TYPE bsize, TX_SIZE uv_tx_size) { MACROBLOCKD *const xd = &x->e_mbd; + int64_t dummy; if (xd->mode_info_context->mbmi.ref_frame[0] == INTRA_FRAME) vp9_encode_intra_block_uv(cm, x, bsize); else vp9_xform_quant_sbuv(cm, x, bsize); - *distortion = block_error_sbuv(x, bsize, uv_tx_size == TX_32X32 ? 0 : 2); + *distortion = block_error_sbuv(x, bsize, uv_tx_size == TX_32X32 ? 0 : 2, + sse ? sse : &dummy); *rate = rdcost_uv(cm, x, bsize, uv_tx_size); *skippable = vp9_sbuv_is_skippable(xd, bsize); } static void super_block_uvrd(VP9_COMMON *const cm, MACROBLOCK *x, int *rate, int64_t *distortion, int *skippable, - BLOCK_SIZE_TYPE bsize) { + int64_t *sse, BLOCK_SIZE_TYPE bsize) { MACROBLOCKD *const xd = &x->e_mbd; MB_MODE_INFO *const mbmi = &xd->mode_info_context->mbmi; TX_SIZE uv_txfm_size = get_uv_tx_size(mbmi); @@ -937,7 +951,7 @@ static void super_block_uvrd(VP9_COMMON *const cm, MACROBLOCK *x, if (mbmi->ref_frame[0] > INTRA_FRAME) vp9_subtract_sbuv(x, bsize); - super_block_uvrd_for_txfm(cm, x, rate, distortion, skippable, bsize, + super_block_uvrd_for_txfm(cm, x, rate, distortion, skippable, sse, bsize, uv_txfm_size); } @@ -954,7 +968,7 @@ static int64_t rd_pick_intra_sbuv_mode(VP9_COMP *cpi, MACROBLOCK *x, for (mode = DC_PRED; mode <= TM_PRED; mode++) { x->e_mbd.mode_info_context->mbmi.uv_mode = mode; super_block_uvrd(&cpi->common, x, &this_rate_tokenonly, - &this_distortion, &s, bsize); + &this_distortion, &s, NULL, bsize); this_rate = this_rate_tokenonly + x->intra_uv_mode_cost[x->e_mbd.frame_type][mode]; this_rd = RDCOST(x->rdmult, x->rddiv, this_rate, this_distortion); @@ -1151,6 +1165,8 @@ static int64_t encode_inter_mb_segment(VP9_COMMON *const cm, k = i; for (idy = 0; idy < bh / 4; ++idy) { for (idx = 0; idx < bw / 4; ++idx) { + int64_t ssz; + k += (idy * 2 + idx); src_diff = raster_block_offset_int16(xd, BLOCK_SIZE_SB8X8, 0, k, x->plane[0].src_diff); @@ -1159,7 +1175,7 @@ static int64_t encode_inter_mb_segment(VP9_COMMON *const cm, x->quantize_b_4x4(x, k, DCT_DCT, 16); thisdistortion += vp9_block_error(coeff, BLOCK_OFFSET(xd->plane[0].dqcoeff, - k, 16), 16); + k, 16), 16, &ssz); thisrate += cost_coeffs(cm, x, 0, k, PLANE_TYPE_Y_WITH_DC, ta + (k & 1), tl + (k >> 1), TX_4X4, 16); @@ -2238,7 +2254,8 @@ static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, INTERPOLATIONFILTERTYPE *best_filter, int_mv *frame_mv, int mi_row, int mi_col, - int_mv single_newmv[MAX_REF_FRAMES]) { + int_mv single_newmv[MAX_REF_FRAMES], + int64_t *psse) { VP9_COMMON *cm = &cpi->common; MACROBLOCKD *xd = &x->e_mbd; MB_MODE_INFO *mbmi = &xd->mode_info_context->mbmi; @@ -2467,17 +2484,19 @@ static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, if (!x->skip) { int skippable_y, skippable_uv; + int64_t sseuv = INT_MAX; // Y cost and distortion - super_block_yrd(cpi, x, rate_y, distortion_y, &skippable_y, + super_block_yrd(cpi, x, rate_y, distortion_y, &skippable_y, psse, bsize, txfm_cache); *rate2 += *rate_y; *distortion += *distortion_y; super_block_uvrd(cm, x, rate_uv, distortion_uv, - &skippable_uv, bsize); + &skippable_uv, &sseuv, bsize); + *psse += sseuv; *rate2 += *rate_uv; *distortion += *distortion_uv; *skippable = skippable_y && skippable_uv; @@ -2611,6 +2630,7 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x, int bws = (1 << bwsl) / 4; // mode_info step for subsize int bhsl = b_height_log2(bsize); int bhs = (1 << bhsl) / 4; // mode_info step for subsize + int best_skip2 = 0; for (i = 0; i < 4; i++) { int j; @@ -2702,6 +2722,8 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x, int skippable; int64_t txfm_cache[NB_TXFM_MODES]; int i; + int this_skip2 = 0; + int64_t total_sse = INT_MAX; for (i = 0; i < NB_TXFM_MODES; ++i) txfm_cache[i] = INT64_MAX; @@ -2863,7 +2885,7 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x, txfm_cache[i] = txfm_cache[ONLY_4X4]; } else if (ref_frame == INTRA_FRAME) { TX_SIZE uv_tx; - super_block_yrd(cpi, x, &rate_y, &distortion_y, &skippable, + super_block_yrd(cpi, x, &rate_y, &distortion_y, &skippable, NULL, bsize, txfm_cache); uv_tx = mbmi->txfm_size; @@ -2989,7 +3011,7 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE_SB8X8); vp9_subtract_sbuv(x, BLOCK_SIZE_SB8X8); super_block_uvrd_for_txfm(cm, x, &rate_uv, &distortion_uv, - &uv_skippable, BLOCK_SIZE_SB8X8, TX_4X4); + &uv_skippable, NULL, BLOCK_SIZE_SB8X8, TX_4X4); rate2 += rate_uv; distortion2 += distortion_uv; skippable = skippable && uv_skippable; @@ -3017,7 +3039,7 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x, &mode_excluded, &disable_skip, &tmp_best_filter, frame_mv[this_mode], mi_row, mi_col, - single_newmv); + single_newmv, &total_sse); if (this_rd == INT64_MAX) continue; } @@ -3062,10 +3084,29 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x, rate2 += prob_skip_cost; } } + } else if (mb_skip_allowed && ref_frame != INTRA_FRAME && + this_mode != SPLITMV) { + if (RDCOST(x->rdmult, x->rddiv, rate_y + rate_uv, distortion2) < + RDCOST(x->rdmult, x->rddiv, 0, total_sse)) { + // Add in the cost of the no skip flag. + int prob_skip_cost = vp9_cost_bit(vp9_get_pred_prob(cm, xd, + PRED_MBSKIP), 0); + rate2 += prob_skip_cost; + } else { + int prob_skip_cost = vp9_cost_bit(vp9_get_pred_prob(cm, xd, + PRED_MBSKIP), 1); + rate2 += prob_skip_cost; + distortion2 = total_sse; + assert(total_sse >= 0); + rate2 -= (rate_y + rate_uv); + rate_y = 0; + rate_uv = 0; + this_skip2 = 1; + } } else if (mb_skip_allowed) { // Add in the cost of the no skip flag. int prob_skip_cost = vp9_cost_bit(vp9_get_pred_prob(cm, xd, - PRED_MBSKIP), 0); + PRED_MBSKIP), 0); rate2 += prob_skip_cost; } @@ -3119,6 +3160,7 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x, *returndistortion = distortion2; best_rd = this_rd; best_mbmode = *mbmi; + best_skip2 = this_skip2; best_partition = *x->partition_info; if (this_mode == I4X4_PRED || this_mode == SPLITMV) @@ -3301,6 +3343,7 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x, // macroblock modes *mbmi = best_mbmode; + x->skip |= best_skip2; if (best_mbmode.ref_frame[0] == INTRA_FRAME && best_mbmode.sb_type < BLOCK_SIZE_SB8X8) { for (i = 0; i < 4; i++) diff --git a/vp9/encoder/x86/vp9_error_sse2.asm b/vp9/encoder/x86/vp9_error_sse2.asm index bb1ea71b9..1126fdb61 100644 --- a/vp9/encoder/x86/vp9_error_sse2.asm +++ b/vp9/encoder/x86/vp9_error_sse2.asm @@ -12,45 +12,62 @@ SECTION .text -; void vp9_block_error(int16_t *coeff, int16_t *dqcoeff, intptr_t block_size) +; int64_t vp9_block_error(int16_t *coeff, int16_t *dqcoeff, intptr_t block_size, +; int64_t *ssz) INIT_XMM sse2 -cglobal block_error, 3, 3, 6, uqc, dqc, size - pxor m4, m4 ; accumulator +cglobal block_error, 3, 3, 8, uqc, dqc, size, ssz + pxor m4, m4 ; sse accumulator + pxor m6, m6 ; ssz accumulator pxor m5, m5 ; dedicated zero register lea uqcq, [uqcq+sizeq*2] lea dqcq, [dqcq+sizeq*2] neg sizeq .loop: - mova m0, [uqcq+sizeq*2] - mova m2, [dqcq+sizeq*2] - mova m1, [uqcq+sizeq*2+mmsize] - mova m3, [dqcq+sizeq*2+mmsize] + mova m2, [uqcq+sizeq*2] + mova m0, [dqcq+sizeq*2] + mova m3, [uqcq+sizeq*2+mmsize] + mova m1, [dqcq+sizeq*2+mmsize] psubw m0, m2 psubw m1, m3 ; individual errors are max. 15bit+sign, so squares are 30bit, and ; thus the sum of 2 should fit in a 31bit integer (+ unused sign bit) pmaddwd m0, m0 pmaddwd m1, m1 + pmaddwd m2, m2 + pmaddwd m3, m3 ; accumulate in 64bit - punpckldq m2, m0, m5 + punpckldq m7, m0, m5 punpckhdq m0, m5 - punpckldq m3, m1, m5 - punpckhdq m1, m5 - paddq m4, m2 + paddq m4, m7 + punpckldq m7, m1, m5 paddq m4, m0 - paddq m4, m3 + punpckhdq m1, m5 + paddq m4, m7 + punpckldq m7, m2, m5 paddq m4, m1 + punpckhdq m2, m5 + paddq m6, m7 + punpckldq m7, m3, m5 + paddq m6, m2 + punpckhdq m3, m5 + paddq m6, m7 + paddq m6, m3 add sizeq, mmsize jl .loop ; accumulate horizontally and store in return value movhlps m5, m4 + movhlps m7, m6 paddq m4, m5 + paddq m6, m7 %if ARCH_X86_64 movq rax, m4 + movq [sszq], m6 %else + mov eax, sszm pshufd m5, m4, 0x1 + movq [eax], m6 movd eax, m4 movd edx, m5 %endif