From: Ronald S. Bultje Date: Thu, 11 Jul 2013 20:01:44 +0000 (-0700) Subject: Inline vp9_quantize() in xform_quant(). X-Git-Tag: v1.3.0~863 X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=1ff94fea5639c1c7c4bc99a080aaa985d60d25b7;p=libvpx Inline vp9_quantize() in xform_quant(). Cycle times: 4x4: 151 to 131 cycles (15% faster) 8x8: 334 to 306 cycles (9% faster) 16x16: 1401 to 1368 cycles (2.5% faster) 32x32: 7403 to 7367 cycles (0.5% faster) Total encode time of first 50 frames of bus @ 1500kbps (speed 0) goes from 1min39.2 to 1min38.6, i.e. a 0.67% overall speedup. Change-Id: I799a49460e5e3fcab01725564dd49c629bfe935f --- diff --git a/vp9/encoder/vp9_encodemb.c b/vp9/encoder/vp9_encodemb.c index 713a3335f..641fc4cc8 100644 --- a/vp9/encoder/vp9_encodemb.c +++ b/vp9/encoder/vp9_encodemb.c @@ -432,48 +432,86 @@ void xform_quant(int plane, int block, BLOCK_SIZE_TYPE bsize, struct encode_b_args* const args = arg; MACROBLOCK* const x = args->x; MACROBLOCKD* const xd = &x->e_mbd; - const int bw = plane_block_width(bsize, &xd->plane[plane]); - const int raster_block = txfrm_block_to_raster_block(xd, bsize, plane, - block, ss_txfrm_size); - int16_t *const coeff = BLOCK_OFFSET(x->plane[plane].coeff, block, 16); - int16_t *const src_diff = raster_block_offset_int16(xd, bsize, plane, - raster_block, - x->plane[plane].src_diff); - TX_TYPE tx_type = DCT_DCT; + struct macroblock_plane *const p = &x->plane[plane]; + struct macroblockd_plane *const pd = &xd->plane[plane]; + int16_t *coeff = BLOCK_OFFSET(p->coeff, block, 16); + int16_t *qcoeff = BLOCK_OFFSET(pd->qcoeff, block, 16); + int16_t *dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block, 16); + const TX_SIZE tx_size = (TX_SIZE)(ss_txfrm_size / 2); + TX_TYPE tx_type; + const int16_t *scan, *iscan; + uint16_t *eob = &pd->eobs[block]; + const int bwl = b_width_log2(bsize) - pd->subsampling_x, bw = 1 << bwl; + const int twl = bwl - tx_size, twmask = (1 << twl) - 1; + int xoff, yoff; + int16_t *src_diff; - switch (ss_txfrm_size / 2) { + switch (tx_size) { case TX_32X32: + scan = vp9_default_scan_32x32; + iscan = vp9_default_iscan_32x32; + block >>= 6; + xoff = 32 * (block & twmask); + yoff = 32 * (block >> twl); + src_diff = p->src_diff + 4 * bw * yoff + xoff; if (x->rd_search) - vp9_short_fdct32x32_rd(src_diff, coeff, bw * 2); + vp9_short_fdct32x32_rd(src_diff, coeff, bw * 8); else - vp9_short_fdct32x32(src_diff, coeff, bw * 2); + vp9_short_fdct32x32(src_diff, coeff, bw * 8); + vp9_quantize_b_32x32(coeff, 1024, x->skip_block, p->zbin, p->round, + p->quant, p->quant_shift, qcoeff, dqcoeff, + pd->dequant, p->zbin_extra, eob, scan, iscan); break; case TX_16X16: tx_type = plane == 0 ? get_tx_type_16x16(xd) : DCT_DCT; + scan = get_scan_16x16(tx_type); + iscan = get_iscan_16x16(tx_type); + block >>= 4; + xoff = 16 * (block & twmask); + yoff = 16 * (block >> twl); + src_diff = p->src_diff + 4 * bw * yoff + xoff; if (tx_type != DCT_DCT) - vp9_short_fht16x16(src_diff, coeff, bw, tx_type); + vp9_short_fht16x16(src_diff, coeff, bw * 4, tx_type); else - x->fwd_txm16x16(src_diff, coeff, bw * 2); + x->fwd_txm16x16(src_diff, coeff, bw * 8); + vp9_quantize_b(coeff, 256, x->skip_block, p->zbin, p->round, + p->quant, p->quant_shift, qcoeff, dqcoeff, + pd->dequant, p->zbin_extra, eob, scan, iscan); break; case TX_8X8: tx_type = plane == 0 ? get_tx_type_8x8(xd) : DCT_DCT; + scan = get_scan_8x8(tx_type); + iscan = get_iscan_8x8(tx_type); + block >>= 2; + xoff = 8 * (block & twmask); + yoff = 8 * (block >> twl); + src_diff = p->src_diff + 4 * bw * yoff + xoff; if (tx_type != DCT_DCT) - vp9_short_fht8x8(src_diff, coeff, bw, tx_type); + vp9_short_fht8x8(src_diff, coeff, bw * 4, tx_type); else - x->fwd_txm8x8(src_diff, coeff, bw * 2); + x->fwd_txm8x8(src_diff, coeff, bw * 8); + vp9_quantize_b(coeff, 64, x->skip_block, p->zbin, p->round, + p->quant, p->quant_shift, qcoeff, dqcoeff, + pd->dequant, p->zbin_extra, eob, scan, iscan); break; case TX_4X4: - tx_type = plane == 0 ? get_tx_type_4x4(xd, raster_block) : DCT_DCT; + tx_type = plane == 0 ? get_tx_type_4x4(xd, block) : DCT_DCT; + scan = get_scan_4x4(tx_type); + iscan = get_iscan_4x4(tx_type); + xoff = 4 * (block & twmask); + yoff = 4 * (block >> twl); + src_diff = p->src_diff + 4 * bw * yoff + xoff; if (tx_type != DCT_DCT) - vp9_short_fht4x4(src_diff, coeff, bw, tx_type); + vp9_short_fht4x4(src_diff, coeff, bw * 4, tx_type); else - x->fwd_txm4x4(src_diff, coeff, bw * 2); + x->fwd_txm4x4(src_diff, coeff, bw * 8); + vp9_quantize_b(coeff, 16, x->skip_block, p->zbin, p->round, + p->quant, p->quant_shift, qcoeff, dqcoeff, + pd->dequant, p->zbin_extra, eob, scan, iscan); break; default: assert(0); } - - vp9_quantize(x, plane, block, 16 << ss_txfrm_size, tx_type); } static void encode_block(int plane, int block, BLOCK_SIZE_TYPE bsize, diff --git a/vp9/encoder/vp9_quantize.c b/vp9/encoder/vp9_quantize.c index 2d3d6bf9d..525f4da79 100644 --- a/vp9/encoder/vp9_quantize.c +++ b/vp9/encoder/vp9_quantize.c @@ -152,63 +152,6 @@ void vp9_quantize_b_32x32_c(int16_t *coeff_ptr, intptr_t n_coeffs, *eob_ptr = eob + 1; } -void vp9_quantize(MACROBLOCK *mb, int plane, int block, int n_coeffs, - TX_TYPE tx_type) { - MACROBLOCKD *const xd = &mb->e_mbd; - const int16_t *scan, *iscan; - - // These contexts may be available in the caller - switch (n_coeffs) { - case 4 * 4: - scan = get_scan_4x4(tx_type); - iscan = get_iscan_4x4(tx_type); - break; - case 8 * 8: - scan = get_scan_8x8(tx_type); - iscan = get_iscan_8x8(tx_type); - break; - case 16 * 16: - scan = get_scan_16x16(tx_type); - iscan = get_iscan_16x16(tx_type); - break; - default: - scan = vp9_default_scan_32x32; - iscan = vp9_default_iscan_32x32; - break; - } - - // Call different quantization for different transform size. - if (n_coeffs >= 1024) { - // Save index of picked coefficient in pre-scan pass. - vp9_quantize_b_32x32(BLOCK_OFFSET(mb->plane[plane].coeff, block, 16), - n_coeffs, mb->skip_block, - mb->plane[plane].zbin, - mb->plane[plane].round, - mb->plane[plane].quant, - mb->plane[plane].quant_shift, - BLOCK_OFFSET(xd->plane[plane].qcoeff, block, 16), - BLOCK_OFFSET(xd->plane[plane].dqcoeff, block, 16), - xd->plane[plane].dequant, - mb->plane[plane].zbin_extra, - &xd->plane[plane].eobs[block], - scan, iscan); - } - else { - vp9_quantize_b(BLOCK_OFFSET(mb->plane[plane].coeff, block, 16), - n_coeffs, mb->skip_block, - mb->plane[plane].zbin, - mb->plane[plane].round, - mb->plane[plane].quant, - mb->plane[plane].quant_shift, - BLOCK_OFFSET(xd->plane[plane].qcoeff, block, 16), - BLOCK_OFFSET(xd->plane[plane].dqcoeff, block, 16), - xd->plane[plane].dequant, - mb->plane[plane].zbin_extra, - &xd->plane[plane].eobs[block], - scan, iscan); - } -} - void vp9_regular_quantize_b_4x4(MACROBLOCK *mb, int b_idx, TX_TYPE tx_type, int y_blocks) { MACROBLOCKD *const xd = &mb->e_mbd; diff --git a/vp9/encoder/vp9_quantize.h b/vp9/encoder/vp9_quantize.h index 2b1eeabbe..3229eaad2 100644 --- a/vp9/encoder/vp9_quantize.h +++ b/vp9/encoder/vp9_quantize.h @@ -22,9 +22,6 @@ #define prototype_quantize_mb(sym) \ void (sym)(MACROBLOCK *x) -void vp9_quantize(MACROBLOCK *mb, int plane, int block, int n_coefs, - TX_TYPE tx_type); - void vp9_regular_quantize_b_4x4_pair(MACROBLOCK *mb, int b_idx1, int b_idx2, int y_blocks); void vp9_regular_quantize_b_4x4(MACROBLOCK *mb, int b_idx, TX_TYPE tx_type,