From: Ronald S. Bultje <rbultje@google.com>
Date: Thu, 11 Jul 2013 20:01:44 +0000 (-0700)
Subject: Inline vp9_quantize() in xform_quant().
X-Git-Tag: v1.3.0~863
X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=1ff94fea5639c1c7c4bc99a080aaa985d60d25b7;p=libvpx

Inline vp9_quantize() in xform_quant().

Cycle times:
4x4:    151 to  131 cycles (15% faster)
8x8:    334 to  306 cycles (9% faster)
16x16: 1401 to 1368 cycles (2.5% faster)
32x32: 7403 to 7367 cycles (0.5% faster)

Total encode time of first 50 frames of bus @ 1500kbps (speed 0)
goes from 1min39.2 to 1min38.6, i.e. a 0.67% overall speedup.

Change-Id: I799a49460e5e3fcab01725564dd49c629bfe935f
---

diff --git a/vp9/encoder/vp9_encodemb.c b/vp9/encoder/vp9_encodemb.c
index 713a3335f..641fc4cc8 100644
--- a/vp9/encoder/vp9_encodemb.c
+++ b/vp9/encoder/vp9_encodemb.c
@@ -432,48 +432,86 @@ void xform_quant(int plane, int block, BLOCK_SIZE_TYPE bsize,
   struct encode_b_args* const args = arg;
   MACROBLOCK* const x = args->x;
   MACROBLOCKD* const xd = &x->e_mbd;
-  const int bw = plane_block_width(bsize, &xd->plane[plane]);
-  const int raster_block = txfrm_block_to_raster_block(xd, bsize, plane,
-                                                       block, ss_txfrm_size);
-  int16_t *const coeff = BLOCK_OFFSET(x->plane[plane].coeff, block, 16);
-  int16_t *const src_diff = raster_block_offset_int16(xd, bsize, plane,
-                                                      raster_block,
-                                                      x->plane[plane].src_diff);
-  TX_TYPE tx_type = DCT_DCT;
+  struct macroblock_plane *const p = &x->plane[plane];
+  struct macroblockd_plane *const pd = &xd->plane[plane];
+  int16_t *coeff = BLOCK_OFFSET(p->coeff, block, 16);
+  int16_t *qcoeff = BLOCK_OFFSET(pd->qcoeff, block, 16);
+  int16_t *dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block, 16);
+  const TX_SIZE tx_size = (TX_SIZE)(ss_txfrm_size / 2);
+  TX_TYPE tx_type;
+  const int16_t *scan, *iscan;
+  uint16_t *eob = &pd->eobs[block];
+  const int bwl = b_width_log2(bsize) - pd->subsampling_x, bw = 1 << bwl;
+  const int twl = bwl - tx_size, twmask = (1 << twl) - 1;
+  int xoff, yoff;
+  int16_t *src_diff;
 
-  switch (ss_txfrm_size / 2) {
+  switch (tx_size) {
     case TX_32X32:
+      scan = vp9_default_scan_32x32;
+      iscan = vp9_default_iscan_32x32;
+      block >>= 6;
+      xoff = 32 * (block & twmask);
+      yoff = 32 * (block >> twl);
+      src_diff = p->src_diff + 4 * bw * yoff + xoff;
       if (x->rd_search)
-        vp9_short_fdct32x32_rd(src_diff, coeff, bw * 2);
+        vp9_short_fdct32x32_rd(src_diff, coeff, bw * 8);
       else
-        vp9_short_fdct32x32(src_diff, coeff, bw * 2);
+        vp9_short_fdct32x32(src_diff, coeff, bw * 8);
+      vp9_quantize_b_32x32(coeff, 1024, x->skip_block, p->zbin, p->round,
+                           p->quant, p->quant_shift, qcoeff, dqcoeff,
+                           pd->dequant, p->zbin_extra, eob, scan, iscan);
       break;
     case TX_16X16:
       tx_type = plane == 0 ? get_tx_type_16x16(xd) : DCT_DCT;
+      scan = get_scan_16x16(tx_type);
+      iscan = get_iscan_16x16(tx_type);
+      block >>= 4;
+      xoff = 16 * (block & twmask);
+      yoff = 16 * (block >> twl);
+      src_diff = p->src_diff + 4 * bw * yoff + xoff;
       if (tx_type != DCT_DCT)
-        vp9_short_fht16x16(src_diff, coeff, bw, tx_type);
+        vp9_short_fht16x16(src_diff, coeff, bw * 4, tx_type);
       else
-        x->fwd_txm16x16(src_diff, coeff, bw * 2);
+        x->fwd_txm16x16(src_diff, coeff, bw * 8);
+      vp9_quantize_b(coeff, 256, x->skip_block, p->zbin, p->round,
+                     p->quant, p->quant_shift, qcoeff, dqcoeff,
+                     pd->dequant, p->zbin_extra, eob, scan, iscan);
       break;
     case TX_8X8:
       tx_type = plane == 0 ? get_tx_type_8x8(xd) : DCT_DCT;
+      scan = get_scan_8x8(tx_type);
+      iscan = get_iscan_8x8(tx_type);
+      block >>= 2;
+      xoff = 8 * (block & twmask);
+      yoff = 8 * (block >> twl);
+      src_diff = p->src_diff + 4 * bw * yoff + xoff;
       if (tx_type != DCT_DCT)
-        vp9_short_fht8x8(src_diff, coeff, bw, tx_type);
+        vp9_short_fht8x8(src_diff, coeff, bw * 4, tx_type);
       else
-        x->fwd_txm8x8(src_diff, coeff, bw * 2);
+        x->fwd_txm8x8(src_diff, coeff, bw * 8);
+      vp9_quantize_b(coeff, 64, x->skip_block, p->zbin, p->round,
+                     p->quant, p->quant_shift, qcoeff, dqcoeff,
+                     pd->dequant, p->zbin_extra, eob, scan, iscan);
       break;
     case TX_4X4:
-      tx_type = plane == 0 ? get_tx_type_4x4(xd, raster_block) : DCT_DCT;
+      tx_type = plane == 0 ? get_tx_type_4x4(xd, block) : DCT_DCT;
+      scan = get_scan_4x4(tx_type);
+      iscan = get_iscan_4x4(tx_type);
+      xoff = 4 * (block & twmask);
+      yoff = 4 * (block >> twl);
+      src_diff = p->src_diff + 4 * bw * yoff + xoff;
       if (tx_type != DCT_DCT)
-        vp9_short_fht4x4(src_diff, coeff, bw, tx_type);
+        vp9_short_fht4x4(src_diff, coeff, bw * 4, tx_type);
       else
-        x->fwd_txm4x4(src_diff, coeff, bw * 2);
+        x->fwd_txm4x4(src_diff, coeff, bw * 8);
+      vp9_quantize_b(coeff, 16, x->skip_block, p->zbin, p->round,
+                     p->quant, p->quant_shift, qcoeff, dqcoeff,
+                     pd->dequant, p->zbin_extra, eob, scan, iscan);
       break;
     default:
       assert(0);
   }
-
-  vp9_quantize(x, plane, block, 16 << ss_txfrm_size, tx_type);
 }
 
 static void encode_block(int plane, int block, BLOCK_SIZE_TYPE bsize,
diff --git a/vp9/encoder/vp9_quantize.c b/vp9/encoder/vp9_quantize.c
index 2d3d6bf9d..525f4da79 100644
--- a/vp9/encoder/vp9_quantize.c
+++ b/vp9/encoder/vp9_quantize.c
@@ -152,63 +152,6 @@ void vp9_quantize_b_32x32_c(int16_t *coeff_ptr, intptr_t n_coeffs,
   *eob_ptr = eob + 1;
 }
 
-void vp9_quantize(MACROBLOCK *mb, int plane, int block, int n_coeffs,
-                  TX_TYPE tx_type) {
-  MACROBLOCKD *const xd = &mb->e_mbd;
-  const int16_t *scan, *iscan;
-
-  // These contexts may be available in the caller
-  switch (n_coeffs) {
-    case 4 * 4:
-      scan = get_scan_4x4(tx_type);
-      iscan = get_iscan_4x4(tx_type);
-      break;
-    case 8 * 8:
-      scan = get_scan_8x8(tx_type);
-      iscan = get_iscan_8x8(tx_type);
-      break;
-    case 16 * 16:
-      scan = get_scan_16x16(tx_type);
-      iscan = get_iscan_16x16(tx_type);
-      break;
-    default:
-      scan = vp9_default_scan_32x32;
-      iscan = vp9_default_iscan_32x32;
-      break;
-  }
-
-  // Call different quantization for different transform size.
-  if (n_coeffs >= 1024) {
-    // Save index of picked coefficient in pre-scan pass.
-    vp9_quantize_b_32x32(BLOCK_OFFSET(mb->plane[plane].coeff, block, 16),
-                         n_coeffs, mb->skip_block,
-                         mb->plane[plane].zbin,
-                         mb->plane[plane].round,
-                         mb->plane[plane].quant,
-                         mb->plane[plane].quant_shift,
-                         BLOCK_OFFSET(xd->plane[plane].qcoeff, block, 16),
-                         BLOCK_OFFSET(xd->plane[plane].dqcoeff, block, 16),
-                         xd->plane[plane].dequant,
-                         mb->plane[plane].zbin_extra,
-                         &xd->plane[plane].eobs[block],
-                         scan, iscan);
-  }
-  else {
-    vp9_quantize_b(BLOCK_OFFSET(mb->plane[plane].coeff, block, 16),
-                   n_coeffs, mb->skip_block,
-                   mb->plane[plane].zbin,
-                   mb->plane[plane].round,
-                   mb->plane[plane].quant,
-                   mb->plane[plane].quant_shift,
-                   BLOCK_OFFSET(xd->plane[plane].qcoeff, block, 16),
-                   BLOCK_OFFSET(xd->plane[plane].dqcoeff, block, 16),
-                   xd->plane[plane].dequant,
-                   mb->plane[plane].zbin_extra,
-                   &xd->plane[plane].eobs[block],
-                   scan, iscan);
-  }
-}
-
 void vp9_regular_quantize_b_4x4(MACROBLOCK *mb, int b_idx, TX_TYPE tx_type,
                                 int y_blocks) {
   MACROBLOCKD *const xd = &mb->e_mbd;
diff --git a/vp9/encoder/vp9_quantize.h b/vp9/encoder/vp9_quantize.h
index 2b1eeabbe..3229eaad2 100644
--- a/vp9/encoder/vp9_quantize.h
+++ b/vp9/encoder/vp9_quantize.h
@@ -22,9 +22,6 @@
 #define prototype_quantize_mb(sym) \
   void (sym)(MACROBLOCK *x)
 
-void vp9_quantize(MACROBLOCK *mb, int plane, int block, int n_coefs,
-                  TX_TYPE tx_type);
-
 void vp9_regular_quantize_b_4x4_pair(MACROBLOCK *mb, int b_idx1, int b_idx2,
                                      int y_blocks);
 void vp9_regular_quantize_b_4x4(MACROBLOCK *mb, int b_idx, TX_TYPE tx_type,