From 1470529f62729b99d623e3c2f36a588ceb85ddb7 Mon Sep 17 00:00:00 2001
From: Jingning Han <jingning@google.com>
Date: Tue, 31 Mar 2015 17:46:41 -0700
Subject: [PATCH] Refactor block_yrd function for RTC coding mode

This commit separates Hadamard transform/quantization operations
from rate and distortion computation in block_yrd. This allows one
to skip SATD computation when all transform blocks are quantized
to zero. It also uses a new block error function that skips
repeated computation of sum of squared residuals. It reduces the
CPU cycles spent on block error calculation in block_yrd by 40%.

Change-Id: I726acb2454b44af1c3bd95385abecac209959b10
---
 vp9/common/vp9_rtcd_defs.pl        |  3 ++
 vp9/encoder/vp9_pickmode.c         | 59 ++++++++++++++++++++----------
 vp9/encoder/vp9_rdopt.c            | 12 ++++++
 vp9/encoder/x86/vp9_error_sse2.asm | 46 +++++++++++++++++++++++
 4 files changed, 101 insertions(+), 19 deletions(-)

diff --git a/vp9/common/vp9_rtcd_defs.pl b/vp9/common/vp9_rtcd_defs.pl
index 887f407ba..b1a33f5a0 100644
--- a/vp9/common/vp9_rtcd_defs.pl
+++ b/vp9/common/vp9_rtcd_defs.pl
@@ -1171,6 +1171,9 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
   add_proto qw/int64_t vp9_block_error/, "const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz";
   specialize qw/vp9_block_error avx2/, "$sse2_x86inc";
 
+  add_proto qw/int64_t vp9_block_error_fp/, "const int16_t *coeff, const int16_t *dqcoeff, int block_size";
+  specialize qw/vp9_block_error_fp sse2/;
+
   add_proto qw/void vp9_quantize_fp/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
   specialize qw/vp9_quantize_fp neon sse2/, "$ssse3_x86_64";
 
diff --git a/vp9/encoder/vp9_pickmode.c b/vp9/encoder/vp9_pickmode.c
index f8a5e6ae7..e26b4c09f 100644
--- a/vp9/encoder/vp9_pickmode.c
+++ b/vp9/encoder/vp9_pickmode.c
@@ -315,6 +315,20 @@ static void model_rd_for_sb_y(VP9_COMP *cpi, BLOCK_SIZE bsize,
   *out_dist_sum += dist << 4;
 }
 
+#if CONFIG_VP9_HIGHBITDEPTH
+static void block_yrd(VP9_COMP *cpi, MACROBLOCK *x, int *rate, int64_t *dist,
+                      int *skippable, int64_t *sse, int plane,
+                      BLOCK_SIZE bsize, TX_SIZE tx_size) {
+  MACROBLOCKD *xd = &x->e_mbd;
+  unsigned int var_y, sse_y;
+  (void)plane;
+  (void)tx_size;
+  model_rd_for_sb_y(cpi, bsize, x, xd, rate, dist, &var_y, &sse_y);
+  *sse = INT_MAX;
+  *skippable = 0;
+  return;
+}
+#else
 static void block_yrd(VP9_COMP *cpi, MACROBLOCK *x, int *rate, int64_t *dist,
                       int *skippable, int64_t *sse, int plane,
                       BLOCK_SIZE bsize, TX_SIZE tx_size) {
@@ -332,23 +346,9 @@ static void block_yrd(VP9_COMP *cpi, MACROBLOCK *x, int *rate, int64_t *dist,
   const int max_blocks_high = num_4x4_h + (xd->mb_to_bottom_edge >= 0 ? 0 :
       xd->mb_to_bottom_edge >> (5 + pd->subsampling_y));
 
-#if CONFIG_VP9_HIGHBITDEPTH
-  unsigned int var_y, sse_y;
-  model_rd_for_sb_y(cpi, bsize, x, xd, rate, dist, &var_y, &sse_y);
-  *sse = INT_MAX;
-  *skippable = 0;
-  return;
-#else
   (void)cpi;
-#endif
-
   vp9_subtract_plane(x, bsize, plane);
-
   *skippable = 1;
-  *rate = 0;
-  *dist = 0;
-  *sse = 0;
-
   // Keep track of the row and column of the blocks we use so that we know
   // if we are in the unrestricted motion border.
   for (r = 0; r < max_blocks_high; r += block_step) {
@@ -362,7 +362,6 @@ static void block_yrd(VP9_COMP *cpi, MACROBLOCK *x, int *rate, int64_t *dist,
         const int diff_stride = 4 * num_4x4_blocks_wide_lookup[bsize];
         int i, j;
         const int16_t *src_diff;
-        int64_t this_sse;
         txfrm_block_to_raster_xy(bsize, tx_size, block, &i, &j);
         src_diff = &p->src_diff[4 * (j * diff_stride + i)];
 
@@ -399,16 +398,36 @@ static void block_yrd(VP9_COMP *cpi, MACROBLOCK *x, int *rate, int64_t *dist,
             assert(0);
             break;
         }
+        *skippable &= (*eob == 0);
+      }
+      block += step;
+    }
+  }
+
+  if (*skippable && *sse < INT64_MAX) {
+    *dist = (*sse << 6) >> shift;
+    *sse = *dist;
+    return;
+  }
 
-        *dist += vp9_block_error(coeff, dqcoeff, step << 4, &this_sse) >> shift;
+  block = 0;
+  *rate = 0;
+  *dist = 0;
+  *sse = (*sse << 6) >> shift;
+  for (r = 0; r < max_blocks_high; r += block_step) {
+    for (c = 0; c < num_4x4_w; c += block_step) {
+      if (c < max_blocks_wide) {
+        tran_low_t *const coeff = BLOCK_OFFSET(p->coeff, block);
+        tran_low_t *const qcoeff = BLOCK_OFFSET(p->qcoeff, block);
+        tran_low_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
+        uint16_t *const eob = &p->eobs[block];
 
         if (*eob == 1)
           *rate += (int)abs(qcoeff[0]);
         else if (*eob > 1)
           *rate += (int)vp9_satd((const int16_t *)qcoeff, step << 4);
 
-        *sse += (this_sse >> shift);
-        *skippable &= (*eob == 0);
+        *dist += vp9_block_error_fp(coeff, dqcoeff, step << 4) >> shift;
       }
       block += step;
     }
@@ -417,6 +436,7 @@ static void block_yrd(VP9_COMP *cpi, MACROBLOCK *x, int *rate, int64_t *dist,
   *rate <<= 8;
   *rate *= 6;
 }
+#endif
 
 static void model_rd_for_sb_uv(VP9_COMP *cpi, BLOCK_SIZE bsize,
                                MACROBLOCK *x, MACROBLOCKD *xd,
@@ -624,7 +644,7 @@ static void estimate_block_intra(int plane, int block, BLOCK_SIZE plane_bsize,
   int i, j;
   int rate;
   int64_t dist;
-  int64_t this_sse;
+  int64_t this_sse = INT64_MAX;
   int is_skippable;
 
   txfrm_block_to_raster_xy(plane_bsize, tx_size, block, &i, &j);
@@ -1074,6 +1094,7 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
     }
 
     if (bsize <= BLOCK_16X16) {
+      this_sse = (int64_t)sse_y;
       block_yrd(cpi, x, &this_rdc.rate, &this_rdc.dist, &is_skippable,
                 &this_sse, 0, bsize, mbmi->tx_size);
       x->skip_txfm[0] = is_skippable;
diff --git a/vp9/encoder/vp9_rdopt.c b/vp9/encoder/vp9_rdopt.c
index e8c3f767e..166535b8b 100644
--- a/vp9/encoder/vp9_rdopt.c
+++ b/vp9/encoder/vp9_rdopt.c
@@ -292,6 +292,18 @@ int64_t vp9_block_error_c(const tran_low_t *coeff, const tran_low_t *dqcoeff,
   return error;
 }
 
+int64_t vp9_block_error_fp_c(const int16_t *coeff, const int16_t *dqcoeff,
+                             int block_size) {
+  int i;
+  int64_t error = 0;
+
+  for (i = 0; i < block_size; i++) {
+    const int diff = coeff[i] - dqcoeff[i];
+    error +=  diff * diff;
+  }
+
+  return error;
+}
 
 #if CONFIG_VP9_HIGHBITDEPTH
 int64_t vp9_highbd_block_error_c(const tran_low_t *coeff,
diff --git a/vp9/encoder/x86/vp9_error_sse2.asm b/vp9/encoder/x86/vp9_error_sse2.asm
index 1126fdb61..318379777 100644
--- a/vp9/encoder/x86/vp9_error_sse2.asm
+++ b/vp9/encoder/x86/vp9_error_sse2.asm
@@ -72,3 +72,49 @@ cglobal block_error, 3, 3, 8, uqc, dqc, size, ssz
   movd    edx, m5
 %endif
   RET
+
+; Compute the sum of squared difference between two int16_t vectors.
+; int64_t vp9_block_error_fp(int16_t *coeff, int16_t *dqcoeff,
+;                            intptr_t block_size)
+
+INIT_XMM sse2
+cglobal block_error_fp, 3, 3, 8, uqc, dqc, size
+  pxor      m4, m4                 ; sse accumulator
+  pxor      m5, m5                 ; dedicated zero register
+  lea     uqcq, [uqcq+sizeq*2]
+  lea     dqcq, [dqcq+sizeq*2]
+  neg    sizeq
+.loop:
+  mova      m2, [uqcq+sizeq*2]
+  mova      m0, [dqcq+sizeq*2]
+  mova      m3, [uqcq+sizeq*2+mmsize]
+  mova      m1, [dqcq+sizeq*2+mmsize]
+  psubw     m0, m2
+  psubw     m1, m3
+  ; individual errors are max. 15bit+sign, so squares are 30bit, and
+  ; thus the sum of 2 should fit in a 31bit integer (+ unused sign bit)
+  pmaddwd   m0, m0
+  pmaddwd   m1, m1
+  ; accumulate in 64bit
+  punpckldq m7, m0, m5
+  punpckhdq m0, m5
+  paddq     m4, m7
+  punpckldq m7, m1, m5
+  paddq     m4, m0
+  punpckhdq m1, m5
+  paddq     m4, m7
+  paddq     m4, m1
+  add    sizeq, mmsize
+  jl .loop
+
+  ; accumulate horizontally and store in return value
+  movhlps   m5, m4
+  paddq     m4, m5
+%if ARCH_X86_64
+  movq    rax, m4
+%else
+  pshufd   m5, m4, 0x1
+  movd    eax, m4
+  movd    edx, m5
+%endif
+  RET
-- 
2.50.1