From a4ea7e131b139c5cb68e50f9d2518da53695a810 Mon Sep 17 00:00:00 2001
From: Urvang Joshi <urvang@google.com>
Date: Thu, 8 Jun 2017 14:51:01 -0700
Subject: [PATCH] VP9: Add greedy version of av1_optimize_b().

This was ported from the greedy version in AV1, written by Dake He
(dkhe@google.com).
See:
https://aomedia.googlesource.com/aom/+/master/av1/encoder/encodemb.c#137

Greedy version is disabled by default, but can be picked by setting
USE_GREEDY_OPTIMIZE_B to 1.
To be enabled by default later.

This is both faster and better in terms of compression.

Compression Improvement:
------------------------
lowres: -0.119
midres: -0.064
hdres:  -0.405

Speed Improvement:
------------------
(Based on encode time of 3 videos of different difficulties at
3 different target bitrates)
With --cpu-used=0: 0.38% to 5.55% faster
With --cpu-used=1: 0.24% to 2.79% faster
With --cpu-used=2: 0.29% to 1.46% faster

Change-Id: Ia7a23b3b244ad8eb253ac9e43cd03c5e021d2635
---
 vp9/encoder/vp9_encodemb.c | 285 +++++++++++++++++++++++++++++++++++--
 1 file changed, 277 insertions(+), 8 deletions(-)

diff --git a/vp9/encoder/vp9_encodemb.c b/vp9/encoder/vp9_encodemb.c
index 7e30499c5..d8ea92af0 100644
--- a/vp9/encoder/vp9_encodemb.c
+++ b/vp9/encoder/vp9_encodemb.c
@@ -49,19 +49,275 @@ void vp9_subtract_plane(MACROBLOCK *x, BLOCK_SIZE bsize, int plane) {
                      pd->dst.buf, pd->dst.stride);
 }
 
-typedef struct vp9_token_state {
-  int64_t error;
-  int rate;
-  int16_t next;
+static const int plane_rd_mult[REF_TYPES][PLANE_TYPES] = {
+  { 10, 6 }, { 8, 5 },
+};
+
+#define USE_GREEDY_OPTIMIZE_B 0
+
+#if USE_GREEDY_OPTIMIZE_B
+
+typedef struct {
   int16_t token;
   tran_low_t qc;
   tran_low_t dqc;
-  uint8_t best_index;
 } vp9_token_state;
 
-static const int plane_rd_mult[REF_TYPES][PLANE_TYPES] = {
-  { 10, 6 }, { 8, 5 },
-};
+// 'num' can be negative, but 'shift' must be non-negative.
+#define RIGHT_SHIFT_POSSIBLY_NEGATIVE(num, shift) \
+  ((num) >= 0) ? (num) >> (shift) : -((-(num)) >> (shift))
+
+int vp9_optimize_b(MACROBLOCK *mb, int plane, int block, TX_SIZE tx_size,
+                   int ctx) {
+  MACROBLOCKD *const xd = &mb->e_mbd;
+  struct macroblock_plane *const p = &mb->plane[plane];
+  struct macroblockd_plane *const pd = &xd->plane[plane];
+  const int ref = is_inter_block(xd->mi[0]);
+  vp9_token_state tokens[1025][2];
+  uint8_t token_cache[1024];
+  const tran_low_t *const coeff = BLOCK_OFFSET(p->coeff, block);
+  tran_low_t *const qcoeff = BLOCK_OFFSET(p->qcoeff, block);
+  tran_low_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
+  const int eob = p->eobs[block];
+  const PLANE_TYPE plane_type = get_plane_type(plane);
+  const int default_eob = 16 << (tx_size << 1);
+  const int shift = (tx_size == TX_32X32);
+  const int16_t *const dequant_ptr = pd->dequant;
+  const uint8_t *const band_translate = get_band_translate(tx_size);
+  const scan_order *const so = get_scan(xd, tx_size, plane_type, block);
+  const int16_t *const scan = so->scan;
+  const int16_t *const nb = so->neighbors;
+  const int64_t rdmult =
+      ((int64_t)mb->rdmult * plane_rd_mult[ref][plane_type]) >> 1;
+  const int64_t rddiv = mb->rddiv;
+  int64_t rd_cost0, rd_cost1;
+  int64_t rate0, rate1;
+  int16_t t0, t1;
+  int i, final_eob;
+#if CONFIG_VP9_HIGHBITDEPTH
+  const uint16_t *cat6_high_cost = vp9_get_high_cost_table(xd->bd);
+#else
+  const uint16_t *cat6_high_cost = vp9_get_high_cost_table(8);
+#endif
+  unsigned int(*const token_costs)[2][COEFF_CONTEXTS][ENTROPY_TOKENS] =
+      mb->token_costs[tx_size][plane_type][ref];
+  unsigned int(*token_costs_cur)[2][COEFF_CONTEXTS][ENTROPY_TOKENS];
+  int64_t eob_cost0, eob_cost1;
+  const int ctx0 = ctx;
+  int64_t accu_rate = 0;
+  // Initialized to the worst possible error for the largest transform size.
+  // This ensures that it never goes negative.
+  int64_t accu_error = ((int64_t)1) << 50;
+  int64_t best_block_rd_cost = INT64_MAX;
+  int x_prev = 1;
+  assert((!plane_type && !plane) || (plane_type && plane));
+  assert(eob <= default_eob);
+
+  for (i = 0; i < eob; i++) {
+    const int rc = scan[i];
+    int x = qcoeff[rc];
+    t0 = vp9_get_token(x);
+    tokens[i][0].qc = x;
+    tokens[i][0].token = t0;
+    tokens[i][0].dqc = dqcoeff[rc];
+    token_cache[rc] = vp9_pt_energy_class[t0];
+  }
+  tokens[eob][0].token = EOB_TOKEN;
+  tokens[eob][0].qc = 0;
+  tokens[eob][0].dqc = 0;
+  tokens[eob][1] = tokens[eob][0];
+  final_eob = 0;
+
+  // Initial RD cost.
+  token_costs_cur = token_costs + band_translate[0];
+  rate0 = (*token_costs_cur)[0][ctx0][EOB_TOKEN];
+  best_block_rd_cost = RDCOST(rdmult, rddiv, rate0, accu_error);
+
+  // For each token, pick one of two choices greedily:
+  // (i) First candidate: Keep current quantized value, OR
+  // (ii) Second candidate: Reduce quantized value by 1.
+  for (i = 0; i < eob; i++) {
+    const int rc = scan[i];
+    const int x = qcoeff[rc];
+    const int band_cur = band_translate[i];
+    const int ctx_cur = (i == 0) ? ctx : get_coef_context(nb, token_cache, i);
+    const int token_tree_sel_cur = (x_prev == 0);
+    token_costs_cur = token_costs + band_cur;
+    if (x == 0) {  // No need to search
+      rate0 =
+          (*token_costs_cur)[token_tree_sel_cur][ctx_cur][tokens[i][0].token];
+      accu_rate += rate0;
+      x_prev = 0;
+      // Note: accu_error does not change.
+    } else {
+      const int dqv = dequant_ptr[rc != 0];
+      // Compute the distortion for quantizing to 0.
+      const int diff_for_zero_raw = (0 - coeff[rc]) * (1 << shift);
+      const int diff_for_zero =
+#if CONFIG_VP9_HIGHBITDEPTH
+          (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH)
+              ? RIGHT_SHIFT_POSSIBLY_NEGATIVE(diff_for_zero_raw, xd->bd - 8)
+              :
+#endif
+              diff_for_zero_raw;
+      const int64_t distortion_for_zero =
+          (int64_t)diff_for_zero * diff_for_zero;
+
+      // Compute the distortion for the first candidate
+      const int diff0_raw = (dqcoeff[rc] - coeff[rc]) * (1 << shift);
+      const int diff0 =
+#if CONFIG_VP9_HIGHBITDEPTH
+          (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH)
+              ? RIGHT_SHIFT_POSSIBLY_NEGATIVE(diff0_raw, xd->bd - 8)
+              :
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+              diff0_raw;
+      const int64_t distortion0 = (int64_t)diff0 * diff0;
+
+      // Compute the distortion for the second candidate
+      const int sign = -(x < 0);        // -1 if x is negative and 0 otherwise.
+      const int x1 = x - 2 * sign - 1;  // abs(x1) = abs(x) - 1.
+      int64_t distortion1;
+      if (x1 != 0) {
+        const int dqv_step =
+#if CONFIG_VP9_HIGHBITDEPTH
+            (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) ? dqv >> (xd->bd - 8)
+                                                          :
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+                                                          dqv;
+        const int diff_step = (dqv_step + sign) ^ sign;
+        const int diff1 = diff0 - diff_step;
+        assert(dqv > 0);  // We aren't right shifting a negative number above.
+        distortion1 = (int64_t)diff1 * diff1;
+      } else {
+        distortion1 = distortion_for_zero;
+      }
+      {
+        // Calculate RDCost for current coeff for the two candidates.
+        const int64_t base_bits0 = vp9_get_token_cost(x, &t0, cat6_high_cost);
+        const int64_t base_bits1 = vp9_get_token_cost(x1, &t1, cat6_high_cost);
+        rate0 =
+            base_bits0 + (*token_costs_cur)[token_tree_sel_cur][ctx_cur][t0];
+        rate1 =
+            base_bits1 + (*token_costs_cur)[token_tree_sel_cur][ctx_cur][t1];
+      }
+      {
+        int rdcost_better_for_x1, eob_rdcost_better_for_x1;
+        int dqc0, dqc1;
+        int64_t best_eob_cost_cur;
+
+        // Calculate RD Cost effect on the next coeff for the two candidates.
+        int64_t next_bits0 = 0;
+        int64_t next_bits1 = 0;
+        int64_t next_eob_bits0 = 0;
+        int64_t next_eob_bits1 = 0;
+        if (i < default_eob - 1) {
+          int ctx_next, token_tree_sel_next;
+          const int band_next = band_translate[i + 1];
+          unsigned int(
+              *const token_costs_next)[2][COEFF_CONTEXTS][ENTROPY_TOKENS] =
+              token_costs + band_next;
+          token_cache[rc] = vp9_pt_energy_class[t0];
+          ctx_next = get_coef_context(nb, token_cache, i + 1);
+          token_tree_sel_next = (x == 0);
+          next_bits0 = (*token_costs_next)[token_tree_sel_next][ctx_next]
+                                          [tokens[i + 1][0].token];
+          next_eob_bits0 =
+              (*token_costs_next)[token_tree_sel_next][ctx_next][EOB_TOKEN];
+          token_cache[rc] = vp9_pt_energy_class[t1];
+          ctx_next = get_coef_context(nb, token_cache, i + 1);
+          token_tree_sel_next = (x1 == 0);
+          next_bits1 = (*token_costs_next)[token_tree_sel_next][ctx_next]
+                                          [tokens[i + 1][0].token];
+          if (x1 != 0) {
+            next_eob_bits1 =
+                (*token_costs_next)[token_tree_sel_next][ctx_next][EOB_TOKEN];
+          }
+        }
+
+        // Compare the total RD costs for two candidates.
+        rd_cost0 = RDCOST(rdmult, rddiv, (rate0 + next_bits0), distortion0);
+        rd_cost1 = RDCOST(rdmult, rddiv, (rate1 + next_bits1), distortion1);
+        rdcost_better_for_x1 = (rd_cost1 < rd_cost0);
+        eob_cost0 = RDCOST(rdmult, rddiv, (accu_rate + rate0 + next_eob_bits0),
+                           (accu_error + distortion0 - distortion_for_zero));
+        eob_cost1 = eob_cost0;
+        if (x1 != 0) {
+          eob_cost1 =
+              RDCOST(rdmult, rddiv, (accu_rate + rate1 + next_eob_bits1),
+                     (accu_error + distortion1 - distortion_for_zero));
+          eob_rdcost_better_for_x1 = (eob_cost1 < eob_cost0);
+        } else {
+          eob_rdcost_better_for_x1 = 0;
+        }
+
+        // Calculate the two candidate de-quantized values.
+        dqc0 = dqcoeff[rc];
+        dqc1 = 0;
+        if (rdcost_better_for_x1 + eob_rdcost_better_for_x1) {
+          if (x1 != 0) {
+            dqc1 = RIGHT_SHIFT_POSSIBLY_NEGATIVE(x1 * dqv, shift);
+          } else {
+            dqc1 = 0;
+          }
+        }
+
+        // Pick and record the better quantized and de-quantized values.
+        if (rdcost_better_for_x1) {
+          qcoeff[rc] = x1;
+          dqcoeff[rc] = dqc1;
+          accu_rate += rate1;
+          accu_error += distortion1 - distortion_for_zero;
+          assert(distortion1 <= distortion_for_zero);
+          token_cache[rc] = vp9_pt_energy_class[t1];
+        } else {
+          accu_rate += rate0;
+          accu_error += distortion0 - distortion_for_zero;
+          assert(distortion0 <= distortion_for_zero);
+          token_cache[rc] = vp9_pt_energy_class[t0];
+        }
+        assert(accu_error >= 0);
+        x_prev = qcoeff[rc];  // Update based on selected quantized value.
+
+        best_eob_cost_cur = eob_cost0;
+        tokens[i][1].token = t0;
+        tokens[i][1].qc = x;
+        tokens[i][1].dqc = dqc0;
+        if ((x1 != 0) && eob_rdcost_better_for_x1) {
+          best_eob_cost_cur = eob_cost1;
+          tokens[i][1].token = t1;
+          tokens[i][1].qc = x1;
+          tokens[i][1].dqc = dqc1;
+        }
+
+        // Determine whether to move the eob position to i+1
+        if (best_eob_cost_cur < best_block_rd_cost) {
+          best_block_rd_cost = best_eob_cost_cur;
+          final_eob = i + 1;
+        }
+      }
+    }
+  }
+  assert(final_eob <= eob);
+  if (final_eob > 0) {
+    int rc;
+    assert(tokens[final_eob - 1][1].qc != 0);
+    i = final_eob - 1;
+    rc = scan[i];
+    qcoeff[rc] = tokens[i][1].qc;
+    dqcoeff[rc] = tokens[i][1].dqc;
+  }
+  for (i = final_eob; i < eob; i++) {
+    int rc = scan[i];
+    qcoeff[rc] = 0;
+    dqcoeff[rc] = 0;
+  }
+  mb->plane[plane].eobs[block] = final_eob;
+  return final_eob;
+}
+#undef RIGHT_SHIFT_POSSIBLY_NEGATIVE
+
+#else
 
 #define UPDATE_RD_COST()                             \
   {                                                  \
@@ -92,6 +348,17 @@ static const int16_t band_cum_count_table[TX_SIZES][8] = {
   { 0, 1, 3, 6, 10, 21, 256, 0 },
   { 0, 1, 3, 6, 10, 21, 1024, 0 },
 };
+
+typedef struct vp9_token_state {
+  int64_t error;
+  int rate;
+  int16_t next;
+  int16_t token;
+  tran_low_t qc;
+  tran_low_t dqc;
+  uint8_t best_index;
+} vp9_token_state;
+
 int vp9_optimize_b(MACROBLOCK *mb, int plane, int block, TX_SIZE tx_size,
                    int ctx) {
   MACROBLOCKD *const xd = &mb->e_mbd;
@@ -327,6 +594,8 @@ int vp9_optimize_b(MACROBLOCK *mb, int plane, int block, TX_SIZE tx_size,
   return final_eob;
 }
 
+#endif  // USE_GREEDY_OPTIMIZE_B
+
 static INLINE void fdct32x32(int rd_transform, const int16_t *src,
                              tran_low_t *dst, int src_stride) {
   if (rd_transform)
-- 
2.40.0