From 7105e66d19811add930055959598beb68f21df29 Mon Sep 17 00:00:00 2001
From: Urvang Joshi <urvang@google.com>
Date: Fri, 28 Jul 2017 15:57:22 -0700
Subject: [PATCH] Remove the DP version of vp9_optimize_b().

The greedy version was already enabled by default here:
https://chromium-review.googlesource.com/c/546848/

And the speed+compression gains from greedy version were already
mentioned here:
https://chromium-review.googlesource.com/c/531675/

Change-Id: Iad9f7d03490c845ad1e230af028c9d39edddca97
---
 vp9/encoder/vp9_encodemb.c | 283 -------------------------------------
 1 file changed, 283 deletions(-)

diff --git a/vp9/encoder/vp9_encodemb.c b/vp9/encoder/vp9_encodemb.c
index 2a50c9912..c8b7977aa 100644
--- a/vp9/encoder/vp9_encodemb.c
+++ b/vp9/encoder/vp9_encodemb.c
@@ -53,10 +53,6 @@ static const int plane_rd_mult[REF_TYPES][PLANE_TYPES] = {
   { 10, 6 }, { 8, 5 },
 };
 
-#define USE_GREEDY_OPTIMIZE_B 1
-
-#if USE_GREEDY_OPTIMIZE_B
-
 // 'num' can be negative, but 'shift' must be non-negative.
 #define RIGHT_SHIFT_POSSIBLY_NEGATIVE(num, shift) \
   ((num) >= 0) ? (num) >> (shift) : -((-(num)) >> (shift))
@@ -306,285 +302,6 @@ int vp9_optimize_b(MACROBLOCK *mb, int plane, int block, TX_SIZE tx_size,
 }
 #undef RIGHT_SHIFT_POSSIBLY_NEGATIVE
 
-#else
-
-#define UPDATE_RD_COST()                             \
-  {                                                  \
-    rd_cost0 = RDCOST(rdmult, rddiv, rate0, error0); \
-    rd_cost1 = RDCOST(rdmult, rddiv, rate1, error1); \
-  }
-
-// This function is a place holder for now but may ultimately need
-// to scan previous tokens to work out the correct context.
-static int trellis_get_coeff_context(const int16_t *scan, const int16_t *nb,
-                                     int idx, int token, uint8_t *token_cache) {
-  int bak = token_cache[scan[idx]], pt;
-  token_cache[scan[idx]] = vp9_pt_energy_class[token];
-  pt = get_coef_context(nb, token_cache, idx + 1);
-  token_cache[scan[idx]] = bak;
-  return pt;
-}
-
-static const int16_t band_count_table[TX_SIZES][8] = {
-  { 1, 2, 3, 4, 3, 16 - 13, 0 },
-  { 1, 2, 3, 4, 11, 64 - 21, 0 },
-  { 1, 2, 3, 4, 11, 256 - 21, 0 },
-  { 1, 2, 3, 4, 11, 1024 - 21, 0 },
-};
-static const int16_t band_cum_count_table[TX_SIZES][8] = {
-  { 0, 1, 3, 6, 10, 13, 16, 0 },
-  { 0, 1, 3, 6, 10, 21, 64, 0 },
-  { 0, 1, 3, 6, 10, 21, 256, 0 },
-  { 0, 1, 3, 6, 10, 21, 1024, 0 },
-};
-
-typedef struct vp9_token_state {
-  int64_t error;
-  int rate;
-  int16_t next;
-  int16_t token;
-  tran_low_t qc;
-  tran_low_t dqc;
-  uint8_t best_index;
-} vp9_token_state;
-
-int vp9_optimize_b(MACROBLOCK *mb, int plane, int block, TX_SIZE tx_size,
-                   int ctx) {
-  MACROBLOCKD *const xd = &mb->e_mbd;
-  struct macroblock_plane *const p = &mb->plane[plane];
-  struct macroblockd_plane *const pd = &xd->plane[plane];
-  const int ref = is_inter_block(xd->mi[0]);
-  vp9_token_state tokens[1025][2];
-  uint8_t token_cache[1024];
-  const tran_low_t *const coeff = BLOCK_OFFSET(mb->plane[plane].coeff, block);
-  tran_low_t *const qcoeff = BLOCK_OFFSET(p->qcoeff, block);
-  tran_low_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
-  const int eob = p->eobs[block];
-  const PLANE_TYPE type = get_plane_type(plane);
-  const int default_eob = 16 << (tx_size << 1);
-  const int shift = (tx_size == TX_32X32);
-  const int16_t *const dequant_ptr = pd->dequant;
-  const uint8_t *const band_translate = get_band_translate(tx_size);
-  const scan_order *const so = get_scan(xd, tx_size, type, block);
-  const int16_t *const scan = so->scan;
-  const int16_t *const nb = so->neighbors;
-  const int dq_step[2] = { dequant_ptr[0] >> shift, dequant_ptr[1] >> shift };
-  int next = eob, sz = 0;
-  const int64_t rdmult = ((int64_t)mb->rdmult * plane_rd_mult[ref][type]) >> 1;
-  const int64_t rddiv = mb->rddiv;
-  int64_t rd_cost0, rd_cost1;
-  int rate0, rate1;
-  int64_t error0, error1;
-  int16_t t0, t1;
-  int best, band = (eob < default_eob) ? band_translate[eob]
-                                       : band_translate[eob - 1];
-  int pt, i, final_eob;
-#if CONFIG_VP9_HIGHBITDEPTH
-  const uint16_t *cat6_high_cost = vp9_get_high_cost_table(xd->bd);
-#else
-  const uint16_t *cat6_high_cost = vp9_get_high_cost_table(8);
-#endif
-  unsigned int(*token_costs)[2][COEFF_CONTEXTS][ENTROPY_TOKENS] =
-      mb->token_costs[tx_size][type][ref];
-  const int16_t *band_counts = &band_count_table[tx_size][band];
-  int16_t band_left = eob - band_cum_count_table[tx_size][band] + 1;
-
-  token_costs += band;
-
-  assert((!type && !plane) || (type && plane));
-  assert(eob <= default_eob);
-
-  /* Now set up a Viterbi trellis to evaluate alternative roundings. */
-  /* Initialize the sentinel node of the trellis. */
-  tokens[eob][0].rate = 0;
-  tokens[eob][0].error = 0;
-  tokens[eob][0].next = default_eob;
-  tokens[eob][0].token = EOB_TOKEN;
-  tokens[eob][0].qc = 0;
-  tokens[eob][1] = tokens[eob][0];
-
-  for (i = 0; i < eob; i++) {
-    const int rc = scan[i];
-    token_cache[rc] = vp9_pt_energy_class[vp9_get_token(qcoeff[rc])];
-  }
-
-  for (i = eob; i-- > 0;) {
-    int base_bits, d2, dx;
-    const int rc = scan[i];
-    int x = qcoeff[rc];
-    /* Only add a trellis state for non-zero coefficients. */
-    if (x) {
-      error0 = tokens[next][0].error;
-      error1 = tokens[next][1].error;
-      /* Evaluate the first possibility for this state. */
-      rate0 = tokens[next][0].rate;
-      rate1 = tokens[next][1].rate;
-      base_bits = vp9_get_token_cost(x, &t0, cat6_high_cost);
-      /* Consider both possible successor states. */
-      if (next < default_eob) {
-        pt = trellis_get_coeff_context(scan, nb, i, t0, token_cache);
-        rate0 += (*token_costs)[0][pt][tokens[next][0].token];
-        rate1 += (*token_costs)[0][pt][tokens[next][1].token];
-      }
-      UPDATE_RD_COST();
-      /* And pick the best. */
-      best = rd_cost1 < rd_cost0;
-      dx = (dqcoeff[rc] - coeff[rc]) * (1 << shift);
-#if CONFIG_VP9_HIGHBITDEPTH
-      if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
-        dx >>= xd->bd - 8;
-      }
-#endif  // CONFIG_VP9_HIGHBITDEPTH
-      d2 = dx * dx;
-      tokens[i][0].rate = base_bits + (best ? rate1 : rate0);
-      tokens[i][0].error = d2 + (best ? error1 : error0);
-      tokens[i][0].next = next;
-      tokens[i][0].token = t0;
-      tokens[i][0].qc = x;
-      tokens[i][0].dqc = dqcoeff[rc];
-      tokens[i][0].best_index = best;
-
-      /* Evaluate the second possibility for this state. */
-      rate0 = tokens[next][0].rate;
-      rate1 = tokens[next][1].rate;
-
-      if ((abs(x) * dequant_ptr[rc != 0] > (abs(coeff[rc]) << shift)) &&
-          (abs(x) * dequant_ptr[rc != 0] <
-           (abs(coeff[rc]) << shift) + dequant_ptr[rc != 0])) {
-        sz = -(x < 0);
-        x -= 2 * sz + 1;
-      } else {
-        tokens[i][1] = tokens[i][0];
-        next = i;
-
-        if (!(--band_left)) {
-          --band_counts;
-          band_left = *band_counts;
-          --token_costs;
-        }
-        continue;
-      }
-
-      /* Consider both possible successor states. */
-      if (!x) {
-        /* If we reduced this coefficient to zero, check to see if
-         *  we need to move the EOB back here.
-         */
-        t0 = tokens[next][0].token == EOB_TOKEN ? EOB_TOKEN : ZERO_TOKEN;
-        t1 = tokens[next][1].token == EOB_TOKEN ? EOB_TOKEN : ZERO_TOKEN;
-        base_bits = 0;
-      } else {
-        base_bits = vp9_get_token_cost(x, &t0, cat6_high_cost);
-        t1 = t0;
-      }
-      if (next < default_eob) {
-        if (t0 != EOB_TOKEN) {
-          pt = trellis_get_coeff_context(scan, nb, i, t0, token_cache);
-          rate0 += (*token_costs)[!x][pt][tokens[next][0].token];
-        }
-        if (t1 != EOB_TOKEN) {
-          pt = trellis_get_coeff_context(scan, nb, i, t1, token_cache);
-          rate1 += (*token_costs)[!x][pt][tokens[next][1].token];
-        }
-      }
-
-      UPDATE_RD_COST();
-      /* And pick the best. */
-      best = rd_cost1 < rd_cost0;
-
-#if CONFIG_VP9_HIGHBITDEPTH
-      if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
-        dx -= ((dequant_ptr[rc != 0] >> (xd->bd - 8)) + sz) ^ sz;
-      } else {
-        dx -= (dequant_ptr[rc != 0] + sz) ^ sz;
-      }
-#else
-      dx -= (dequant_ptr[rc != 0] + sz) ^ sz;
-#endif  // CONFIG_VP9_HIGHBITDEPTH
-      d2 = dx * dx;
-
-      tokens[i][1].rate = base_bits + (best ? rate1 : rate0);
-      tokens[i][1].error = d2 + (best ? error1 : error0);
-      tokens[i][1].next = next;
-      tokens[i][1].token = best ? t1 : t0;
-      tokens[i][1].qc = x;
-
-      if (x) {
-        tran_low_t offset = dq_step[rc != 0];
-        // The 32x32 transform coefficient uses half quantization step size.
-        // Account for the rounding difference in the dequantized coefficeint
-        // value when the quantization index is dropped from an even number
-        // to an odd number.
-        if (shift & x) offset += (dequant_ptr[rc != 0] & 0x01);
-
-        if (sz == 0)
-          tokens[i][1].dqc = dqcoeff[rc] - offset;
-        else
-          tokens[i][1].dqc = dqcoeff[rc] + offset;
-      } else {
-        tokens[i][1].dqc = 0;
-      }
-
-      tokens[i][1].best_index = best;
-      /* Finally, make this the new head of the trellis. */
-      next = i;
-    } else {
-      /* There's no choice to make for a zero coefficient, so we don't
-       *  add a new trellis node, but we do need to update the costs.
-       */
-      pt = get_coef_context(nb, token_cache, i + 1);
-      t0 = tokens[next][0].token;
-      t1 = tokens[next][1].token;
-      /* Update the cost of each path if we're past the EOB token. */
-      if (t0 != EOB_TOKEN) {
-        tokens[next][0].rate += (*token_costs)[1][pt][t0];
-        tokens[next][0].token = ZERO_TOKEN;
-      }
-      if (t1 != EOB_TOKEN) {
-        tokens[next][1].rate += (*token_costs)[1][pt][t1];
-        tokens[next][1].token = ZERO_TOKEN;
-      }
-      tokens[i][0].best_index = tokens[i][1].best_index = 0;
-      /* Don't update next, because we didn't add a new node. */
-    }
-
-    if (!(--band_left)) {
-      --band_counts;
-      band_left = *band_counts;
-      --token_costs;
-    }
-  }
-
-  /* Now pick the best path through the whole trellis. */
-  rate0 = tokens[next][0].rate;
-  rate1 = tokens[next][1].rate;
-  error0 = tokens[next][0].error;
-  error1 = tokens[next][1].error;
-  t0 = tokens[next][0].token;
-  t1 = tokens[next][1].token;
-  rate0 += (*token_costs)[0][ctx][t0];
-  rate1 += (*token_costs)[0][ctx][t1];
-  UPDATE_RD_COST();
-  best = rd_cost1 < rd_cost0;
-  final_eob = -1;
-
-  for (i = next; i < eob; i = next) {
-    const int x = tokens[i][best].qc;
-    const int rc = scan[i];
-    if (x) final_eob = i;
-    qcoeff[rc] = x;
-    dqcoeff[rc] = tokens[i][best].dqc;
-    next = tokens[i][best].next;
-    best = tokens[i][best].best_index;
-  }
-  final_eob++;
-
-  mb->plane[plane].eobs[block] = final_eob;
-  return final_eob;
-}
-
-#endif  // USE_GREEDY_OPTIMIZE_B
-
 static INLINE void fdct32x32(int rd_transform, const int16_t *src,
                              tran_low_t *dst, int src_stride) {
   if (rd_transform)
-- 
2.40.0