From 5322a31b189178ad714aa03e42760e4cacd45a9e Mon Sep 17 00:00:00 2001
From: Urvang Joshi <urvang@google.com>
Date: Tue, 11 Jul 2017 13:05:29 -0700
Subject: [PATCH] Remove the token state array from greedy optimize_b.

Reduces memory usage, and speeds up encoding for some difficult clips.
No impact on output or metrics.

Ported from aomedia patch:
https://aomedia-review.googlesource.com/c/14501

Change-Id: I26ec69af8336f9e80da486a1cfbfc89a3596954d
---
 vp9/encoder/vp9_encodemb.c | 61 ++++++++++++++++----------------------
 1 file changed, 25 insertions(+), 36 deletions(-)

diff --git a/vp9/encoder/vp9_encodemb.c b/vp9/encoder/vp9_encodemb.c
index aa6bb8be9..2a50c9912 100644
--- a/vp9/encoder/vp9_encodemb.c
+++ b/vp9/encoder/vp9_encodemb.c
@@ -57,12 +57,6 @@ static const int plane_rd_mult[REF_TYPES][PLANE_TYPES] = {
 
 #if USE_GREEDY_OPTIMIZE_B
 
-typedef struct {
-  int16_t token;
-  tran_low_t qc;
-  tran_low_t dqc;
-} vp9_token_state;
-
 // 'num' can be negative, but 'shift' must be non-negative.
 #define RIGHT_SHIFT_POSSIBLY_NEGATIVE(num, shift) \
   ((num) >= 0) ? (num) >> (shift) : -((-(num)) >> (shift))
@@ -73,7 +67,6 @@ int vp9_optimize_b(MACROBLOCK *mb, int plane, int block, TX_SIZE tx_size,
   struct macroblock_plane *const p = &mb->plane[plane];
   struct macroblockd_plane *const pd = &xd->plane[plane];
   const int ref = is_inter_block(xd->mi[0]);
-  vp9_token_state tokens[1025][2];
   uint8_t token_cache[1024];
   const tran_low_t *const coeff = BLOCK_OFFSET(p->coeff, block);
   tran_low_t *const qcoeff = BLOCK_OFFSET(p->qcoeff, block);
@@ -110,22 +103,16 @@ int vp9_optimize_b(MACROBLOCK *mb, int plane, int block, TX_SIZE tx_size,
   int64_t accu_error = ((int64_t)1) << 50;
   int64_t best_block_rd_cost = INT64_MAX;
   int x_prev = 1;
+  tran_low_t before_best_eob_qc = 0;
+  tran_low_t before_best_eob_dqc = 0;
+
   assert((!plane_type && !plane) || (plane_type && plane));
   assert(eob <= default_eob);
 
   for (i = 0; i < eob; i++) {
     const int rc = scan[i];
-    int x = qcoeff[rc];
-    t0 = vp9_get_token(x);
-    tokens[i][0].qc = x;
-    tokens[i][0].token = t0;
-    tokens[i][0].dqc = dqcoeff[rc];
-    token_cache[rc] = vp9_pt_energy_class[t0];
+    token_cache[rc] = vp9_pt_energy_class[vp9_get_token(qcoeff[rc])];
   }
-  tokens[eob][0].token = EOB_TOKEN;
-  tokens[eob][0].qc = 0;
-  tokens[eob][0].dqc = 0;
-  tokens[eob][1] = tokens[eob][0];
   final_eob = 0;
 
   // Initial RD cost.
@@ -144,8 +131,8 @@ int vp9_optimize_b(MACROBLOCK *mb, int plane, int block, TX_SIZE tx_size,
     const int token_tree_sel_cur = (x_prev == 0);
     token_costs_cur = token_costs + band_cur;
     if (x == 0) {  // No need to search
-      rate0 =
-          (*token_costs_cur)[token_tree_sel_cur][ctx_cur][tokens[i][0].token];
+      const int token = vp9_get_token(x);
+      rate0 = (*token_costs_cur)[token_tree_sel_cur][ctx_cur][token];
       accu_rate += rate0;
       x_prev = 0;
       // Note: accu_error does not change.
@@ -205,6 +192,7 @@ int vp9_optimize_b(MACROBLOCK *mb, int plane, int block, TX_SIZE tx_size,
         int rdcost_better_for_x1, eob_rdcost_better_for_x1;
         int dqc0, dqc1;
         int64_t best_eob_cost_cur;
+        int use_x1;
 
         // Calculate RD Cost effect on the next coeff for the two candidates.
         int64_t next_bits0 = 0;
@@ -214,21 +202,23 @@ int vp9_optimize_b(MACROBLOCK *mb, int plane, int block, TX_SIZE tx_size,
         if (i < default_eob - 1) {
           int ctx_next, token_tree_sel_next;
           const int band_next = band_translate[i + 1];
+          const int token_next =
+              (i + 1 != eob) ? vp9_get_token(qcoeff[scan[i + 1]]) : EOB_TOKEN;
           unsigned int(
               *const token_costs_next)[2][COEFF_CONTEXTS][ENTROPY_TOKENS] =
               token_costs + band_next;
           token_cache[rc] = vp9_pt_energy_class[t0];
           ctx_next = get_coef_context(nb, token_cache, i + 1);
           token_tree_sel_next = (x == 0);
-          next_bits0 = (*token_costs_next)[token_tree_sel_next][ctx_next]
-                                          [tokens[i + 1][0].token];
+          next_bits0 =
+              (*token_costs_next)[token_tree_sel_next][ctx_next][token_next];
           next_eob_bits0 =
               (*token_costs_next)[token_tree_sel_next][ctx_next][EOB_TOKEN];
           token_cache[rc] = vp9_pt_energy_class[t1];
           ctx_next = get_coef_context(nb, token_cache, i + 1);
           token_tree_sel_next = (x1 == 0);
-          next_bits1 = (*token_costs_next)[token_tree_sel_next][ctx_next]
-                                          [tokens[i + 1][0].token];
+          next_bits1 =
+              (*token_costs_next)[token_tree_sel_next][ctx_next][token_next];
           if (x1 != 0) {
             next_eob_bits1 =
                 (*token_costs_next)[token_tree_sel_next][ctx_next][EOB_TOKEN];
@@ -279,21 +269,20 @@ int vp9_optimize_b(MACROBLOCK *mb, int plane, int block, TX_SIZE tx_size,
         assert(accu_error >= 0);
         x_prev = qcoeff[rc];  // Update based on selected quantized value.
 
-        best_eob_cost_cur = eob_cost0;
-        tokens[i][1].token = t0;
-        tokens[i][1].qc = x;
-        tokens[i][1].dqc = dqc0;
-        if ((x1 != 0) && eob_rdcost_better_for_x1) {
-          best_eob_cost_cur = eob_cost1;
-          tokens[i][1].token = t1;
-          tokens[i][1].qc = x1;
-          tokens[i][1].dqc = dqc1;
-        }
+        use_x1 = (x1 != 0) && eob_rdcost_better_for_x1;
+        best_eob_cost_cur = use_x1 ? eob_cost1 : eob_cost0;
 
         // Determine whether to move the eob position to i+1
         if (best_eob_cost_cur < best_block_rd_cost) {
           best_block_rd_cost = best_eob_cost_cur;
           final_eob = i + 1;
+          if (use_x1) {
+            before_best_eob_qc = x1;
+            before_best_eob_dqc = dqc1;
+          } else {
+            before_best_eob_qc = x;
+            before_best_eob_dqc = dqc0;
+          }
         }
       }
     }
@@ -301,11 +290,11 @@ int vp9_optimize_b(MACROBLOCK *mb, int plane, int block, TX_SIZE tx_size,
   assert(final_eob <= eob);
   if (final_eob > 0) {
     int rc;
-    assert(tokens[final_eob - 1][1].qc != 0);
+    assert(before_best_eob_qc != 0);
     i = final_eob - 1;
     rc = scan[i];
-    qcoeff[rc] = tokens[i][1].qc;
-    dqcoeff[rc] = tokens[i][1].dqc;
+    qcoeff[rc] = before_best_eob_qc;
+    dqcoeff[rc] = before_best_eob_dqc;
   }
   for (i = final_eob; i < eob; i++) {
     int rc = scan[i];
-- 
2.40.0