From: Debargha Mukherjee <debargha@google.com>
Date: Tue, 11 Oct 2016 12:26:50 +0000 (-0700)
Subject: Refactor expand dry_run types to return coef rate
X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=ceebb701979a95f856dce223b118d910e4781154;p=libvpx

Refactor expand dry_run types to return coef rate

Adds the functionality to return the rate cost due to
coefficients without doing full search of all modes.
This will be subsequently used in various experiments,
including in new_quant experiment to search quantization
profiles at the superblock level without repeating the
full mode/partition search.

Change-Id: I4aad3f3f0c8b8dfdea38f8f4f094a98283f47f08
---

diff --git a/av1/encoder/encodeframe.c b/av1/encoder/encodeframe.c
index 87e7d51b0..ea865b693 100644
--- a/av1/encoder/encodeframe.c
+++ b/av1/encoder/encodeframe.c
@@ -61,8 +61,9 @@
 #endif  // CONFIG_AOM_HIGHBITDEPTH
 
 static void encode_superblock(AV1_COMP *cpi, ThreadData *td, TOKENEXTRA **t,
-                              int output_enabled, int mi_row, int mi_col,
-                              BLOCK_SIZE bsize, PICK_MODE_CONTEXT *ctx);
+                              RUN_TYPE dry_run, int mi_row, int mi_col,
+                              BLOCK_SIZE bsize, PICK_MODE_CONTEXT *ctx,
+                              int *rate);
 
 #if CONFIG_SUPERTX
 static int check_intra_b(PICK_MODE_CONTEXT *ctx);
@@ -80,13 +81,13 @@ static int check_supertx_sb(BLOCK_SIZE bsize, TX_SIZE supertx_size,
 static void predict_sb_complex(AV1_COMP *cpi, ThreadData *td,
                                const TileInfo *const tile, int mi_row,
                                int mi_col, int mi_row_ori, int mi_col_ori,
-                               int output_enabled, BLOCK_SIZE bsize,
+                               RUN_TYPE dry_run, BLOCK_SIZE bsize,
                                BLOCK_SIZE top_bsize, uint8_t *dst_buf[3],
                                int dst_stride[3], PC_TREE *pc_tree);
 static void update_state_sb_supertx(AV1_COMP *cpi, ThreadData *td,
                                     const TileInfo *const tile, int mi_row,
                                     int mi_col, BLOCK_SIZE bsize,
-                                    int output_enabled, PC_TREE *pc_tree);
+                                    RUN_TYPE dry_run, PC_TREE *pc_tree);
 static void rd_supertx_sb(AV1_COMP *cpi, ThreadData *td,
                           const TileInfo *const tile, int mi_row, int mi_col,
                           BLOCK_SIZE bsize, int *tmp_rate, int64_t *tmp_dist,
@@ -1025,7 +1026,7 @@ static void update_global_motion_used(PREDICTION_MODE mode,
 
 static void update_state(AV1_COMP *cpi, ThreadData *td, PICK_MODE_CONTEXT *ctx,
                          int mi_row, int mi_col, BLOCK_SIZE bsize,
-                         int output_enabled) {
+                         RUN_TYPE dry_run) {
   int i, x_idx, y;
   AV1_COMMON *const cm = &cpi->common;
   RD_COUNTS *const rdc = &td->rd_counts;
@@ -1139,7 +1140,7 @@ static void update_state(AV1_COMP *cpi, ThreadData *td, PICK_MODE_CONTEXT *ctx,
            sizeof(uint8_t) * ctx->num_4x4_blk);
 #endif
 
-  if (!output_enabled) return;
+  if (dry_run) return;
 
 #if CONFIG_INTERNAL_STATS
   if (frame_is_intra_only(cm)) {
@@ -1208,7 +1209,7 @@ static void update_state(AV1_COMP *cpi, ThreadData *td, PICK_MODE_CONTEXT *ctx,
 #if CONFIG_SUPERTX
 static void update_state_supertx(AV1_COMP *cpi, ThreadData *td,
                                  PICK_MODE_CONTEXT *ctx, int mi_row, int mi_col,
-                                 BLOCK_SIZE bsize, int output_enabled) {
+                                 BLOCK_SIZE bsize, RUN_TYPE dry_run) {
   int y, x_idx;
 #if CONFIG_VAR_TX || CONFIG_REF_MV
   int i;
@@ -1316,7 +1317,7 @@ static void update_state_supertx(AV1_COMP *cpi, ThreadData *td,
   // Turn motion variation off for supertx
   mbmi->motion_variation = SIMPLE_TRANSLATION;
 
-  if (!output_enabled) return;
+  if (dry_run) return;
 
   if (!frame_is_intra_only(cm)) {
     av1_update_mv_count(td);
@@ -1354,7 +1355,7 @@ static void update_state_supertx(AV1_COMP *cpi, ThreadData *td,
 static void update_state_sb_supertx(AV1_COMP *cpi, ThreadData *td,
                                     const TileInfo *const tile, int mi_row,
                                     int mi_col, BLOCK_SIZE bsize,
-                                    int output_enabled, PC_TREE *pc_tree) {
+                                    RUN_TYPE dry_run, PC_TREE *pc_tree) {
   AV1_COMMON *const cm = &cpi->common;
   MACROBLOCK *const x = &td->mb;
   MACROBLOCKD *const xd = &x->e_mbd;
@@ -1378,27 +1379,27 @@ static void update_state_sb_supertx(AV1_COMP *cpi, ThreadData *td,
     case PARTITION_NONE:
       set_offsets_supertx(cpi, td, tile, mi_row, mi_col, subsize);
       update_state_supertx(cpi, td, &pc_tree->none, mi_row, mi_col, subsize,
-                           output_enabled);
+                           dry_run);
       break;
     case PARTITION_VERT:
       set_offsets_supertx(cpi, td, tile, mi_row, mi_col, subsize);
       update_state_supertx(cpi, td, &pc_tree->vertical[0], mi_row, mi_col,
-                           subsize, output_enabled);
+                           subsize, dry_run);
       if (mi_col + hbs < cm->mi_cols && bsize > BLOCK_8X8) {
         set_offsets_supertx(cpi, td, tile, mi_row, mi_col + hbs, subsize);
         update_state_supertx(cpi, td, &pc_tree->vertical[1], mi_row,
-                             mi_col + hbs, subsize, output_enabled);
+                             mi_col + hbs, subsize, dry_run);
       }
       pmc = &pc_tree->vertical_supertx;
       break;
     case PARTITION_HORZ:
       set_offsets_supertx(cpi, td, tile, mi_row, mi_col, subsize);
       update_state_supertx(cpi, td, &pc_tree->horizontal[0], mi_row, mi_col,
-                           subsize, output_enabled);
+                           subsize, dry_run);
       if (mi_row + hbs < cm->mi_rows && bsize > BLOCK_8X8) {
         set_offsets_supertx(cpi, td, tile, mi_row + hbs, mi_col, subsize);
         update_state_supertx(cpi, td, &pc_tree->horizontal[1], mi_row + hbs,
-                             mi_col, subsize, output_enabled);
+                             mi_col, subsize, dry_run);
       }
       pmc = &pc_tree->horizontal_supertx;
       break;
@@ -1406,20 +1407,20 @@ static void update_state_sb_supertx(AV1_COMP *cpi, ThreadData *td,
       if (bsize == BLOCK_8X8) {
         set_offsets_supertx(cpi, td, tile, mi_row, mi_col, subsize);
         update_state_supertx(cpi, td, pc_tree->leaf_split[0], mi_row, mi_col,
-                             subsize, output_enabled);
+                             subsize, dry_run);
       } else {
         set_offsets_supertx(cpi, td, tile, mi_row, mi_col, subsize);
-        update_state_sb_supertx(cpi, td, tile, mi_row, mi_col, subsize,
-                                output_enabled, pc_tree->split[0]);
+        update_state_sb_supertx(cpi, td, tile, mi_row, mi_col, subsize, dry_run,
+                                pc_tree->split[0]);
         set_offsets_supertx(cpi, td, tile, mi_row, mi_col + hbs, subsize);
         update_state_sb_supertx(cpi, td, tile, mi_row, mi_col + hbs, subsize,
-                                output_enabled, pc_tree->split[1]);
+                                dry_run, pc_tree->split[1]);
         set_offsets_supertx(cpi, td, tile, mi_row + hbs, mi_col, subsize);
         update_state_sb_supertx(cpi, td, tile, mi_row + hbs, mi_col, subsize,
-                                output_enabled, pc_tree->split[2]);
+                                dry_run, pc_tree->split[2]);
         set_offsets_supertx(cpi, td, tile, mi_row + hbs, mi_col + hbs, subsize);
         update_state_sb_supertx(cpi, td, tile, mi_row + hbs, mi_col + hbs,
-                                subsize, output_enabled, pc_tree->split[3]);
+                                subsize, dry_run, pc_tree->split[3]);
       }
       pmc = &pc_tree->split_supertx;
       break;
@@ -1427,49 +1428,49 @@ static void update_state_sb_supertx(AV1_COMP *cpi, ThreadData *td,
     case PARTITION_HORZ_A:
       set_offsets_supertx(cpi, td, tile, mi_row, mi_col, bsize2);
       update_state_supertx(cpi, td, &pc_tree->horizontala[0], mi_row, mi_col,
-                           bsize2, output_enabled);
+                           bsize2, dry_run);
       set_offsets_supertx(cpi, td, tile, mi_row, mi_col + hbs, bsize2);
       update_state_supertx(cpi, td, &pc_tree->horizontala[1], mi_row,
-                           mi_col + hbs, bsize2, output_enabled);
+                           mi_col + hbs, bsize2, dry_run);
       set_offsets_supertx(cpi, td, tile, mi_row + hbs, mi_col, subsize);
       update_state_supertx(cpi, td, &pc_tree->horizontala[2], mi_row + hbs,
-                           mi_col, subsize, output_enabled);
+                           mi_col, subsize, dry_run);
       pmc = &pc_tree->horizontala_supertx;
       break;
     case PARTITION_HORZ_B:
       set_offsets_supertx(cpi, td, tile, mi_row, mi_col, subsize);
       update_state_supertx(cpi, td, &pc_tree->horizontalb[0], mi_row, mi_col,
-                           subsize, output_enabled);
+                           subsize, dry_run);
       set_offsets_supertx(cpi, td, tile, mi_row + hbs, mi_col, bsize2);
       update_state_supertx(cpi, td, &pc_tree->horizontalb[1], mi_row + hbs,
-                           mi_col, bsize2, output_enabled);
+                           mi_col, bsize2, dry_run);
       set_offsets_supertx(cpi, td, tile, mi_row + hbs, mi_col + hbs, bsize2);
       update_state_supertx(cpi, td, &pc_tree->horizontalb[2], mi_row + hbs,
-                           mi_col + hbs, bsize2, output_enabled);
+                           mi_col + hbs, bsize2, dry_run);
       pmc = &pc_tree->horizontalb_supertx;
       break;
     case PARTITION_VERT_A:
       set_offsets_supertx(cpi, td, tile, mi_row, mi_col, bsize2);
       update_state_supertx(cpi, td, &pc_tree->verticala[0], mi_row, mi_col,
-                           bsize2, output_enabled);
+                           bsize2, dry_run);
       set_offsets_supertx(cpi, td, tile, mi_row + hbs, mi_col, bsize2);
       update_state_supertx(cpi, td, &pc_tree->verticala[1], mi_row + hbs,
-                           mi_col, bsize2, output_enabled);
+                           mi_col, bsize2, dry_run);
       set_offsets_supertx(cpi, td, tile, mi_row, mi_col + hbs, subsize);
       update_state_supertx(cpi, td, &pc_tree->verticala[2], mi_row,
-                           mi_col + hbs, subsize, output_enabled);
+                           mi_col + hbs, subsize, dry_run);
       pmc = &pc_tree->verticala_supertx;
       break;
     case PARTITION_VERT_B:
       set_offsets_supertx(cpi, td, tile, mi_row, mi_col, subsize);
       update_state_supertx(cpi, td, &pc_tree->verticalb[0], mi_row, mi_col,
-                           subsize, output_enabled);
+                           subsize, dry_run);
       set_offsets_supertx(cpi, td, tile, mi_row, mi_col + hbs, bsize2);
       update_state_supertx(cpi, td, &pc_tree->verticalb[1], mi_row,
-                           mi_col + hbs, bsize2, output_enabled);
+                           mi_col + hbs, bsize2, dry_run);
       set_offsets_supertx(cpi, td, tile, mi_row + hbs, mi_col + hbs, bsize2);
       update_state_supertx(cpi, td, &pc_tree->verticalb[2], mi_row + hbs,
-                           mi_col + hbs, bsize2, output_enabled);
+                           mi_col + hbs, bsize2, dry_run);
       pmc = &pc_tree->verticalb_supertx;
       break;
 #endif  // CONFIG_EXT_PARTITION_TYPES
@@ -2109,21 +2110,21 @@ static void save_context(const MACROBLOCK *x, RD_SEARCH_MACROBLOCK_CONTEXT *ctx,
 }
 
 static void encode_b(AV1_COMP *cpi, const TileInfo *const tile, ThreadData *td,
-                     TOKENEXTRA **tp, int mi_row, int mi_col,
-                     int output_enabled, BLOCK_SIZE bsize,
+                     TOKENEXTRA **tp, int mi_row, int mi_col, RUN_TYPE dry_run,
+                     BLOCK_SIZE bsize,
 #if CONFIG_EXT_PARTITION_TYPES
                      PARTITION_TYPE partition,
 #endif
-                     PICK_MODE_CONTEXT *ctx) {
+                     PICK_MODE_CONTEXT *ctx, int *rate) {
   MACROBLOCK *const x = &td->mb;
   set_offsets(cpi, tile, x, mi_row, mi_col, bsize);
 #if CONFIG_EXT_PARTITION_TYPES
   x->e_mbd.mi[0]->mbmi.partition = partition;
 #endif
-  update_state(cpi, td, ctx, mi_row, mi_col, bsize, output_enabled);
-  encode_superblock(cpi, td, tp, output_enabled, mi_row, mi_col, bsize, ctx);
+  update_state(cpi, td, ctx, mi_row, mi_col, bsize, dry_run);
+  encode_superblock(cpi, td, tp, dry_run, mi_row, mi_col, bsize, ctx, rate);
 
-  if (output_enabled) {
+  if (!dry_run) {
 #if CONFIG_SUPERTX
     update_stats(&cpi->common, td, 0);
 #else
@@ -2133,8 +2134,8 @@ static void encode_b(AV1_COMP *cpi, const TileInfo *const tile, ThreadData *td,
 }
 
 static void encode_sb(AV1_COMP *cpi, ThreadData *td, const TileInfo *const tile,
-                      TOKENEXTRA **tp, int mi_row, int mi_col,
-                      int output_enabled, BLOCK_SIZE bsize, PC_TREE *pc_tree) {
+                      TOKENEXTRA **tp, int mi_row, int mi_col, RUN_TYPE dry_run,
+                      BLOCK_SIZE bsize, PC_TREE *pc_tree, int *rate) {
   const AV1_COMMON *const cm = &cpi->common;
   MACROBLOCK *const x = &td->mb;
   MACROBLOCKD *const xd = &x->e_mbd;
@@ -2151,7 +2152,7 @@ static void encode_sb(AV1_COMP *cpi, ThreadData *td, const TileInfo *const tile,
 
   if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols) return;
 
-  if (output_enabled) td->counts->partition[ctx][partition]++;
+  if (!dry_run) td->counts->partition[ctx][partition]++;
 
 #if CONFIG_SUPERTX
   if (!frame_is_intra_only(cm) && bsize <= MAX_SUPERTX_BLOCK_SIZE &&
@@ -2167,33 +2168,34 @@ static void encode_sb(AV1_COMP *cpi, ThreadData *td, const TileInfo *const tile,
       int dst_stride[3];
       set_skip_context(xd, mi_row, mi_col);
       set_mode_info_offsets(cpi, x, xd, mi_row, mi_col);
-      update_state_sb_supertx(cpi, td, tile, mi_row, mi_col, bsize,
-                              output_enabled, pc_tree);
+      update_state_sb_supertx(cpi, td, tile, mi_row, mi_col, bsize, dry_run,
+                              pc_tree);
 
       av1_setup_dst_planes(xd->plane, get_frame_new_buffer(cm), mi_row, mi_col);
       for (i = 0; i < MAX_MB_PLANE; i++) {
         dst_buf[i] = xd->plane[i].dst.buf;
         dst_stride[i] = xd->plane[i].dst.stride;
       }
-      predict_sb_complex(cpi, td, tile, mi_row, mi_col, mi_row, mi_col,
-                         output_enabled, bsize, bsize, dst_buf, dst_stride,
-                         pc_tree);
+      predict_sb_complex(cpi, td, tile, mi_row, mi_col, mi_row, mi_col, dry_run,
+                         bsize, bsize, dst_buf, dst_stride, pc_tree);
 
       set_offsets_without_segment_id(cpi, tile, x, mi_row, mi_col, bsize);
       set_segment_id_supertx(cpi, x, mi_row, mi_col, bsize);
 
       if (!x->skip) {
+        int this_rate = 0;
         x->skip_optimize = 0;
         x->use_lp32x32fdct = cpi->sf.use_lp32x32fdct;
 
         av1_encode_sb_supertx(x, bsize);
-        av1_tokenize_sb_supertx(cpi, td, tp, !output_enabled, bsize);
+        av1_tokenize_sb_supertx(cpi, td, tp, dry_run, bsize, rate);
+        if (rate) *rate += this_rate;
       } else {
         xd->mi[0]->mbmi.skip = 1;
-        if (output_enabled) td->counts->skip[av1_get_skip_context(xd)][1]++;
+        if (!dry_run) td->counts->skip[av1_get_skip_context(xd)][1]++;
         reset_skip_context(xd, bsize);
       }
-      if (output_enabled) {
+      if (!dry_run) {
         for (y_idx = 0; y_idx < mi_height; y_idx++)
           for (x_idx = 0; x_idx < mi_width; x_idx++) {
             if ((xd->mb_to_right_edge >> (3 + MI_SIZE_LOG2)) + mi_width >
@@ -2234,7 +2236,7 @@ static void encode_sb(AV1_COMP *cpi, ThreadData *td, const TileInfo *const tile,
 #endif  // CONFIG_VAR_TX
       return;
     } else {
-      if (output_enabled) {
+      if (!dry_run) {
         td->counts->supertx[partition_supertx_context_lookup[partition]]
                            [supertx_size][0]++;
       }
@@ -2244,93 +2246,91 @@ static void encode_sb(AV1_COMP *cpi, ThreadData *td, const TileInfo *const tile,
 
   switch (partition) {
     case PARTITION_NONE:
-      encode_b(cpi, tile, td, tp, mi_row, mi_col, output_enabled, subsize,
+      encode_b(cpi, tile, td, tp, mi_row, mi_col, dry_run, subsize,
 #if CONFIG_EXT_PARTITION_TYPES
                partition,
 #endif
-               &pc_tree->none);
+               &pc_tree->none, rate);
       break;
     case PARTITION_VERT:
-      encode_b(cpi, tile, td, tp, mi_row, mi_col, output_enabled, subsize,
+      encode_b(cpi, tile, td, tp, mi_row, mi_col, dry_run, subsize,
 #if CONFIG_EXT_PARTITION_TYPES
                partition,
 #endif
-               &pc_tree->vertical[0]);
+               &pc_tree->vertical[0], rate);
       if (mi_col + hbs < cm->mi_cols && bsize > BLOCK_8X8) {
-        encode_b(cpi, tile, td, tp, mi_row, mi_col + hbs, output_enabled,
-                 subsize,
+        encode_b(cpi, tile, td, tp, mi_row, mi_col + hbs, dry_run, subsize,
 #if CONFIG_EXT_PARTITION_TYPES
                  partition,
 #endif
-                 &pc_tree->vertical[1]);
+                 &pc_tree->vertical[1], rate);
       }
       break;
     case PARTITION_HORZ:
-      encode_b(cpi, tile, td, tp, mi_row, mi_col, output_enabled, subsize,
+      encode_b(cpi, tile, td, tp, mi_row, mi_col, dry_run, subsize,
 #if CONFIG_EXT_PARTITION_TYPES
                partition,
 #endif
-               &pc_tree->horizontal[0]);
+               &pc_tree->horizontal[0], rate);
       if (mi_row + hbs < cm->mi_rows && bsize > BLOCK_8X8) {
-        encode_b(cpi, tile, td, tp, mi_row + hbs, mi_col, output_enabled,
-                 subsize,
+        encode_b(cpi, tile, td, tp, mi_row + hbs, mi_col, dry_run, subsize,
 #if CONFIG_EXT_PARTITION_TYPES
                  partition,
 #endif
-                 &pc_tree->horizontal[1]);
+                 &pc_tree->horizontal[1], rate);
       }
       break;
     case PARTITION_SPLIT:
       if (bsize == BLOCK_8X8) {
-        encode_b(cpi, tile, td, tp, mi_row, mi_col, output_enabled, subsize,
+        encode_b(cpi, tile, td, tp, mi_row, mi_col, dry_run, subsize,
 #if CONFIG_EXT_PARTITION_TYPES
                  partition,
 #endif
-                 pc_tree->leaf_split[0]);
+                 pc_tree->leaf_split[0], rate);
       } else {
-        encode_sb(cpi, td, tile, tp, mi_row, mi_col, output_enabled, subsize,
-                  pc_tree->split[0]);
-        encode_sb(cpi, td, tile, tp, mi_row, mi_col + hbs, output_enabled,
-                  subsize, pc_tree->split[1]);
-        encode_sb(cpi, td, tile, tp, mi_row + hbs, mi_col, output_enabled,
-                  subsize, pc_tree->split[2]);
-        encode_sb(cpi, td, tile, tp, mi_row + hbs, mi_col + hbs, output_enabled,
-                  subsize, pc_tree->split[3]);
+        encode_sb(cpi, td, tile, tp, mi_row, mi_col, dry_run, subsize,
+                  pc_tree->split[0], rate);
+        encode_sb(cpi, td, tile, tp, mi_row, mi_col + hbs, dry_run, subsize,
+                  pc_tree->split[1], rate);
+        encode_sb(cpi, td, tile, tp, mi_row + hbs, mi_col, dry_run, subsize,
+                  pc_tree->split[2], rate);
+        encode_sb(cpi, td, tile, tp, mi_row + hbs, mi_col + hbs, dry_run,
+                  subsize, pc_tree->split[3], rate);
       }
       break;
 #if CONFIG_EXT_PARTITION_TYPES
     case PARTITION_HORZ_A:
-      encode_b(cpi, tile, td, tp, mi_row, mi_col, output_enabled, bsize2,
-               partition, &pc_tree->horizontala[0]);
-      encode_b(cpi, tile, td, tp, mi_row, mi_col + hbs, output_enabled, bsize2,
-               partition, &pc_tree->horizontala[1]);
-      encode_b(cpi, tile, td, tp, mi_row + hbs, mi_col, output_enabled, subsize,
-               partition, &pc_tree->horizontala[2]);
+      encode_b(cpi, tile, td, tp, mi_row, mi_col, dry_run, bsize2, partition,
+               &pc_tree->horizontala[0], rate);
+      encode_b(cpi, tile, td, tp, mi_row, mi_col + hbs, dry_run, bsize2,
+               partition, &pc_tree->horizontala[1], rate);
+      encode_b(cpi, tile, td, tp, mi_row + hbs, mi_col, dry_run, subsize,
+               partition, &pc_tree->horizontala[2], rate);
       break;
     case PARTITION_HORZ_B:
-      encode_b(cpi, tile, td, tp, mi_row, mi_col, output_enabled, subsize,
-               partition, &pc_tree->horizontalb[0]);
-      encode_b(cpi, tile, td, tp, mi_row + hbs, mi_col, output_enabled, bsize2,
-               partition, &pc_tree->horizontalb[1]);
-      encode_b(cpi, tile, td, tp, mi_row + hbs, mi_col + hbs, output_enabled,
-               bsize2, partition, &pc_tree->horizontalb[2]);
+      encode_b(cpi, tile, td, tp, mi_row, mi_col, dry_run, subsize, partition,
+               &pc_tree->horizontalb[0], rate);
+      encode_b(cpi, tile, td, tp, mi_row + hbs, mi_col, dry_run, bsize2,
+               partition, &pc_tree->horizontalb[1], rate);
+      encode_b(cpi, tile, td, tp, mi_row + hbs, mi_col + hbs, dry_run, bsize2,
+               partition, &pc_tree->horizontalb[2], rate);
       break;
     case PARTITION_VERT_A:
-      encode_b(cpi, tile, td, tp, mi_row, mi_col, output_enabled, bsize2,
-               partition, &pc_tree->verticala[0]);
-      encode_b(cpi, tile, td, tp, mi_row + hbs, mi_col, output_enabled, bsize2,
-               partition, &pc_tree->verticala[1]);
-      encode_b(cpi, tile, td, tp, mi_row, mi_col + hbs, output_enabled, subsize,
-               partition, &pc_tree->verticala[2]);
+      encode_b(cpi, tile, td, tp, mi_row, mi_col, dry_run, bsize2, partition,
+               &pc_tree->verticala[0], rate);
+      encode_b(cpi, tile, td, tp, mi_row + hbs, mi_col, dry_run, bsize2,
+               partition, &pc_tree->verticala[1], rate);
+      encode_b(cpi, tile, td, tp, mi_row, mi_col + hbs, dry_run, subsize,
+               partition, &pc_tree->verticala[2], rate);
 
       break;
     case PARTITION_VERT_B:
-      encode_b(cpi, tile, td, tp, mi_row, mi_col, output_enabled, subsize,
-               partition, &pc_tree->verticalb[0]);
-      encode_b(cpi, tile, td, tp, mi_row, mi_col + hbs, output_enabled, bsize2,
-               partition, &pc_tree->verticalb[1]);
-      encode_b(cpi, tile, td, tp, mi_row + hbs, mi_col + hbs, output_enabled,
-               bsize2, partition, &pc_tree->verticalb[2]);
+      encode_b(cpi, tile, td, tp, mi_row, mi_col, dry_run, subsize, partition,
+               &pc_tree->verticalb[0], rate);
+      encode_b(cpi, tile, td, tp, mi_row, mi_col + hbs, dry_run, bsize2,
+               partition, &pc_tree->verticalb[1], rate);
+      encode_b(cpi, tile, td, tp, mi_row + hbs, mi_col + hbs, dry_run, bsize2,
+               partition, &pc_tree->verticalb[2], rate);
       break;
 #endif  // CONFIG_EXT_PARTITION_TYPES
     default: assert(0 && "Invalid partition type."); break;
@@ -2545,8 +2545,9 @@ static void rd_use_partition(AV1_COMP *cpi, ThreadData *td,
 #endif
         PICK_MODE_CONTEXT *ctx = &pc_tree->horizontal[0];
         av1_rd_cost_init(&tmp_rdc);
-        update_state(cpi, td, ctx, mi_row, mi_col, subsize, 0);
-        encode_superblock(cpi, td, tp, 0, mi_row, mi_col, subsize, ctx);
+        update_state(cpi, td, ctx, mi_row, mi_col, subsize, 1);
+        encode_superblock(cpi, td, tp, DRY_RUN_NORMAL, mi_row, mi_col, subsize,
+                          ctx, NULL);
         rd_pick_sb_modes(cpi, tile_data, x, mi_row + hbs, mi_col, &tmp_rdc,
 #if CONFIG_SUPERTX
                          &rt_nocoef,
@@ -2587,8 +2588,9 @@ static void rd_use_partition(AV1_COMP *cpi, ThreadData *td,
 #endif
         PICK_MODE_CONTEXT *ctx = &pc_tree->vertical[0];
         av1_rd_cost_init(&tmp_rdc);
-        update_state(cpi, td, ctx, mi_row, mi_col, subsize, 0);
-        encode_superblock(cpi, td, tp, 0, mi_row, mi_col, subsize, ctx);
+        update_state(cpi, td, ctx, mi_row, mi_col, subsize, 1);
+        encode_superblock(cpi, td, tp, DRY_RUN_NORMAL, mi_row, mi_col, subsize,
+                          ctx, NULL);
         rd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col + hbs, &tmp_rdc,
 #if CONFIG_SUPERTX
                          &rt_nocoef,
@@ -2741,8 +2743,8 @@ static void rd_use_partition(AV1_COMP *cpi, ThreadData *td,
 #endif
 
       if (i != 3)
-        encode_sb(cpi, td, tile_info, tp, mi_row + y_idx, mi_col + x_idx, 0,
-                  split_subsize, pc_tree->split[i]);
+        encode_sb(cpi, td, tile_info, tp, mi_row + y_idx, mi_col + x_idx,
+                  OUTPUT_ENABLED, split_subsize, pc_tree->split[i], NULL);
 
       chosen_rdc.rate += cpi->partition_cost[pl][PARTITION_NONE];
 #if CONFIG_SUPERTX
@@ -2785,9 +2787,17 @@ static void rd_use_partition(AV1_COMP *cpi, ThreadData *td,
     assert(chosen_rdc.rate < INT_MAX && chosen_rdc.dist < INT64_MAX);
 
   if (do_recon) {
-    int output_enabled = (bsize == cm->sb_size);
-    encode_sb(cpi, td, tile_info, tp, mi_row, mi_col, output_enabled, bsize,
-              pc_tree);
+    if (bsize == cm->sb_size) {
+      // NOTE: To get estimate for rate due to the tokens, use:
+      // int rate_coeffs = 0;
+      // encode_sb(cpi, td, tile_info, tp, mi_row, mi_col, DRY_RUN_COSTCOEFFS,
+      //           bsize, pc_tree, &rate_coeffs);
+      encode_sb(cpi, td, tile_info, tp, mi_row, mi_col, OUTPUT_ENABLED, bsize,
+                pc_tree, NULL);
+    } else {
+      encode_sb(cpi, td, tile_info, tp, mi_row, mi_col, DRY_RUN_NORMAL, bsize,
+                pc_tree, NULL);
+    }
   }
 
   *rate = chosen_rdc.rate;
@@ -3136,8 +3146,9 @@ static void rd_test_partition3(
   if (sum_rdc.rdcost < best_rdc->rdcost) {
 #endif
     PICK_MODE_CONTEXT *ctx = &ctxs[0];
-    update_state(cpi, td, ctx, mi_row0, mi_col0, subsize0, 0);
-    encode_superblock(cpi, td, tp, 0, mi_row0, mi_col0, subsize0, ctx);
+    update_state(cpi, td, ctx, mi_row0, mi_col0, subsize0, 1);
+    encode_superblock(cpi, td, tp, DRY_RUN_NORMAL, mi_row0, mi_col0, subsize0,
+                      ctx, NULL);
 
     if (cpi->sf.adaptive_motion_search) load_pred_mv(x, ctx);
 
@@ -3176,8 +3187,9 @@ static void rd_test_partition3(
     if (sum_rdc.rdcost < best_rdc->rdcost) {
 #endif
       PICK_MODE_CONTEXT *ctx = &ctxs[1];
-      update_state(cpi, td, ctx, mi_row1, mi_col1, subsize1, 0);
-      encode_superblock(cpi, td, tp, 0, mi_row1, mi_col1, subsize1, ctx);
+      update_state(cpi, td, ctx, mi_row1, mi_col1, subsize1, 1);
+      encode_superblock(cpi, td, tp, DRY_RUN_NORMAL, mi_row1, mi_col1, subsize1,
+                        ctx, NULL);
 
       if (cpi->sf.adaptive_motion_search) load_pred_mv(x, ctx);
 
@@ -3788,8 +3800,9 @@ static void rd_pick_partition(AV1_COMP *cpi, ThreadData *td,
 #endif  // CONFIG_SUPERTX
         mi_row + mi_step < cm->mi_rows && bsize > BLOCK_8X8) {
       PICK_MODE_CONTEXT *ctx = &pc_tree->horizontal[0];
-      update_state(cpi, td, ctx, mi_row, mi_col, subsize, 0);
-      encode_superblock(cpi, td, tp, 0, mi_row, mi_col, subsize, ctx);
+      update_state(cpi, td, ctx, mi_row, mi_col, subsize, 1);
+      encode_superblock(cpi, td, tp, DRY_RUN_NORMAL, mi_row, mi_col, subsize,
+                        ctx, NULL);
 
       if (cpi->sf.adaptive_motion_search) load_pred_mv(x, ctx);
 
@@ -3924,9 +3937,9 @@ static void rd_pick_partition(AV1_COMP *cpi, ThreadData *td,
     if (sum_rdc.rdcost < best_rdc.rdcost &&
 #endif  // CONFIG_SUPERTX
         mi_col + mi_step < cm->mi_cols && bsize > BLOCK_8X8) {
-      update_state(cpi, td, &pc_tree->vertical[0], mi_row, mi_col, subsize, 0);
-      encode_superblock(cpi, td, tp, 0, mi_row, mi_col, subsize,
-                        &pc_tree->vertical[0]);
+      update_state(cpi, td, &pc_tree->vertical[0], mi_row, mi_col, subsize, 1);
+      encode_superblock(cpi, td, tp, DRY_RUN_NORMAL, mi_row, mi_col, subsize,
+                        &pc_tree->vertical[0], NULL);
 
       if (cpi->sf.adaptive_motion_search) load_pred_mv(x, ctx);
 
@@ -4099,9 +4112,13 @@ static void rd_pick_partition(AV1_COMP *cpi, ThreadData *td,
 
   if (best_rdc.rate < INT_MAX && best_rdc.dist < INT64_MAX &&
       pc_tree->index != 3) {
-    int output_enabled = (bsize == cm->sb_size);
-    encode_sb(cpi, td, tile_info, tp, mi_row, mi_col, output_enabled, bsize,
-              pc_tree);
+    if (bsize == cm->sb_size) {
+      encode_sb(cpi, td, tile_info, tp, mi_row, mi_col, OUTPUT_ENABLED, bsize,
+                pc_tree, NULL);
+    } else {
+      encode_sb(cpi, td, tile_info, tp, mi_row, mi_col, DRY_RUN_NORMAL, bsize,
+                pc_tree, NULL);
+    }
   }
 
   if (bsize == cm->sb_size) {
@@ -5000,8 +5017,9 @@ static void tx_partition_set_contexts(AV1_COMMON *cm, MACROBLOCKD *xd,
 #endif
 
 static void encode_superblock(AV1_COMP *cpi, ThreadData *td, TOKENEXTRA **t,
-                              int output_enabled, int mi_row, int mi_col,
-                              BLOCK_SIZE bsize, PICK_MODE_CONTEXT *ctx) {
+                              RUN_TYPE dry_run, int mi_row, int mi_col,
+                              BLOCK_SIZE bsize, PICK_MODE_CONTEXT *ctx,
+                              int *rate) {
   AV1_COMMON *const cm = &cpi->common;
   MACROBLOCK *const x = &td->mb;
   MACROBLOCKD *const xd = &x->e_mbd;
@@ -5023,12 +5041,12 @@ static void encode_superblock(AV1_COMP *cpi, ThreadData *td, TOKENEXTRA **t,
     mbmi->skip = 1;
     for (plane = 0; plane < MAX_MB_PLANE; ++plane)
       av1_encode_intra_block_plane(x, AOMMAX(bsize, BLOCK_8X8), plane, 1);
-    if (output_enabled)
+    if (!dry_run)
       sum_intra_stats(td->counts, mi, xd->above_mi, xd->left_mi,
                       frame_is_intra_only(cm));
 
 #if CONFIG_EXT_INTRA
-    if (output_enabled && bsize >= BLOCK_8X8) {
+    if (!dry_run && bsize >= BLOCK_8X8) {
       FRAME_COUNTS *counts = td->counts;
       if (mbmi->mode == DC_PRED && mbmi->palette_mode_info.palette_size[0] == 0)
         ++counts->ext_intra[0][mbmi->ext_intra_mode_info.use_ext_intra_mode[0]];
@@ -5046,18 +5064,18 @@ static void encode_superblock(AV1_COMP *cpi, ThreadData *td, TOKENEXTRA **t,
     }
 #endif  // CONFIG_EXT_INTRA
 
-    if (bsize >= BLOCK_8X8 && output_enabled) {
+    if (bsize >= BLOCK_8X8 && !dry_run) {
       for (plane = 0; plane <= 1; ++plane) {
         if (mbmi->palette_mode_info.palette_size[plane] > 0) {
           mbmi->palette_mode_info.palette_first_color_idx[plane] =
               xd->plane[plane].color_index_map[0];
           // TODO(huisu): this increases the use of token buffer. Needs stretch
           // test to verify.
-          av1_tokenize_palette_sb(td, bsize, plane, t);
+          av1_tokenize_palette_sb(cpi, td, plane, t, dry_run, bsize, rate);
         }
       }
     }
-    av1_tokenize_sb(cpi, td, t, !output_enabled, AOMMAX(bsize, BLOCK_8X8));
+    av1_tokenize_sb(cpi, td, t, dry_run, AOMMAX(bsize, BLOCK_8X8), rate);
   } else {
     int ref;
     const int is_compound = has_second_ref(mbmi);
@@ -5129,17 +5147,17 @@ static void encode_superblock(AV1_COMP *cpi, ThreadData *td, TOKENEXTRA **t,
 #if CONFIG_VAR_TX
 #if CONFIG_EXT_TX && CONFIG_RECT_TX
     if (is_rect_tx(mbmi->tx_size))
-      av1_tokenize_sb(cpi, td, t, !output_enabled, AOMMAX(bsize, BLOCK_8X8));
+      av1_tokenize_sb(cpi, td, t, dry_run, AOMMAX(bsize, BLOCK_8X8), rate);
     else
 #endif
-      av1_tokenize_sb_vartx(cpi, td, t, !output_enabled, mi_row, mi_col,
-                            AOMMAX(bsize, BLOCK_8X8));
+      av1_tokenize_sb_vartx(cpi, td, t, dry_run, mi_row, mi_col,
+                            AOMMAX(bsize, BLOCK_8X8), rate);
 #else
-    av1_tokenize_sb(cpi, td, t, !output_enabled, AOMMAX(bsize, BLOCK_8X8));
+    av1_tokenize_sb(cpi, td, t, dry_run, AOMMAX(bsize, BLOCK_8X8), rate);
 #endif
   }
 
-  if (output_enabled) {
+  if (!dry_run) {
     if (cm->tx_mode == TX_MODE_SELECT && mbmi->sb_type >= BLOCK_8X8 &&
         !(is_inter_block(mbmi) && (mbmi->skip || seg_skip))) {
       const int is_inter = is_inter_block(mbmi);
@@ -5222,8 +5240,7 @@ static void encode_superblock(AV1_COMP *cpi, ThreadData *td, TOKENEXTRA **t,
 #if CONFIG_VAR_TX
   if (cm->tx_mode == TX_MODE_SELECT && mbmi->sb_type >= BLOCK_8X8 &&
       is_inter_block(mbmi) && !(mbmi->skip || seg_skip)) {
-    if (!output_enabled)
-      tx_partition_set_contexts(cm, xd, bsize, mi_row, mi_col);
+    if (dry_run) tx_partition_set_contexts(cm, xd, bsize, mi_row, mi_col);
 #if CONFIG_EXT_TX && CONFIG_RECT_TX
     if (is_rect_tx(mbmi->tx_size)) {
       set_txfm_ctxs(mbmi->tx_size, xd->n8_w, xd->n8_h, xd);
@@ -5419,7 +5436,7 @@ static void predict_b_extend(AV1_COMP *cpi, ThreadData *td,
                              int mi_col_pred, int mi_row_top, int mi_col_top,
                              uint8_t *dst_buf[3], int dst_stride[3],
                              BLOCK_SIZE bsize_top, BLOCK_SIZE bsize_pred,
-                             int output_enabled, int b_sub8x8, int bextend) {
+                             RUN_TYPE dry_run, int b_sub8x8, int bextend) {
   // Used in supertx
   // (mi_row_ori, mi_col_ori): location for mv
   // (mi_row_pred, mi_col_pred, bsize_pred): region to predict
@@ -5463,13 +5480,13 @@ static void predict_b_extend(AV1_COMP *cpi, ThreadData *td,
 #endif  // CONFIG_EXT_INTER
                      mi_row_pred, mi_col_pred, bsize_pred, b_sub8x8, block);
 
-  if (output_enabled && !bextend) update_stats(&cpi->common, td, 1);
+  if (!dry_run && !bextend) update_stats(&cpi->common, td, 1);
 }
 
 static void extend_dir(AV1_COMP *cpi, ThreadData *td,
                        const TileInfo *const tile, int block, BLOCK_SIZE bsize,
                        BLOCK_SIZE top_bsize, int mi_row, int mi_col,
-                       int mi_row_top, int mi_col_top, int output_enabled,
+                       int mi_row_top, int mi_col_top, RUN_TYPE dry_run,
                        uint8_t *dst_buf[3], int dst_stride[3], int dir) {
   // dir: 0-lower, 1-upper, 2-left, 3-right
   //      4-lowerleft, 5-upperleft, 6-lowerright, 7-upperright
@@ -5493,7 +5510,7 @@ static void extend_dir(AV1_COMP *cpi, ThreadData *td,
 
     predict_b_extend(cpi, td, tile, block, mi_row, mi_col, mi_row_pred,
                      mi_col_pred, mi_row_top, mi_col_top, dst_buf, dst_stride,
-                     top_bsize, extend_bsize, output_enabled, b_sub8x8, 1);
+                     top_bsize, extend_bsize, dry_run, b_sub8x8, 1);
 
     if (mi_width > unit) {
       int i;
@@ -5501,8 +5518,8 @@ static void extend_dir(AV1_COMP *cpi, ThreadData *td,
         mi_col_pred += unit;
         predict_b_extend(cpi, td, tile, block, mi_row, mi_col, mi_row_pred,
                          mi_col_pred, mi_row_top, mi_col_top, dst_buf,
-                         dst_stride, top_bsize, extend_bsize, output_enabled,
-                         b_sub8x8, 1);
+                         dst_stride, top_bsize, extend_bsize, dry_run, b_sub8x8,
+                         1);
       }
     }
   } else if (dir == 2 || dir == 3) {  // left and right
@@ -5515,7 +5532,7 @@ static void extend_dir(AV1_COMP *cpi, ThreadData *td,
 
     predict_b_extend(cpi, td, tile, block, mi_row, mi_col, mi_row_pred,
                      mi_col_pred, mi_row_top, mi_col_top, dst_buf, dst_stride,
-                     top_bsize, extend_bsize, output_enabled, b_sub8x8, 1);
+                     top_bsize, extend_bsize, dry_run, b_sub8x8, 1);
 
     if (mi_height > unit) {
       int i;
@@ -5523,8 +5540,8 @@ static void extend_dir(AV1_COMP *cpi, ThreadData *td,
         mi_row_pred += unit;
         predict_b_extend(cpi, td, tile, block, mi_row, mi_col, mi_row_pred,
                          mi_col_pred, mi_row_top, mi_col_top, dst_buf,
-                         dst_stride, top_bsize, extend_bsize, output_enabled,
-                         b_sub8x8, 1);
+                         dst_stride, top_bsize, extend_bsize, dry_run, b_sub8x8,
+                         1);
       }
     }
   } else {
@@ -5534,32 +5551,32 @@ static void extend_dir(AV1_COMP *cpi, ThreadData *td,
 
     predict_b_extend(cpi, td, tile, block, mi_row, mi_col, mi_row_pred,
                      mi_col_pred, mi_row_top, mi_col_top, dst_buf, dst_stride,
-                     top_bsize, extend_bsize, output_enabled, b_sub8x8, 1);
+                     top_bsize, extend_bsize, dry_run, b_sub8x8, 1);
   }
 }
 
 static void extend_all(AV1_COMP *cpi, ThreadData *td,
                        const TileInfo *const tile, int block, BLOCK_SIZE bsize,
                        BLOCK_SIZE top_bsize, int mi_row, int mi_col,
-                       int mi_row_top, int mi_col_top, int output_enabled,
+                       int mi_row_top, int mi_col_top, RUN_TYPE dry_run,
                        uint8_t *dst_buf[3], int dst_stride[3]) {
   assert(block >= 0 && block < 4);
   extend_dir(cpi, td, tile, block, bsize, top_bsize, mi_row, mi_col, mi_row_top,
-             mi_col_top, output_enabled, dst_buf, dst_stride, 0);
+             mi_col_top, dry_run, dst_buf, dst_stride, 0);
   extend_dir(cpi, td, tile, block, bsize, top_bsize, mi_row, mi_col, mi_row_top,
-             mi_col_top, output_enabled, dst_buf, dst_stride, 1);
+             mi_col_top, dry_run, dst_buf, dst_stride, 1);
   extend_dir(cpi, td, tile, block, bsize, top_bsize, mi_row, mi_col, mi_row_top,
-             mi_col_top, output_enabled, dst_buf, dst_stride, 2);
+             mi_col_top, dry_run, dst_buf, dst_stride, 2);
   extend_dir(cpi, td, tile, block, bsize, top_bsize, mi_row, mi_col, mi_row_top,
-             mi_col_top, output_enabled, dst_buf, dst_stride, 3);
+             mi_col_top, dry_run, dst_buf, dst_stride, 3);
   extend_dir(cpi, td, tile, block, bsize, top_bsize, mi_row, mi_col, mi_row_top,
-             mi_col_top, output_enabled, dst_buf, dst_stride, 4);
+             mi_col_top, dry_run, dst_buf, dst_stride, 4);
   extend_dir(cpi, td, tile, block, bsize, top_bsize, mi_row, mi_col, mi_row_top,
-             mi_col_top, output_enabled, dst_buf, dst_stride, 5);
+             mi_col_top, dry_run, dst_buf, dst_stride, 5);
   extend_dir(cpi, td, tile, block, bsize, top_bsize, mi_row, mi_col, mi_row_top,
-             mi_col_top, output_enabled, dst_buf, dst_stride, 6);
+             mi_col_top, dry_run, dst_buf, dst_stride, 6);
   extend_dir(cpi, td, tile, block, bsize, top_bsize, mi_row, mi_col, mi_row_top,
-             mi_col_top, output_enabled, dst_buf, dst_stride, 7);
+             mi_col_top, dry_run, dst_buf, dst_stride, 7);
 }
 
 // This function generates prediction for multiple blocks, between which
@@ -5573,7 +5590,7 @@ static void extend_all(AV1_COMP *cpi, ThreadData *td,
 static void predict_sb_complex(AV1_COMP *cpi, ThreadData *td,
                                const TileInfo *const tile, int mi_row,
                                int mi_col, int mi_row_top, int mi_col_top,
-                               int output_enabled, BLOCK_SIZE bsize,
+                               RUN_TYPE dry_run, BLOCK_SIZE bsize,
                                BLOCK_SIZE top_bsize, uint8_t *dst_buf[3],
                                int dst_stride[3], PC_TREE *pc_tree) {
   AV1_COMMON *const cm = &cpi->common;
@@ -5628,8 +5645,7 @@ static void predict_sb_complex(AV1_COMP *cpi, ThreadData *td,
   }
 #endif  // CONFIG_AOM_HIGHBITDEPTH
 
-  if (output_enabled && bsize < top_bsize)
-    cm->counts.partition[ctx][partition]++;
+  if (!dry_run && bsize < top_bsize) cm->counts.partition[ctx][partition]++;
 
   for (i = 0; i < MAX_MB_PLANE; i++) {
     xd->plane[i].dst.buf = dst_buf[i];
@@ -5641,29 +5657,27 @@ static void predict_sb_complex(AV1_COMP *cpi, ThreadData *td,
       assert(bsize < top_bsize);
       predict_b_extend(cpi, td, tile, 0, mi_row, mi_col, mi_row, mi_col,
                        mi_row_top, mi_col_top, dst_buf, dst_stride, top_bsize,
-                       bsize, output_enabled, 0, 0);
+                       bsize, dry_run, 0, 0);
       extend_all(cpi, td, tile, 0, bsize, top_bsize, mi_row, mi_col, mi_row_top,
-                 mi_col_top, output_enabled, dst_buf, dst_stride);
+                 mi_col_top, dry_run, dst_buf, dst_stride);
       break;
     case PARTITION_HORZ:
       if (bsize == BLOCK_8X8) {
         // Fisrt half
         predict_b_extend(cpi, td, tile, 0, mi_row, mi_col, mi_row, mi_col,
                          mi_row_top, mi_col_top, dst_buf, dst_stride, top_bsize,
-                         BLOCK_8X8, output_enabled, 1, 0);
+                         BLOCK_8X8, dry_run, 1, 0);
         if (bsize < top_bsize)
           extend_all(cpi, td, tile, 0, subsize, top_bsize, mi_row, mi_col,
-                     mi_row_top, mi_col_top, output_enabled, dst_buf,
-                     dst_stride);
+                     mi_row_top, mi_col_top, dry_run, dst_buf, dst_stride);
 
         // Second half
         predict_b_extend(cpi, td, tile, 2, mi_row, mi_col, mi_row, mi_col,
                          mi_row_top, mi_col_top, dst_buf1, dst_stride1,
-                         top_bsize, BLOCK_8X8, output_enabled, 1, 1);
+                         top_bsize, BLOCK_8X8, dry_run, 1, 1);
         if (bsize < top_bsize)
           extend_all(cpi, td, tile, 2, subsize, top_bsize, mi_row, mi_col,
-                     mi_row_top, mi_col_top, output_enabled, dst_buf1,
-                     dst_stride1);
+                     mi_row_top, mi_col_top, dry_run, dst_buf1, dst_stride1);
 
         // Smooth
         xd->plane[0].dst.buf = dst_buf[0];
@@ -5676,29 +5690,26 @@ static void predict_sb_complex(AV1_COMP *cpi, ThreadData *td,
         // First half
         predict_b_extend(cpi, td, tile, 0, mi_row, mi_col, mi_row, mi_col,
                          mi_row_top, mi_col_top, dst_buf, dst_stride, top_bsize,
-                         subsize, output_enabled, 0, 0);
+                         subsize, dry_run, 0, 0);
         if (bsize < top_bsize)
           extend_all(cpi, td, tile, 0, subsize, top_bsize, mi_row, mi_col,
-                     mi_row_top, mi_col_top, output_enabled, dst_buf,
-                     dst_stride);
+                     mi_row_top, mi_col_top, dry_run, dst_buf, dst_stride);
         else
           extend_dir(cpi, td, tile, 0, subsize, top_bsize, mi_row, mi_col,
-                     mi_row_top, mi_col_top, output_enabled, dst_buf,
-                     dst_stride, 0);
+                     mi_row_top, mi_col_top, dry_run, dst_buf, dst_stride, 0);
 
         if (mi_row + hbs < cm->mi_rows) {
           // Second half
           predict_b_extend(cpi, td, tile, 0, mi_row + hbs, mi_col, mi_row + hbs,
                            mi_col, mi_row_top, mi_col_top, dst_buf1,
-                           dst_stride1, top_bsize, subsize, output_enabled, 0,
-                           0);
+                           dst_stride1, top_bsize, subsize, dry_run, 0, 0);
           if (bsize < top_bsize)
             extend_all(cpi, td, tile, 0, subsize, top_bsize, mi_row + hbs,
-                       mi_col, mi_row_top, mi_col_top, output_enabled, dst_buf1,
+                       mi_col, mi_row_top, mi_col_top, dry_run, dst_buf1,
                        dst_stride1);
           else
             extend_dir(cpi, td, tile, 0, subsize, top_bsize, mi_row + hbs,
-                       mi_col, mi_row_top, mi_col_top, output_enabled, dst_buf1,
+                       mi_col, mi_row_top, mi_col_top, dry_run, dst_buf1,
                        dst_stride1, 1);
 
           // Smooth
@@ -5718,20 +5729,18 @@ static void predict_sb_complex(AV1_COMP *cpi, ThreadData *td,
         // First half
         predict_b_extend(cpi, td, tile, 0, mi_row, mi_col, mi_row, mi_col,
                          mi_row_top, mi_col_top, dst_buf, dst_stride, top_bsize,
-                         BLOCK_8X8, output_enabled, 1, 0);
+                         BLOCK_8X8, dry_run, 1, 0);
         if (bsize < top_bsize)
           extend_all(cpi, td, tile, 0, subsize, top_bsize, mi_row, mi_col,
-                     mi_row_top, mi_col_top, output_enabled, dst_buf,
-                     dst_stride);
+                     mi_row_top, mi_col_top, dry_run, dst_buf, dst_stride);
 
         // Second half
         predict_b_extend(cpi, td, tile, 1, mi_row, mi_col, mi_row, mi_col,
                          mi_row_top, mi_col_top, dst_buf1, dst_stride1,
-                         top_bsize, BLOCK_8X8, output_enabled, 1, 1);
+                         top_bsize, BLOCK_8X8, dry_run, 1, 1);
         if (bsize < top_bsize)
           extend_all(cpi, td, tile, 1, subsize, top_bsize, mi_row, mi_col,
-                     mi_row_top, mi_col_top, output_enabled, dst_buf1,
-                     dst_stride1);
+                     mi_row_top, mi_col_top, dry_run, dst_buf1, dst_stride1);
 
         // Smooth
         xd->plane[0].dst.buf = dst_buf[0];
@@ -5744,29 +5753,26 @@ static void predict_sb_complex(AV1_COMP *cpi, ThreadData *td,
         // bsize: not important, not useful
         predict_b_extend(cpi, td, tile, 0, mi_row, mi_col, mi_row, mi_col,
                          mi_row_top, mi_col_top, dst_buf, dst_stride, top_bsize,
-                         subsize, output_enabled, 0, 0);
+                         subsize, dry_run, 0, 0);
         if (bsize < top_bsize)
           extend_all(cpi, td, tile, 0, subsize, top_bsize, mi_row, mi_col,
-                     mi_row_top, mi_col_top, output_enabled, dst_buf,
-                     dst_stride);
+                     mi_row_top, mi_col_top, dry_run, dst_buf, dst_stride);
         else
           extend_dir(cpi, td, tile, 0, subsize, top_bsize, mi_row, mi_col,
-                     mi_row_top, mi_col_top, output_enabled, dst_buf,
-                     dst_stride, 3);
+                     mi_row_top, mi_col_top, dry_run, dst_buf, dst_stride, 3);
 
         if (mi_col + hbs < cm->mi_cols) {
           predict_b_extend(cpi, td, tile, 0, mi_row, mi_col + hbs, mi_row,
                            mi_col + hbs, mi_row_top, mi_col_top, dst_buf1,
-                           dst_stride1, top_bsize, subsize, output_enabled, 0,
-                           0);
+                           dst_stride1, top_bsize, subsize, dry_run, 0, 0);
           if (bsize < top_bsize)
             extend_all(cpi, td, tile, 0, subsize, top_bsize, mi_row,
-                       mi_col + hbs, mi_row_top, mi_col_top, output_enabled,
-                       dst_buf1, dst_stride1);
+                       mi_col + hbs, mi_row_top, mi_col_top, dry_run, dst_buf1,
+                       dst_stride1);
           else
             extend_dir(cpi, td, tile, 0, subsize, top_bsize, mi_row,
-                       mi_col + hbs, mi_row_top, mi_col_top, output_enabled,
-                       dst_buf1, dst_stride1, 2);
+                       mi_col + hbs, mi_row_top, mi_col_top, dry_run, dst_buf1,
+                       dst_stride1, 2);
 
           for (i = 0; i < MAX_MB_PLANE; i++) {
             xd->plane[i].dst.buf = dst_buf[i];
@@ -5783,46 +5789,42 @@ static void predict_sb_complex(AV1_COMP *cpi, ThreadData *td,
       if (bsize == BLOCK_8X8) {
         predict_b_extend(cpi, td, tile, 0, mi_row, mi_col, mi_row, mi_col,
                          mi_row_top, mi_col_top, dst_buf, dst_stride, top_bsize,
-                         BLOCK_8X8, output_enabled, 1, 0);
+                         BLOCK_8X8, dry_run, 1, 0);
         predict_b_extend(cpi, td, tile, 1, mi_row, mi_col, mi_row, mi_col,
                          mi_row_top, mi_col_top, dst_buf1, dst_stride1,
-                         top_bsize, BLOCK_8X8, output_enabled, 1, 1);
+                         top_bsize, BLOCK_8X8, dry_run, 1, 1);
         predict_b_extend(cpi, td, tile, 2, mi_row, mi_col, mi_row, mi_col,
                          mi_row_top, mi_col_top, dst_buf2, dst_stride2,
-                         top_bsize, BLOCK_8X8, output_enabled, 1, 1);
+                         top_bsize, BLOCK_8X8, dry_run, 1, 1);
         predict_b_extend(cpi, td, tile, 3, mi_row, mi_col, mi_row, mi_col,
                          mi_row_top, mi_col_top, dst_buf3, dst_stride3,
-                         top_bsize, BLOCK_8X8, output_enabled, 1, 1);
+                         top_bsize, BLOCK_8X8, dry_run, 1, 1);
 
         if (bsize < top_bsize) {
           extend_all(cpi, td, tile, 0, subsize, top_bsize, mi_row, mi_col,
-                     mi_row_top, mi_col_top, output_enabled, dst_buf,
-                     dst_stride);
+                     mi_row_top, mi_col_top, dry_run, dst_buf, dst_stride);
           extend_all(cpi, td, tile, 1, subsize, top_bsize, mi_row, mi_col,
-                     mi_row_top, mi_col_top, output_enabled, dst_buf1,
-                     dst_stride1);
+                     mi_row_top, mi_col_top, dry_run, dst_buf1, dst_stride1);
           extend_all(cpi, td, tile, 2, subsize, top_bsize, mi_row, mi_col,
-                     mi_row_top, mi_col_top, output_enabled, dst_buf2,
-                     dst_stride2);
+                     mi_row_top, mi_col_top, dry_run, dst_buf2, dst_stride2);
           extend_all(cpi, td, tile, 3, subsize, top_bsize, mi_row, mi_col,
-                     mi_row_top, mi_col_top, output_enabled, dst_buf3,
-                     dst_stride3);
+                     mi_row_top, mi_col_top, dry_run, dst_buf3, dst_stride3);
         }
       } else {
         predict_sb_complex(cpi, td, tile, mi_row, mi_col, mi_row_top,
-                           mi_col_top, output_enabled, subsize, top_bsize,
-                           dst_buf, dst_stride, pc_tree->split[0]);
+                           mi_col_top, dry_run, subsize, top_bsize, dst_buf,
+                           dst_stride, pc_tree->split[0]);
         if (mi_row < cm->mi_rows && mi_col + hbs < cm->mi_cols)
           predict_sb_complex(cpi, td, tile, mi_row, mi_col + hbs, mi_row_top,
-                             mi_col_top, output_enabled, subsize, top_bsize,
-                             dst_buf1, dst_stride1, pc_tree->split[1]);
+                             mi_col_top, dry_run, subsize, top_bsize, dst_buf1,
+                             dst_stride1, pc_tree->split[1]);
         if (mi_row + hbs < cm->mi_rows && mi_col < cm->mi_cols)
           predict_sb_complex(cpi, td, tile, mi_row + hbs, mi_col, mi_row_top,
-                             mi_col_top, output_enabled, subsize, top_bsize,
-                             dst_buf2, dst_stride2, pc_tree->split[2]);
+                             mi_col_top, dry_run, subsize, top_bsize, dst_buf2,
+                             dst_stride2, pc_tree->split[2]);
         if (mi_row + hbs < cm->mi_rows && mi_col + hbs < cm->mi_cols)
           predict_sb_complex(cpi, td, tile, mi_row + hbs, mi_col + hbs,
-                             mi_row_top, mi_col_top, output_enabled, subsize,
+                             mi_row_top, mi_col_top, dry_run, subsize,
                              top_bsize, dst_buf3, dst_stride3,
                              pc_tree->split[3]);
       }
@@ -5856,27 +5858,25 @@ static void predict_sb_complex(AV1_COMP *cpi, ThreadData *td,
     case PARTITION_HORZ_A:
       predict_b_extend(cpi, td, tile, 0, mi_row, mi_col, mi_row, mi_col,
                        mi_row_top, mi_col_top, dst_buf, dst_stride, top_bsize,
-                       bsize2, output_enabled, 0, 0);
+                       bsize2, dry_run, 0, 0);
       extend_all(cpi, td, tile, 0, bsize2, top_bsize, mi_row, mi_col,
-                 mi_row_top, mi_col_top, output_enabled, dst_buf, dst_stride);
+                 mi_row_top, mi_col_top, dry_run, dst_buf, dst_stride);
 
       predict_b_extend(cpi, td, tile, 0, mi_row, mi_col + hbs, mi_row,
                        mi_col + hbs, mi_row_top, mi_col_top, dst_buf1,
-                       dst_stride1, top_bsize, bsize2, output_enabled, 0, 0);
+                       dst_stride1, top_bsize, bsize2, dry_run, 0, 0);
       extend_all(cpi, td, tile, 0, bsize2, top_bsize, mi_row, mi_col + hbs,
-                 mi_row_top, mi_col_top, output_enabled, dst_buf1, dst_stride1);
+                 mi_row_top, mi_col_top, dry_run, dst_buf1, dst_stride1);
 
       predict_b_extend(cpi, td, tile, 0, mi_row + hbs, mi_col, mi_row + hbs,
                        mi_col, mi_row_top, mi_col_top, dst_buf2, dst_stride2,
-                       top_bsize, subsize, output_enabled, 0, 0);
+                       top_bsize, subsize, dry_run, 0, 0);
       if (bsize < top_bsize)
         extend_all(cpi, td, tile, 0, subsize, top_bsize, mi_row + hbs, mi_col,
-                   mi_row_top, mi_col_top, output_enabled, dst_buf2,
-                   dst_stride2);
+                   mi_row_top, mi_col_top, dry_run, dst_buf2, dst_stride2);
       else
         extend_dir(cpi, td, tile, 0, subsize, top_bsize, mi_row + hbs, mi_col,
-                   mi_row_top, mi_col_top, output_enabled, dst_buf2,
-                   dst_stride2, 1);
+                   mi_row_top, mi_col_top, dry_run, dst_buf2, dst_stride2, 1);
 
       for (i = 0; i < MAX_MB_PLANE; i++) {
         xd->plane[i].dst.buf = dst_buf[i];
@@ -5898,27 +5898,25 @@ static void predict_sb_complex(AV1_COMP *cpi, ThreadData *td,
 
       predict_b_extend(cpi, td, tile, 0, mi_row, mi_col, mi_row, mi_col,
                        mi_row_top, mi_col_top, dst_buf, dst_stride, top_bsize,
-                       bsize2, output_enabled, 0, 0);
+                       bsize2, dry_run, 0, 0);
       extend_all(cpi, td, tile, 0, bsize2, top_bsize, mi_row, mi_col,
-                 mi_row_top, mi_col_top, output_enabled, dst_buf, dst_stride);
+                 mi_row_top, mi_col_top, dry_run, dst_buf, dst_stride);
 
       predict_b_extend(cpi, td, tile, 0, mi_row + hbs, mi_col, mi_row + hbs,
                        mi_col, mi_row_top, mi_col_top, dst_buf1, dst_stride1,
-                       top_bsize, bsize2, output_enabled, 0, 0);
+                       top_bsize, bsize2, dry_run, 0, 0);
       extend_all(cpi, td, tile, 0, bsize2, top_bsize, mi_row + hbs, mi_col,
-                 mi_row_top, mi_col_top, output_enabled, dst_buf1, dst_stride1);
+                 mi_row_top, mi_col_top, dry_run, dst_buf1, dst_stride1);
 
       predict_b_extend(cpi, td, tile, 0, mi_row, mi_col + hbs, mi_row,
                        mi_col + hbs, mi_row_top, mi_col_top, dst_buf2,
-                       dst_stride2, top_bsize, subsize, output_enabled, 0, 0);
+                       dst_stride2, top_bsize, subsize, dry_run, 0, 0);
       if (bsize < top_bsize)
         extend_all(cpi, td, tile, 0, subsize, top_bsize, mi_row, mi_col + hbs,
-                   mi_row_top, mi_col_top, output_enabled, dst_buf2,
-                   dst_stride2);
+                   mi_row_top, mi_col_top, dry_run, dst_buf2, dst_stride2);
       else
         extend_dir(cpi, td, tile, 0, subsize, top_bsize, mi_row, mi_col + hbs,
-                   mi_row_top, mi_col_top, output_enabled, dst_buf2,
-                   dst_stride2, 2);
+                   mi_row_top, mi_col_top, dry_run, dst_buf2, dst_stride2, 2);
 
       for (i = 0; i < MAX_MB_PLANE; i++) {
         xd->plane[i].dst.buf = dst_buf[i];
@@ -5939,27 +5937,25 @@ static void predict_sb_complex(AV1_COMP *cpi, ThreadData *td,
 
       predict_b_extend(cpi, td, tile, 0, mi_row, mi_col, mi_row, mi_col,
                        mi_row_top, mi_col_top, dst_buf, dst_stride, top_bsize,
-                       subsize, output_enabled, 0, 0);
+                       subsize, dry_run, 0, 0);
       if (bsize < top_bsize)
         extend_all(cpi, td, tile, 0, subsize, top_bsize, mi_row, mi_col,
-                   mi_row_top, mi_col_top, output_enabled, dst_buf, dst_stride);
+                   mi_row_top, mi_col_top, dry_run, dst_buf, dst_stride);
       else
         extend_dir(cpi, td, tile, 0, subsize, top_bsize, mi_row, mi_col,
-                   mi_row_top, mi_col_top, output_enabled, dst_buf, dst_stride,
-                   0);
+                   mi_row_top, mi_col_top, dry_run, dst_buf, dst_stride, 0);
 
       predict_b_extend(cpi, td, tile, 0, mi_row + hbs, mi_col, mi_row + hbs,
                        mi_col, mi_row_top, mi_col_top, dst_buf1, dst_stride1,
-                       top_bsize, bsize2, output_enabled, 0, 0);
+                       top_bsize, bsize2, dry_run, 0, 0);
       extend_all(cpi, td, tile, 0, bsize2, top_bsize, mi_row + hbs, mi_col,
-                 mi_row_top, mi_col_top, output_enabled, dst_buf1, dst_stride1);
+                 mi_row_top, mi_col_top, dry_run, dst_buf1, dst_stride1);
 
       predict_b_extend(cpi, td, tile, 0, mi_row + hbs, mi_col + hbs,
                        mi_row + hbs, mi_col + hbs, mi_row_top, mi_col_top,
-                       dst_buf2, dst_stride2, top_bsize, bsize2, output_enabled,
-                       0, 0);
+                       dst_buf2, dst_stride2, top_bsize, bsize2, dry_run, 0, 0);
       extend_all(cpi, td, tile, 0, bsize2, top_bsize, mi_row + hbs,
-                 mi_col + hbs, mi_row_top, mi_col_top, output_enabled, dst_buf2,
+                 mi_col + hbs, mi_row_top, mi_col_top, dry_run, dst_buf2,
                  dst_stride2);
 
       for (i = 0; i < MAX_MB_PLANE; i++) {
@@ -5983,27 +5979,25 @@ static void predict_sb_complex(AV1_COMP *cpi, ThreadData *td,
 
       predict_b_extend(cpi, td, tile, 0, mi_row, mi_col, mi_row, mi_col,
                        mi_row_top, mi_col_top, dst_buf, dst_stride, top_bsize,
-                       subsize, output_enabled, 0, 0);
+                       subsize, dry_run, 0, 0);
       if (bsize < top_bsize)
         extend_all(cpi, td, tile, 0, subsize, top_bsize, mi_row, mi_col,
-                   mi_row_top, mi_col_top, output_enabled, dst_buf, dst_stride);
+                   mi_row_top, mi_col_top, dry_run, dst_buf, dst_stride);
       else
         extend_dir(cpi, td, tile, 0, subsize, top_bsize, mi_row, mi_col,
-                   mi_row_top, mi_col_top, output_enabled, dst_buf, dst_stride,
-                   3);
+                   mi_row_top, mi_col_top, dry_run, dst_buf, dst_stride, 3);
 
       predict_b_extend(cpi, td, tile, 0, mi_row, mi_col + hbs, mi_row,
                        mi_col + hbs, mi_row_top, mi_col_top, dst_buf1,
-                       dst_stride1, top_bsize, bsize2, output_enabled, 0, 0);
+                       dst_stride1, top_bsize, bsize2, dry_run, 0, 0);
       extend_all(cpi, td, tile, 0, bsize2, top_bsize, mi_row, mi_col + hbs,
-                 mi_row_top, mi_col_top, output_enabled, dst_buf1, dst_stride1);
+                 mi_row_top, mi_col_top, dry_run, dst_buf1, dst_stride1);
 
       predict_b_extend(cpi, td, tile, 0, mi_row + hbs, mi_col + hbs,
                        mi_row + hbs, mi_col + hbs, mi_row_top, mi_col_top,
-                       dst_buf2, dst_stride2, top_bsize, bsize2, output_enabled,
-                       0, 0);
+                       dst_buf2, dst_stride2, top_bsize, bsize2, dry_run, 0, 0);
       extend_all(cpi, td, tile, 0, bsize2, top_bsize, mi_row + hbs,
-                 mi_col + hbs, mi_row_top, mi_col_top, output_enabled, dst_buf2,
+                 mi_col + hbs, mi_row_top, mi_col_top, dry_run, dst_buf2,
                  dst_stride2);
 
       for (i = 0; i < MAX_MB_PLANE; i++) {
@@ -6059,13 +6053,13 @@ static void rd_supertx_sb(AV1_COMP *cpi, ThreadData *td,
 
   set_skip_context(xd, mi_row, mi_col);
   set_mode_info_offsets(cpi, x, xd, mi_row, mi_col);
-  update_state_sb_supertx(cpi, td, tile, mi_row, mi_col, bsize, 0, pc_tree);
+  update_state_sb_supertx(cpi, td, tile, mi_row, mi_col, bsize, 1, pc_tree);
   av1_setup_dst_planes(xd->plane, get_frame_new_buffer(cm), mi_row, mi_col);
   for (plane = 0; plane < MAX_MB_PLANE; plane++) {
     dst_buf[plane] = xd->plane[plane].dst.buf;
     dst_stride[plane] = xd->plane[plane].dst.stride;
   }
-  predict_sb_complex(cpi, td, tile, mi_row, mi_col, mi_row, mi_col, 0, bsize,
+  predict_sb_complex(cpi, td, tile, mi_row, mi_col, mi_row, mi_col, 1, bsize,
                      bsize, dst_buf, dst_stride, pc_tree);
 
   set_offsets_without_segment_id(cpi, tile, x, mi_row, mi_col, bsize);
diff --git a/av1/encoder/rdopt.c b/av1/encoder/rdopt.c
index 86f0c8d81..8be7310cc 100644
--- a/av1/encoder/rdopt.c
+++ b/av1/encoder/rdopt.c
@@ -46,6 +46,7 @@
 #include "av1/encoder/ratectrl.h"
 #include "av1/encoder/rd.h"
 #include "av1/encoder/rdopt.h"
+#include "av1/encoder/tokenize.h"
 
 #if CONFIG_DUAL_FILTER
 #if CONFIG_EXT_INTERP
@@ -865,14 +866,14 @@ int64_t av1_highbd_block_error_c(const tran_low_t *coeff,
 }
 #endif  // CONFIG_AOM_HIGHBITDEPTH
 
-/* The trailing '0' is a terminator which is used inside cost_coeffs() to
+/* The trailing '0' is a terminator which is used inside av1_cost_coeffs() to
  * decide whether to include cost of a trailing EOB node or not (i.e. we
  * can skip this if the last coefficient in this transform block, e.g. the
  * 16th coefficient in a 4x4 block or the 64th coefficient in a 8x8 block,
  * were non-zero). */
-static int cost_coeffs(MACROBLOCK *x, int plane, int block, int coeff_ctx,
-                       TX_SIZE tx_size, const int16_t *scan, const int16_t *nb,
-                       int use_fast_coef_costing) {
+int av1_cost_coeffs(MACROBLOCK *x, int plane, int block, int coeff_ctx,
+                    TX_SIZE tx_size, const int16_t *scan, const int16_t *nb,
+                    int use_fast_coef_costing) {
   MACROBLOCKD *const xd = &x->e_mbd;
   MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
   const struct macroblock_plane *p = &x->plane[plane];
@@ -1064,8 +1065,9 @@ static void dist_block(const AV1_COMP *cpi, MACROBLOCK *x, int plane, int block,
 
 static int rate_block(int plane, int block, int coeff_ctx, TX_SIZE tx_size,
                       struct rdcost_block_args *args) {
-  return cost_coeffs(args->x, plane, block, coeff_ctx, tx_size, args->so->scan,
-                     args->so->neighbors, args->use_fast_coef_costing);
+  return av1_cost_coeffs(args->x, plane, block, coeff_ctx, tx_size,
+                         args->so->scan, args->so->neighbors,
+                         args->use_fast_coef_costing);
 }
 
 static uint64_t sum_squares_2d(const int16_t *diff, int diff_stride,
@@ -1946,8 +1948,9 @@ static int64_t rd_pick_intra4x4block(AV1_COMP *cpi, MACROBLOCK *x, int row,
             av1_xform_quant(x, 0, block, row + idy, col + idx, BLOCK_8X8,
                             TX_4X4, AV1_XFORM_QUANT_FP);
 #endif  // CONFIG_NEW_QUANT
-            ratey += cost_coeffs(x, 0, block, coeff_ctx, TX_4X4, so->scan,
-                                 so->neighbors, cpi->sf.use_fast_coef_costing);
+            ratey +=
+                av1_cost_coeffs(x, 0, block, coeff_ctx, TX_4X4, so->scan,
+                                so->neighbors, cpi->sf.use_fast_coef_costing);
             *(tempa + idx) = !(p->eobs[block] == 0);
             *(templ + idy) = !(p->eobs[block] == 0);
             can_skip &= (p->eobs[block] == 0);
@@ -1971,8 +1974,9 @@ static int64_t rd_pick_intra4x4block(AV1_COMP *cpi, MACROBLOCK *x, int row,
                             TX_4X4, AV1_XFORM_QUANT_FP);
 #endif  // CONFIG_NEW_QUANT
             av1_optimize_b(x, 0, block, TX_4X4, coeff_ctx);
-            ratey += cost_coeffs(x, 0, block, coeff_ctx, TX_4X4, so->scan,
-                                 so->neighbors, cpi->sf.use_fast_coef_costing);
+            ratey +=
+                av1_cost_coeffs(x, 0, block, coeff_ctx, TX_4X4, so->scan,
+                                so->neighbors, cpi->sf.use_fast_coef_costing);
             *(tempa + idx) = !(p->eobs[block] == 0);
             *(templ + idy) = !(p->eobs[block] == 0);
             can_skip &= (p->eobs[block] == 0);
@@ -2064,8 +2068,9 @@ static int64_t rd_pick_intra4x4block(AV1_COMP *cpi, MACROBLOCK *x, int row,
           av1_xform_quant(x, 0, block, row + idy, col + idx, BLOCK_8X8, TX_4X4,
                           AV1_XFORM_QUANT_B);
 #endif  // CONFIG_NEW_QUANT
-          ratey += cost_coeffs(x, 0, block, coeff_ctx, TX_4X4, so->scan,
-                               so->neighbors, cpi->sf.use_fast_coef_costing);
+          ratey +=
+              av1_cost_coeffs(x, 0, block, coeff_ctx, TX_4X4, so->scan,
+                              so->neighbors, cpi->sf.use_fast_coef_costing);
           *(tempa + idx) = !(p->eobs[block] == 0);
           *(templ + idy) = !(p->eobs[block] == 0);
           can_skip &= (p->eobs[block] == 0);
@@ -2088,8 +2093,9 @@ static int64_t rd_pick_intra4x4block(AV1_COMP *cpi, MACROBLOCK *x, int row,
                           AV1_XFORM_QUANT_FP);
 #endif  // CONFIG_NEW_QUANT
           av1_optimize_b(x, 0, block, TX_4X4, coeff_ctx);
-          ratey += cost_coeffs(x, 0, block, coeff_ctx, TX_4X4, so->scan,
-                               so->neighbors, cpi->sf.use_fast_coef_costing);
+          ratey +=
+              av1_cost_coeffs(x, 0, block, coeff_ctx, TX_4X4, so->scan,
+                              so->neighbors, cpi->sf.use_fast_coef_costing);
           *(tempa + idx) = !(p->eobs[block] == 0);
           *(templ + idy) = !(p->eobs[block] == 0);
           can_skip &= (p->eobs[block] == 0);
@@ -2964,8 +2970,8 @@ void av1_tx_block_rd_b(const AV1_COMP *cpi, MACROBLOCK *x, TX_SIZE tx_size,
     }
   }
   *dist += tmp * 16;
-  *rate += cost_coeffs(x, plane, block, coeff_ctx, tx_size, scan_order->scan,
-                       scan_order->neighbors, 0);
+  *rate += av1_cost_coeffs(x, plane, block, coeff_ctx, tx_size,
+                           scan_order->scan, scan_order->neighbors, 0);
   *skip &= (p->eobs[block] == 0);
 }
 
@@ -4374,8 +4380,8 @@ static int64_t encode_inter_mb_segment(AV1_COMP *cpi, MACROBLOCK *x,
                  &dist, &ssz);
       thisdistortion += dist;
       thissse += ssz;
-      thisrate += cost_coeffs(x, 0, block, coeff_ctx, tx_size, so->scan,
-                              so->neighbors, cpi->sf.use_fast_coef_costing);
+      thisrate += av1_cost_coeffs(x, 0, block, coeff_ctx, tx_size, so->scan,
+                                  so->neighbors, cpi->sf.use_fast_coef_costing);
       *(ta + (k & 1)) = !(p->eobs[block] == 0);
       *(tl + (k >> 1)) = !(p->eobs[block] == 0);
 #if CONFIG_EXT_TX
diff --git a/av1/encoder/rdopt.h b/av1/encoder/rdopt.h
index eb0ff9f5c..584c43913 100644
--- a/av1/encoder/rdopt.h
+++ b/av1/encoder/rdopt.h
@@ -26,6 +26,9 @@ struct AV1_COMP;
 struct macroblock;
 struct RD_COST;
 
+int av1_cost_coeffs(MACROBLOCK *x, int plane, int block, int coeff_ctx,
+                    TX_SIZE tx_size, const int16_t *scan, const int16_t *nb,
+                    int use_fast_coef_costing);
 void av1_rd_pick_intra_mode_sb(struct AV1_COMP *cpi, struct macroblock *x,
                                struct RD_COST *rd_cost, BLOCK_SIZE bsize,
                                PICK_MODE_CONTEXT *ctx, int64_t best_rd);
diff --git a/av1/encoder/tokenize.c b/av1/encoder/tokenize.c
index 3bf24101a..d65960756 100644
--- a/av1/encoder/tokenize.c
+++ b/av1/encoder/tokenize.c
@@ -23,6 +23,7 @@
 
 #include "av1/encoder/cost.h"
 #include "av1/encoder/encoder.h"
+#include "av1/encoder/rdopt.h"
 #include "av1/encoder/tokenize.h"
 
 static const TOKENVALUE dct_cat_lt_10_value_tokens[] = {
@@ -346,8 +347,31 @@ struct tokenize_b_args {
   AV1_COMP *cpi;
   ThreadData *td;
   TOKENEXTRA **tp;
+  int this_rate;
 };
 
+static void cost_coeffs_b(int plane, int block, int blk_row, int blk_col,
+                          BLOCK_SIZE plane_bsize, TX_SIZE tx_size, void *arg) {
+  struct tokenize_b_args *const args = arg;
+  ThreadData *const td = args->td;
+  MACROBLOCK *const x = &td->mb;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
+  struct macroblock_plane *p = &x->plane[plane];
+  struct macroblockd_plane *pd = &xd->plane[plane];
+  const PLANE_TYPE type = pd->plane_type;
+  const int ref = is_inter_block(mbmi);
+  const TX_TYPE tx_type = get_tx_type(type, xd, block, tx_size);
+  const scan_order *const so = get_scan(tx_size, tx_type, ref);
+  int pt = get_entropy_context(tx_size, pd->above_context + blk_col,
+                               pd->left_context + blk_row);
+  int rate =
+      av1_cost_coeffs(x, plane, block, pt, tx_size, so->scan, so->neighbors, 0);
+  args->this_rate += rate;
+  av1_set_contexts(xd, pd, plane_bsize, tx_size, p->eobs[block] > 0, blk_col,
+                   blk_row);
+}
+
 static void set_entropy_context_b(int plane, int block, int blk_row,
                                   int blk_col, BLOCK_SIZE plane_bsize,
                                   TX_SIZE tx_size, void *arg) {
@@ -395,8 +419,9 @@ static INLINE int get_tx_eob(const struct segmentation *seg, int segment_id,
   return segfeature_active(seg, segment_id, SEG_LVL_SKIP) ? 0 : eob_max;
 }
 
-void av1_tokenize_palette_sb(struct ThreadData *const td, BLOCK_SIZE bsize,
-                             int plane, TOKENEXTRA **t) {
+void av1_tokenize_palette_sb(AV1_COMP *cpi, struct ThreadData *const td,
+                             int plane, TOKENEXTRA **t, RUN_TYPE dry_run,
+                             BLOCK_SIZE bsize, int *rate) {
   MACROBLOCK *const x = &td->mb;
   MACROBLOCKD *const xd = &x->e_mbd;
   MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
@@ -404,7 +429,8 @@ void av1_tokenize_palette_sb(struct ThreadData *const td, BLOCK_SIZE bsize,
   PALETTE_MODE_INFO *pmi = &mbmi->palette_mode_info;
   int n = pmi->palette_size[plane != 0];
   int i, j, k;
-  int color_new_idx = -1, color_ctx, color_order[PALETTE_MAX_SIZE];
+  int this_rate = 0;
+  int color_idx = -1, color_ctx, color_order[PALETTE_MAX_SIZE];
   const int rows = (4 * num_4x4_blocks_high_lookup[bsize]) >>
                    (xd->plane[plane != 0].subsampling_y);
   const int cols = (4 * num_4x4_blocks_wide_lookup[bsize]) >>
@@ -419,16 +445,19 @@ void av1_tokenize_palette_sb(struct ThreadData *const td, BLOCK_SIZE bsize,
           av1_get_palette_color_context(color_map, cols, i, j, n, color_order);
       for (k = 0; k < n; ++k)
         if (color_map[i * cols + j] == color_order[k]) {
-          color_new_idx = k;
+          color_idx = k;
           break;
         }
-      assert(color_new_idx >= 0 && color_new_idx < n);
-      (*t)->token = color_new_idx;
+      assert(color_idx >= 0 && color_idx < n);
+      if (dry_run == DRY_RUN_COSTCOEFFS)
+        this_rate += cpi->palette_y_color_cost[n - 2][color_ctx][color_idx];
+      (*t)->token = color_idx;
       (*t)->context_tree = probs[n - 2][color_ctx];
       (*t)->skip_eob_node = 0;
       ++(*t);
     }
   }
+  if (rate) *rate += this_rate;
 }
 
 static void tokenize_b(int plane, int block, int blk_row, int blk_col,
@@ -560,7 +589,7 @@ int av1_has_high_freq_in_plane(MACROBLOCK *x, BLOCK_SIZE bsize, int plane) {
 }
 
 #if CONFIG_VAR_TX
-void tokenize_vartx(ThreadData *td, TOKENEXTRA **t, int dry_run,
+void tokenize_vartx(ThreadData *td, TOKENEXTRA **t, RUN_TYPE dry_run,
                     TX_SIZE tx_size, BLOCK_SIZE plane_bsize, int blk_row,
                     int blk_col, int block, int plane, void *arg) {
   MACROBLOCK *const x = &td->mb;
@@ -593,9 +622,11 @@ void tokenize_vartx(ThreadData *td, TOKENEXTRA **t, int dry_run,
     BLOCK_SIZE plane_bsize = get_plane_block_size(mbmi->sb_type, pd);
     if (!dry_run)
       tokenize_b(plane, block, blk_row, blk_col, plane_bsize, tx_size, arg);
-    else
+    else if (dry_run == DRY_RUN_NORMAL)
       set_entropy_context_b(plane, block, blk_row, blk_col, plane_bsize,
                             tx_size, arg);
+    else if (dry_run == DRY_RUN_COSTCOEFFS)
+      cost_coeffs_b(plane, block, blk_row, blk_col, plane_bsize, tx_size, arg);
   } else {
     int bsl = b_width_log2_lookup[bsize];
     int i;
@@ -617,8 +648,8 @@ void tokenize_vartx(ThreadData *td, TOKENEXTRA **t, int dry_run,
 }
 
 void av1_tokenize_sb_vartx(AV1_COMP *cpi, ThreadData *td, TOKENEXTRA **t,
-                           int dry_run, int mi_row, int mi_col,
-                           BLOCK_SIZE bsize) {
+                           RUN_TYPE dry_run, int mi_row, int mi_col,
+                           BLOCK_SIZE bsize, int *rate) {
   AV1_COMMON *const cm = &cpi->common;
   MACROBLOCK *const x = &td->mb;
   MACROBLOCKD *const xd = &x->e_mbd;
@@ -627,7 +658,7 @@ void av1_tokenize_sb_vartx(AV1_COMP *cpi, ThreadData *td, TOKENEXTRA **t,
   const int ctx = av1_get_skip_context(xd);
   const int skip_inc =
       !segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP);
-  struct tokenize_b_args arg = { cpi, td, t };
+  struct tokenize_b_args arg = { cpi, td, t, 0 };
   int plane;
   if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols) return;
 
@@ -667,11 +698,12 @@ void av1_tokenize_sb_vartx(AV1_COMP *cpi, ThreadData *td, TOKENEXTRA **t,
       (*t)++;
     }
   }
+  if (rate) *rate += arg.this_rate;
 }
 #endif  // CONFIG_VAR_TX
 
-void av1_tokenize_sb(AV1_COMP *cpi, ThreadData *td, TOKENEXTRA **t, int dry_run,
-                     BLOCK_SIZE bsize) {
+void av1_tokenize_sb(AV1_COMP *cpi, ThreadData *td, TOKENEXTRA **t,
+                     RUN_TYPE dry_run, BLOCK_SIZE bsize, int *rate) {
   AV1_COMMON *const cm = &cpi->common;
   MACROBLOCK *const x = &td->mb;
   MACROBLOCKD *const xd = &x->e_mbd;
@@ -679,7 +711,7 @@ void av1_tokenize_sb(AV1_COMP *cpi, ThreadData *td, TOKENEXTRA **t, int dry_run,
   const int ctx = av1_get_skip_context(xd);
   const int skip_inc =
       !segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP);
-  struct tokenize_b_args arg = { cpi, td, t };
+  struct tokenize_b_args arg = { cpi, td, t, 0 };
   if (mbmi->skip) {
     if (!dry_run) td->counts->skip[ctx][1] += skip_inc;
     reset_skip_context(xd, bsize);
@@ -697,14 +729,17 @@ void av1_tokenize_sb(AV1_COMP *cpi, ThreadData *td, TOKENEXTRA **t, int dry_run,
       (*t)->token = EOSB_TOKEN;
       (*t)++;
     }
-  } else {
+  } else if (dry_run == DRY_RUN_NORMAL) {
     av1_foreach_transformed_block(xd, bsize, set_entropy_context_b, &arg);
+  } else if (dry_run == DRY_RUN_COSTCOEFFS) {
+    av1_foreach_transformed_block(xd, bsize, cost_coeffs_b, &arg);
   }
+  if (rate) *rate += arg.this_rate;
 }
 
 #if CONFIG_SUPERTX
 void av1_tokenize_sb_supertx(AV1_COMP *cpi, ThreadData *td, TOKENEXTRA **t,
-                             int dry_run, BLOCK_SIZE bsize) {
+                             RUN_TYPE dry_run, BLOCK_SIZE bsize, int *rate) {
   AV1_COMMON *const cm = &cpi->common;
   MACROBLOCKD *const xd = &td->mb.e_mbd;
   MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
@@ -712,7 +747,7 @@ void av1_tokenize_sb_supertx(AV1_COMP *cpi, ThreadData *td, TOKENEXTRA **t,
   const int ctx = av1_get_skip_context(xd);
   const int skip_inc =
       !segfeature_active(&cm->seg, mbmi->segment_id_supertx, SEG_LVL_SKIP);
-  struct tokenize_b_args arg = { cpi, td, t };
+  struct tokenize_b_args arg = { cpi, td, t, 0 };
   if (mbmi->skip) {
     if (!dry_run) td->counts->skip[ctx][1] += skip_inc;
     reset_skip_context(xd, bsize);
@@ -730,9 +765,12 @@ void av1_tokenize_sb_supertx(AV1_COMP *cpi, ThreadData *td, TOKENEXTRA **t,
       (*t)->token = EOSB_TOKEN;
       (*t)++;
     }
-  } else {
+  } else if (dry_run == DRY_RUN_NORMAL) {
     av1_foreach_transformed_block(xd, bsize, set_entropy_context_b, &arg);
     *t = t_backup;
+  } else if (dry_run == DRY_RUN_COSTCOEFFS) {
+    av1_foreach_transformed_block(xd, bsize, cost_coeffs_b, &arg);
   }
+  if (rate) *rate += arg.this_rate;
 }
 #endif  // CONFIG_SUPERTX
diff --git a/av1/encoder/tokenize.h b/av1/encoder/tokenize.h
index a7e30d545..520e1b676 100644
--- a/av1/encoder/tokenize.h
+++ b/av1/encoder/tokenize.h
@@ -56,19 +56,31 @@ int av1_has_high_freq_in_plane(MACROBLOCK *x, BLOCK_SIZE bsize, int plane);
 struct AV1_COMP;
 struct ThreadData;
 
+typedef enum {
+  OUTPUT_ENABLED = 0,
+  DRY_RUN_NORMAL,
+  DRY_RUN_COSTCOEFFS,
+} RUN_TYPE;
+
+// Note in all the tokenize functions rate if non NULL is incremented
+// with the coefficient token cost only if dry_run = DRY_RUN_COSTCOEFS,
+// otherwise rate is not incremented.
 #if CONFIG_VAR_TX
 void av1_tokenize_sb_vartx(struct AV1_COMP *cpi, struct ThreadData *td,
-                           TOKENEXTRA **t, int dry_run, int mi_row, int mi_col,
-                           BLOCK_SIZE bsize);
+                           TOKENEXTRA **t, RUN_TYPE dry_run, int mi_row,
+                           int mi_col, BLOCK_SIZE bsize, int *rate);
 #endif
 
-void av1_tokenize_palette_sb(struct ThreadData *const td, BLOCK_SIZE bsize,
-                             int plane, TOKENEXTRA **t);
+void av1_tokenize_palette_sb(struct AV1_COMP *cpi, struct ThreadData *const td,
+                             int plane, TOKENEXTRA **t, RUN_TYPE dry_run,
+                             BLOCK_SIZE bsize, int *rate);
 void av1_tokenize_sb(struct AV1_COMP *cpi, struct ThreadData *td,
-                     TOKENEXTRA **t, int dry_run, BLOCK_SIZE bsize);
+                     TOKENEXTRA **t, RUN_TYPE dry_run, BLOCK_SIZE bsize,
+                     int *rate);
 #if CONFIG_SUPERTX
 void av1_tokenize_sb_supertx(struct AV1_COMP *cpi, struct ThreadData *td,
-                             TOKENEXTRA **t, int dry_run, BLOCK_SIZE bsize);
+                             TOKENEXTRA **t, RUN_TYPE dry_run, BLOCK_SIZE bsize,
+                             int *rate);
 #endif
 
 extern const int16_t *av1_dct_value_cost_ptr;