From 71061e9332c05324007e7f6c900285273793366d Mon Sep 17 00:00:00 2001
From: Ranjit Kumar Tulabandu <ranjit.tulabandu@ittiam.com>
Date: Fri, 10 Feb 2017 16:25:50 +0530
Subject: [PATCH] Row based multi-threading of encoding stage

(Yunqing Wang)
This patch implements the row-based multi-threading within tiles in
the encoding pass, and substantially speeds up the multi-threaded
encoder in VP9.

Speed tests at speed 1 on STDHD(using 4 tiles) set show that the
average speedups of the encoding pass(second pass in the 2-pass
encoding) is 7% while using 2 threads, 16% while using 4 threads,
85% while using 8 threads, and 116% while using 16 threads.

Change-Id: I12e41dbc171951958af9e6d098efd6e2c82827de
---
 vp9/encoder/vp9_bitstream.c      |  5 +-
 vp9/encoder/vp9_block.h          |  5 ++
 vp9/encoder/vp9_encodeframe.c    | 48 ++++++++++++----
 vp9/encoder/vp9_encodeframe.h    |  3 +
 vp9/encoder/vp9_encoder.c        | 26 +++++----
 vp9/encoder/vp9_encoder.h        |  6 ++
 vp9/encoder/vp9_ethread.c        | 99 +++++++++++++++++++++++++++++++-
 vp9/encoder/vp9_ethread.h        |  2 +
 vp9/encoder/vp9_mcomp.c          | 45 +++++++++++++--
 vp9/encoder/vp9_multi_thread.c   | 40 +++++++++++++
 vp9/encoder/vp9_pickmode.c       | 10 +++-
 vp9/encoder/vp9_rd.c             | 14 ++++-
 vp9/encoder/vp9_rd.h             | 25 +++++++-
 vp9/encoder/vp9_rdopt.c          | 48 ++++++++++++++--
 vp9/encoder/vp9_speed_features.c | 18 ++++++
 vp9/vp9_cx_iface.c               |  3 +
 16 files changed, 357 insertions(+), 40 deletions(-)

diff --git a/vp9/encoder/vp9_bitstream.c b/vp9/encoder/vp9_bitstream.c
index 43c5eaed0..71f85bbe7 100644
--- a/vp9/encoder/vp9_bitstream.c
+++ b/vp9/encoder/vp9_bitstream.c
@@ -925,10 +925,11 @@ int vp9_get_refresh_mask(VP9_COMP *cpi) {
 
 static int encode_tile_worker(VP9_COMP *cpi, VP9BitstreamWorkerData *data) {
   MACROBLOCKD *const xd = &data->xd;
+  const int tile_row = 0;
   vpx_start_encode(&data->bit_writer, data->dest);
   write_modes(cpi, xd, &cpi->tile_data[data->tile_idx].tile_info,
-              &data->bit_writer, 0, data->tile_idx, &data->max_mv_magnitude,
-              data->interp_filter_selected);
+              &data->bit_writer, tile_row, data->tile_idx,
+              &data->max_mv_magnitude, data->interp_filter_selected);
   vpx_stop_encode(&data->bit_writer);
   return 1;
 }
diff --git a/vp9/encoder/vp9_block.h b/vp9/encoder/vp9_block.h
index 91d07e3a0..c0c69f6b5 100644
--- a/vp9/encoder/vp9_block.h
+++ b/vp9/encoder/vp9_block.h
@@ -11,6 +11,8 @@
 #ifndef VP9_ENCODER_VP9_BLOCK_H_
 #define VP9_ENCODER_VP9_BLOCK_H_
 
+#include "vpx_util/vpx_thread.h"
+
 #include "vp9/common/vp9_entropymv.h"
 #include "vp9/common/vp9_entropy.h"
 
@@ -88,6 +90,9 @@ struct macroblock {
   int mb_energy;
   int *m_search_count_ptr;
   int *ex_search_count_ptr;
+#if CONFIG_MULTITHREAD
+  pthread_mutex_t *search_count_mutex;
+#endif
 
   // These are set to their default values at the beginning, and then adjusted
   // further in the encoding process.
diff --git a/vp9/encoder/vp9_encodeframe.c b/vp9/encoder/vp9_encodeframe.c
index 1bbdeece5..215f8b8f6 100644
--- a/vp9/encoder/vp9_encodeframe.c
+++ b/vp9/encoder/vp9_encodeframe.c
@@ -3095,13 +3095,18 @@ static void encode_rd_sb_row(VP9_COMP *cpi, ThreadData *td,
   const int mi_col_start = tile_info->mi_col_start;
   const int mi_col_end = tile_info->mi_col_end;
   int mi_col;
+  const int sb_row = mi_row >> MI_BLOCK_SIZE_LOG2;
+  const int num_sb_cols =
+      get_num_cols(tile_data->tile_info, MI_BLOCK_SIZE_LOG2);
+  int sb_col_in_tile;
 
   // Initialize the left context for the new SB row
   memset(&xd->left_context, 0, sizeof(xd->left_context));
   memset(xd->left_seg_context, 0, sizeof(xd->left_seg_context));
 
   // Code each SB in the row
-  for (mi_col = mi_col_start; mi_col < mi_col_end; mi_col += MI_BLOCK_SIZE) {
+  for (mi_col = mi_col_start, sb_col_in_tile = 0; mi_col < mi_col_end;
+       mi_col += MI_BLOCK_SIZE, sb_col_in_tile++) {
     const struct segmentation *const seg = &cm->seg;
     int dummy_rate;
     int64_t dummy_dist;
@@ -3112,6 +3117,9 @@ static void encode_rd_sb_row(VP9_COMP *cpi, ThreadData *td,
     const int idx_str = cm->mi_stride * mi_row + mi_col;
     MODE_INFO **mi = cm->mi_grid_visible + idx_str;
 
+    (*(cpi->row_mt_sync_read_ptr))(&tile_data->row_mt_sync, sb_row,
+                                   sb_col_in_tile - 1);
+
     if (sf->adaptive_pred_interp_filter) {
       for (i = 0; i < 64; ++i) td->leaf_tree[i].pred_interp_filter = SWITCHABLE;
 
@@ -3163,6 +3171,8 @@ static void encode_rd_sb_row(VP9_COMP *cpi, ThreadData *td,
       rd_pick_partition(cpi, td, tile_data, tp, mi_row, mi_col, BLOCK_64X64,
                         &dummy_rdc, INT64_MAX, td->pc_root);
     }
+    (*(cpi->row_mt_sync_write_ptr))(&tile_data->row_mt_sync, sb_row,
+                                    sb_col_in_tile, num_sb_cols);
   }
 }
 
@@ -4109,13 +4119,17 @@ void vp9_init_tile_data(VP9_COMP *cpi) {
             tile_data->mode_map[i][j] = j;
           }
         }
+#if CONFIG_MULTITHREAD
+        tile_data->search_count_mutex = NULL;
+        tile_data->enc_row_mt_mutex = NULL;
+#endif
       }
   }
 
   for (tile_row = 0; tile_row < tile_rows; ++tile_row) {
     for (tile_col = 0; tile_col < tile_cols; ++tile_col) {
-      TileInfo *tile_info =
-          &cpi->tile_data[tile_row * tile_cols + tile_col].tile_info;
+      TileDataEnc *this_tile = &cpi->tile_data[tile_row * tile_cols + tile_col];
+      TileInfo *tile_info = &this_tile->tile_info;
       vp9_tile_init(tile_info, cm, tile_row, tile_col);
 
       cpi->tile_tok[tile_row][tile_col] = pre_tok + tile_tok;
@@ -4125,6 +4139,10 @@ void vp9_init_tile_data(VP9_COMP *cpi) {
       cpi->tplist[tile_row][tile_col] = tplist + tplist_count;
       tplist = cpi->tplist[tile_row][tile_col];
       tplist_count = get_num_vert_units(*tile_info, MI_BLOCK_SIZE_LOG2);
+
+      // Set up pointers to per thread motion search counters.
+      this_tile->m_search_count = 0;   // Count of motion search hits.
+      this_tile->ex_search_count = 0;  // Exhaustive mesh search hits.
     }
   }
 }
@@ -4170,10 +4188,11 @@ void vp9_encode_tile(VP9_COMP *cpi, ThreadData *td, int tile_row,
   int mi_row;
 
   // Set up pointers to per thread motion search counters.
-  this_tile->m_search_count = 0;   // Count of motion search hits.
-  this_tile->ex_search_count = 0;  // Exhaustive mesh search hits.
   td->mb.m_search_count_ptr = &this_tile->m_search_count;
   td->mb.ex_search_count_ptr = &this_tile->ex_search_count;
+#if CONFIG_MULTITHREAD
+  td->mb.search_count_mutex = this_tile->search_count_mutex;
+#endif
 
   for (mi_row = mi_row_start; mi_row < mi_row_end; mi_row += MI_BLOCK_SIZE)
     vp9_encode_sb_row(cpi, td, tile_row, tile_col, mi_row);
@@ -4289,11 +4308,20 @@ static void encode_frame_internal(VP9_COMP *cpi) {
     }
 #endif
 
-    // If allowed, encoding tiles in parallel with one thread handling one tile.
-    if (VPXMIN(cpi->oxcf.max_threads, 1 << cm->log2_tile_cols) > 1)
-      vp9_encode_tiles_mt(cpi);
-    else
-      encode_tiles(cpi);
+    if (!cpi->new_mt) {
+      cpi->row_mt_sync_read_ptr = vp9_row_mt_sync_read_dummy;
+      cpi->row_mt_sync_write_ptr = vp9_row_mt_sync_write_dummy;
+      // If allowed, encoding tiles in parallel with one thread handling one
+      // tile when row based multi-threading is disabled.
+      if (VPXMIN(cpi->oxcf.max_threads, 1 << cm->log2_tile_cols) > 1)
+        vp9_encode_tiles_mt(cpi);
+      else
+        encode_tiles(cpi);
+    } else {
+      cpi->row_mt_sync_read_ptr = vp9_row_mt_sync_read;
+      cpi->row_mt_sync_write_ptr = vp9_row_mt_sync_write;
+      vp9_encode_tiles_row_mt(cpi);
+    }
 
     vpx_usec_timer_mark(&emr_timer);
     cpi->time_encode_sb_row += vpx_usec_timer_elapsed(&emr_timer);
diff --git a/vp9/encoder/vp9_encodeframe.h b/vp9/encoder/vp9_encodeframe.h
index aa5494785..2b9b65dcd 100644
--- a/vp9/encoder/vp9_encodeframe.h
+++ b/vp9/encoder/vp9_encodeframe.h
@@ -39,6 +39,9 @@ void vp9_init_tile_data(struct VP9_COMP *cpi);
 void vp9_encode_tile(struct VP9_COMP *cpi, struct ThreadData *td, int tile_row,
                      int tile_col);
 
+void vp9_encode_sb_row(struct VP9_COMP *cpi, struct ThreadData *td,
+                       int tile_row, int tile_col, int mi_row);
+
 void vp9_set_variance_partition_thresholds(struct VP9_COMP *cpi, int q);
 
 #ifdef __cplusplus
diff --git a/vp9/encoder/vp9_encoder.c b/vp9/encoder/vp9_encoder.c
index 50fa8c682..2ce46c657 100644
--- a/vp9/encoder/vp9_encoder.c
+++ b/vp9/encoder/vp9_encoder.c
@@ -1575,17 +1575,7 @@ void vp9_change_config(struct VP9_COMP *cpi, const VP9EncoderConfig *oxcf) {
   highbd_set_var_fns(cpi);
 #endif
 
-  // Enable multi-threading for first pass.
-  cpi->new_mt = 0;
-  if (((cpi->oxcf.mode == GOOD || cpi->oxcf.mode == BEST) &&
-       cpi->oxcf.speed < 5 && cpi->oxcf.pass == 1) &&
-      cpi->oxcf.new_mt && !cpi->use_svc)
-    cpi->new_mt = 1;
-
-  if (cpi->oxcf.mode == GOOD && cpi->oxcf.speed < 5 &&
-      (cpi->oxcf.pass == 0 || cpi->oxcf.pass == 2) && cpi->oxcf.new_mt &&
-      !cpi->use_svc)
-    cpi->new_mt = 1;
+  vp9_set_new_mt(cpi);
 }
 
 #ifndef M_LOG2_E
@@ -5213,3 +5203,17 @@ void vp9_apply_encoding_flags(VP9_COMP *cpi, vpx_enc_frame_flags_t flags) {
     vp9_update_entropy(cpi, 0);
   }
 }
+
+void vp9_set_new_mt(VP9_COMP *cpi) {
+  // Enable row based multi-threading for supported modes of encoding
+  cpi->new_mt = 0;
+  if (((cpi->oxcf.mode == GOOD || cpi->oxcf.mode == BEST) &&
+       cpi->oxcf.speed < 5 && cpi->oxcf.pass == 1) &&
+      cpi->oxcf.new_mt && !cpi->use_svc)
+    cpi->new_mt = 1;
+
+  if (cpi->oxcf.mode == GOOD && cpi->oxcf.speed < 5 &&
+      (cpi->oxcf.pass == 0 || cpi->oxcf.pass == 2) && cpi->oxcf.new_mt &&
+      !cpi->use_svc)
+    cpi->new_mt = 1;
+}
diff --git a/vp9/encoder/vp9_encoder.h b/vp9/encoder/vp9_encoder.h
index 65f3f86de..675512618 100644
--- a/vp9/encoder/vp9_encoder.h
+++ b/vp9/encoder/vp9_encoder.h
@@ -276,6 +276,10 @@ typedef struct TileDataEnc {
   int ex_search_count;
   FIRSTPASS_DATA fp_data;
   VP9RowMTSync row_mt_sync;
+#if CONFIG_MULTITHREAD
+  pthread_mutex_t *search_count_mutex;
+  pthread_mutex_t *enc_row_mt_mutex;
+#endif
 } TileDataEnc;
 
 typedef struct RowMTInfo {
@@ -897,6 +901,8 @@ VP9_LEVEL vp9_get_level(const Vp9LevelSpec *const level_spec);
 
 void vp9_new_framerate(VP9_COMP *cpi, double framerate);
 
+void vp9_set_new_mt(VP9_COMP *cpi);
+
 #define LAYER_IDS_TO_IDX(sl, tl, num_tl) ((sl) * (num_tl) + (tl))
 
 #ifdef __cplusplus
diff --git a/vp9/encoder/vp9_ethread.c b/vp9/encoder/vp9_ethread.c
index 1bffc4030..bf8108416 100644
--- a/vp9/encoder/vp9_ethread.c
+++ b/vp9/encoder/vp9_ethread.c
@@ -341,7 +341,7 @@ void vp9_row_mt_sync_write(VP9RowMTSync *const row_mt_sync, int r, int c,
 #if CONFIG_MULTITHREAD
   const int nsync = row_mt_sync->sync_range;
   int cur;
-  // Only signal when there are enough filtered SB for next row to run.
+  // Only signal when there are enough encoded blocks for next row to run.
   int sig = 1;
 
   if (c < cols - 1) {
@@ -542,3 +542,100 @@ void vp9_temporal_filter_row_mt(VP9_COMP *cpi) {
   launch_enc_workers(cpi, (VPxWorkerHook)temporal_filter_worker_hook,
                      multi_thread_ctxt, num_workers);
 }
+
+static int enc_row_mt_worker_hook(EncWorkerData *const thread_data,
+                                  MultiThreadHandle *multi_thread_ctxt) {
+  VP9_COMP *const cpi = thread_data->cpi;
+  const VP9_COMMON *const cm = &cpi->common;
+  const int tile_cols = 1 << cm->log2_tile_cols;
+  int tile_row, tile_col;
+  TileDataEnc *this_tile;
+  int end_of_frame;
+  int thread_id = thread_data->thread_id;
+  int cur_tile_id = multi_thread_ctxt->thread_id_to_tile_id[thread_id];
+  JobNode *proc_job = NULL;
+  int mi_row;
+
+  end_of_frame = 0;
+  while (0 == end_of_frame) {
+    // Get the next job in the queue
+    proc_job =
+        (JobNode *)vp9_enc_grp_get_next_job(multi_thread_ctxt, cur_tile_id);
+    if (NULL == proc_job) {
+      // Query for the status of other tiles
+      end_of_frame = vp9_get_tiles_proc_status(
+          multi_thread_ctxt, thread_data->tile_completion_status, &cur_tile_id,
+          tile_cols);
+    } else {
+      tile_col = proc_job->tile_col_id;
+      tile_row = proc_job->tile_row_id;
+      mi_row = proc_job->vert_unit_row_num * MI_BLOCK_SIZE;
+
+      this_tile = &cpi->tile_data[tile_row * tile_cols + tile_col];
+      thread_data->td->mb.m_search_count_ptr = &this_tile->m_search_count;
+      thread_data->td->mb.ex_search_count_ptr = &this_tile->ex_search_count;
+#if CONFIG_MULTITHREAD
+      thread_data->td->mb.search_count_mutex = this_tile->search_count_mutex;
+#endif
+
+      vp9_encode_sb_row(cpi, thread_data->td, tile_row, tile_col, mi_row);
+    }
+  }
+  return 0;
+}
+
+void vp9_encode_tiles_row_mt(VP9_COMP *cpi) {
+  VP9_COMMON *const cm = &cpi->common;
+  const int tile_cols = 1 << cm->log2_tile_cols;
+  const int tile_rows = 1 << cm->log2_tile_rows;
+  MultiThreadHandle *multi_thread_ctxt = &cpi->multi_thread_ctxt;
+  int num_workers = VPXMAX(cpi->oxcf.max_threads, 1);
+  int i;
+
+  if (multi_thread_ctxt->allocated_tile_cols < tile_cols ||
+      multi_thread_ctxt->allocated_tile_rows < tile_rows ||
+      multi_thread_ctxt->allocated_vert_unit_rows < cm->mb_rows) {
+    vp9_row_mt_mem_dealloc(cpi);
+    vp9_init_tile_data(cpi);
+    vp9_row_mt_mem_alloc(cpi);
+  } else {
+    vp9_init_tile_data(cpi);
+  }
+
+  create_enc_workers(cpi, num_workers);
+
+  vp9_assign_tile_to_thread(multi_thread_ctxt, tile_cols, cpi->num_workers);
+
+  vp9_prepare_job_queue(cpi, ENCODE_JOB);
+
+  vp9_multi_thread_tile_init(cpi);
+
+  for (i = 0; i < num_workers; i++) {
+    EncWorkerData *thread_data;
+    thread_data = &cpi->tile_thr_data[i];
+
+    // Before encoding a frame, copy the thread data from cpi.
+    if (thread_data->td != &cpi->td) {
+      thread_data->td->mb = cpi->td.mb;
+      thread_data->td->rd_counts = cpi->td.rd_counts;
+    }
+    if (thread_data->td->counts != &cpi->common.counts) {
+      memcpy(thread_data->td->counts, &cpi->common.counts,
+             sizeof(cpi->common.counts));
+    }
+  }
+
+  launch_enc_workers(cpi, (VPxWorkerHook)enc_row_mt_worker_hook,
+                     multi_thread_ctxt, num_workers);
+
+  for (i = 0; i < num_workers; i++) {
+    VPxWorker *const worker = &cpi->workers[i];
+    EncWorkerData *const thread_data = (EncWorkerData *)worker->data1;
+
+    // Accumulate counters.
+    if (i < cpi->num_workers - 1) {
+      vp9_accumulate_frame_counts(&cm->counts, thread_data->td->counts, 0);
+      accumulate_rd_opt(&cpi->td, thread_data->td);
+    }
+  }
+}
diff --git a/vp9/encoder/vp9_ethread.h b/vp9/encoder/vp9_ethread.h
index 908bb6ff6..a396e621d 100644
--- a/vp9/encoder/vp9_ethread.h
+++ b/vp9/encoder/vp9_ethread.h
@@ -44,6 +44,8 @@ typedef struct VP9RowMTSyncData {
 
 void vp9_encode_tiles_mt(struct VP9_COMP *cpi);
 
+void vp9_encode_tiles_row_mt(struct VP9_COMP *cpi);
+
 void vp9_encode_fp_row_mt(struct VP9_COMP *cpi);
 
 void vp9_row_mt_sync_read(VP9RowMTSync *const row_mt_sync, int r, int c);
diff --git a/vp9/encoder/vp9_mcomp.c b/vp9/encoder/vp9_mcomp.c
index 03877e9aa..300cda648 100644
--- a/vp9/encoder/vp9_mcomp.c
+++ b/vp9/encoder/vp9_mcomp.c
@@ -1993,9 +1993,18 @@ static int full_pixel_exhaustive(VP9_COMP *cpi, MACROBLOCK *x,
   int range = sf->mesh_patterns[0].range;
   int baseline_interval_divisor;
 
+#if CONFIG_MULTITHREAD
+  if (NULL != x->search_count_mutex) pthread_mutex_lock(x->search_count_mutex);
+#endif
+
   // Keep track of number of exhaustive calls (this frame in this thread).
   ++(*x->ex_search_count_ptr);
 
+#if CONFIG_MULTITHREAD
+  if (NULL != x->search_count_mutex)
+    pthread_mutex_unlock(x->search_count_mutex);
+#endif
+
   // Trap illegal values for interval and range for this function.
   if ((range < MIN_RANGE) || (range > MAX_RANGE) || (interval < MIN_INTERVAL) ||
       (interval > range))
@@ -2356,13 +2365,27 @@ int vp9_refining_search_8p_c(const MACROBLOCK *x, MV *ref_mv, int error_per_bit,
 #define MIN_EX_SEARCH_LIMIT 128
 static int is_exhaustive_allowed(VP9_COMP *cpi, MACROBLOCK *x) {
   const SPEED_FEATURES *const sf = &cpi->sf;
-  const int max_ex =
-      VPXMAX(MIN_EX_SEARCH_LIMIT,
-             (*x->m_search_count_ptr * sf->max_exaustive_pct) / 100);
+  int is_exhaustive_allowed;
+  int max_ex;
+
+#if CONFIG_MULTITHREAD
+  if (NULL != x->search_count_mutex) pthread_mutex_lock(x->search_count_mutex);
+#endif
+
+  max_ex = VPXMAX(MIN_EX_SEARCH_LIMIT,
+                  (*x->m_search_count_ptr * sf->max_exaustive_pct) / 100);
 
-  return sf->allow_exhaustive_searches &&
-         (sf->exhaustive_searches_thresh < INT_MAX) &&
-         (*x->ex_search_count_ptr <= max_ex) && !cpi->rc.is_src_frame_alt_ref;
+  is_exhaustive_allowed = sf->allow_exhaustive_searches &&
+                          (sf->exhaustive_searches_thresh < INT_MAX) &&
+                          (*x->ex_search_count_ptr <= max_ex) &&
+                          !cpi->rc.is_src_frame_alt_ref;
+
+#if CONFIG_MULTITHREAD
+  if (NULL != x->search_count_mutex)
+    pthread_mutex_unlock(x->search_count_mutex);
+#endif
+
+  return is_exhaustive_allowed;
 }
 
 int vp9_full_pixel_search(VP9_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize,
@@ -2407,9 +2430,19 @@ int vp9_full_pixel_search(VP9_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize,
                                MAX_MVSEARCH_STEPS - 1 - step_param, 1,
                                cost_list, fn_ptr, ref_mv, tmp_mv);
 
+#if CONFIG_MULTITHREAD
+      if (NULL != x->search_count_mutex)
+        pthread_mutex_lock(x->search_count_mutex);
+#endif
+
       // Keep track of number of searches (this frame in this thread).
       ++(*x->m_search_count_ptr);
 
+#if CONFIG_MULTITHREAD
+      if (NULL != x->search_count_mutex)
+        pthread_mutex_unlock(x->search_count_mutex);
+#endif
+
       // Should we allow a follow on exhaustive search?
       if (is_exhaustive_allowed(cpi, x)) {
         int64_t exhuastive_thr = sf->exhaustive_searches_thresh;
diff --git a/vp9/encoder/vp9_multi_thread.c b/vp9/encoder/vp9_multi_thread.c
index 23b0b4276..e27b1ed3a 100644
--- a/vp9/encoder/vp9_multi_thread.c
+++ b/vp9/encoder/vp9_multi_thread.c
@@ -100,11 +100,32 @@ void vp9_row_mt_mem_alloc(VP9_COMP *cpi) {
     multi_thread_ctxt->num_tile_vert_sbs[tile_row] =
         get_num_vert_units(*tile_info, MI_BLOCK_SIZE_LOG2);
   }
+
+#if CONFIG_MULTITHREAD
+  for (tile_row = 0; tile_row < tile_rows; tile_row++) {
+    for (tile_col = 0; tile_col < tile_cols; tile_col++) {
+      TileDataEnc *this_tile = &cpi->tile_data[tile_row * tile_cols + tile_col];
+
+      CHECK_MEM_ERROR(cm, this_tile->search_count_mutex,
+                      vpx_malloc(sizeof(*this_tile->search_count_mutex)));
+
+      pthread_mutex_init(this_tile->search_count_mutex, NULL);
+
+      CHECK_MEM_ERROR(cm, this_tile->enc_row_mt_mutex,
+                      vpx_malloc(sizeof(*this_tile->enc_row_mt_mutex)));
+
+      pthread_mutex_init(this_tile->enc_row_mt_mutex, NULL);
+    }
+  }
+#endif
 }
 
 void vp9_row_mt_mem_dealloc(VP9_COMP *cpi) {
   MultiThreadHandle *multi_thread_ctxt = &cpi->multi_thread_ctxt;
   int tile_col;
+#if CONFIG_MULTITHREAD
+  int tile_row;
+#endif
 
   // Deallocate memory for job queue
   if (multi_thread_ctxt->job_queue) vpx_free(multi_thread_ctxt->job_queue);
@@ -124,6 +145,25 @@ void vp9_row_mt_mem_dealloc(VP9_COMP *cpi) {
     TileDataEnc *this_tile = &cpi->tile_data[tile_col];
     vp9_row_mt_sync_mem_dealloc(&this_tile->row_mt_sync);
   }
+
+#if CONFIG_MULTITHREAD
+  for (tile_row = 0; tile_row < multi_thread_ctxt->allocated_tile_rows;
+       tile_row++) {
+    for (tile_col = 0; tile_col < multi_thread_ctxt->allocated_tile_cols;
+         tile_col++) {
+      TileDataEnc *this_tile =
+          &cpi->tile_data[tile_row * multi_thread_ctxt->allocated_tile_cols +
+                          tile_col];
+      pthread_mutex_destroy(this_tile->search_count_mutex);
+      vpx_free(this_tile->search_count_mutex);
+      this_tile->search_count_mutex = NULL;
+
+      pthread_mutex_destroy(this_tile->enc_row_mt_mutex);
+      vpx_free(this_tile->enc_row_mt_mutex);
+      this_tile->enc_row_mt_mutex = NULL;
+    }
+  }
+#endif
 }
 
 void vp9_multi_thread_tile_init(VP9_COMP *cpi) {
diff --git a/vp9/encoder/vp9_pickmode.c b/vp9/encoder/vp9_pickmode.c
index cff8a3fa9..9f2e93adc 100644
--- a/vp9/encoder/vp9_pickmode.c
+++ b/vp9/encoder/vp9_pickmode.c
@@ -1657,7 +1657,10 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data,
       mode_rd_thresh = mode_rd_thresh << 3;
 
     if (rd_less_than_thresh(best_rdc.rdcost, mode_rd_thresh,
-                            rd_thresh_freq_fact[mode_index]))
+#if CONFIG_MULTITHREAD
+                            tile_data->enc_row_mt_mutex,
+#endif
+                            &rd_thresh_freq_fact[mode_index]))
       continue;
 
     if (this_mode == NEWMV) {
@@ -2018,7 +2021,10 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data,
         continue;
 
       if (rd_less_than_thresh(best_rdc.rdcost, mode_rd_thresh,
-                              rd_thresh_freq_fact[mode_index]))
+#if CONFIG_MULTITHREAD
+                              tile_data->enc_row_mt_mutex,
+#endif
+                              &rd_thresh_freq_fact[mode_index]))
         continue;
 
       mi->mode = this_mode;
diff --git a/vp9/encoder/vp9_rd.c b/vp9/encoder/vp9_rd.c
index 3bbfa1aac..21e3b1f63 100644
--- a/vp9/encoder/vp9_rd.c
+++ b/vp9/encoder/vp9_rd.c
@@ -610,7 +610,15 @@ void vp9_set_rd_speed_thresholds_sub8x8(VP9_COMP *cpi) {
 }
 
 void vp9_update_rd_thresh_fact(int (*factor_buf)[MAX_MODES], int rd_thresh,
-                               int bsize, int best_mode_index) {
+                               int bsize,
+#if CONFIG_MULTITHREAD
+                               pthread_mutex_t *enc_row_mt_mutex,
+#endif
+                               int best_mode_index) {
+#if CONFIG_MULTITHREAD
+  if (NULL != enc_row_mt_mutex) pthread_mutex_lock(enc_row_mt_mutex);
+#endif
+
   if (rd_thresh > 0) {
     const int top_mode = bsize < BLOCK_8X8 ? MAX_REFS : MAX_MODES;
     int mode;
@@ -628,6 +636,10 @@ void vp9_update_rd_thresh_fact(int (*factor_buf)[MAX_MODES], int rd_thresh,
       }
     }
   }
+
+#if CONFIG_MULTITHREAD
+  if (NULL != enc_row_mt_mutex) pthread_mutex_unlock(enc_row_mt_mutex);
+#endif
 }
 
 int vp9_get_intra_cost_penalty(int qindex, int qdelta,
diff --git a/vp9/encoder/vp9_rd.h b/vp9/encoder/vp9_rd.h
index 1c6831358..74a2f5d95 100644
--- a/vp9/encoder/vp9_rd.h
+++ b/vp9/encoder/vp9_rd.h
@@ -164,11 +164,32 @@ void vp9_set_rd_speed_thresholds(struct VP9_COMP *cpi);
 void vp9_set_rd_speed_thresholds_sub8x8(struct VP9_COMP *cpi);
 
 void vp9_update_rd_thresh_fact(int (*fact)[MAX_MODES], int rd_thresh, int bsize,
+#if CONFIG_MULTITHREAD
+                               pthread_mutex_t *enc_row_mt_mutex,
+#endif
                                int best_mode_index);
 
 static INLINE int rd_less_than_thresh(int64_t best_rd, int thresh,
-                                      int thresh_fact) {
-  return best_rd < ((int64_t)thresh * thresh_fact >> 5) || thresh == INT_MAX;
+#if CONFIG_MULTITHREAD
+                                      pthread_mutex_t *enc_row_mt_mutex,
+#endif
+                                      const int *const thresh_fact) {
+  int is_rd_less_than_thresh;
+
+#if CONFIG_MULTITHREAD
+  // Synchronize to ensure data coherency as thresh_freq_fact is maintained at
+  // tile level and not thread-safe with row based multi-threading
+  if (NULL != enc_row_mt_mutex) pthread_mutex_lock(enc_row_mt_mutex);
+#endif
+
+  is_rd_less_than_thresh =
+      best_rd < ((int64_t)thresh * (*thresh_fact) >> 5) || thresh == INT_MAX;
+
+#if CONFIG_MULTITHREAD
+  if (NULL != enc_row_mt_mutex) pthread_mutex_unlock(enc_row_mt_mutex);
+#endif
+
+  return is_rd_less_than_thresh;
 }
 
 static INLINE void set_error_per_bit(MACROBLOCK *x, int rdmult) {
diff --git a/vp9/encoder/vp9_rdopt.c b/vp9/encoder/vp9_rdopt.c
index 4e1ca328c..8d1006b6e 100644
--- a/vp9/encoder/vp9_rdopt.c
+++ b/vp9/encoder/vp9_rdopt.c
@@ -3043,7 +3043,10 @@ void vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, TileDataEnc *tile_data,
   const int *const rd_threshes = rd_opt->threshes[segment_id][bsize];
   const int *const rd_thresh_freq_fact = tile_data->thresh_freq_fact[bsize];
   int64_t mode_threshold[MAX_MODES];
-  int *mode_map = tile_data->mode_map[bsize];
+  int *tile_mode_map = tile_data->mode_map[bsize];
+  int mode_map[MAX_MODES];  // Maintain mode_map information locally to avoid
+                            // lock mechanism involved with reads from
+                            // tile_mode_map
   const int mode_search_skip_flags = sf->mode_search_skip_flags;
   int64_t mask_filter = 0;
   int64_t filter_cache[SWITCHABLE_FILTER_CONTEXTS];
@@ -3155,10 +3158,19 @@ void vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, TileDataEnc *tile_data,
       ~(sf->intra_y_mode_mask[max_txsize_lookup[bsize]]);
 
   for (i = 0; i <= LAST_NEW_MV_INDEX; ++i) mode_threshold[i] = 0;
+
+#if CONFIG_MULTITHREAD
+  if (NULL != tile_data->enc_row_mt_mutex)
+    pthread_mutex_lock(tile_data->enc_row_mt_mutex);
+#endif
+
   for (i = LAST_NEW_MV_INDEX + 1; i < MAX_MODES; ++i)
     mode_threshold[i] = ((int64_t)rd_threshes[i] * rd_thresh_freq_fact[i]) >> 5;
 
   midx = sf->schedule_mode_search ? mode_skip_start : 0;
+
+  memcpy(mode_map, tile_mode_map, sizeof(mode_map));
+
   while (midx > 4) {
     uint8_t end_pos = 0;
     for (i = 5; i < midx; ++i) {
@@ -3172,6 +3184,13 @@ void vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, TileDataEnc *tile_data,
     midx = end_pos;
   }
 
+  memcpy(tile_mode_map, mode_map, sizeof(mode_map));
+
+#if CONFIG_MULTITHREAD
+  if (NULL != tile_data->enc_row_mt_mutex)
+    pthread_mutex_unlock(tile_data->enc_row_mt_mutex);
+#endif
+
   for (midx = 0; midx < MAX_MODES; ++midx) {
     int mode_index = mode_map[midx];
     int mode_excluded = 0;
@@ -3573,6 +3592,9 @@ void vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, TileDataEnc *tile_data,
   }
 
   if (best_mode_index < 0 || best_rd >= best_rd_so_far) {
+    // If adaptive interp filter is enabled, then the current leaf node of 8x8
+    // data is needed for sub8x8. Hence preserve the context.
+    if (cpi->new_mt && bsize == BLOCK_8X8) ctx->mic = *xd->mi[0];
     rd_cost->rate = INT_MAX;
     rd_cost->rdcost = INT64_MAX;
     return;
@@ -3599,7 +3621,11 @@ void vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, TileDataEnc *tile_data,
 
   if (!cpi->rc.is_src_frame_alt_ref)
     vp9_update_rd_thresh_fact(tile_data->thresh_freq_fact,
-                              sf->adaptive_rd_thresh, bsize, best_mode_index);
+                              sf->adaptive_rd_thresh, bsize,
+#if CONFIG_MULTITHREAD
+                              tile_data->enc_row_mt_mutex,
+#endif
+                              best_mode_index);
 
   // macroblock modes
   *mi = best_mbmode;
@@ -3737,7 +3763,11 @@ void vp9_rd_pick_inter_mode_sb_seg_skip(VP9_COMP *cpi, TileDataEnc *tile_data,
          (cm->interp_filter == mi->interp_filter));
 
   vp9_update_rd_thresh_fact(tile_data->thresh_freq_fact,
-                            cpi->sf.adaptive_rd_thresh, bsize, THR_ZEROMV);
+                            cpi->sf.adaptive_rd_thresh, bsize,
+#if CONFIG_MULTITHREAD
+                            tile_data->enc_row_mt_mutex,
+#endif
+                            THR_ZEROMV);
 
   vp9_zero(best_pred_diff);
   vp9_zero(best_filter_diff);
@@ -3789,6 +3819,7 @@ void vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, TileDataEnc *tile_data,
   int64_t filter_cache[SWITCHABLE_FILTER_CONTEXTS];
   int internal_active_edge =
       vp9_active_edge_sb(cpi, mi_row, mi_col) && vp9_internal_image_edge(cpi);
+  const int *const rd_thresh_freq_fact = tile_data->thresh_freq_fact[bsize];
 
   x->skip_encode = sf->skip_encode_frame && x->q_index < QIDX_SKIP_THRESH;
   memset(x->zcoeff_blk[TX_4X4], 0, 4);
@@ -3880,7 +3911,10 @@ void vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, TileDataEnc *tile_data,
     if (!internal_active_edge &&
         rd_less_than_thresh(best_rd,
                             rd_opt->threshes[segment_id][bsize][ref_index],
-                            tile_data->thresh_freq_fact[bsize][ref_index]))
+#if CONFIG_MULTITHREAD
+                            tile_data->enc_row_mt_mutex,
+#endif
+                            &rd_thresh_freq_fact[ref_index]))
       continue;
 
     comp_pred = second_ref_frame > INTRA_FRAME;
@@ -4324,7 +4358,11 @@ void vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, TileDataEnc *tile_data,
          !is_inter_block(&best_mbmode));
 
   vp9_update_rd_thresh_fact(tile_data->thresh_freq_fact, sf->adaptive_rd_thresh,
-                            bsize, best_ref_index);
+                            bsize,
+#if CONFIG_MULTITHREAD
+                            tile_data->enc_row_mt_mutex,
+#endif
+                            best_ref_index);
 
   // macroblock modes
   *mi = best_mbmode;
diff --git a/vp9/encoder/vp9_speed_features.c b/vp9/encoder/vp9_speed_features.c
index 9ce756efe..09e91fc17 100644
--- a/vp9/encoder/vp9_speed_features.c
+++ b/vp9/encoder/vp9_speed_features.c
@@ -585,6 +585,15 @@ void vp9_set_speed_features_framesize_dependent(VP9_COMP *cpi) {
       rd->thresh_mult_sub8x8[i] = INT_MAX;
     }
   }
+
+  // With row based multi-threading, the following speed features
+  // have to be disabled to guarantee that bitstreams encoded with single thread
+  // and multiple threads match
+  if (cpi->oxcf.ethread_bit_match) {
+    sf->adaptive_rd_thresh = 0;
+    sf->allow_exhaustive_searches = 0;
+    sf->adaptive_pred_interp_filter = 0;
+  }
 }
 
 void vp9_set_speed_features_framesize_independent(VP9_COMP *cpi) {
@@ -747,4 +756,13 @@ void vp9_set_speed_features_framesize_independent(VP9_COMP *cpi) {
   if (!cpi->oxcf.frame_periodic_boost) {
     sf->max_delta_qindex = 0;
   }
+
+  // With row based multi-threading, the following speed features
+  // have to be disabled to guarantee that bitstreams encoded with single thread
+  // and multiple threads match
+  if (cpi->oxcf.ethread_bit_match) {
+    sf->adaptive_rd_thresh = 0;
+    sf->allow_exhaustive_searches = 0;
+    sf->adaptive_pred_interp_filter = 0;
+  }
 }
diff --git a/vp9/vp9_cx_iface.c b/vp9/vp9_cx_iface.c
index 481189020..cc946dfd6 100644
--- a/vp9/vp9_cx_iface.c
+++ b/vp9/vp9_cx_iface.c
@@ -1459,6 +1459,9 @@ static vpx_codec_err_t ctrl_set_svc(vpx_codec_alg_priv_t *ctx, va_list args) {
       cfg->ss_number_layers > 1 && cfg->ts_number_layers > 1) {
     return VPX_CODEC_INVALID_PARAM;
   }
+
+  vp9_set_new_mt(ctx->cpi);
+
   return VPX_CODEC_OK;
 }
 
-- 
2.40.0