From 02b3ef7faee5be5ee519856fbb3523d3ab49f6e7 Mon Sep 17 00:00:00 2001
From: Ritu Baldwa <ritu.baldwa@ittiam.com>
Date: Tue, 18 Dec 2018 17:39:38 +0530
Subject: [PATCH] Add Tile-SB-Row based Multi-threading in Decoder

Add the multi-thread function that decodes a video row by row instead
of a tile at a time. Create a job queue for queueing all parse and recon jobs.
Each SB row of a tile is a job.

Performance Improvement:

Platform        Resolution      3 Threads       4 Threads
ARM             720p            36.81%          18.37%
                1080p           32.27%          14.76%

ARM Improvement measured on Nexus 6 Snapdragon 805 Quad-core  @ 2.65 GHz

Change-Id: I3d4dd7a932fc2904c90d9546b2de99c809afd29e
---
 vp9/common/vp9_enums.h         |   2 +
 vp9/common/vp9_thread_common.c |   6 +
 vp9/common/vp9_thread_common.h |   2 +-
 vp9/decoder/vp9_decodeframe.c  | 628 +++++++++++++++++++++++++++------
 vp9/decoder/vp9_decoder.c      |  21 +-
 vp9/decoder/vp9_decoder.h      |  33 +-
 vp9/decoder/vp9_job_queue.c    | 124 +++++++
 vp9/decoder/vp9_job_queue.h    |  45 +++
 vp9/vp9dx.mk                   |   2 +
 9 files changed, 746 insertions(+), 117 deletions(-)
 create mode 100644 vp9/decoder/vp9_job_queue.c
 create mode 100644 vp9/decoder/vp9_job_queue.h

diff --git a/vp9/common/vp9_enums.h b/vp9/common/vp9_enums.h
index bc665534d..b33a3a297 100644
--- a/vp9/common/vp9_enums.h
+++ b/vp9/common/vp9_enums.h
@@ -41,6 +41,8 @@ typedef enum BITSTREAM_PROFILE {
   MAX_PROFILES
 } BITSTREAM_PROFILE;
 
+typedef enum PARSE_RECON_FLAG { PARSE = 1, RECON = 2 } PARSE_RECON_FLAG;
+
 #define BLOCK_4X4 0
 #define BLOCK_4X8 1
 #define BLOCK_8X4 2
diff --git a/vp9/common/vp9_thread_common.c b/vp9/common/vp9_thread_common.c
index ba9aa69d0..33fe25637 100644
--- a/vp9/common/vp9_thread_common.c
+++ b/vp9/common/vp9_thread_common.c
@@ -470,6 +470,12 @@ void vp9_set_row(VP9LfSync *lf_sync, int num_tiles, int row, int is_last_row,
 #endif  // CONFIG_MULTITHREAD
 }
 
+void vp9_loopfilter_job(LFWorkerData *lf_data, VP9LfSync *lf_sync) {
+  thread_loop_filter_rows(lf_data->frame_buffer, lf_data->cm, lf_data->planes,
+                          lf_data->start, lf_data->stop, lf_data->y_only,
+                          lf_sync);
+}
+
 // Accumulate frame counts.
 void vp9_accumulate_frame_counts(FRAME_COUNTS *accum,
                                  const FRAME_COUNTS *counts, int is_dec) {
diff --git a/vp9/common/vp9_thread_common.h b/vp9/common/vp9_thread_common.h
index b97e9ee13..1a2d79abd 100644
--- a/vp9/common/vp9_thread_common.h
+++ b/vp9/common/vp9_thread_common.h
@@ -70,7 +70,7 @@ void vp9_loopfilter_rows(LFWorkerData *lf_data, VP9LfSync *lf_sync);
 void vp9_set_row(VP9LfSync *lf_sync, int num_tiles, int row, int is_last_row,
                  int corrupted);
 
-void vp9_set_last_decoded_row(struct VP9Common *cm, int tile_col, int mi_row);
+void vp9_loopfilter_job(LFWorkerData *lf_data, VP9LfSync *lf_sync);
 
 void vp9_accumulate_frame_counts(struct FRAME_COUNTS *accum,
                                  const struct FRAME_COUNTS *counts, int is_dec);
diff --git a/vp9/decoder/vp9_decodeframe.c b/vp9/decoder/vp9_decodeframe.c
index c9c85053d..f5050bcd8 100644
--- a/vp9/decoder/vp9_decodeframe.c
+++ b/vp9/decoder/vp9_decodeframe.c
@@ -42,6 +42,7 @@
 #include "vp9/decoder/vp9_decodemv.h"
 #include "vp9/decoder/vp9_decoder.h"
 #include "vp9/decoder/vp9_dsubexp.h"
+#include "vp9/decoder/vp9_job_queue.h"
 
 #define MAX_VP9_HEADER_SIZE 80
 
@@ -1027,7 +1028,6 @@ static void recon_block(TileWorkerData *twd, VP9Decoder *const pbi, int mi_row,
 static void parse_block(TileWorkerData *twd, VP9Decoder *const pbi, int mi_row,
                         int mi_col, BLOCK_SIZE bsize, int bwl, int bhl) {
   VP9_COMMON *const cm = &pbi->common;
-  const int less8x8 = bsize < BLOCK_8X8;
   const int bw = 1 << (bwl - 1);
   const int bh = 1 << (bhl - 1);
   const int x_mis = VPXMIN(bw, cm->mi_cols - mi_col);
@@ -1059,7 +1059,7 @@ static void parse_block(TileWorkerData *twd, VP9Decoder *const pbi, int mi_row,
       const int eobtotal =
           predict_recon_inter(xd, mi, twd, parse_inter_block_row_mt);
 
-      if (!less8x8 && eobtotal == 0) mi->skip = 1;  // skip loopfilter
+      if (bsize >= BLOCK_8X8 && eobtotal == 0) mi->skip = 1;  // skip loopfilter
     }
   }
 
@@ -1172,9 +1172,10 @@ static void decode_partition(TileWorkerData *twd, VP9Decoder *const pbi,
     dec_update_partition_context(twd, mi_row, mi_col, subsize, num_8x8_wh);
 }
 
-static void recon_partition(TileWorkerData *twd, VP9Decoder *const pbi,
-                            int mi_row, int mi_col, BLOCK_SIZE bsize,
-                            int n4x4_l2) {
+static void process_partition(TileWorkerData *twd, VP9Decoder *const pbi,
+                              int mi_row, int mi_col, BLOCK_SIZE bsize,
+                              int n4x4_l2, int parse_recon_flag,
+                              process_block_fn_t process_block) {
   VP9_COMMON *const cm = &pbi->common;
   const int n8x8_l2 = n4x4_l2 - 1;
   const int num_8x8_wh = 1 << n8x8_l2;
@@ -1187,60 +1188,10 @@ static void recon_partition(TileWorkerData *twd, VP9Decoder *const pbi,
 
   if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols) return;
 
-  partition = *xd->partition;
-  xd->partition++;
-
-  subsize = get_subsize(bsize, partition);
-  if (!hbs) {
-    // calculate bmode block dimensions (log 2)
-    xd->bmode_blocks_wl = 1 >> !!(partition & PARTITION_VERT);
-    xd->bmode_blocks_hl = 1 >> !!(partition & PARTITION_HORZ);
-    recon_block(twd, pbi, mi_row, mi_col, subsize, 1, 1);
-  } else {
-    switch (partition) {
-      case PARTITION_NONE:
-        recon_block(twd, pbi, mi_row, mi_col, subsize, n4x4_l2, n4x4_l2);
-        break;
-      case PARTITION_HORZ:
-        recon_block(twd, pbi, mi_row, mi_col, subsize, n4x4_l2, n8x8_l2);
-        if (has_rows)
-          recon_block(twd, pbi, mi_row + hbs, mi_col, subsize, n4x4_l2,
-                      n8x8_l2);
-        break;
-      case PARTITION_VERT:
-        recon_block(twd, pbi, mi_row, mi_col, subsize, n8x8_l2, n4x4_l2);
-        if (has_cols)
-          recon_block(twd, pbi, mi_row, mi_col + hbs, subsize, n8x8_l2,
-                      n4x4_l2);
-        break;
-      case PARTITION_SPLIT:
-        recon_partition(twd, pbi, mi_row, mi_col, subsize, n8x8_l2);
-        recon_partition(twd, pbi, mi_row, mi_col + hbs, subsize, n8x8_l2);
-        recon_partition(twd, pbi, mi_row + hbs, mi_col, subsize, n8x8_l2);
-        recon_partition(twd, pbi, mi_row + hbs, mi_col + hbs, subsize, n8x8_l2);
-        break;
-      default: assert(0 && "Invalid partition type");
-    }
+  if (parse_recon_flag & PARSE) {
+    *xd->partition =
+        read_partition(twd, mi_row, mi_col, has_rows, has_cols, n8x8_l2);
   }
-}
-
-static void parse_partition(TileWorkerData *twd, VP9Decoder *const pbi,
-                            int mi_row, int mi_col, BLOCK_SIZE bsize,
-                            int n4x4_l2) {
-  VP9_COMMON *const cm = &pbi->common;
-  const int n8x8_l2 = n4x4_l2 - 1;
-  const int num_8x8_wh = 1 << n8x8_l2;
-  const int hbs = num_8x8_wh >> 1;
-  PARTITION_TYPE partition;
-  BLOCK_SIZE subsize;
-  const int has_rows = (mi_row + hbs) < cm->mi_rows;
-  const int has_cols = (mi_col + hbs) < cm->mi_cols;
-  MACROBLOCKD *const xd = &twd->xd;
-
-  if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols) return;
-
-  *xd->partition =
-      read_partition(twd, mi_row, mi_col, has_rows, has_cols, n8x8_l2);
 
   partition = *xd->partition;
   xd->partition++;
@@ -1250,38 +1201,44 @@ static void parse_partition(TileWorkerData *twd, VP9Decoder *const pbi,
     // calculate bmode block dimensions (log 2)
     xd->bmode_blocks_wl = 1 >> !!(partition & PARTITION_VERT);
     xd->bmode_blocks_hl = 1 >> !!(partition & PARTITION_HORZ);
-    parse_block(twd, pbi, mi_row, mi_col, subsize, 1, 1);
+    process_block(twd, pbi, mi_row, mi_col, subsize, 1, 1);
   } else {
     switch (partition) {
       case PARTITION_NONE:
-        parse_block(twd, pbi, mi_row, mi_col, subsize, n4x4_l2, n4x4_l2);
+        process_block(twd, pbi, mi_row, mi_col, subsize, n4x4_l2, n4x4_l2);
         break;
       case PARTITION_HORZ:
-        parse_block(twd, pbi, mi_row, mi_col, subsize, n4x4_l2, n8x8_l2);
+        process_block(twd, pbi, mi_row, mi_col, subsize, n4x4_l2, n8x8_l2);
         if (has_rows)
-          parse_block(twd, pbi, mi_row + hbs, mi_col, subsize, n4x4_l2,
-                      n8x8_l2);
+          process_block(twd, pbi, mi_row + hbs, mi_col, subsize, n4x4_l2,
+                        n8x8_l2);
         break;
       case PARTITION_VERT:
-        parse_block(twd, pbi, mi_row, mi_col, subsize, n8x8_l2, n4x4_l2);
+        process_block(twd, pbi, mi_row, mi_col, subsize, n8x8_l2, n4x4_l2);
         if (has_cols)
-          parse_block(twd, pbi, mi_row, mi_col + hbs, subsize, n8x8_l2,
-                      n4x4_l2);
+          process_block(twd, pbi, mi_row, mi_col + hbs, subsize, n8x8_l2,
+                        n4x4_l2);
         break;
       case PARTITION_SPLIT:
-        parse_partition(twd, pbi, mi_row, mi_col, subsize, n8x8_l2);
-        parse_partition(twd, pbi, mi_row, mi_col + hbs, subsize, n8x8_l2);
-        parse_partition(twd, pbi, mi_row + hbs, mi_col, subsize, n8x8_l2);
-        parse_partition(twd, pbi, mi_row + hbs, mi_col + hbs, subsize, n8x8_l2);
+        process_partition(twd, pbi, mi_row, mi_col, subsize, n8x8_l2,
+                          parse_recon_flag, process_block);
+        process_partition(twd, pbi, mi_row, mi_col + hbs, subsize, n8x8_l2,
+                          parse_recon_flag, process_block);
+        process_partition(twd, pbi, mi_row + hbs, mi_col, subsize, n8x8_l2,
+                          parse_recon_flag, process_block);
+        process_partition(twd, pbi, mi_row + hbs, mi_col + hbs, subsize,
+                          n8x8_l2, parse_recon_flag, process_block);
         break;
       default: assert(0 && "Invalid partition type");
     }
   }
 
-  // update partition context
-  if (bsize >= BLOCK_8X8 &&
-      (bsize == BLOCK_8X8 || partition != PARTITION_SPLIT))
-    dec_update_partition_context(twd, mi_row, mi_col, subsize, num_8x8_wh);
+  if (parse_recon_flag & PARSE) {
+    // update partition context
+    if ((bsize == BLOCK_8X8 || partition != PARTITION_SPLIT) &&
+        bsize >= BLOCK_8X8)
+      dec_update_partition_context(twd, mi_row, mi_col, subsize, num_8x8_wh);
+  }
 }
 
 static void setup_token_decoder(const uint8_t *data, const uint8_t *data_end,
@@ -1688,6 +1645,306 @@ static void get_tile_buffers(VP9Decoder *pbi, const uint8_t *data,
   }
 }
 
+static void map_write(RowMTWorkerData *row_mt_worker_data, int idx) {
+#if CONFIG_MULTITHREAD
+  pthread_mutex_lock(&row_mt_worker_data->map_mutex);
+  row_mt_worker_data->recon_map[idx] = 1;
+  pthread_mutex_unlock(&row_mt_worker_data->map_mutex);
+#else
+  (void)row_mt_worker_data;
+  (void)idx;
+#endif
+}
+
+static void map_read(RowMTWorkerData *row_mt_worker_data, int idx) {
+#if CONFIG_MULTITHREAD
+  volatile int8_t *map = row_mt_worker_data->recon_map + idx;
+  pthread_mutex_lock(&row_mt_worker_data->map_mutex);
+  while (!*map) {
+    pthread_mutex_unlock(&row_mt_worker_data->map_mutex);
+    sched_yield();
+    pthread_mutex_lock(&row_mt_worker_data->map_mutex);
+  }
+  pthread_mutex_unlock(&row_mt_worker_data->map_mutex);
+#else
+  (void)row_mt_worker_data;
+  (void)idx;
+#endif
+}
+
+static int lpf_map_write_check(VP9LfSync *lf_sync, int row, int num_tile_cols) {
+  int return_val = 0;
+#if CONFIG_MULTITHREAD
+  int corrupted;
+  pthread_mutex_lock(&lf_sync->lf_mutex);
+  corrupted = lf_sync->corrupted;
+  pthread_mutex_unlock(&lf_sync->lf_mutex);
+  if (!corrupted) {
+    pthread_mutex_lock(&lf_sync->recon_done_mutex[row]);
+    lf_sync->num_tiles_done[row] += 1;
+    if (num_tile_cols == lf_sync->num_tiles_done[row]) return_val = 1;
+    pthread_mutex_unlock(&lf_sync->recon_done_mutex[row]);
+  }
+#else
+  (void)lf_sync;
+  (void)row;
+  (void)num_tile_cols;
+#endif
+  return return_val;
+}
+
+static void vp9_tile_done(VP9Decoder *pbi) {
+#if CONFIG_MULTITHREAD
+  int terminate;
+  RowMTWorkerData *const row_mt_worker_data = pbi->row_mt_worker_data;
+  const int all_parse_done = 1 << pbi->common.log2_tile_cols;
+  pthread_mutex_lock(&row_mt_worker_data->recon_mutex);
+  row_mt_worker_data->num_tiles_done++;
+  terminate = all_parse_done == row_mt_worker_data->num_tiles_done;
+  pthread_mutex_unlock(&row_mt_worker_data->recon_mutex);
+  if (terminate) {
+    vp9_jobq_terminate(&row_mt_worker_data->jobq);
+  }
+#else
+  (void)pbi;
+#endif
+}
+
+void vp9_jobq_alloc(VP9Decoder *pbi) {
+  VP9_COMMON *const cm = &pbi->common;
+  RowMTWorkerData *const row_mt_worker_data = pbi->row_mt_worker_data;
+  const int aligned_rows = mi_cols_aligned_to_sb(cm->mi_rows);
+  const int sb_rows = aligned_rows >> MI_BLOCK_SIZE_LOG2;
+  const int tile_cols = 1 << cm->log2_tile_cols;
+  const size_t jobq_size = (tile_cols * sb_rows * 2 + sb_rows) * sizeof(Job);
+
+  if (jobq_size > row_mt_worker_data->jobq_size) {
+    vpx_free(row_mt_worker_data->jobq_buf);
+    CHECK_MEM_ERROR(cm, row_mt_worker_data->jobq_buf, vpx_calloc(1, jobq_size));
+    vp9_jobq_init(&row_mt_worker_data->jobq, row_mt_worker_data->jobq_buf,
+                  jobq_size);
+    row_mt_worker_data->jobq_size = jobq_size;
+  }
+}
+
+static void recon_tile_row(TileWorkerData *tile_data, VP9Decoder *pbi,
+                           int mi_row, int is_last_row, VP9LfSync *lf_sync) {
+  VP9_COMMON *const cm = &pbi->common;
+  RowMTWorkerData *const row_mt_worker_data = pbi->row_mt_worker_data;
+  const int tile_cols = 1 << cm->log2_tile_cols;
+  const int aligned_cols = mi_cols_aligned_to_sb(cm->mi_cols);
+  const int sb_cols = aligned_cols >> MI_BLOCK_SIZE_LOG2;
+  const int cur_sb_row = mi_row >> MI_BLOCK_SIZE_LOG2;
+  int mi_col_start = tile_data->xd.tile.mi_col_start;
+  int mi_col_end = tile_data->xd.tile.mi_col_end;
+  int mi_col;
+
+  vp9_zero(tile_data->xd.left_context);
+  vp9_zero(tile_data->xd.left_seg_context);
+  for (mi_col = mi_col_start; mi_col < mi_col_end; mi_col += MI_BLOCK_SIZE) {
+    const int c = mi_col >> MI_BLOCK_SIZE_LOG2;
+    int plane;
+    const int sb_num = (cur_sb_row * (aligned_cols >> MI_BLOCK_SIZE_LOG2) + c);
+
+    // Top Dependency
+    if (cur_sb_row) {
+      map_read(row_mt_worker_data, ((cur_sb_row - 1) * sb_cols) + c);
+    }
+
+    for (plane = 0; plane < MAX_MB_PLANE; ++plane) {
+      tile_data->xd.plane[plane].eob =
+          row_mt_worker_data->eob[plane] + (sb_num << EOBS_PER_SB_LOG2);
+      tile_data->xd.plane[plane].dqcoeff =
+          row_mt_worker_data->dqcoeff[plane] + (sb_num << DQCOEFFS_PER_SB_LOG2);
+    }
+    tile_data->xd.partition =
+        row_mt_worker_data->partition + (sb_num * PARTITIONS_PER_SB);
+    process_partition(tile_data, pbi, mi_row, mi_col, BLOCK_64X64, 4, RECON,
+                      recon_block);
+    if (cm->lf.filter_level && !cm->skip_loop_filter) {
+      // Queue LPF_JOB
+      int is_lpf_job_ready = 0;
+
+      if (mi_col + MI_BLOCK_SIZE >= mi_col_end) {
+        // Checks if this row has been decoded in all tiles
+        is_lpf_job_ready = lpf_map_write_check(lf_sync, cur_sb_row, tile_cols);
+
+        if (is_lpf_job_ready) {
+          Job lpf_job;
+          lpf_job.job_type = LPF_JOB;
+          if (cur_sb_row > 0) {
+            lpf_job.row_num = mi_row - MI_BLOCK_SIZE;
+            vp9_jobq_queue(&row_mt_worker_data->jobq, &lpf_job,
+                           sizeof(lpf_job));
+          }
+          if (is_last_row) {
+            lpf_job.row_num = mi_row;
+            vp9_jobq_queue(&row_mt_worker_data->jobq, &lpf_job,
+                           sizeof(lpf_job));
+          }
+        }
+      }
+    }
+    map_write(row_mt_worker_data, (cur_sb_row * sb_cols) + c);
+  }
+}
+
+static void parse_tile_row(TileWorkerData *tile_data, VP9Decoder *pbi,
+                           int mi_row, int cur_tile_col, uint8_t **data_end) {
+  int mi_col;
+  VP9_COMMON *const cm = &pbi->common;
+  RowMTWorkerData *const row_mt_worker_data = pbi->row_mt_worker_data;
+  TileInfo *tile = &tile_data->xd.tile;
+  TileBuffer *const buf = &pbi->tile_buffers[cur_tile_col];
+  const int aligned_cols = mi_cols_aligned_to_sb(cm->mi_cols);
+
+  vp9_zero(tile_data->dqcoeff);
+  vp9_tile_init(tile, cm, 0, cur_tile_col);
+
+  /* Update reader only at the beginning of each row in a tile */
+  if (mi_row == 0) {
+    setup_token_decoder(buf->data, *data_end, buf->size, &tile_data->error_info,
+                        &tile_data->bit_reader, pbi->decrypt_cb,
+                        pbi->decrypt_state);
+  }
+  vp9_init_macroblockd(cm, &tile_data->xd, tile_data->dqcoeff);
+  tile_data->xd.error_info = &tile_data->error_info;
+
+  vp9_zero(tile_data->xd.left_context);
+  vp9_zero(tile_data->xd.left_seg_context);
+  for (mi_col = tile->mi_col_start; mi_col < tile->mi_col_end;
+       mi_col += MI_BLOCK_SIZE) {
+    const int r = mi_row >> MI_BLOCK_SIZE_LOG2;
+    const int c = mi_col >> MI_BLOCK_SIZE_LOG2;
+    int plane;
+    const int sb_num = (r * (aligned_cols >> MI_BLOCK_SIZE_LOG2) + c);
+    for (plane = 0; plane < MAX_MB_PLANE; ++plane) {
+      tile_data->xd.plane[plane].eob =
+          row_mt_worker_data->eob[plane] + (sb_num << EOBS_PER_SB_LOG2);
+      tile_data->xd.plane[plane].dqcoeff =
+          row_mt_worker_data->dqcoeff[plane] + (sb_num << DQCOEFFS_PER_SB_LOG2);
+    }
+    tile_data->xd.partition =
+        row_mt_worker_data->partition + sb_num * PARTITIONS_PER_SB;
+    process_partition(tile_data, pbi, mi_row, mi_col, BLOCK_64X64, 4, PARSE,
+                      parse_block);
+  }
+}
+
+static int row_decode_worker_hook(ThreadData *const thread_data,
+                                  uint8_t **data_end) {
+  VP9Decoder *const pbi = thread_data->pbi;
+  VP9_COMMON *const cm = &pbi->common;
+  RowMTWorkerData *const row_mt_worker_data = pbi->row_mt_worker_data;
+  const int aligned_cols = mi_cols_aligned_to_sb(cm->mi_cols);
+  const int aligned_rows = mi_cols_aligned_to_sb(cm->mi_rows);
+  const int sb_rows = aligned_rows >> MI_BLOCK_SIZE_LOG2;
+  Job job;
+  LFWorkerData *lf_data = thread_data->lf_data;
+  VP9LfSync *lf_sync = thread_data->lf_sync;
+  volatile int corrupted = 0;
+
+  while (!vp9_jobq_dequeue(&row_mt_worker_data->jobq, &job, sizeof(job), 1)) {
+    int mi_col;
+    const int mi_row = job.row_num;
+
+    if (job.job_type == LPF_JOB) {
+      lf_data->start = mi_row;
+      lf_data->stop = lf_data->start + MI_BLOCK_SIZE;
+
+      if (cm->lf.filter_level && !cm->skip_loop_filter &&
+          mi_row < cm->mi_rows) {
+        vp9_loopfilter_job(lf_data, lf_sync);
+      }
+    } else if (job.job_type == RECON_JOB) {
+      const int cur_sb_row = mi_row >> MI_BLOCK_SIZE_LOG2;
+      const int is_last_row = sb_rows - 1 == cur_sb_row;
+      TileWorkerData twd_recon;
+      TileWorkerData *const tile_data_recon = &twd_recon;
+      int mi_col_start, mi_col_end;
+
+      tile_data_recon->xd = pbi->mb;
+      vp9_tile_init(&tile_data_recon->xd.tile, cm, 0, job.tile_col);
+      vp9_init_macroblockd(cm, &tile_data_recon->xd, tile_data_recon->dqcoeff);
+      mi_col_start = tile_data_recon->xd.tile.mi_col_start;
+      mi_col_end = tile_data_recon->xd.tile.mi_col_end;
+
+      if (setjmp(tile_data_recon->error_info.jmp)) {
+        const int sb_cols = aligned_cols >> MI_BLOCK_SIZE_LOG2;
+        tile_data_recon->error_info.setjmp = 0;
+        corrupted = 1;
+        for (mi_col = mi_col_start; mi_col < mi_col_end;
+             mi_col += MI_BLOCK_SIZE) {
+          const int c = mi_col >> MI_BLOCK_SIZE_LOG2;
+          map_write(row_mt_worker_data, (cur_sb_row * sb_cols) + c);
+        }
+        if (is_last_row) {
+          vp9_tile_done(pbi);
+        }
+        continue;
+      }
+
+      tile_data_recon->error_info.setjmp = 1;
+      tile_data_recon->xd.error_info = &tile_data_recon->error_info;
+
+      recon_tile_row(tile_data_recon, pbi, mi_row, is_last_row, lf_sync);
+
+      if (corrupted)
+        vpx_internal_error(&tile_data_recon->error_info,
+                           VPX_CODEC_CORRUPT_FRAME,
+                           "Failed to decode tile data");
+
+      if (is_last_row) {
+        vp9_tile_done(pbi);
+      }
+    } else if (job.job_type == PARSE_JOB) {
+      TileWorkerData *const tile_data = &pbi->tile_worker_data[job.tile_col];
+
+      if (setjmp(tile_data->error_info.jmp)) {
+        tile_data->error_info.setjmp = 0;
+        corrupted = 1;
+        vp9_tile_done(pbi);
+        continue;
+      }
+
+      tile_data->xd = pbi->mb;
+      tile_data->xd.counts =
+          cm->frame_parallel_decoding_mode ? 0 : &tile_data->counts;
+
+      tile_data->error_info.setjmp = 1;
+
+      parse_tile_row(tile_data, pbi, mi_row, job.tile_col, data_end);
+
+      corrupted |= tile_data->xd.corrupted;
+      if (corrupted)
+        vpx_internal_error(&tile_data->error_info, VPX_CODEC_CORRUPT_FRAME,
+                           "Failed to decode tile data");
+
+      /* Queue in the recon_job for this row */
+      {
+        Job recon_job;
+        recon_job.row_num = mi_row;
+        recon_job.tile_col = job.tile_col;
+        recon_job.job_type = RECON_JOB;
+        vp9_jobq_queue(&row_mt_worker_data->jobq, &recon_job,
+                       sizeof(recon_job));
+      }
+
+      /* Queue next parse job */
+      if (mi_row + MI_BLOCK_SIZE < cm->mi_rows) {
+        Job parse_job;
+        parse_job.row_num = mi_row + MI_BLOCK_SIZE;
+        parse_job.tile_col = job.tile_col;
+        parse_job.job_type = PARSE_JOB;
+        vp9_jobq_queue(&row_mt_worker_data->jobq, &parse_job,
+                       sizeof(parse_job));
+      }
+    }
+  }
+
+  return !corrupted;
+}
+
 static const uint8_t *decode_tiles(VP9Decoder *pbi, const uint8_t *data,
                                    const uint8_t *data_end) {
   VP9_COMMON *const cm = &pbi->common;
@@ -1775,7 +2032,8 @@ static const uint8_t *decode_tiles(VP9Decoder *pbi, const uint8_t *data,
                   row_mt_worker_data->dqcoeff[plane];
             }
             tile_data->xd.partition = row_mt_worker_data->partition;
-            parse_partition(tile_data, pbi, mi_row, mi_col, BLOCK_64X64, 4);
+            process_partition(tile_data, pbi, mi_row, mi_col, BLOCK_64X64, 4,
+                              PARSE, parse_block);
 
             for (plane = 0; plane < MAX_MB_PLANE; ++plane) {
               tile_data->xd.plane[plane].eob = row_mt_worker_data->eob[plane];
@@ -1783,7 +2041,8 @@ static const uint8_t *decode_tiles(VP9Decoder *pbi, const uint8_t *data,
                   row_mt_worker_data->dqcoeff[plane];
             }
             tile_data->xd.partition = row_mt_worker_data->partition;
-            recon_partition(tile_data, pbi, mi_row, mi_col, BLOCK_64X64, 4);
+            process_partition(tile_data, pbi, mi_row, mi_col, BLOCK_64X64, 4,
+                              RECON, recon_block);
           } else {
             decode_partition(tile_data, pbi, mi_row, mi_col, BLOCK_64X64, 4);
           }
@@ -1951,22 +2210,12 @@ static int compare_tile_buffers(const void *a, const void *b) {
   return (buf_a->size < buf_b->size) - (buf_a->size > buf_b->size);
 }
 
-static const uint8_t *decode_tiles_mt(VP9Decoder *pbi, const uint8_t *data,
-                                      const uint8_t *data_end) {
+static INLINE void init_mt(VP9Decoder *pbi) {
+  int n;
   VP9_COMMON *const cm = &pbi->common;
-  const VPxWorkerInterface *const winterface = vpx_get_worker_interface();
-  const uint8_t *bit_reader_end = NULL;
   VP9LfSync *lf_row_sync = &pbi->lf_row_sync;
-  YV12_BUFFER_CONFIG *const new_fb = get_frame_new_buffer(cm);
   const int aligned_mi_cols = mi_cols_aligned_to_sb(cm->mi_cols);
-  const int tile_cols = 1 << cm->log2_tile_cols;
-  const int tile_rows = 1 << cm->log2_tile_rows;
-  const int num_workers = VPXMIN(pbi->max_threads, tile_cols);
-  int n;
-
-  assert(tile_cols <= (1 << 6));
-  assert(tile_rows == 1);
-  (void)tile_rows;
+  const VPxWorkerInterface *const winterface = vpx_get_worker_interface();
 
   if (pbi->num_tile_workers == 0) {
     const int num_threads = pbi->max_threads;
@@ -1985,11 +2234,159 @@ static const uint8_t *decode_tiles_mt(VP9Decoder *pbi, const uint8_t *data,
   }
 
   // Initialize LPF
-  if (pbi->lpf_mt_opt && cm->lf.filter_level && !cm->skip_loop_filter) {
+  if ((pbi->lpf_mt_opt || pbi->row_mt) && cm->lf.filter_level &&
+      !cm->skip_loop_filter) {
     vp9_lpf_mt_init(lf_row_sync, cm, cm->lf.filter_level,
                     pbi->num_tile_workers);
   }
 
+  // Note: this memset assumes above_context[0], [1] and [2]
+  // are allocated as part of the same buffer.
+  memset(cm->above_context, 0,
+         sizeof(*cm->above_context) * MAX_MB_PLANE * 2 * aligned_mi_cols);
+
+  memset(cm->above_seg_context, 0,
+         sizeof(*cm->above_seg_context) * aligned_mi_cols);
+
+  vp9_reset_lfm(cm);
+}
+
+const uint8_t *decode_tiles_row_wise_mt(VP9Decoder *pbi, const uint8_t *data,
+                                        const uint8_t *data_end) {
+  VP9_COMMON *const cm = &pbi->common;
+  RowMTWorkerData *const row_mt_worker_data = pbi->row_mt_worker_data;
+  const VPxWorkerInterface *const winterface = vpx_get_worker_interface();
+  const int tile_cols = 1 << cm->log2_tile_cols;
+  const int tile_rows = 1 << cm->log2_tile_rows;
+  const int num_workers = pbi->max_threads;
+  int i, n;
+  int col;
+  int corrupted = 0;
+  const int sb_rows = mi_cols_aligned_to_sb(cm->mi_rows) >> MI_BLOCK_SIZE_LOG2;
+  const int sb_cols = mi_cols_aligned_to_sb(cm->mi_cols) >> MI_BLOCK_SIZE_LOG2;
+  VP9LfSync *lf_row_sync = &pbi->lf_row_sync;
+  YV12_BUFFER_CONFIG *const new_fb = get_frame_new_buffer(cm);
+
+  assert(tile_cols <= (1 << 6));
+  assert(tile_rows == 1);
+  (void)tile_rows;
+
+  memset(row_mt_worker_data->recon_map, 0,
+         sb_rows * sb_cols * sizeof(*row_mt_worker_data->recon_map));
+
+  init_mt(pbi);
+
+  // Reset tile decoding hook
+  for (n = 0; n < num_workers; ++n) {
+    VPxWorker *const worker = &pbi->tile_workers[n];
+    ThreadData *const thread_data = &pbi->row_mt_worker_data->thread_data[n];
+    winterface->sync(worker);
+
+    if (cm->lf.filter_level && !cm->skip_loop_filter) {
+      thread_data->lf_sync = lf_row_sync;
+      thread_data->lf_data = &thread_data->lf_sync->lfdata[n];
+      vp9_loop_filter_data_reset(thread_data->lf_data, new_fb, cm,
+                                 pbi->mb.plane);
+    }
+
+    thread_data->pbi = pbi;
+
+    worker->hook = (VPxWorkerHook)row_decode_worker_hook;
+    worker->data1 = thread_data;
+    worker->data2 = &row_mt_worker_data->data_end;
+  }
+
+  for (col = 0; col < tile_cols; ++col) {
+    TileWorkerData *const tile_data = &pbi->tile_worker_data[col];
+    tile_data->xd = pbi->mb;
+    tile_data->xd.counts =
+        cm->frame_parallel_decoding_mode ? NULL : &tile_data->counts;
+  }
+
+  /* Reset the jobq to start of the jobq buffer */
+  vp9_jobq_reset(&row_mt_worker_data->jobq);
+  row_mt_worker_data->num_tiles_done = 0;
+  row_mt_worker_data->data_end = NULL;
+
+  // Load tile data into tile_buffers
+  get_tile_buffers(pbi, data, data_end, tile_cols, tile_rows,
+                   &pbi->tile_buffers);
+
+  // Initialize thread frame counts.
+  if (!cm->frame_parallel_decoding_mode) {
+    for (col = 0; col < tile_cols; ++col) {
+      TileWorkerData *const tile_data =
+          (TileWorkerData *)&pbi->tile_worker_data[col];
+      vp9_zero(tile_data->counts);
+    }
+  }
+
+  // queue parse jobs for 0th row of every tile
+  for (col = 0; col < tile_cols; ++col) {
+    Job parse_job;
+    parse_job.row_num = 0;
+    parse_job.tile_col = col;
+    parse_job.job_type = PARSE_JOB;
+    vp9_jobq_queue(&row_mt_worker_data->jobq, &parse_job, sizeof(parse_job));
+  }
+
+  for (i = 0; i < num_workers; ++i) {
+    VPxWorker *const worker = &pbi->tile_workers[i];
+    worker->had_error = 0;
+    if (i == num_workers - 1) {
+      winterface->execute(worker);
+    } else {
+      winterface->launch(worker);
+    }
+  }
+
+  for (; n > 0; --n) {
+    VPxWorker *const worker = &pbi->tile_workers[n - 1];
+    // TODO(jzern): The tile may have specific error data associated with
+    // its vpx_internal_error_info which could be propagated to the main info
+    // in cm. Additionally once the threads have been synced and an error is
+    // detected, there's no point in continuing to decode tiles.
+    corrupted |= !winterface->sync(worker);
+  }
+
+  pbi->mb.corrupted = corrupted;
+
+  {
+    /* Set data end */
+    TileWorkerData *const tile_data = &pbi->tile_worker_data[tile_cols - 1];
+    row_mt_worker_data->data_end = vpx_reader_find_end(&tile_data->bit_reader);
+  }
+
+  // Accumulate thread frame counts.
+  if (!cm->frame_parallel_decoding_mode) {
+    for (i = 0; i < tile_cols; ++i) {
+      TileWorkerData *const tile_data =
+          (TileWorkerData *)&pbi->tile_worker_data[i];
+      vp9_accumulate_frame_counts(&cm->counts, &tile_data->counts, 1);
+    }
+  }
+
+  return row_mt_worker_data->data_end;
+}
+
+static const uint8_t *decode_tiles_mt(VP9Decoder *pbi, const uint8_t *data,
+                                      const uint8_t *data_end) {
+  VP9_COMMON *const cm = &pbi->common;
+  const VPxWorkerInterface *const winterface = vpx_get_worker_interface();
+  const uint8_t *bit_reader_end = NULL;
+  VP9LfSync *lf_row_sync = &pbi->lf_row_sync;
+  YV12_BUFFER_CONFIG *const new_fb = get_frame_new_buffer(cm);
+  const int tile_cols = 1 << cm->log2_tile_cols;
+  const int tile_rows = 1 << cm->log2_tile_rows;
+  const int num_workers = VPXMIN(pbi->max_threads, tile_cols);
+  int n;
+
+  assert(tile_cols <= (1 << 6));
+  assert(tile_rows == 1);
+  (void)tile_rows;
+
+  init_mt(pbi);
+
   // Reset tile decoding hook
   for (n = 0; n < num_workers; ++n) {
     VPxWorker *const worker = &pbi->tile_workers[n];
@@ -2012,15 +2409,6 @@ static const uint8_t *decode_tiles_mt(VP9Decoder *pbi, const uint8_t *data,
     worker->data2 = pbi;
   }
 
-  // Note: this memset assumes above_context[0], [1] and [2]
-  // are allocated as part of the same buffer.
-  memset(cm->above_context, 0,
-         sizeof(*cm->above_context) * MAX_MB_PLANE * 2 * aligned_mi_cols);
-  memset(cm->above_seg_context, 0,
-         sizeof(*cm->above_seg_context) * aligned_mi_cols);
-
-  vp9_reset_lfm(cm);
-
   // Load tile data into tile_buffers
   get_tile_buffers(pbi, data, data_end, tile_cols, tile_rows,
                    &pbi->tile_buffers);
@@ -2370,6 +2758,10 @@ static size_t read_uncompressed_header(VP9Decoder *pbi,
     if (pbi->row_mt_worker_data == NULL) {
       CHECK_MEM_ERROR(cm, pbi->row_mt_worker_data,
                       vpx_calloc(1, sizeof(*pbi->row_mt_worker_data)));
+#if CONFIG_MULTITHREAD
+      pthread_mutex_init(&pbi->row_mt_worker_data->recon_mutex, NULL);
+      pthread_mutex_init(&pbi->row_mt_worker_data->map_mutex, NULL);
+#endif
     }
 
     if (pbi->max_threads > 1) {
@@ -2383,8 +2775,10 @@ static size_t read_uncompressed_header(VP9Decoder *pbi,
 
     if (num_sbs > pbi->row_mt_worker_data->num_sbs) {
       vp9_dec_free_row_mt_mem(pbi->row_mt_worker_data);
-      vp9_dec_alloc_row_mt_mem(pbi->row_mt_worker_data, cm, num_sbs);
+      vp9_dec_alloc_row_mt_mem(pbi->row_mt_worker_data, cm, num_sbs,
+                               pbi->max_threads);
     }
+    vp9_jobq_alloc(pbi);
   }
   sz = vpx_rb_read_literal(rb, 16);
 
@@ -2544,21 +2938,27 @@ void vp9_decode_frame(VP9Decoder *pbi, const uint8_t *data,
     pbi->total_tiles = tile_rows * tile_cols;
   }
 
-  if (pbi->max_threads > 1 && tile_rows == 1 && tile_cols > 1) {
-    // Multi-threaded tile decoder
-    *p_data_end = decode_tiles_mt(pbi, data + first_partition_size, data_end);
-    if (!pbi->lpf_mt_opt) {
-      if (!xd->corrupted) {
-        if (!cm->skip_loop_filter) {
-          // If multiple threads are used to decode tiles, then we use those
-          // threads to do parallel loopfiltering.
-          vp9_loop_filter_frame_mt(new_fb, cm, pbi->mb.plane,
-                                   cm->lf.filter_level, 0, 0, pbi->tile_workers,
-                                   pbi->num_tile_workers, &pbi->lf_row_sync);
+  if (pbi->max_threads > 1 && tile_rows == 1 &&
+      (tile_cols > 1 || pbi->row_mt == 1)) {
+    if (pbi->row_mt == 1) {
+      *p_data_end =
+          decode_tiles_row_wise_mt(pbi, data + first_partition_size, data_end);
+    } else {
+      // Multi-threaded tile decoder
+      *p_data_end = decode_tiles_mt(pbi, data + first_partition_size, data_end);
+      if (!pbi->lpf_mt_opt) {
+        if (!xd->corrupted) {
+          if (!cm->skip_loop_filter) {
+            // If multiple threads are used to decode tiles, then we use those
+            // threads to do parallel loopfiltering.
+            vp9_loop_filter_frame_mt(
+                new_fb, cm, pbi->mb.plane, cm->lf.filter_level, 0, 0,
+                pbi->tile_workers, pbi->num_tile_workers, &pbi->lf_row_sync);
+          }
+        } else {
+          vpx_internal_error(&cm->error, VPX_CODEC_CORRUPT_FRAME,
+                             "Decode failed. Frame data is corrupted.");
         }
-      } else {
-        vpx_internal_error(&cm->error, VPX_CODEC_CORRUPT_FRAME,
-                           "Decode failed. Frame data is corrupted.");
       }
     }
   } else {
diff --git a/vp9/decoder/vp9_decoder.c b/vp9/decoder/vp9_decoder.c
index 1e2a44293..a352cffb9 100644
--- a/vp9/decoder/vp9_decoder.c
+++ b/vp9/decoder/vp9_decoder.c
@@ -56,7 +56,7 @@ static void vp9_dec_setup_mi(VP9_COMMON *cm) {
 }
 
 void vp9_dec_alloc_row_mt_mem(RowMTWorkerData *row_mt_worker_data,
-                              VP9_COMMON *cm, int num_sbs) {
+                              VP9_COMMON *cm, int num_sbs, int max_threads) {
   int plane;
   const size_t dqcoeff_size = (num_sbs << DQCOEFFS_PER_SB_LOG2) *
                               sizeof(*row_mt_worker_data->dqcoeff[0]);
@@ -74,6 +74,14 @@ void vp9_dec_alloc_row_mt_mem(RowMTWorkerData *row_mt_worker_data,
                              sizeof(*row_mt_worker_data->partition)));
   CHECK_MEM_ERROR(cm, row_mt_worker_data->recon_map,
                   vpx_calloc(num_sbs, sizeof(*row_mt_worker_data->recon_map)));
+
+  // allocate memory for thread_data
+  if (row_mt_worker_data->thread_data == NULL) {
+    const size_t thread_size =
+        max_threads * sizeof(*row_mt_worker_data->thread_data);
+    CHECK_MEM_ERROR(cm, row_mt_worker_data->thread_data,
+                    vpx_memalign(32, thread_size));
+  }
 }
 
 void vp9_dec_free_row_mt_mem(RowMTWorkerData *row_mt_worker_data) {
@@ -89,6 +97,8 @@ void vp9_dec_free_row_mt_mem(RowMTWorkerData *row_mt_worker_data) {
     row_mt_worker_data->partition = NULL;
     vpx_free(row_mt_worker_data->recon_map);
     row_mt_worker_data->recon_map = NULL;
+    vpx_free(row_mt_worker_data->thread_data);
+    row_mt_worker_data->thread_data = NULL;
   }
 }
 
@@ -179,8 +189,17 @@ void vp9_decoder_remove(VP9Decoder *pbi) {
 
   if (pbi->row_mt == 1) {
     vp9_dec_free_row_mt_mem(pbi->row_mt_worker_data);
+    if (pbi->row_mt_worker_data != NULL) {
+      vp9_jobq_deinit(&pbi->row_mt_worker_data->jobq);
+      vpx_free(pbi->row_mt_worker_data->jobq_buf);
+#if CONFIG_MULTITHREAD
+      pthread_mutex_destroy(&pbi->row_mt_worker_data->recon_mutex);
+      pthread_mutex_destroy(&pbi->row_mt_worker_data->map_mutex);
+#endif
+    }
     vpx_free(pbi->row_mt_worker_data);
   }
+
   vp9_remove_common(&pbi->common);
   vpx_free(pbi);
 }
diff --git a/vp9/decoder/vp9_decoder.h b/vp9/decoder/vp9_decoder.h
index 9a582fffb..ad39bc03d 100644
--- a/vp9/decoder/vp9_decoder.h
+++ b/vp9/decoder/vp9_decoder.h
@@ -21,6 +21,7 @@
 #include "vp9/common/vp9_thread_common.h"
 #include "vp9/common/vp9_onyxc_int.h"
 #include "vp9/common/vp9_ppflags.h"
+#include "./vp9_job_queue.h"
 
 #ifdef __cplusplus
 extern "C" {
@@ -30,6 +31,14 @@ extern "C" {
 #define DQCOEFFS_PER_SB_LOG2 12
 #define PARTITIONS_PER_SB 85
 
+typedef enum JobType { PARSE_JOB, RECON_JOB, LPF_JOB } JobType;
+
+typedef struct ThreadData {
+  struct VP9Decoder *pbi;
+  LFWorkerData *lf_data;
+  VP9LfSync *lf_sync;
+} ThreadData;
+
 typedef struct TileBuffer {
   const uint8_t *data;
   size_t size;
@@ -49,14 +58,36 @@ typedef struct TileWorkerData {
   struct vpx_internal_error_info error_info;
 } TileWorkerData;
 
+typedef void (*process_block_fn_t)(TileWorkerData *twd,
+                                   struct VP9Decoder *const pbi, int mi_row,
+                                   int mi_col, BLOCK_SIZE bsize, int bwl,
+                                   int bhl);
+
 typedef struct RowMTWorkerData {
   int num_sbs;
   int *eob[MAX_MB_PLANE];
   PARTITION_TYPE *partition;
   tran_low_t *dqcoeff[MAX_MB_PLANE];
   int8_t *recon_map;
+  const uint8_t *data_end;
+  uint8_t *jobq_buf;
+  JobQueueRowMt jobq;
+  size_t jobq_size;
+  int num_tiles_done;
+#if CONFIG_MULTITHREAD
+  pthread_mutex_t recon_mutex;
+  pthread_mutex_t map_mutex;
+#endif
+  ThreadData *thread_data;
 } RowMTWorkerData;
 
+/* Structure to queue and dequeue row decode jobs */
+typedef struct Job {
+  int row_num;
+  int tile_col;
+  JobType job_type;
+} Job;
+
 typedef struct VP9Decoder {
   DECLARE_ALIGNED(16, MACROBLOCKD, mb);
 
@@ -128,7 +159,7 @@ struct VP9Decoder *vp9_decoder_create(BufferPool *const pool);
 void vp9_decoder_remove(struct VP9Decoder *pbi);
 
 void vp9_dec_alloc_row_mt_mem(RowMTWorkerData *row_mt_worker_data,
-                              VP9_COMMON *cm, int num_sbs);
+                              VP9_COMMON *cm, int num_sbs, int max_threads);
 void vp9_dec_free_row_mt_mem(RowMTWorkerData *row_mt_worker_data);
 
 static INLINE void decrease_ref_count(int idx, RefCntBuffer *const frame_bufs,
diff --git a/vp9/decoder/vp9_job_queue.c b/vp9/decoder/vp9_job_queue.c
new file mode 100644
index 000000000..9a31f5a6d
--- /dev/null
+++ b/vp9/decoder/vp9_job_queue.c
@@ -0,0 +1,124 @@
+/*
+ *  Copyright (c) 2018 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+#include <string.h>
+
+#include "vpx/vpx_integer.h"
+
+#include "vp9/decoder/vp9_job_queue.h"
+
+void vp9_jobq_init(JobQueueRowMt *jobq, uint8_t *buf, size_t buf_size) {
+#if CONFIG_MULTITHREAD
+  pthread_mutex_init(&jobq->mutex, NULL);
+  pthread_cond_init(&jobq->cond, NULL);
+#endif
+  jobq->buf_base = buf;
+  jobq->buf_wr = buf;
+  jobq->buf_rd = buf;
+  jobq->buf_end = buf + buf_size;
+  jobq->terminate = 0;
+}
+
+void vp9_jobq_reset(JobQueueRowMt *jobq) {
+#if CONFIG_MULTITHREAD
+  pthread_mutex_lock(&jobq->mutex);
+#endif
+  jobq->buf_wr = jobq->buf_base;
+  jobq->buf_rd = jobq->buf_base;
+  jobq->terminate = 0;
+#if CONFIG_MULTITHREAD
+  pthread_mutex_unlock(&jobq->mutex);
+#endif
+}
+
+void vp9_jobq_deinit(JobQueueRowMt *jobq) {
+  vp9_jobq_reset(jobq);
+#if CONFIG_MULTITHREAD
+  pthread_mutex_destroy(&jobq->mutex);
+  pthread_cond_destroy(&jobq->cond);
+#endif
+}
+
+void vp9_jobq_terminate(JobQueueRowMt *jobq) {
+#if CONFIG_MULTITHREAD
+  pthread_mutex_lock(&jobq->mutex);
+#endif
+  jobq->terminate = 1;
+#if CONFIG_MULTITHREAD
+  pthread_cond_broadcast(&jobq->cond);
+  pthread_mutex_unlock(&jobq->mutex);
+#endif
+}
+
+int vp9_jobq_queue(JobQueueRowMt *jobq, void *job, size_t job_size) {
+  int ret = 0;
+#if CONFIG_MULTITHREAD
+  pthread_mutex_lock(&jobq->mutex);
+#endif
+  if (jobq->buf_end >= jobq->buf_wr + job_size) {
+    memcpy(jobq->buf_wr, job, job_size);
+    jobq->buf_wr = jobq->buf_wr + job_size;
+#if CONFIG_MULTITHREAD
+    pthread_cond_signal(&jobq->cond);
+#endif
+    ret = 0;
+  } else {
+    /* Wrap around case is not supported */
+    assert(0);
+    ret = 1;
+  }
+#if CONFIG_MULTITHREAD
+  pthread_mutex_unlock(&jobq->mutex);
+#endif
+  return ret;
+}
+
+int vp9_jobq_dequeue(JobQueueRowMt *jobq, void *job, size_t job_size,
+                     int blocking) {
+  int ret = 0;
+#if CONFIG_MULTITHREAD
+  pthread_mutex_lock(&jobq->mutex);
+#endif
+  if (jobq->buf_end >= jobq->buf_rd + job_size) {
+    while (1) {
+      if (jobq->buf_wr >= jobq->buf_rd + job_size) {
+        memcpy(job, jobq->buf_rd, job_size);
+        jobq->buf_rd = jobq->buf_rd + job_size;
+        ret = 0;
+        break;
+      } else {
+        /* If all the entries have been dequeued, then break and return */
+        if (jobq->terminate == 1) {
+          ret = 1;
+          break;
+        }
+        if (blocking == 1) {
+#if CONFIG_MULTITHREAD
+          pthread_cond_wait(&jobq->cond, &jobq->mutex);
+#endif
+        } else {
+          /* If there is no job available,
+           * and this is non blocking call then return fail */
+          ret = 1;
+          break;
+        }
+      }
+    }
+  } else {
+    /* Wrap around case is not supported */
+    ret = 1;
+  }
+#if CONFIG_MULTITHREAD
+  pthread_mutex_unlock(&jobq->mutex);
+#endif
+
+  return ret;
+}
diff --git a/vp9/decoder/vp9_job_queue.h b/vp9/decoder/vp9_job_queue.h
new file mode 100644
index 000000000..bc23bf9c2
--- /dev/null
+++ b/vp9/decoder/vp9_job_queue.h
@@ -0,0 +1,45 @@
+/*
+ *  Copyright (c) 2018 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_VP9_DECODER_VP9_JOB_QUEUE_H_
+#define VPX_VP9_DECODER_VP9_JOB_QUEUE_H_
+
+#include "vpx_util/vpx_thread.h"
+
+typedef struct {
+  // Pointer to buffer base which contains the jobs
+  uint8_t *buf_base;
+
+  // Pointer to current address where new job can be added
+  uint8_t *volatile buf_wr;
+
+  // Pointer to current address from where next job can be obtained
+  uint8_t *volatile buf_rd;
+
+  // Pointer to end of job buffer
+  uint8_t *buf_end;
+
+  int terminate;
+
+#if CONFIG_MULTITHREAD
+  pthread_mutex_t mutex;
+  pthread_cond_t cond;
+#endif
+} JobQueueRowMt;
+
+void vp9_jobq_init(JobQueueRowMt *jobq, uint8_t *buf, size_t buf_size);
+void vp9_jobq_reset(JobQueueRowMt *jobq);
+void vp9_jobq_deinit(JobQueueRowMt *jobq);
+void vp9_jobq_terminate(JobQueueRowMt *jobq);
+int vp9_jobq_queue(JobQueueRowMt *jobq, void *job, size_t job_size);
+int vp9_jobq_dequeue(JobQueueRowMt *jobq, void *job, size_t job_size,
+                     int blocking);
+
+#endif  // VPX_VP9_DECODER_VP9_JOB_QUEUE_H_
diff --git a/vp9/vp9dx.mk b/vp9/vp9dx.mk
index 59f612b94..93a5f368b 100644
--- a/vp9/vp9dx.mk
+++ b/vp9/vp9dx.mk
@@ -28,5 +28,7 @@ VP9_DX_SRCS-yes += decoder/vp9_decoder.c
 VP9_DX_SRCS-yes += decoder/vp9_decoder.h
 VP9_DX_SRCS-yes += decoder/vp9_dsubexp.c
 VP9_DX_SRCS-yes += decoder/vp9_dsubexp.h
+VP9_DX_SRCS-yes += decoder/vp9_job_queue.c
+VP9_DX_SRCS-yes += decoder/vp9_job_queue.h
 
 VP9_DX_SRCS-yes := $(filter-out $(VP9_DX_SRCS_REMOVE-yes),$(VP9_DX_SRCS-yes))
-- 
2.49.0