From 15149484ec75e77a6fa4b0ce2e4ede5cb6a63c39 Mon Sep 17 00:00:00 2001
From: Ryan Lei <ryan.z.lei@intel.com>
Date: Tue, 25 Oct 2016 18:48:43 -0700
Subject: [PATCH] Add parallel-deblocking experiment

This commit is a manual cherry-pick from aom/master:
42ff3881ace1564aac9debae86ef37a8deb8d381

Change-Id: I4a3cdb939b7b96a3aa27f6a00da7a0e73222f3f3
---
 av1/common/loopfilter.c    | 156 +++++++++++++++----
 av1/common/loopfilter.h    |  31 ++--
 av1/common/thread_common.c | 299 ++++++++++++++++++++++++++++++-------
 configure                  |   1 +
 4 files changed, 392 insertions(+), 95 deletions(-)

diff --git a/av1/common/loopfilter.c b/av1/common/loopfilter.c
index d0b897c74..dc7ee188d 100644
--- a/av1/common/loopfilter.c
+++ b/av1/common/loopfilter.c
@@ -1183,9 +1183,10 @@ static void highbd_filter_selectively_vert(
 }
 #endif  // CONFIG_AOM_HIGHBITDEPTH
 
-void av1_filter_block_plane_non420(AV1_COMMON *cm,
-                                   struct macroblockd_plane *plane,
-                                   MODE_INFO **mib, int mi_row, int mi_col) {
+void av1_filter_block_plane_non420_ver(AV1_COMMON *cm,
+                                       struct macroblockd_plane *plane,
+                                       MODE_INFO **mib, int mi_row,
+                                       int mi_col) {
   const int ss_x = plane->subsampling_x;
   const int ss_y = plane->subsampling_y;
   const int row_step = 1 << ss_y;
@@ -1369,6 +1370,22 @@ void av1_filter_block_plane_non420(AV1_COMMON *cm,
 
   // Now do horizontal pass
   dst->buf = dst0;
+}
+
+void av1_filter_block_plane_non420_hor(AV1_COMMON *cm,
+                                       struct macroblockd_plane *plane,
+                                       int mi_row) {
+  const int ss_y = plane->subsampling_y;
+  const int row_step = 1 << ss_y;
+  struct buf_2d *const dst = &plane->dst;
+  uint8_t *const dst0 = dst->buf;
+  unsigned int mask_16x16[MAX_MIB_SIZE] = { 0 };
+  unsigned int mask_8x8[MAX_MIB_SIZE] = { 0 };
+  unsigned int mask_4x4[MAX_MIB_SIZE] = { 0 };
+  unsigned int mask_4x4_int[MAX_MIB_SIZE] = { 0 };
+  uint8_t lfl[MAX_MIB_SIZE][MAX_MIB_SIZE];
+  int r;
+
   for (r = 0; r < cm->mib_size && mi_row + r < cm->mi_rows; r += row_step) {
     const int skip_border_4x4_r = ss_y && mi_row + r == cm->mi_rows - 1;
     const unsigned int mask_4x4_int_r = skip_border_4x4_r ? 0 : mask_4x4_int[r];
@@ -1404,11 +1421,12 @@ void av1_filter_block_plane_non420(AV1_COMMON *cm,
 #endif  // CONFIG_AOM_HIGHBITDEPTH
     dst->buf += MI_SIZE * dst->stride;
   }
+  dst->buf = dst0;
 }
 
-void av1_filter_block_plane_ss00(AV1_COMMON *const cm,
-                                 struct macroblockd_plane *const plane,
-                                 int mi_row, LOOP_FILTER_MASK *lfm) {
+void av1_filter_block_plane_ss00_ver(AV1_COMMON *const cm,
+                                     struct macroblockd_plane *const plane,
+                                     int mi_row, LOOP_FILTER_MASK *lfm) {
   struct buf_2d *const dst = &plane->dst;
   uint8_t *const dst0 = dst->buf;
   int r;
@@ -1452,10 +1470,20 @@ void av1_filter_block_plane_ss00(AV1_COMMON *const cm,
 
   // Horizontal pass
   dst->buf = dst0;
-  mask_16x16 = lfm->above_y[TX_16X16];
-  mask_8x8 = lfm->above_y[TX_8X8];
-  mask_4x4 = lfm->above_y[TX_4X4];
-  mask_4x4_int = lfm->int_4x4_y;
+}
+
+void av1_filter_block_plane_ss00_hor(AV1_COMMON *const cm,
+                                     struct macroblockd_plane *const plane,
+                                     int mi_row, LOOP_FILTER_MASK *lfm) {
+  struct buf_2d *const dst = &plane->dst;
+  uint8_t *const dst0 = dst->buf;
+  int r;
+  uint64_t mask_16x16 = lfm->above_y[TX_16X16];
+  uint64_t mask_8x8 = lfm->above_y[TX_8X8];
+  uint64_t mask_4x4 = lfm->above_y[TX_4X4];
+  uint64_t mask_4x4_int = lfm->int_4x4_y;
+
+  assert(plane->subsampling_x == 0 && plane->subsampling_y == 0);
 
   for (r = 0; r < cm->mib_size && mi_row + r < cm->mi_rows; r++) {
     unsigned int mask_16x16_r;
@@ -1495,11 +1523,13 @@ void av1_filter_block_plane_ss00(AV1_COMMON *const cm,
     mask_4x4 >>= MI_SIZE;
     mask_4x4_int >>= MI_SIZE;
   }
+  // restore the buf pointer in case there is additional filter pass.
+  dst->buf = dst0;
 }
 
-void av1_filter_block_plane_ss11(AV1_COMMON *const cm,
-                                 struct macroblockd_plane *const plane,
-                                 int mi_row, LOOP_FILTER_MASK *lfm) {
+void av1_filter_block_plane_ss11_ver(AV1_COMMON *const cm,
+                                     struct macroblockd_plane *const plane,
+                                     int mi_row, LOOP_FILTER_MASK *lfm) {
   struct buf_2d *const dst = &plane->dst;
   uint8_t *const dst0 = dst->buf;
   int r, c;
@@ -1554,10 +1584,20 @@ void av1_filter_block_plane_ss11(AV1_COMMON *const cm,
 
   // Horizontal pass
   dst->buf = dst0;
-  mask_16x16 = lfm->above_uv[TX_16X16];
-  mask_8x8 = lfm->above_uv[TX_8X8];
-  mask_4x4 = lfm->above_uv[TX_4X4];
-  mask_4x4_int = lfm->above_int_4x4_uv;
+}
+
+void av1_filter_block_plane_ss11_hor(AV1_COMMON *const cm,
+                                     struct macroblockd_plane *const plane,
+                                     int mi_row, LOOP_FILTER_MASK *lfm) {
+  struct buf_2d *const dst = &plane->dst;
+  uint8_t *const dst0 = dst->buf;
+  int r;
+  uint64_t mask_16x16 = lfm->above_uv[TX_16X16];
+  uint64_t mask_8x8 = lfm->above_uv[TX_8X8];
+  uint64_t mask_4x4 = lfm->above_uv[TX_4X4];
+  uint64_t mask_4x4_int = lfm->above_int_4x4_uv;
+
+  assert(plane->subsampling_x == 1 && plane->subsampling_y == 1);
 
   for (r = 0; r < cm->mib_size && mi_row + r < cm->mi_rows; r += 2) {
     const int skip_border_4x4_r = mi_row + r == cm->mi_rows - 1;
@@ -1600,6 +1640,8 @@ void av1_filter_block_plane_ss11(AV1_COMMON *const cm,
     mask_4x4 >>= MI_SIZE / 2;
     mask_4x4_int >>= MI_SIZE / 2;
   }
+  // restore the buf pointer in case there is additional filter pass.
+  dst->buf = dst0;
 }
 
 void av1_loop_filter_rows(YV12_BUFFER_CONFIG *frame_buffer, AV1_COMMON *cm,
@@ -1622,12 +1664,14 @@ void av1_loop_filter_rows(YV12_BUFFER_CONFIG *frame_buffer, AV1_COMMON *cm,
 
       av1_setup_dst_planes(planes, frame_buffer, mi_row, mi_col);
 
-      for (plane = 0; plane < num_planes; ++plane)
-        av1_filter_block_plane_non420(cm, &planes[plane], mi + mi_col, mi_row,
-                                      mi_col);
+      for (plane = 0; plane < num_planes; ++plane) {
+        av1_filter_block_plane_non420_ver(cm, &planes[plane], mi + mi_col,
+                                          mi_row, mi_col);
+        av1_filter_block_plane_non420_hor(cm, &planes[plane], mi_row);
+      }
     }
   }
-#else
+#else  // CONFIG_VAR_TX || CONFIG_EXT_PARTITION || CONFIG_EXT_PARTITION_TYPES
   const int num_planes = y_only ? 1 : MAX_MB_PLANE;
   int mi_row, mi_col;
   enum lf_path path;
@@ -1641,7 +1685,34 @@ void av1_loop_filter_rows(YV12_BUFFER_CONFIG *frame_buffer, AV1_COMMON *cm,
     path = LF_PATH_444;
   else
     path = LF_PATH_SLOW;
+#if CONFIG_PARALLEL_DEBLOCKING
+  for (mi_row = start; mi_row < stop; mi_row += MAX_MIB_SIZE) {
+    MODE_INFO **mi = cm->mi_grid_visible + mi_row * cm->mi_stride;
+    for (mi_col = 0; mi_col < cm->mi_cols; mi_col += MAX_MIB_SIZE) {
+      int plane;
+
+      av1_setup_dst_planes(planes, frame_buffer, mi_row, mi_col);
 
+      // TODO(JBB): Make setup_mask work for non 420.
+      av1_setup_mask(cm, mi_row, mi_col, mi + mi_col, cm->mi_stride, &lfm);
+
+      av1_filter_block_plane_ss00_ver(cm, &planes[0], mi_row, &lfm);
+      for (plane = 1; plane < num_planes; ++plane) {
+        switch (path) {
+          case LF_PATH_420:
+            av1_filter_block_plane_ss11_ver(cm, &planes[plane], mi_row, &lfm);
+            break;
+          case LF_PATH_444:
+            av1_filter_block_plane_ss00_ver(cm, &planes[plane], mi_row, &lfm);
+            break;
+          case LF_PATH_SLOW:
+            av1_filter_block_plane_non420_ver(cm, &planes[plane], mi + mi_col,
+                                              mi_row, mi_col);
+            break;
+        }
+      }
+    }
+  }
   for (mi_row = start; mi_row < stop; mi_row += MAX_MIB_SIZE) {
     MODE_INFO **mi = cm->mi_grid_visible + mi_row * cm->mi_stride;
     for (mi_col = 0; mi_col < cm->mi_cols; mi_col += MAX_MIB_SIZE) {
@@ -1652,23 +1723,56 @@ void av1_loop_filter_rows(YV12_BUFFER_CONFIG *frame_buffer, AV1_COMMON *cm,
       // TODO(JBB): Make setup_mask work for non 420.
       av1_setup_mask(cm, mi_row, mi_col, mi + mi_col, cm->mi_stride, &lfm);
 
-      av1_filter_block_plane_ss00(cm, &planes[0], mi_row, &lfm);
+      av1_filter_block_plane_ss00_hor(cm, &planes[0], mi_row, &lfm);
       for (plane = 1; plane < num_planes; ++plane) {
         switch (path) {
           case LF_PATH_420:
-            av1_filter_block_plane_ss11(cm, &planes[plane], mi_row, &lfm);
+            av1_filter_block_plane_ss11_hor(cm, &planes[plane], mi_row, &lfm);
             break;
           case LF_PATH_444:
-            av1_filter_block_plane_ss00(cm, &planes[plane], mi_row, &lfm);
+            av1_filter_block_plane_ss00_hor(cm, &planes[plane], mi_row, &lfm);
             break;
           case LF_PATH_SLOW:
-            av1_filter_block_plane_non420(cm, &planes[plane], mi + mi_col,
-                                          mi_row, mi_col);
+            av1_filter_block_plane_non420_hor(cm, &planes[plane], mi_row);
+            break;
+        }
+      }
+    }
+  }
+#else   // CONFIG_PARALLEL_DEBLOCKING
+  for (mi_row = start; mi_row < stop; mi_row += MAX_MIB_SIZE) {
+    MODE_INFO **mi = cm->mi_grid_visible + mi_row * cm->mi_stride;
+    for (mi_col = 0; mi_col < cm->mi_cols; mi_col += MAX_MIB_SIZE) {
+      int plane;
+
+      av1_setup_dst_planes(planes, frame_buffer, mi_row, mi_col);
+
+      // TODO(JBB): Make setup_mask work for non 420.
+      av1_setup_mask(cm, mi_row, mi_col, mi + mi_col, cm->mi_stride, &lfm);
+
+      av1_filter_block_plane_ss00_ver(cm, &planes[0], mi_row, &lfm);
+      av1_filter_block_plane_ss00_hor(cm, &planes[0], mi_row, &lfm);
+      for (plane = 1; plane < num_planes; ++plane) {
+        switch (path) {
+          case LF_PATH_420:
+            av1_filter_block_plane_ss11_ver(cm, &planes[plane], mi_row, &lfm);
+            av1_filter_block_plane_ss11_hor(cm, &planes[plane], mi_row, &lfm);
+            break;
+          case LF_PATH_444:
+            av1_filter_block_plane_ss00_ver(cm, &planes[plane], mi_row, &lfm);
+            av1_filter_block_plane_ss00_hor(cm, &planes[plane], mi_row, &lfm);
+            break;
+          case LF_PATH_SLOW:
+            av1_filter_block_plane_non420_ver(cm, &planes[plane], mi + mi_col,
+                                              mi_row, mi_col);
+            av1_filter_block_plane_non420_hor(cm, &planes[plane], mi_row);
+
             break;
         }
       }
     }
   }
+#endif  // CONFIG_PARALLEL_DEBLOCKING
 #endif  // CONFIG_VAR_TX || CONFIG_EXT_PARTITION || CONFIG_EXT_PARTITION_TYPES
 }
 
diff --git a/av1/common/loopfilter.h b/av1/common/loopfilter.h
index 975cbdf19..cdc251208 100644
--- a/av1/common/loopfilter.h
+++ b/av1/common/loopfilter.h
@@ -99,17 +99,26 @@ void av1_setup_mask(struct AV1Common *const cm, const int mi_row,
                     const int mi_col, MODE_INFO **mi_8x8,
                     const int mode_info_stride, LOOP_FILTER_MASK *lfm);
 
-void av1_filter_block_plane_ss00(struct AV1Common *const cm,
-                                 struct macroblockd_plane *const plane,
-                                 int mi_row, LOOP_FILTER_MASK *lfm);
-
-void av1_filter_block_plane_ss11(struct AV1Common *const cm,
-                                 struct macroblockd_plane *const plane,
-                                 int mi_row, LOOP_FILTER_MASK *lfm);
-
-void av1_filter_block_plane_non420(struct AV1Common *cm,
-                                   struct macroblockd_plane *plane,
-                                   MODE_INFO **mi_8x8, int mi_row, int mi_col);
+void av1_filter_block_plane_ss00_ver(struct AV1Common *const cm,
+                                     struct macroblockd_plane *const plane,
+                                     int mi_row, LOOP_FILTER_MASK *lfm);
+void av1_filter_block_plane_ss00_hor(struct AV1Common *const cm,
+                                     struct macroblockd_plane *const plane,
+                                     int mi_row, LOOP_FILTER_MASK *lfm);
+void av1_filter_block_plane_ss11_ver(struct AV1Common *const cm,
+                                     struct macroblockd_plane *const plane,
+                                     int mi_row, LOOP_FILTER_MASK *lfm);
+void av1_filter_block_plane_ss11_hor(struct AV1Common *const cm,
+                                     struct macroblockd_plane *const plane,
+                                     int mi_row, LOOP_FILTER_MASK *lfm);
+
+void av1_filter_block_plane_non420_ver(struct AV1Common *cm,
+                                       struct macroblockd_plane *plane,
+                                       MODE_INFO **mi_8x8, int mi_row,
+                                       int mi_col);
+void av1_filter_block_plane_non420_hor(struct AV1Common *cm,
+                                       struct macroblockd_plane *plane,
+                                       int mi_row);
 
 void av1_loop_filter_init(struct AV1Common *cm);
 
diff --git a/av1/common/thread_common.c b/av1/common/thread_common.c
index eeaeb21fe..11006715b 100644
--- a/av1/common/thread_common.c
+++ b/av1/common/thread_common.c
@@ -85,25 +85,153 @@ static INLINE void sync_write(AV1LfSync *const lf_sync, int r, int c,
 #endif  // CONFIG_MULTITHREAD
 }
 
-// Implement row loopfiltering for each thread.
-static INLINE void thread_loop_filter_rows(
-    const YV12_BUFFER_CONFIG *const frame_buffer, AV1_COMMON *const cm,
-    struct macroblockd_plane planes[MAX_MB_PLANE], int start, int stop,
-    int y_only, AV1LfSync *const lf_sync) {
-  const int num_planes = y_only ? 1 : MAX_MB_PLANE;
-  const int sb_cols = mi_cols_aligned_to_sb(cm) >> cm->mib_size_log2;
-  int mi_row, mi_col;
 #if !CONFIG_EXT_PARTITION_TYPES
-  enum lf_path path;
-  LOOP_FILTER_MASK lfm;
+static INLINE enum lf_path get_loop_filter_path(
+    int y_only, struct macroblockd_plane planes[MAX_MB_PLANE]) {
   if (y_only)
-    path = LF_PATH_444;
+    return LF_PATH_444;
   else if (planes[1].subsampling_y == 1 && planes[1].subsampling_x == 1)
-    path = LF_PATH_420;
+    return LF_PATH_420;
   else if (planes[1].subsampling_y == 0 && planes[1].subsampling_x == 0)
-    path = LF_PATH_444;
+    return LF_PATH_444;
   else
-    path = LF_PATH_SLOW;
+    return LF_PATH_SLOW;
+}
+
+static INLINE void loop_filter_block_plane_ver(
+    AV1_COMMON *cm, struct macroblockd_plane planes[MAX_MB_PLANE], int plane,
+    MODE_INFO **mi, int mi_row, int mi_col, enum lf_path path,
+    LOOP_FILTER_MASK *lfm) {
+  if (plane == 0) {
+    av1_filter_block_plane_ss00_ver(cm, &planes[0], mi_row, lfm);
+  } else {
+    switch (path) {
+      case LF_PATH_420:
+        av1_filter_block_plane_ss11_ver(cm, &planes[plane], mi_row, lfm);
+        break;
+      case LF_PATH_444:
+        av1_filter_block_plane_ss00_ver(cm, &planes[plane], mi_row, lfm);
+        break;
+      case LF_PATH_SLOW:
+        av1_filter_block_plane_non420_ver(cm, &planes[plane], mi + mi_col,
+                                          mi_row, mi_col);
+        break;
+    }
+  }
+}
+
+static INLINE void loop_filter_block_plane_hor(
+    AV1_COMMON *cm, struct macroblockd_plane planes[MAX_MB_PLANE], int plane,
+    int mi_row, enum lf_path path, LOOP_FILTER_MASK *lfm) {
+  if (plane == 0) {
+    av1_filter_block_plane_ss00_hor(cm, &planes[0], mi_row, lfm);
+  } else {
+    switch (path) {
+      case LF_PATH_420:
+        av1_filter_block_plane_ss11_hor(cm, &planes[plane], mi_row, lfm);
+        break;
+      case LF_PATH_444:
+        av1_filter_block_plane_ss00_hor(cm, &planes[plane], mi_row, lfm);
+        break;
+      case LF_PATH_SLOW:
+        av1_filter_block_plane_non420_hor(cm, &planes[plane], mi_row);
+        break;
+    }
+  }
+}
+#endif
+// Row-based multi-threaded loopfilter hook
+#if CONFIG_PARALLEL_DEBLOCKING
+static int loop_filter_ver_row_worker(AV1LfSync *const lf_sync,
+                                      LFWorkerData *const lf_data) {
+  const int num_planes = lf_data->y_only ? 1 : MAX_MB_PLANE;
+  int mi_row, mi_col;
+#if !CONFIG_EXT_PARTITION_TYPES
+  enum lf_path path = get_loop_filter_path(lf_data->y_only, lf_data->planes);
+#endif
+  for (mi_row = lf_data->start; mi_row < lf_data->stop;
+       mi_row += lf_sync->num_workers * lf_data->cm->mib_size) {
+    MODE_INFO **const mi =
+        lf_data->cm->mi_grid_visible + mi_row * lf_data->cm->mi_stride;
+
+    for (mi_col = 0; mi_col < lf_data->cm->mi_cols;
+         mi_col += lf_data->cm->mib_size) {
+      LOOP_FILTER_MASK lfm;
+      int plane;
+
+      av1_setup_dst_planes(lf_data->planes, lf_data->frame_buffer, mi_row,
+                           mi_col);
+      av1_setup_mask(lf_data->cm, mi_row, mi_col, mi + mi_col,
+                     lf_data->cm->mi_stride, &lfm);
+
+#if CONFIG_EXT_PARTITION_TYPES
+      for (plane = 0; plane < num_planes; ++plane)
+        av1_filter_block_plane_non420_ver(lf_data->cm, &lf_data->planes[plane],
+                                          mi + mi_col, mi_row, mi_col);
+#else
+
+      for (plane = 0; plane < num_planes; ++plane)
+        loop_filter_block_plane_ver(lf_data->cm, lf_data->planes, plane, mi,
+                                    mi_row, mi_col, path, &lfm);
+#endif
+    }
+  }
+  return 1;
+}
+
+static int loop_filter_hor_row_worker(AV1LfSync *const lf_sync,
+                                      LFWorkerData *const lf_data) {
+  const int num_planes = lf_data->y_only ? 1 : MAX_MB_PLANE;
+  const int sb_cols =
+      mi_cols_aligned_to_sb(lf_data->cm) >> lf_data->cm->mib_size_log2;
+  int mi_row, mi_col;
+#if !CONFIG_EXT_PARTITION_TYPES
+  enum lf_path path = get_loop_filter_path(lf_data->y_only, lf_data->planes);
+#endif
+
+  for (mi_row = lf_data->start; mi_row < lf_data->stop;
+       mi_row += lf_sync->num_workers * lf_data->cm->mib_size) {
+    MODE_INFO **const mi =
+        lf_data->cm->mi_grid_visible + mi_row * lf_data->cm->mi_stride;
+
+    for (mi_col = 0; mi_col < lf_data->cm->mi_cols;
+         mi_col += lf_data->cm->mib_size) {
+      const int r = mi_row >> lf_data->cm->mib_size_log2;
+      const int c = mi_col >> lf_data->cm->mib_size_log2;
+      LOOP_FILTER_MASK lfm;
+      int plane;
+
+      // TODO(wenhao.zhang@intel.com): For better parallelization, reorder
+      // the outer loop to column-based and remove the synchronizations here.
+      sync_read(lf_sync, r, c);
+
+      av1_setup_dst_planes(lf_data->planes, lf_data->frame_buffer, mi_row,
+                           mi_col);
+      av1_setup_mask(lf_data->cm, mi_row, mi_col, mi + mi_col,
+                     lf_data->cm->mi_stride, &lfm);
+#if CONFIG_EXT_PARTITION_TYPES
+      for (plane = 0; plane < num_planes; ++plane)
+        av1_filter_block_plane_non420_hor(lf_data->cm, &lf_data->planes[plane],
+                                          mi_row);
+#else
+      for (plane = 0; plane < num_planes; ++plane)
+        loop_filter_block_plane_hor(lf_data->cm, lf_data->planes, plane, mi_row,
+                                    path, &lfm);
+#endif
+      sync_write(lf_sync, r, c, sb_cols);
+    }
+  }
+  return 1;
+}
+#else  //  CONFIG_PARALLEL_DEBLOCKING
+static int loop_filter_row_worker(AV1LfSync *const lf_sync,
+                                  LFWorkerData *const lf_data) {
+  const int num_planes = lf_data->y_only ? 1 : MAX_MB_PLANE;
+  const int sb_cols =
+      mi_cols_aligned_to_sb(lf_data->cm) >> lf_data->cm->mib_size_log2;
+  int mi_row, mi_col;
+#if !CONFIG_EXT_PARTITION_TYPES
+  enum lf_path path = get_loop_filter_path(lf_data->y_only, lf_data->planes);
 #endif  // !CONFIG_EXT_PARTITION_TYPES
 
 #if CONFIG_EXT_PARTITION
@@ -113,56 +241,48 @@ static INLINE void thread_loop_filter_rows(
   exit(EXIT_FAILURE);
 #endif  // CONFIG_EXT_PARTITION
 
-  for (mi_row = start; mi_row < stop;
-       mi_row += lf_sync->num_workers * cm->mib_size) {
-    MODE_INFO **const mi = cm->mi_grid_visible + mi_row * cm->mi_stride;
+  for (mi_row = lf_data->start; mi_row < lf_data->stop;
+       mi_row += lf_sync->num_workers * lf_data->cm->mib_size) {
+    MODE_INFO **const mi =
+        lf_data->cm->mi_grid_visible + mi_row * lf_data->cm->mi_stride;
 
-    for (mi_col = 0; mi_col < cm->mi_cols; mi_col += cm->mib_size) {
-      const int r = mi_row >> cm->mib_size_log2;
-      const int c = mi_col >> cm->mib_size_log2;
+    for (mi_col = 0; mi_col < lf_data->cm->mi_cols;
+         mi_col += lf_data->cm->mib_size) {
+      const int r = mi_row >> lf_data->cm->mib_size_log2;
+      const int c = mi_col >> lf_data->cm->mib_size_log2;
+#if !CONFIG_EXT_PARTITION_TYPES
+      LOOP_FILTER_MASK lfm;
+#endif
       int plane;
 
       sync_read(lf_sync, r, c);
 
-      av1_setup_dst_planes(planes, frame_buffer, mi_row, mi_col);
-
+      av1_setup_dst_planes(lf_data->planes, lf_data->frame_buffer, mi_row,
+                           mi_col);
 #if CONFIG_EXT_PARTITION_TYPES
-      for (plane = 0; plane < num_planes; ++plane)
-        av1_filter_block_plane_non420(cm, &planes[plane], mi + mi_col, mi_row,
-                                      mi_col);
+      for (plane = 0; plane < num_planes; ++plane) {
+        av1_filter_block_plane_non420_ver(lf_data->cm, &lf_data->planes[plane],
+                                          mi + mi_col, mi_row, mi_col);
+        av1_filter_block_plane_non420_hor(lf_data->cm, &lf_data->planes[plane],
+                                          mi_row);
+      }
 #else
-      // TODO(JBB): Make setup_mask work for non 420.
-      av1_setup_mask(cm, mi_row, mi_col, mi + mi_col, cm->mi_stride, &lfm);
-
-      av1_filter_block_plane_ss00(cm, &planes[0], mi_row, &lfm);
-      for (plane = 1; plane < num_planes; ++plane) {
-        switch (path) {
-          case LF_PATH_420:
-            av1_filter_block_plane_ss11(cm, &planes[plane], mi_row, &lfm);
-            break;
-          case LF_PATH_444:
-            av1_filter_block_plane_ss00(cm, &planes[plane], mi_row, &lfm);
-            break;
-          case LF_PATH_SLOW:
-            av1_filter_block_plane_non420(cm, &planes[plane], mi + mi_col,
-                                          mi_row, mi_col);
-            break;
-        }
+      av1_setup_mask(lf_data->cm, mi_row, mi_col, mi + mi_col,
+                     lf_data->cm->mi_stride, &lfm);
+
+      for (plane = 0; plane < num_planes; ++plane) {
+        loop_filter_block_plane_ver(lf_data->cm, lf_data->planes, plane, mi,
+                                    mi_row, mi_col, path, &lfm);
+        loop_filter_block_plane_hor(lf_data->cm, lf_data->planes, plane, mi_row,
+                                    path, &lfm);
       }
 #endif  // CONFIG_EXT_PARTITION_TYPES
       sync_write(lf_sync, r, c, sb_cols);
     }
   }
-}
-
-// Row-based multi-threaded loopfilter hook
-static int loop_filter_row_worker(AV1LfSync *const lf_sync,
-                                  LFWorkerData *const lf_data) {
-  thread_loop_filter_rows(lf_data->frame_buffer, lf_data->cm, lf_data->planes,
-                          lf_data->start, lf_data->stop, lf_data->y_only,
-                          lf_sync);
   return 1;
 }
+#endif  //  CONFIG_PARALLEL_DEBLOCKING
 
 static void loop_filter_rows_mt(YV12_BUFFER_CONFIG *frame, AV1_COMMON *cm,
                                 struct macroblockd_plane planes[MAX_MB_PLANE],
@@ -191,17 +311,79 @@ static void loop_filter_rows_mt(YV12_BUFFER_CONFIG *frame, AV1_COMMON *cm,
     av1_loop_filter_alloc(lf_sync, cm, sb_rows, cm->width, num_workers);
   }
 
+// Set up loopfilter thread data.
+// The decoder is capping num_workers because it has been observed that using
+// more threads on the loopfilter than there are cores will hurt performance
+// on Android. This is because the system will only schedule the tile decode
+// workers on cores equal to the number of tile columns. Then if the decoder
+// tries to use more threads for the loopfilter, it will hurt performance
+// because of contention. If the multithreading code changes in the future
+// then the number of workers used by the loopfilter should be revisited.
+
+#if CONFIG_PARALLEL_DEBLOCKING
+  // Initialize cur_sb_col to -1 for all SB rows.
+  memset(lf_sync->cur_sb_col, -1, sizeof(*lf_sync->cur_sb_col) * sb_rows);
+
+  // Filter all the vertical edges in the whole frame
+  for (i = 0; i < num_workers; ++i) {
+    AVxWorker *const worker = &workers[i];
+    LFWorkerData *const lf_data = &lf_sync->lfdata[i];
+
+    worker->hook = (AVxWorkerHook)loop_filter_ver_row_worker;
+    worker->data1 = lf_sync;
+    worker->data2 = lf_data;
+
+    // Loopfilter data
+    av1_loop_filter_data_reset(lf_data, frame, cm, planes);
+    lf_data->start = start + i * cm->mib_size;
+    lf_data->stop = stop;
+    lf_data->y_only = y_only;
+
+    // Start loopfiltering
+    if (i == num_workers - 1) {
+      winterface->execute(worker);
+    } else {
+      winterface->launch(worker);
+    }
+  }
+
+  // Wait till all rows are finished
+  for (i = 0; i < num_workers; ++i) {
+    winterface->sync(&workers[i]);
+  }
+
+  memset(lf_sync->cur_sb_col, -1, sizeof(*lf_sync->cur_sb_col) * sb_rows);
+  // Filter all the horizontal edges in the whole frame
+  for (i = 0; i < num_workers; ++i) {
+    AVxWorker *const worker = &workers[i];
+    LFWorkerData *const lf_data = &lf_sync->lfdata[i];
+
+    worker->hook = (AVxWorkerHook)loop_filter_hor_row_worker;
+    worker->data1 = lf_sync;
+    worker->data2 = lf_data;
+
+    // Loopfilter data
+    av1_loop_filter_data_reset(lf_data, frame, cm, planes);
+    lf_data->start = start + i * cm->mib_size;
+    lf_data->stop = stop;
+    lf_data->y_only = y_only;
+
+    // Start loopfiltering
+    if (i == num_workers - 1) {
+      winterface->execute(worker);
+    } else {
+      winterface->launch(worker);
+    }
+  }
+
+  // Wait till all rows are finished
+  for (i = 0; i < num_workers; ++i) {
+    winterface->sync(&workers[i]);
+  }
+#else   // CONFIG_PARALLEL_DEBLOCKING
   // Initialize cur_sb_col to -1 for all SB rows.
   memset(lf_sync->cur_sb_col, -1, sizeof(*lf_sync->cur_sb_col) * sb_rows);
 
-  // Set up loopfilter thread data.
-  // The decoder is capping num_workers because it has been observed that using
-  // more threads on the loopfilter than there are cores will hurt performance
-  // on Android. This is because the system will only schedule the tile decode
-  // workers on cores equal to the number of tile columns. Then if the decoder
-  // tries to use more threads for the loopfilter, it will hurt performance
-  // because of contention. If the multithreading code changes in the future
-  // then the number of workers used by the loopfilter should be revisited.
   for (i = 0; i < num_workers; ++i) {
     AVxWorker *const worker = &workers[i];
     LFWorkerData *const lf_data = &lf_sync->lfdata[i];
@@ -228,6 +410,7 @@ static void loop_filter_rows_mt(YV12_BUFFER_CONFIG *frame, AV1_COMMON *cm,
   for (i = 0; i < num_workers; ++i) {
     winterface->sync(&workers[i]);
   }
+#endif  // CONFIG_PARALLEL_DEBLOCKING
 }
 
 void av1_loop_filter_frame_mt(YV12_BUFFER_CONFIG *frame, AV1_COMMON *cm,
diff --git a/configure b/configure
index 5bfce0416..fa458f7ad 100755
--- a/configure
+++ b/configure
@@ -289,6 +289,7 @@ EXPERIMENT_LIST="
     delta_q
     adapt_scan
     filter_7bit
+    parallel_deblocking
 "
 CONFIG_LIST="
     dependency_tracking
-- 
2.49.0