]> granicus.if.org Git - libvpx/commitdiff
Speed up of supertx
authorShunyao Li <shunyaoli@google.com>
Fri, 17 Jul 2015 17:44:06 +0000 (10:44 -0700)
committerShunyao Li <shunyaoli@google.com>
Fri, 24 Jul 2015 18:19:19 +0000 (11:19 -0700)
Limited the prediction extension to 8 pixels at each edge
Fixed a bug in the combination of wedge prediction and supertx

~10% speed up in decoder
derflr:     -0.004
derflr+hbd: +0.002
hevcmr:     +0.015

Change-Id: I777518896894a612c9704d3de0e7902bf498b0ea

vp9/common/vp9_reconinter.c
vp9/common/vp9_reconinter.h
vp9/decoder/vp9_decodeframe.c
vp9/encoder/vp9_encodeframe.c

index cf3d7c2bc5aac0b1f39531ec343cc7baa19b839a..a6e5baf0c1f427a0e20b5022e94c1d198e6fd41b 100644 (file)
@@ -933,6 +933,7 @@ void vp9_build_inter_predictors_sb(MACROBLOCKD *xd, int mi_row, int mi_col,
 }
 
 #if CONFIG_SUPERTX
+
 static const uint8_t mask_8[8] = {
   64, 64, 62, 52, 12,  2,  0,  0
 };
@@ -955,20 +956,42 @@ static const uint8_t mask_64[64] = {
 };
 #endif
 
-static void generate_1dmask(int length, uint8_t *mask) {
+static const uint8_t mask_8_uv[8] = {
+  64, 64, 62, 52,  12,  2,  0,  0
+};
+
+static const uint8_t mask_16_uv[16] = {
+  64, 64, 64, 64, 61, 53, 45, 36, 28, 19, 11, 3, 0,  0,  0,  0
+};
+
+static const uint8_t mask_32_uv[32] = {
+  64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 60, 54, 46, 36,
+  28, 18, 10,  4,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0
+};
+
+#if CONFIG_TX64X64
+static const uint8_t mask_64_uv[64] = {
+  64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+  64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 60, 54, 46, 36,
+  28, 18, 10,  4,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
+  0,   0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0
+};
+#endif
+
+static void generate_1dmask(int length, uint8_t *mask, int plane) {
   switch (length) {
     case 8:
-      vpx_memcpy(mask, mask_8, length);
+      vpx_memcpy(mask, plane ? mask_8_uv : mask_8, length);
       break;
     case 16:
-      vpx_memcpy(mask, mask_16, length);
+      vpx_memcpy(mask, plane ? mask_16_uv : mask_16, length);
       break;
     case 32:
-      vpx_memcpy(mask, mask_32, length);
+      vpx_memcpy(mask, plane ? mask_32_uv : mask_32, length);
       break;
 #if CONFIG_TX64X64
     case 64:
-      vpx_memcpy(mask, mask_64, length);
+      vpx_memcpy(mask, plane ? mask_64_uv : mask_64, length);
       break;
 #endif
     default:
@@ -976,12 +999,13 @@ static void generate_1dmask(int length, uint8_t *mask) {
   }
 }
 
+
 void vp9_build_masked_inter_predictor_complex(
     MACROBLOCKD *xd,
     uint8_t *dst, int dst_stride, uint8_t *dst2, int dst2_stride,
     const struct macroblockd_plane *pd, int mi_row, int mi_col,
     int mi_row_ori, int mi_col_ori, BLOCK_SIZE bsize, BLOCK_SIZE top_bsize,
-    PARTITION_TYPE partition) {
+    PARTITION_TYPE partition, int plane) {
   int i, j;
   uint8_t mask[MAXTXLEN];
   int top_w = 4 << b_width_log2_lookup[top_bsize],
@@ -990,6 +1014,12 @@ void vp9_build_masked_inter_predictor_complex(
   int w_offset = (mi_col - mi_col_ori) << 3,
       h_offset = (mi_row - mi_row_ori) << 3;
 
+#if CONFIG_VP9_HIGHBITDEPTH
+  uint16_t *dst16= CONVERT_TO_SHORTPTR(dst);
+  uint16_t *dst216 = CONVERT_TO_SHORTPTR(dst2);
+  int b_hdb = (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) ? 1 : 0;
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
   top_w >>= pd->subsampling_x;
   top_h >>= pd->subsampling_y;
   w >>= pd->subsampling_x;
@@ -999,52 +1029,169 @@ void vp9_build_masked_inter_predictor_complex(
 
   switch (partition) {
     case PARTITION_HORZ:
-      generate_1dmask(h, mask + h_offset);
-      vpx_memset(mask, 64, h_offset);
-      vpx_memset(mask + h_offset + h, 0, top_h - h_offset - h);
+    {
+#if CONFIG_VP9_HIGHBITDEPTH
+      if (b_hdb) {
+        uint16_t *dst_tmp = dst16 + h_offset * dst_stride;
+        uint16_t *dst2_tmp = dst216 + h_offset * dst2_stride;
+        generate_1dmask(h, mask + h_offset,
+                        plane && xd->plane[plane].subsampling_y);
+
+        for (i = h_offset; i < h_offset + h; i++) {
+          for (j = 0; j < top_w; j++) {
+            const int m = mask[i];  assert(m >= 0 && m <= 64);
+            if (m == 64)
+              continue;
+
+            if (m == 0)
+              dst_tmp[j] = dst2_tmp[j];
+            else
+              dst_tmp[j] = (dst_tmp[j] * m + dst2_tmp[j] * (64 - m) + 32) >> 6;
+          }
+          dst_tmp += dst_stride;
+          dst2_tmp += dst2_stride;
+        }
+
+        for (; i < top_h; i ++) {
+          vpx_memcpy(dst_tmp, dst2_tmp, top_w * sizeof(uint16_t));
+          dst_tmp += dst_stride;
+          dst2_tmp += dst2_stride;
+        }
+      } else {
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+        uint8_t *dst_tmp = dst + h_offset * dst_stride;
+        uint8_t *dst2_tmp = dst2 + h_offset * dst2_stride;
+        generate_1dmask(h, mask + h_offset,
+                        plane && xd->plane[plane].subsampling_y);
+
+        for (i = h_offset; i < h_offset + h; i++) {
+          for (j = 0; j < top_w; j++) {
+            const int m = mask[i];  assert(m >= 0 && m <= 64);
+            if (m == 64)
+              continue;
+
+            if (m == 0)
+              dst_tmp[j] = dst2_tmp[j];
+            else
+              dst_tmp[j] = (dst_tmp[j] * m + dst2_tmp[j] * (64 - m) + 32) >> 6;
+          }
+          dst_tmp += dst_stride;
+          dst2_tmp += dst2_stride;
+        }
+
+        for (; i < top_h; i ++) {
+          vpx_memcpy(dst_tmp, dst2_tmp, top_w * sizeof(uint8_t));
+          dst_tmp += dst_stride;
+          dst2_tmp += dst2_stride;
+        }
+#if CONFIG_VP9_HIGHBITDEPTH
+      }
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+    }
+
       break;
     case PARTITION_VERT:
-      generate_1dmask(w, mask + w_offset);
-      vpx_memset(mask, 64, w_offset);
-      vpx_memset(mask + w_offset + w, 0, top_w - w_offset - w);
-      break;
-    default:
-      assert(0);
-  }
+    {
+#if CONFIG_VP9_HIGHBITDEPTH
+      if (b_hdb) {
+        uint16_t *dst_tmp = dst16;
+        uint16_t *dst2_tmp = dst216;
+        generate_1dmask(w, mask + w_offset,
+                        plane && xd->plane[plane].subsampling_x);
+
+        for (i = 0; i < top_h; i++) {
+          for (j = w_offset; j < w_offset + w; j++) {
+            const int m = mask[j];   assert(m >= 0 && m <= 64);
+            if (m == 64)
+              continue;
+
+            if (m == 0)
+              dst_tmp[j] = dst2_tmp[j];
+            else
+              dst_tmp[j] = (dst_tmp[j] * m + dst2_tmp[j] * (64 - m) + 32) >> 6;
+          }
+          vpx_memcpy(dst_tmp + j, dst2_tmp + j,
+                     (top_w - w_offset - w) * sizeof(uint16_t));
+          dst_tmp += dst_stride;
+          dst2_tmp += dst2_stride;
+        }
+      } else {
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+        uint8_t *dst_tmp = dst;
+        uint8_t *dst2_tmp = dst2;
+        generate_1dmask(w, mask + w_offset,
+                        plane && xd->plane[plane].subsampling_x);
+
+        for (i = 0; i < top_h; i++) {
+          for (j = w_offset; j < w_offset + w; j++) {
+            const int m = mask[j];   assert(m >= 0 && m <= 64);
+            if (m == 64)
+              continue;
+
+            if (m == 0)
+              dst_tmp[j] = dst2_tmp[j];
+            else
+              dst_tmp[j] = (dst_tmp[j] * m + dst2_tmp[j] * (64 - m) + 32) >> 6;
+          }
+            vpx_memcpy(dst_tmp + j, dst2_tmp + j,
+                       (top_w - w_offset - w) * sizeof(uint8_t));
+          dst_tmp += dst_stride;
+          dst2_tmp += dst2_stride;
+        }
 #if CONFIG_VP9_HIGHBITDEPTH
-  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
-    uint16_t *dst16= CONVERT_TO_SHORTPTR(dst);
-    uint16_t *dst216 = CONVERT_TO_SHORTPTR(dst2);
-    for (i = 0; i < top_h; ++i) {
-      for (j = 0; j < top_w; ++j) {
-        const int m = (partition == PARTITION_HORZ ? mask[i] : mask[j]);
-        if (m == 64)
-          continue;
-        else if (m == 0)
-          dst16[i * dst_stride + j] = dst216[i * dst2_stride + j];
-        else
-          dst16[i * dst_stride + j] = (dst16[i * dst_stride + j] * m +
-                                       dst216[i * dst2_stride + j] * (64 - m) +
-                                       32) >> 6;
       }
+#endif  // CONFIG_VP9_HIGHBITDEPTH
     }
-    return;
+      break;
+    default:
+      assert(0);
   }
-#endif  // CONFIG_VP9_HIGHBITDEPTH
   (void) xd;
-  for (i = 0; i < top_h; ++i) {
-    for (j = 0; j < top_w; ++j) {
-      const int m = (partition == PARTITION_HORZ ? mask[i] : mask[j]);
-      if (m == 64)
-        continue;
-      else if (m == 0)
-        dst[i * dst_stride + j] = dst2[i * dst2_stride + j];
-      else
-        dst[i * dst_stride + j] = (dst[i * dst_stride + j] * m +
-                                   dst2[i * dst2_stride + j] * (64 - m) +
-                                   32) >> 6;
-    }
+}
+
+void vp9_build_inter_predictors_sb_sub8x8(MACROBLOCKD *xd,
+                                          int mi_row, int mi_col,
+                                          BLOCK_SIZE bsize, int block) {
+  // Prediction function used in supertx:
+  // Use the mv at current block (which is less than 8x8)
+  // to get prediction of a block located at (mi_row, mi_col) at size of bsize
+  // bsize can be larger than 8x8.
+  // block (0-3): the sub8x8 location of current block
+  int plane;
+  const int mi_x = mi_col * MI_SIZE;
+  const int mi_y = mi_row * MI_SIZE;
+
+  // For sub8x8 uv:
+  // Skip uv prediction in supertx except the first block (block = 0)
+  int max_plane = block ? 1 : MAX_MB_PLANE;
+
+  for (plane = 0; plane < max_plane; plane++) {
+    const BLOCK_SIZE plane_bsize = get_plane_block_size(bsize,
+                                                        &xd->plane[plane]);
+    const int num_4x4_w = num_4x4_blocks_wide_lookup[plane_bsize];
+    const int num_4x4_h = num_4x4_blocks_high_lookup[plane_bsize];
+    const int bw = 4 * num_4x4_w;
+    const int bh = 4 * num_4x4_h;
+
+    build_inter_predictors(xd, plane, block, bw, bh,
+                           0, 0, bw, bh,
+#if CONFIG_SUPERTX && CONFIG_WEDGE_PARTITION
+                            0, 0,
+#endif
+                           mi_x, mi_y);
   }
+#if CONFIG_INTERINTRA
+  if (xd->mi[0].src_mi->mbmi.ref_frame[1] == INTRA_FRAME &&
+#if CONFIG_INTRABC
+      xd->mi[0].src_mi->mbmi.ref_frame[0] != INTRA_FRAME &&
+#endif  // CONFIG_INTRABC
+      is_interintra_allowed(xd->mi[0].src_mi->mbmi.sb_type))
+    vp9_build_interintra_predictors(xd, xd->plane[0].dst.buf,
+                                    xd->plane[1].dst.buf, xd->plane[2].dst.buf,
+                                    xd->plane[0].dst.stride,
+                                    xd->plane[1].dst.stride,
+                                    xd->plane[2].dst.stride, bsize);
+#endif  // CONFIG_INTERINTRA
 }
 
 #if CONFIG_WEDGE_PARTITION
@@ -1071,248 +1218,48 @@ void vp9_build_inter_predictors_sb_extend(MACROBLOCKD *xd,
       for (y = 0; y < num_4x4_h; ++y)
         for (x = 0; x < num_4x4_w; ++x)
            build_inter_predictors(xd, plane, i++, bw, bh, 4 * x, 4 * y, 4, 4,
-                                  wedge_offset_x, wedge_offset_y, mi_x, mi_y);
+                            wedge_offset_x >> (xd->plane[plane].subsampling_x),
+                            wedge_offset_y >> (xd->plane[plane].subsampling_y),
+                            mi_x, mi_y);
     } else {
       build_inter_predictors(xd, plane, 0, bw, bh, 0, 0, bw, bh,
-                             wedge_offset_x, wedge_offset_y, mi_x, mi_y);
+                            wedge_offset_x >> (xd->plane[plane].subsampling_x),
+                            wedge_offset_y >> (xd->plane[plane].subsampling_y),
+                            mi_x, mi_y);
     }
   }
 }
-#endif  // CONFIG_WEDGE_PARTITION
-
-void vp9_build_inter_predictors_sby_sub8x8_extend(
+void vp9_build_inter_predictors_sb_sub8x8_extend(
     MACROBLOCKD *xd,
     int mi_row, int mi_col,
-    int mi_row_ori,
-    int mi_col_ori,
-    BLOCK_SIZE top_bsize,
-    PARTITION_TYPE partition) {
+    int mi_row_ori, int mi_col_ori,
+    BLOCK_SIZE bsize, int block) {
+  // Sub8x8 prediction for wedge partition in supertx
+  int plane;
   const int mi_x = mi_col_ori * MI_SIZE;
   const int mi_y = mi_row_ori * MI_SIZE;
-#if CONFIG_WEDGE_PARTITION
   const int wedge_offset_x = (mi_col - mi_col_ori) * MI_SIZE;
   const int wedge_offset_y = (mi_row - mi_row_ori) * MI_SIZE;
-#endif  // CONFIG_WEDGE_PARTITION
-  uint8_t *orig_dst;
-  int orig_dst_stride;
-  int bw = 4 << b_width_log2_lookup[top_bsize];
-  int bh = 4 << b_height_log2_lookup[top_bsize];
-#if CONFIG_VP9_HIGHBITDEPTH
-  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
-    DECLARE_ALIGNED_ARRAY(16, uint8_t, tmp_buf, 2 * MAXTXLEN * MAXTXLEN);
-    DECLARE_ALIGNED_ARRAY(16, uint8_t, tmp_buf1, 2 * MAXTXLEN * MAXTXLEN);
-    DECLARE_ALIGNED_ARRAY(16, uint8_t, tmp_buf2, 2 * MAXTXLEN * MAXTXLEN);
-
-    orig_dst = xd->plane[0].dst.buf;
-    orig_dst_stride = xd->plane[0].dst.stride;
-    build_inter_predictors(xd, 0, 0, bw, bh, 0, 0, bw, bh,
-#if CONFIG_WEDGE_PARTITION
-                           wedge_offset_x, wedge_offset_y,
-#endif
-                           mi_x, mi_y);
-
-    xd->plane[0].dst.buf = CONVERT_TO_BYTEPTR(tmp_buf);
-    xd->plane[0].dst.stride = MAXTXLEN;
-    switch (partition) {
-      case PARTITION_HORZ:
-        build_inter_predictors(xd, 0, 2, bw, bh, 0, 0, bw, bh,
-#if CONFIG_WEDGE_PARTITION
-                               wedge_offset_x, wedge_offset_y,
-#endif
-                               mi_x, mi_y);
-        break;
-      case PARTITION_VERT:
-        build_inter_predictors(xd, 0, 1, bw, bh, 0, 0, bw, bh,
-#if CONFIG_WEDGE_PARTITION
-                               wedge_offset_x, wedge_offset_y,
-#endif
-                               mi_x, mi_y);
-        break;
-      case PARTITION_SPLIT:
-        build_inter_predictors(xd, 0, 1, bw, bh, 0, 0, bw, bh,
-#if CONFIG_WEDGE_PARTITION
-                               wedge_offset_x, wedge_offset_y,
-#endif
-                               mi_x, mi_y);
-        xd->plane[0].dst.buf = CONVERT_TO_BYTEPTR(tmp_buf1);
-        xd->plane[0].dst.stride = MAXTXLEN;
-        build_inter_predictors(xd, 0, 2, bw, bh, 0, 0, bw, bh,
-#if CONFIG_WEDGE_PARTITION
-                               wedge_offset_x, wedge_offset_y,
-#endif
-                               mi_x, mi_y);
-        xd->plane[0].dst.buf = CONVERT_TO_BYTEPTR(tmp_buf2);
-        xd->plane[0].dst.stride = MAXTXLEN;
-        build_inter_predictors(xd, 0, 3, bw, bh, 0, 0, bw, bh,
-#if CONFIG_WEDGE_PARTITION
-                               wedge_offset_x, wedge_offset_y,
-#endif
-                               mi_x, mi_y);
-        break;
-      default:
-        assert(0);
-    }
-    if (partition != PARTITION_SPLIT) {
-      vp9_build_masked_inter_predictor_complex(
-          xd,
-          orig_dst, orig_dst_stride,
-          CONVERT_TO_BYTEPTR(tmp_buf), MAXTXLEN,
-          &xd->plane[0], mi_row, mi_col,
-          mi_row_ori, mi_col_ori,
-          BLOCK_8X8, top_bsize,
-          partition);
-    } else {
-      vp9_build_masked_inter_predictor_complex(
-          xd,
-          orig_dst, orig_dst_stride,
-          CONVERT_TO_BYTEPTR(tmp_buf), MAXTXLEN,
-          &xd->plane[0], mi_row, mi_col,
-          mi_row_ori, mi_col_ori,
-          BLOCK_8X8, top_bsize,
-          PARTITION_VERT);
-      vp9_build_masked_inter_predictor_complex(
-          xd,
-          CONVERT_TO_BYTEPTR(tmp_buf1), MAXTXLEN,
-          CONVERT_TO_BYTEPTR(tmp_buf2), MAXTXLEN,
-          &xd->plane[0], mi_row, mi_col,
-          mi_row_ori, mi_col_ori,
-          BLOCK_8X8, top_bsize,
-          PARTITION_VERT);
-      vp9_build_masked_inter_predictor_complex(
-          xd,
-          orig_dst, orig_dst_stride,
-          CONVERT_TO_BYTEPTR(tmp_buf1), MAXTXLEN,
-          &xd->plane[0], mi_row, mi_col,
-          mi_row_ori, mi_col_ori,
-          BLOCK_8X8, top_bsize,
-          PARTITION_HORZ);
-    }
-    xd->plane[0].dst.buf = orig_dst;
-    xd->plane[0].dst.stride = orig_dst_stride;
-    return;
-  }
-#endif  // CONFIG_VP9_HIGHBITDEPTH
 
-  {
-    DECLARE_ALIGNED_ARRAY(16, uint8_t, tmp_buf, MAXTXLEN * MAXTXLEN);
-    DECLARE_ALIGNED_ARRAY(16, uint8_t, tmp_buf1, MAXTXLEN * MAXTXLEN);
-    DECLARE_ALIGNED_ARRAY(16, uint8_t, tmp_buf2, MAXTXLEN * MAXTXLEN);
-
-    orig_dst = xd->plane[0].dst.buf;
-    orig_dst_stride = xd->plane[0].dst.stride;
-    build_inter_predictors(xd, 0, 0, bw, bh, 0, 0, bw, bh,
-#if CONFIG_WEDGE_PARTITION
-                           wedge_offset_x, wedge_offset_y,
-#endif
-                           mi_x, mi_y);
+  // For sub8x8 uv:
+  // Skip uv prediction in supertx except the first block (block = 0)
+  int max_plane = block ? 1 : MAX_MB_PLANE;
 
-    xd->plane[0].dst.buf = tmp_buf;
-    xd->plane[0].dst.stride = MAXTXLEN;
-    switch (partition) {
-      case PARTITION_HORZ:
-        build_inter_predictors(xd, 0, 2, bw, bh, 0, 0, bw, bh,
-#if CONFIG_WEDGE_PARTITION
-                               wedge_offset_x, wedge_offset_y,
-#endif
-                               mi_x, mi_y);
-        break;
-      case PARTITION_VERT:
-        build_inter_predictors(xd, 0, 1, bw, bh, 0, 0, bw, bh,
-#if CONFIG_WEDGE_PARTITION
-                               wedge_offset_x, wedge_offset_y,
-#endif
-                               mi_x, mi_y);
-        break;
-      case PARTITION_SPLIT:
-        build_inter_predictors(xd, 0, 1, bw, bh, 0, 0, bw, bh,
-#if CONFIG_WEDGE_PARTITION
-                               wedge_offset_x, wedge_offset_y,
-#endif
-                               mi_x, mi_y);
-        xd->plane[0].dst.buf = tmp_buf1;
-        xd->plane[0].dst.stride = MAXTXLEN;
-        build_inter_predictors(xd, 0, 2, bw, bh, 0, 0, bw, bh,
-#if CONFIG_WEDGE_PARTITION
-                               wedge_offset_x, wedge_offset_y,
-#endif
-                               mi_x, mi_y);
-        xd->plane[0].dst.buf = tmp_buf2;
-        xd->plane[0].dst.stride = MAXTXLEN;
-        build_inter_predictors(xd, 0, 3, bw, bh, 0, 0, bw, bh,
-#if CONFIG_WEDGE_PARTITION
-                               wedge_offset_x, wedge_offset_y,
-#endif
-                               mi_x, mi_y);
-        break;
-      default:
-        assert(0);
-    }
-
-    if (partition != PARTITION_SPLIT) {
-      vp9_build_masked_inter_predictor_complex(xd,
-                                               orig_dst, orig_dst_stride,
-                                               tmp_buf, MAXTXLEN,
-                                               &xd->plane[0], mi_row, mi_col,
-                                               mi_row_ori, mi_col_ori,
-                                               BLOCK_8X8, top_bsize,
-                                               partition);
-    } else {
-      vp9_build_masked_inter_predictor_complex(xd,
-                                               orig_dst, orig_dst_stride,
-                                               tmp_buf, MAXTXLEN,
-                                               &xd->plane[0], mi_row, mi_col,
-                                               mi_row_ori, mi_col_ori,
-                                               BLOCK_8X8, top_bsize,
-                                               PARTITION_VERT);
-      vp9_build_masked_inter_predictor_complex(xd,
-                                               tmp_buf1, MAXTXLEN,
-                                               tmp_buf2, MAXTXLEN,
-                                               &xd->plane[0], mi_row, mi_col,
-                                               mi_row_ori, mi_col_ori,
-                                               BLOCK_8X8, top_bsize,
-                                               PARTITION_VERT);
-      vp9_build_masked_inter_predictor_complex(xd,
-                                               orig_dst, orig_dst_stride,
-                                               tmp_buf1, MAXTXLEN,
-                                               &xd->plane[0], mi_row, mi_col,
-                                               mi_row_ori, mi_col_ori,
-                                               BLOCK_8X8, top_bsize,
-                                               PARTITION_HORZ);
-    }
-    xd->plane[0].dst.buf = orig_dst;
-    xd->plane[0].dst.stride = orig_dst_stride;
-  }
-}
-
-void vp9_build_inter_predictors_sbuv_sub8x8_extend(MACROBLOCKD *xd,
-#if CONFIG_WEDGE_PARTITION
-                                                   int mi_row, int mi_col,
-#endif
-                                                   int mi_row_ori,
-                                                   int mi_col_ori,
-                                                   BLOCK_SIZE top_bsize) {
-  int plane;
-  const int mi_x = mi_col_ori * MI_SIZE;
-  const int mi_y = mi_row_ori * MI_SIZE;
-#if CONFIG_WEDGE_PARTITION
-  const int wedge_offset_x = (mi_col - mi_col_ori) * MI_SIZE;
-  const int wedge_offset_y = (mi_row - mi_row_ori) * MI_SIZE;
-#endif
-  for (plane = 1; plane < MAX_MB_PLANE; ++plane) {
-    const BLOCK_SIZE plane_bsize = get_plane_block_size(top_bsize,
+  for (plane = 0; plane < max_plane; ++plane) {
+    const BLOCK_SIZE plane_bsize = get_plane_block_size(bsize,
                                                         &xd->plane[plane]);
     const int num_4x4_w = num_4x4_blocks_wide_lookup[plane_bsize];
     const int num_4x4_h = num_4x4_blocks_high_lookup[plane_bsize];
     const int bw = 4 * num_4x4_w;
     const int bh = 4 * num_4x4_h;
 
-    build_inter_predictors(xd, plane, 0, bw, bh, 0, 0, bw, bh,
-#if CONFIG_WEDGE_PARTITION
-                           wedge_offset_x, wedge_offset_y,
-#endif
+    build_inter_predictors(xd, plane, block, bw, bh, 0, 0, bw, bh,
+                           wedge_offset_x >> (xd->plane[plane].subsampling_x),
+                           wedge_offset_y >> (xd->plane[plane].subsampling_y),
                            mi_x, mi_y);
   }
 }
+#endif  // CONFIG_WEDGE_PARTITION
 #endif  // CONFIG_SUPERTX
 
 // TODO(jingning): This function serves as a placeholder for decoder prediction
@@ -1698,6 +1645,48 @@ void vp9_dec_build_inter_predictors_sb(MACROBLOCKD *xd, int mi_row, int mi_col,
 }
 
 #if CONFIG_SUPERTX
+void vp9_dec_build_inter_predictors_sb_sub8x8(MACROBLOCKD *xd,
+                                       int mi_row, int mi_col,
+                                       BLOCK_SIZE bsize, int block) {
+  // Prediction function used in supertx:
+  // Use the mv at current block (which is less than 8x8)
+  // to get prediction of a block located at (mi_row, mi_col) at size of bsize
+  // bsize can be larger than 8x8.
+  // block (0-3): the sub8x8 location of current block
+  int plane;
+  const int mi_x = mi_col * MI_SIZE;
+  const int mi_y = mi_row * MI_SIZE;
+
+  // For sub8x8 uv:
+  // Skip uv prediction in supertx except the first block (block = 0)
+  int max_plane = (block) ? 1 : MAX_MB_PLANE;
+
+  for (plane = 0; plane < max_plane; plane++) {
+    const BLOCK_SIZE plane_bsize = get_plane_block_size(bsize,
+                                                        &xd->plane[plane]);
+    const int num_4x4_w = num_4x4_blocks_wide_lookup[plane_bsize];
+    const int num_4x4_h = num_4x4_blocks_high_lookup[plane_bsize];
+    const int bw = 4 * num_4x4_w;
+    const int bh = 4 * num_4x4_h;
+
+    dec_build_inter_predictors(xd, plane, block, bw, bh,
+                                     0, 0, bw, bh,
+#if CONFIG_SUPERTX && CONFIG_WEDGE_PARTITION
+                                     0, 0,
+#endif
+                                     mi_x, mi_y);
+  }
+#if CONFIG_INTERINTRA  // not sure
+  if (xd->mi[0].src_mi->mbmi.ref_frame[1] == INTRA_FRAME &&
+      is_interintra_allowed(xd->mi[0].src_mi->mbmi.sb_type))
+    vp9_build_interintra_predictors(xd, xd->plane[0].dst.buf,
+                                    xd->plane[1].dst.buf, xd->plane[2].dst.buf,
+                                    xd->plane[0].dst.stride,
+                                    xd->plane[1].dst.stride,
+                                    xd->plane[2].dst.stride, bsize);
+#endif  // CONFIG_INTERINTRA
+}
+
 #if CONFIG_WEDGE_PARTITION
 void vp9_dec_build_inter_predictors_sb_extend(MACROBLOCKD *xd,
                                               int mi_row, int mi_col,
@@ -1721,252 +1710,55 @@ void vp9_dec_build_inter_predictors_sb_extend(MACROBLOCKD *xd,
       assert(bsize == BLOCK_8X8);
       for (y = 0; y < num_4x4_h; ++y)
         for (x = 0; x < num_4x4_w; ++x)
-          dec_build_inter_predictors(xd, plane, i++, bw, bh, 4 * x, 4 * y, 4, 4,
-                                     wedge_offset_x, wedge_offset_y,
-                                     mi_x, mi_y);
+          dec_build_inter_predictors(
+              xd, plane, i++, bw, bh, 4 * x, 4 * y, 4, 4,
+              wedge_offset_x >> (xd->plane[plane].subsampling_x),
+              wedge_offset_y >> (xd->plane[plane].subsampling_y),
+              mi_x, mi_y);
     } else {
-      dec_build_inter_predictors(xd, plane, 0, bw, bh, 0, 0, bw, bh,
-                                 wedge_offset_x, wedge_offset_y,
-                                 mi_x, mi_y);
+      dec_build_inter_predictors(
+          xd, plane, 0, bw, bh, 0, 0, bw, bh,
+          wedge_offset_x >> (xd->plane[plane].subsampling_x),
+          wedge_offset_y >> (xd->plane[plane].subsampling_y),
+          mi_x, mi_y);
     }
   }
 }
-#endif  // CONFIG_WEDGE_PARTITION
 
-void vp9_dec_build_inter_predictors_sby_sub8x8_extend(
+void vp9_dec_build_inter_predictors_sb_sub8x8_extend(
     MACROBLOCKD *xd,
     int mi_row, int mi_col,
-    int mi_row_ori,
-    int mi_col_ori,
-    BLOCK_SIZE top_bsize,
-    PARTITION_TYPE partition) {
+    int mi_row_ori, int mi_col_ori,
+    BLOCK_SIZE bsize, int block) {
+  // Sub8x8 prediction for wedge partition in supertx
+  int plane;
   const int mi_x = mi_col_ori * MI_SIZE;
   const int mi_y = mi_row_ori * MI_SIZE;
-#if CONFIG_WEDGE_PARTITION
   const int wedge_offset_x = (mi_col - mi_col_ori) * MI_SIZE;
   const int wedge_offset_y = (mi_row - mi_row_ori) * MI_SIZE;
-#endif
-  uint8_t *orig_dst;
-  int orig_dst_stride;
-  int bw = 4 << b_width_log2_lookup[top_bsize];
-  int bh = 4 << b_height_log2_lookup[top_bsize];
-#if CONFIG_VP9_HIGHBITDEPTH
-  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
-    DECLARE_ALIGNED_ARRAY(16, uint8_t, tmp_buf, 2 * MAXTXLEN * MAXTXLEN);
-    DECLARE_ALIGNED_ARRAY(16, uint8_t, tmp_buf1, 2 * MAXTXLEN * MAXTXLEN);
-    DECLARE_ALIGNED_ARRAY(16, uint8_t, tmp_buf2, 2 * MAXTXLEN * MAXTXLEN);
-
-    orig_dst = xd->plane[0].dst.buf;
-    orig_dst_stride = xd->plane[0].dst.stride;
-    dec_build_inter_predictors(xd, 0, 0, bw, bh, 0, 0, bw, bh,
-#if CONFIG_WEDGE_PARTITION
-                               wedge_offset_x, wedge_offset_y,
-#endif
-                               mi_x, mi_y);
-
-    xd->plane[0].dst.buf = CONVERT_TO_BYTEPTR(tmp_buf);
-    xd->plane[0].dst.stride = MAXTXLEN;
-    switch (partition) {
-      case PARTITION_HORZ:
-        dec_build_inter_predictors(xd, 0, 2, bw, bh, 0, 0, bw, bh,
-#if CONFIG_WEDGE_PARTITION
-                                   wedge_offset_x, wedge_offset_y,
-#endif
-                                   mi_x, mi_y);
-        break;
-      case PARTITION_VERT:
-        dec_build_inter_predictors(xd, 0, 1, bw, bh, 0, 0, bw, bh,
-#if CONFIG_WEDGE_PARTITION
-                                   wedge_offset_x, wedge_offset_y,
-#endif
-                                   mi_x, mi_y);
-        break;
-      case PARTITION_SPLIT:
-        dec_build_inter_predictors(xd, 0, 1, bw, bh, 0, 0, bw, bh,
-#if CONFIG_WEDGE_PARTITION
-                                   wedge_offset_x, wedge_offset_y,
-#endif
-                                   mi_x, mi_y);
-        xd->plane[0].dst.buf = CONVERT_TO_BYTEPTR(tmp_buf1);
-        xd->plane[0].dst.stride = MAXTXLEN;
-        dec_build_inter_predictors(xd, 0, 2, bw, bh, 0, 0, bw, bh,
-#if CONFIG_WEDGE_PARTITION
-                                   wedge_offset_x, wedge_offset_y,
-#endif
-                                   mi_x, mi_y);
-        xd->plane[0].dst.buf = CONVERT_TO_BYTEPTR(tmp_buf2);
-        xd->plane[0].dst.stride = MAXTXLEN;
-        dec_build_inter_predictors(xd, 0, 3, bw, bh, 0, 0, bw, bh,
-#if CONFIG_WEDGE_PARTITION
-                                   wedge_offset_x, wedge_offset_y,
-#endif
-                                   mi_x, mi_y);
-        break;
-      default:
-        assert(0);
-    }
-
-    if (partition != PARTITION_SPLIT) {
-      vp9_build_masked_inter_predictor_complex(
-          xd,
-          orig_dst, orig_dst_stride,
-          CONVERT_TO_BYTEPTR(tmp_buf), MAXTXLEN,
-          &xd->plane[0], mi_row, mi_col,
-          mi_row_ori, mi_col_ori,
-          BLOCK_8X8, top_bsize,
-          partition);
-    } else {
-      vp9_build_masked_inter_predictor_complex(
-          xd,
-          orig_dst, orig_dst_stride,
-          CONVERT_TO_BYTEPTR(tmp_buf), MAXTXLEN,
-          &xd->plane[0], mi_row, mi_col,
-          mi_row_ori, mi_col_ori,
-          BLOCK_8X8, top_bsize,
-          PARTITION_VERT);
-      vp9_build_masked_inter_predictor_complex(
-          xd,
-          CONVERT_TO_BYTEPTR(tmp_buf1), MAXTXLEN,
-          CONVERT_TO_BYTEPTR(tmp_buf2), MAXTXLEN,
-          &xd->plane[0], mi_row, mi_col,
-          mi_row_ori, mi_col_ori,
-          BLOCK_8X8, top_bsize,
-          PARTITION_VERT);
-      vp9_build_masked_inter_predictor_complex(
-          xd,
-          orig_dst, orig_dst_stride,
-          CONVERT_TO_BYTEPTR(tmp_buf1), MAXTXLEN,
-          &xd->plane[0], mi_row, mi_col,
-          mi_row_ori, mi_col_ori,
-          BLOCK_8X8, top_bsize,
-          PARTITION_HORZ);
-    }
-    xd->plane[0].dst.buf = orig_dst;
-    xd->plane[0].dst.stride = orig_dst_stride;
-    return;
-  }
-#endif  // CONFIG_VP9_HIGHBITDEPTH
-
-  {
-    DECLARE_ALIGNED_ARRAY(16, uint8_t, tmp_buf, MAXTXLEN * MAXTXLEN);
-    DECLARE_ALIGNED_ARRAY(16, uint8_t, tmp_buf1, MAXTXLEN * MAXTXLEN);
-    DECLARE_ALIGNED_ARRAY(16, uint8_t, tmp_buf2, MAXTXLEN * MAXTXLEN);
-
-    orig_dst = xd->plane[0].dst.buf;
-    orig_dst_stride = xd->plane[0].dst.stride;
-    dec_build_inter_predictors(xd, 0, 0, bw, bh, 0, 0, bw, bh,
-#if CONFIG_WEDGE_PARTITION
-                               wedge_offset_x, wedge_offset_y,
-#endif
-                               mi_x, mi_y);
-
-    xd->plane[0].dst.buf = tmp_buf;
-    xd->plane[0].dst.stride = MAXTXLEN;
-    switch (partition) {
-      case PARTITION_HORZ:
-        dec_build_inter_predictors(xd, 0, 2, bw, bh, 0, 0, bw, bh,
-#if CONFIG_WEDGE_PARTITION
-                                   wedge_offset_x, wedge_offset_y,
-#endif
-                                   mi_x, mi_y);
-        break;
-      case PARTITION_VERT:
-        dec_build_inter_predictors(xd, 0, 1, bw, bh, 0, 0, bw, bh,
-#if CONFIG_WEDGE_PARTITION
-                                   wedge_offset_x, wedge_offset_y,
-#endif
-                                   mi_x, mi_y);
-        break;
-      case PARTITION_SPLIT:
-        dec_build_inter_predictors(xd, 0, 1, bw, bh, 0, 0, bw, bh,
-#if CONFIG_WEDGE_PARTITION
-                                   wedge_offset_x, wedge_offset_y,
-#endif
-                                   mi_x, mi_y);
-        xd->plane[0].dst.buf = tmp_buf1;
-        xd->plane[0].dst.stride = MAXTXLEN;
-        dec_build_inter_predictors(xd, 0, 2, bw, bh, 0, 0, bw, bh,
-#if CONFIG_WEDGE_PARTITION
-                                   wedge_offset_x, wedge_offset_y,
-#endif
-                                   mi_x, mi_y);
-        xd->plane[0].dst.buf = tmp_buf2;
-        xd->plane[0].dst.stride = MAXTXLEN;
-        dec_build_inter_predictors(xd, 0, 3, bw, bh, 0, 0, bw, bh,
-#if CONFIG_WEDGE_PARTITION
-                                   wedge_offset_x, wedge_offset_y,
-#endif
-                                   mi_x, mi_y);
-        break;
-      default:
-        assert(0);
-    }
 
-    if (partition != PARTITION_SPLIT) {
-      vp9_build_masked_inter_predictor_complex(xd,
-                                               orig_dst, orig_dst_stride,
-                                               tmp_buf, MAXTXLEN,
-                                               &xd->plane[0], mi_row, mi_col,
-                                               mi_row_ori, mi_col_ori,
-                                               BLOCK_8X8, top_bsize,
-                                               partition);
-    } else {
-      vp9_build_masked_inter_predictor_complex(xd,
-                                               orig_dst, orig_dst_stride,
-                                               tmp_buf, MAXTXLEN,
-                                               &xd->plane[0], mi_row, mi_col,
-                                               mi_row_ori, mi_col_ori,
-                                               BLOCK_8X8, top_bsize,
-                                               PARTITION_VERT);
-      vp9_build_masked_inter_predictor_complex(xd,
-                                               tmp_buf1, MAXTXLEN,
-                                               tmp_buf2, MAXTXLEN,
-                                               &xd->plane[0], mi_row, mi_col,
-                                               mi_row_ori, mi_col_ori,
-                                               BLOCK_8X8, top_bsize,
-                                               PARTITION_VERT);
-      vp9_build_masked_inter_predictor_complex(xd,
-                                               orig_dst, orig_dst_stride,
-                                               tmp_buf1, MAXTXLEN,
-                                               &xd->plane[0], mi_row, mi_col,
-                                               mi_row_ori, mi_col_ori,
-                                               BLOCK_8X8, top_bsize,
-                                               PARTITION_HORZ);
-    }
-    xd->plane[0].dst.buf = orig_dst;
-    xd->plane[0].dst.stride = orig_dst_stride;
-  }
-}
+  // For sub8x8 uv:
+  // Skip uv prediction in supertx except the first block (block = 0)
+  int max_plane = block ? 1 : MAX_MB_PLANE;
 
-void vp9_dec_build_inter_predictors_sbuv_sub8x8_extend(MACROBLOCKD *xd,
-#if CONFIG_WEDGE_PARTITION
-                                                       int mi_row, int mi_col,
-#endif
-                                                       int mi_row_ori,
-                                                       int mi_col_ori,
-                                                       BLOCK_SIZE top_bsize) {
-  int plane;
-  const int mi_x = mi_col_ori * MI_SIZE;
-  const int mi_y = mi_row_ori * MI_SIZE;
-#if CONFIG_WEDGE_PARTITION
-  const int wedge_offset_x = (mi_col - mi_col_ori) * MI_SIZE;
-  const int wedge_offset_y = (mi_row - mi_row_ori) * MI_SIZE;
-#endif
-  for (plane = 1; plane < MAX_MB_PLANE; ++plane) {
-    const BLOCK_SIZE plane_bsize = get_plane_block_size(top_bsize,
+  for (plane = 0; plane < max_plane; ++plane) {
+    const BLOCK_SIZE plane_bsize = get_plane_block_size(bsize,
                                                         &xd->plane[plane]);
     const int num_4x4_w = num_4x4_blocks_wide_lookup[plane_bsize];
     const int num_4x4_h = num_4x4_blocks_high_lookup[plane_bsize];
     const int bw = 4 * num_4x4_w;
     const int bh = 4 * num_4x4_h;
 
-    dec_build_inter_predictors(xd, plane, 0, bw, bh, 0, 0, bw, bh,
-#if CONFIG_WEDGE_PARTITION
-                               wedge_offset_x, wedge_offset_y,
-#endif
-                               mi_x, mi_y);
+    dec_build_inter_predictors(
+        xd, plane, block, bw, bh, 0, 0, bw, bh,
+        wedge_offset_x >> (xd->plane[plane].subsampling_x),
+        wedge_offset_y >> (xd->plane[plane].subsampling_y),
+        mi_x, mi_y);
   }
 }
+
+#endif  // CONFIG_WEDGE_PARTITION
+
 #endif  // CONFIG_SUPERTX
 
 void vp9_setup_dst_planes(struct macroblockd_plane planes[MAX_MB_PLANE],
index 10b18cc2e52ed7db84cf28c157b2d45fb8ab2f87..dcae0126fcb26e734b7ef3ebd3ddf2148db7a08b 100644 (file)
@@ -30,6 +30,15 @@ void vp9_build_inter_predictors_sb(MACROBLOCKD *xd, int mi_row, int mi_col,
 void vp9_dec_build_inter_predictors_sb(MACROBLOCKD *xd, int mi_row, int mi_col,
                                        BLOCK_SIZE bsize);
 
+#if CONFIG_SUPERTX
+void vp9_build_inter_predictors_sb_sub8x8(MACROBLOCKD *xd,
+                                          int mi_row, int mi_col,
+                                          BLOCK_SIZE bsize, int block);
+void vp9_dec_build_inter_predictors_sb_sub8x8(MACROBLOCKD *xd,
+                                       int mi_row, int mi_col,
+                                       BLOCK_SIZE bsize, int block);
+#endif
+
 void vp9_build_inter_predictor(const uint8_t *src, int src_stride,
                                uint8_t *dst, int dst_stride,
                                const MV *mv_q3,
@@ -88,39 +97,15 @@ void vp9_generate_hard_mask(int wedge_index, BLOCK_SIZE sb_type,
 #endif  // CONFIG_WEDGE_PARTITION
 
 #if CONFIG_SUPERTX
+
 struct macroblockd_plane;
-void vp9_build_inter_predictors_sby_sub8x8_extend(MACROBLOCKD *xd,
-                                                  int mi_row, int mi_col,
-                                                  int mi_row_ori,
-                                                  int mi_col_ori,
-                                                  BLOCK_SIZE top_bsize,
-                                                  PARTITION_TYPE partition);
-void vp9_build_inter_predictors_sbuv_sub8x8_extend(MACROBLOCKD *xd,
-#if CONFIG_WEDGE_PARTITION
-                                                   int mi_row, int mi_col,
-#endif
-                                                   int mi_row_ori,
-                                                   int mi_col_ori,
-                                                   BLOCK_SIZE top_bsize);
+
 void vp9_build_masked_inter_predictor_complex(
     MACROBLOCKD *xd,
     uint8_t *dst, int dst_stride, uint8_t *dst2, int dst2_stride,
     const struct macroblockd_plane *pd, int mi_row, int mi_col,
     int mi_row_ori, int mi_col_ori, BLOCK_SIZE bsize, BLOCK_SIZE top_bsize,
-    PARTITION_TYPE partition);
-void vp9_dec_build_inter_predictors_sby_sub8x8_extend(MACROBLOCKD *xd,
-                                                      int mi_row, int mi_col,
-                                                      int mi_row_ori,
-                                                      int mi_col_ori,
-                                                      BLOCK_SIZE top_bsize,
-                                                      PARTITION_TYPE p);
-void vp9_dec_build_inter_predictors_sbuv_sub8x8_extend(MACROBLOCKD *xd,
-#if CONFIG_WEDGE_PARTITION
-                                                       int mi_row, int mi_col,
-#endif
-                                                       int mi_row_ori,
-                                                       int mi_col_ori,
-                                                       BLOCK_SIZE top_bsize);
+    PARTITION_TYPE partition, int plane);
 
 #if CONFIG_WEDGE_PARTITION
 void vp9_build_inter_predictors_sb_extend(MACROBLOCKD *xd,
@@ -131,6 +116,18 @@ void vp9_dec_build_inter_predictors_sb_extend(MACROBLOCKD *xd,
                                               int mi_row, int mi_col,
                                               int mi_row_ori, int mi_col_ori,
                                               BLOCK_SIZE bsize);
+
+void vp9_build_inter_predictors_sb_sub8x8_extend(
+    MACROBLOCKD *xd,
+    int mi_row, int mi_col,
+    int mi_row_ori, int mi_col_ori,
+    BLOCK_SIZE bsize, int block);
+void vp9_dec_build_inter_predictors_sb_sub8x8_extend(
+    MACROBLOCKD *xd,
+    int mi_row, int mi_col,
+    int mi_row_ori, int mi_col_ori,
+    BLOCK_SIZE bsize, int block);
+
 #endif  // CONFIG_WEDGE_PARTITION
 #endif  // CONFIG_SUPERTX
 
index 7f5befe58e3f480dc301aa13a32083051029040f..3891204744397230316baea5ea9db91899da9e6f 100644 (file)
@@ -746,17 +746,23 @@ static MB_MODE_INFO *set_offsets(VP9_COMMON *const cm, MACROBLOCKD *const xd,
 static MB_MODE_INFO *set_offsets_extend(VP9_COMMON *const cm,
                                         MACROBLOCKD *const xd,
                                         const TileInfo *const tile,
-                                        BLOCK_SIZE top_bsize,
-                                        int mi_row, int mi_col,
+                                        BLOCK_SIZE bsize_pred,
+                                        int mi_row_pred, int mi_col_pred,
                                         int mi_row_ori, int mi_col_ori) {
-  const int bw = num_8x8_blocks_wide_lookup[top_bsize];
-  const int bh = num_8x8_blocks_high_lookup[top_bsize];
-  const int offset = mi_row * cm->mi_stride + mi_col;
-
+  // Used in supertx
+  // (mi_row_ori, mi_col_ori): location for mv
+  // (mi_row_pred, mi_col_pred, bsize_pred): region to predict
+  const int bw = num_8x8_blocks_wide_lookup[bsize_pred];
+  const int bh = num_8x8_blocks_high_lookup[bsize_pred];
+  const int offset = mi_row_ori * cm->mi_stride + mi_col_ori;
   xd->mi = cm->mi + offset;
   xd->mi[0].src_mi = &xd->mi[0];
-  set_mi_row_col(xd, tile, mi_row_ori, bh, mi_col_ori, bw,
+  set_mi_row_col(xd, tile, mi_row_pred, bh, mi_col_pred, bw,
                  cm->mi_rows, cm->mi_cols);
+
+  xd->up_available    = (mi_row_ori != 0);
+  xd->left_available  = (mi_col_ori > tile->mi_col_start);
+
   return &xd->mi[0].mbmi;
 }
 
@@ -836,55 +842,184 @@ static void set_ref(VP9_COMMON *const cm, MACROBLOCKD *const xd,
   xd->corrupted |= ref_buffer->buf->corrupted;
 }
 
-static void dec_predict_b_extend(VP9_COMMON *const cm, MACROBLOCKD *const xd,
-                                 const TileInfo *const tile,
-                                 int mi_row, int mi_col,
-                                 int mi_row_ori, int mi_col_ori,
-                                 BLOCK_SIZE top_bsize) {
-  MB_MODE_INFO *mbmi = set_offsets_extend(cm, xd, tile, top_bsize,
-                                          mi_row, mi_col,
-                                          mi_row_ori, mi_col_ori);
-  set_ref(cm, xd, 0, mi_row_ori, mi_col_ori);
+static void dec_predict_b_extend(
+    VP9_COMMON *const cm, MACROBLOCKD *const xd,
+    const TileInfo *const tile, int block,
+    int mi_row_ori, int mi_col_ori,
+    int mi_row_pred, int mi_col_pred,
+    int mi_row_top, int mi_col_top,
+    uint8_t * dst_buf[3], int dst_stride[3],
+    BLOCK_SIZE bsize_top,
+    BLOCK_SIZE bsize_pred,
+    int b_sub8x8, int bextend) {
+  // Used in supertx
+  // (mi_row_ori, mi_col_ori): location for mv
+  // (mi_row_pred, mi_col_pred, bsize_pred): region to predict
+  // (mi_row_top, mi_col_top, bsize_top): region of the top partition size
+  // block: sub location of sub8x8 blocks
+  // b_sub8x8: 1: ori is sub8x8; 0: ori is not sub8x8
+  // bextend: 1: region to predict is an extension of ori; 0: not
+  int r = (mi_row_pred - mi_row_top) * MI_SIZE;
+  int c = (mi_col_pred - mi_col_top) * MI_SIZE;
+  const int mi_width_top = num_8x8_blocks_wide_lookup[bsize_top];
+  const int mi_height_top = num_8x8_blocks_high_lookup[bsize_top];
+  MB_MODE_INFO *mbmi;
+
+  if (mi_row_pred < mi_row_top || mi_col_pred < mi_col_top ||
+      mi_row_pred >= mi_row_top + mi_height_top ||
+      mi_col_pred >= mi_col_top + mi_width_top ||
+      mi_row_pred >= cm->mi_rows || mi_col_pred >= cm->mi_cols)
+    return;
+
+  mbmi = set_offsets_extend(cm, xd, tile, bsize_pred,
+                            mi_row_pred, mi_col_pred,
+                            mi_row_ori, mi_col_ori);
+  set_ref(cm, xd, 0, mi_row_pred, mi_col_pred);
   if (has_second_ref(&xd->mi[0].mbmi))
-    set_ref(cm, xd, 1, mi_row_ori, mi_col_ori);
-  mbmi->tx_size = b_width_log2_lookup[top_bsize];
+    set_ref(cm, xd, 1, mi_row_pred, mi_col_pred);
+
+  if (!bextend) {
+    mbmi->tx_size = b_width_log2_lookup[bsize_top];
+  }
+
+  xd->plane[0].dst.stride = dst_stride[0];
+  xd->plane[1].dst.stride = dst_stride[1];
+  xd->plane[2].dst.stride = dst_stride[2];
+  xd->plane[0].dst.buf = dst_buf[0] +
+                         (r >> xd->plane[0].subsampling_y) * dst_stride[0] +
+                         (c >> xd->plane[0].subsampling_x);
+  xd->plane[1].dst.buf = dst_buf[1] +
+                         (r >> xd->plane[1].subsampling_y) * dst_stride[1] +
+                         (c >> xd->plane[1].subsampling_x);
+  xd->plane[2].dst.buf = dst_buf[2] +
+                         (r >> xd->plane[2].subsampling_y) * dst_stride[2] +
+                         (c >> xd->plane[2].subsampling_x);
+
 #if CONFIG_WEDGE_PARTITION
-  vp9_dec_build_inter_predictors_sb_extend(xd, mi_row, mi_col,
-                                           mi_row_ori, mi_col_ori, top_bsize);
+  if (!b_sub8x8)
+    vp9_dec_build_inter_predictors_sb_extend(xd, mi_row_ori, mi_col_ori,
+                                             mi_row_pred, mi_col_pred,
+                                             bsize_pred);
+  else
+    vp9_dec_build_inter_predictors_sb_sub8x8_extend(
+        xd, mi_row_ori, mi_col_ori,
+        mi_row_pred, mi_col_pred, bsize_pred, block);
 #else
-  vp9_dec_build_inter_predictors_sb(xd, mi_row_ori, mi_col_ori, top_bsize);
+
+  if (!b_sub8x8)
+    vp9_dec_build_inter_predictors_sb(xd, mi_row_pred, mi_col_pred, bsize_pred);
+  else
+    vp9_dec_build_inter_predictors_sb_sub8x8(xd, mi_row_pred, mi_col_pred,
+                                             bsize_pred, block);
+
 #endif  // CONFIG_WEDGE_PARTITION
 }
 
-static void dec_predict_b_sub8x8_extend(VP9_COMMON *const cm,
-                                        MACROBLOCKD *const xd,
-                                        const TileInfo *const tile,
-                                        int mi_row, int mi_col,
-                                        int mi_row_ori, int mi_col_ori,
-                                        BLOCK_SIZE top_bsize,
-                                        PARTITION_TYPE partition) {
-  MB_MODE_INFO *mbmi = set_offsets_extend(cm, xd, tile, top_bsize,
-                                          mi_row, mi_col,
-                                          mi_row_ori, mi_col_ori);
-  set_ref(cm, xd, 0, mi_row_ori, mi_col_ori);
-  if (has_second_ref(&xd->mi[0].mbmi))
-    set_ref(cm, xd, 1, mi_row_ori, mi_col_ori);
-  mbmi->tx_size = b_width_log2_lookup[top_bsize];
-  vp9_dec_build_inter_predictors_sby_sub8x8_extend(xd, mi_row, mi_col,
-                                                   mi_row_ori, mi_col_ori,
-                                                   top_bsize, partition);
-  vp9_dec_build_inter_predictors_sbuv_sub8x8_extend(xd,
-#if CONFIG_WEDGE_PARTITION
-                                                    mi_row, mi_col,
-#endif
-                                                    mi_row_ori, mi_col_ori,
-                                                    top_bsize);
+static void dec_extend_dir(VP9_COMMON *const cm, MACROBLOCKD *const xd,
+                           const TileInfo *const tile, int block,
+                           BLOCK_SIZE bsize, BLOCK_SIZE top_bsize,
+                           int mi_row, int mi_col,
+                           int mi_row_top, int mi_col_top,
+                           uint8_t * dst_buf[3], int dst_stride[3], int dir) {
+  // dir: 0-lower, 1-upper, 2-left, 3-right
+  //      4-lowerleft, 5-upperleft, 6-lowerright, 7-upperright
+  const int mi_width = num_8x8_blocks_wide_lookup[bsize];
+  const int mi_height = num_8x8_blocks_high_lookup[bsize];
+  int xss = xd->plane[1].subsampling_x;
+  int yss = xd->plane[1].subsampling_y;
+  int b_sub8x8 = (bsize < BLOCK_8X8) ? 1 : 0;
+  BLOCK_SIZE extend_bsize;
+  int unit, mi_row_pred, mi_col_pred;
+
+  if (dir == 0 || dir == 1) {
+    extend_bsize = (mi_width == 1 || bsize < BLOCK_8X8 || xss < yss) ?
+                    BLOCK_8X8 : BLOCK_16X8;
+    unit = num_8x8_blocks_wide_lookup[extend_bsize];
+    mi_row_pred = mi_row + ((dir == 0) ? mi_height : -1);
+    mi_col_pred = mi_col;
+
+    dec_predict_b_extend(cm, xd, tile, block, mi_row, mi_col,
+                         mi_row_pred, mi_col_pred,
+                         mi_row_top, mi_col_top,
+                         dst_buf, dst_stride,
+                         top_bsize, extend_bsize, b_sub8x8, 1);
+
+    if (mi_width > unit) {
+      int i;
+      assert(!b_sub8x8);
+      for (i = 0; i < mi_width/unit - 1; i++) {
+        mi_col_pred += unit;
+        dec_predict_b_extend(cm, xd, tile, block, mi_row, mi_col,
+                             mi_row_pred, mi_col_pred,
+                             mi_row_top, mi_col_top,
+                             dst_buf, dst_stride,
+                             top_bsize, extend_bsize, b_sub8x8, 1);
+      }
+    }
+  } else if (dir == 2 || dir == 3) {
+    extend_bsize = (mi_height == 1 || bsize < BLOCK_8X8 || yss < xss) ?
+                    BLOCK_8X8 : BLOCK_8X16;
+    unit = num_8x8_blocks_high_lookup[extend_bsize];
+    mi_row_pred = mi_row;
+    mi_col_pred = mi_col + ((dir == 3) ? mi_width : -1);
+
+    dec_predict_b_extend(cm, xd, tile, block, mi_row, mi_col,
+                         mi_row_pred, mi_col_pred,
+                         mi_row_top, mi_col_top,
+                         dst_buf, dst_stride,
+                         top_bsize, extend_bsize, b_sub8x8, 1);
+
+    if (mi_height > unit) {
+      int i;
+      for (i = 0; i < mi_height/unit - 1; i++) {
+        mi_row_pred += unit;
+        dec_predict_b_extend(cm, xd, tile, block, mi_row, mi_col,
+                             mi_row_pred, mi_col_pred,
+                             mi_row_top, mi_col_top,
+                             dst_buf, dst_stride,
+                             top_bsize, extend_bsize, b_sub8x8, 1);
+      }
+    }
+  } else {
+    extend_bsize = BLOCK_8X8;
+    mi_row_pred = mi_row + ((dir == 4 || dir == 6) ? mi_height : -1);
+    mi_col_pred = mi_col + ((dir == 6 || dir == 7) ? mi_width : -1);
+    dec_predict_b_extend(cm, xd, tile, block, mi_row, mi_col,
+                         mi_row_pred, mi_col_pred,
+                         mi_row_top, mi_col_top,
+                         dst_buf, dst_stride,
+                         top_bsize, extend_bsize, b_sub8x8, 1);
+  }
+}
+
+static void dec_extend_all(VP9_COMMON *const cm, MACROBLOCKD *const xd,
+                           const TileInfo *const tile, int block,
+                           BLOCK_SIZE bsize, BLOCK_SIZE top_bsize,
+                           int mi_row, int mi_col,
+                           int mi_row_top, int mi_col_top,
+                           uint8_t * dst_buf[3], int dst_stride[3]) {
+  dec_extend_dir(cm, xd, tile, block, bsize, top_bsize, mi_row, mi_col,
+                 mi_row_top, mi_col_top, dst_buf, dst_stride, 0);
+  dec_extend_dir(cm, xd, tile, block, bsize, top_bsize, mi_row, mi_col,
+                 mi_row_top, mi_col_top, dst_buf, dst_stride, 1);
+  dec_extend_dir(cm, xd, tile, block, bsize, top_bsize, mi_row, mi_col,
+                 mi_row_top, mi_col_top, dst_buf, dst_stride, 2);
+  dec_extend_dir(cm, xd, tile, block, bsize, top_bsize, mi_row, mi_col,
+                 mi_row_top, mi_col_top, dst_buf, dst_stride, 3);
+  dec_extend_dir(cm, xd, tile, block, bsize, top_bsize, mi_row, mi_col,
+                 mi_row_top, mi_col_top, dst_buf, dst_stride, 4);
+  dec_extend_dir(cm, xd, tile, block, bsize, top_bsize, mi_row, mi_col,
+                 mi_row_top, mi_col_top, dst_buf, dst_stride, 5);
+  dec_extend_dir(cm, xd, tile, block, bsize, top_bsize, mi_row, mi_col,
+                 mi_row_top, mi_col_top, dst_buf, dst_stride, 6);
+  dec_extend_dir(cm, xd, tile, block, bsize, top_bsize, mi_row, mi_col,
+                 mi_row_top, mi_col_top, dst_buf, dst_stride, 7);
 }
 
 static void dec_predict_sb_complex(VP9_COMMON *const cm, MACROBLOCKD *const xd,
                                    const TileInfo *const tile,
                                    int mi_row, int mi_col,
-                                   int mi_row_ori, int mi_col_ori,
+                                   int mi_row_top, int mi_col_top,
                                    BLOCK_SIZE bsize, BLOCK_SIZE top_bsize,
                                    uint8_t *dst_buf[3], int dst_stride[3]) {
   const int bsl = b_width_log2_lookup[bsize], hbs = (1 << bsl) / 4;
@@ -894,29 +1029,48 @@ static void dec_predict_sb_complex(VP9_COMMON *const cm, MACROBLOCKD *const xd,
   MB_MODE_INFO *mbmi;
 #endif
   int i, offset = mi_row * cm->mi_stride + mi_col;
+#if CONFIG_EXT_PARTITION
+  BLOCK_SIZE bsize2 = get_subsize(bsize, PARTITION_SPLIT);
+#endif
+  uint8_t *dst_buf1[3], *dst_buf2[3], *dst_buf3[3];
 
   DECLARE_ALIGNED_ARRAY(16, uint8_t, tmp_buf1,
-                        MAX_MB_PLANE * MAXTXLEN * MAXTXLEN);
+                        MAX_MB_PLANE * MAXTXLEN * MAXTXLEN * sizeof(uint16_t));
   DECLARE_ALIGNED_ARRAY(16, uint8_t, tmp_buf2,
-                        MAX_MB_PLANE * MAXTXLEN * MAXTXLEN);
+                        MAX_MB_PLANE * MAXTXLEN * MAXTXLEN * sizeof(uint16_t));
   DECLARE_ALIGNED_ARRAY(16, uint8_t, tmp_buf3,
-                        MAX_MB_PLANE * MAXTXLEN * MAXTXLEN);
-  uint8_t *dst_buf1[3] = {
-    tmp_buf1,
-    tmp_buf1 + MAXTXLEN * MAXTXLEN,
-    tmp_buf1 + 2 * MAXTXLEN * MAXTXLEN};
-  uint8_t *dst_buf2[3] = {
-    tmp_buf2,
-    tmp_buf2 + MAXTXLEN * MAXTXLEN,
-    tmp_buf2 + 2 * MAXTXLEN * MAXTXLEN};
-  uint8_t *dst_buf3[3] = {
-    tmp_buf3,
-    tmp_buf3 + MAXTXLEN * MAXTXLEN,
-    tmp_buf3 + 2 * MAXTXLEN * MAXTXLEN};
+                        MAX_MB_PLANE * MAXTXLEN * MAXTXLEN * sizeof(uint16_t));
   int dst_stride1[3] = {MAXTXLEN, MAXTXLEN, MAXTXLEN};
   int dst_stride2[3] = {MAXTXLEN, MAXTXLEN, MAXTXLEN};
   int dst_stride3[3] = {MAXTXLEN, MAXTXLEN, MAXTXLEN};
 
+#if CONFIG_VP9_HIGHBITDEPTH
+  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+    int len = sizeof(uint16_t);
+    dst_buf1[0] = CONVERT_TO_BYTEPTR(tmp_buf1);
+    dst_buf1[1] = CONVERT_TO_BYTEPTR(tmp_buf1 + MAXTXLEN * MAXTXLEN * len);
+    dst_buf1[2] = CONVERT_TO_BYTEPTR(tmp_buf1 + 2 * MAXTXLEN * MAXTXLEN * len);
+    dst_buf2[0] = CONVERT_TO_BYTEPTR(tmp_buf2);
+    dst_buf2[1] = CONVERT_TO_BYTEPTR(tmp_buf2 + MAXTXLEN * MAXTXLEN * len);
+    dst_buf2[2] = CONVERT_TO_BYTEPTR(tmp_buf2 + 2 * MAXTXLEN * MAXTXLEN * len);
+    dst_buf3[0] = CONVERT_TO_BYTEPTR(tmp_buf3);
+    dst_buf3[1] = CONVERT_TO_BYTEPTR(tmp_buf3 + MAXTXLEN * MAXTXLEN * len);
+    dst_buf3[2] = CONVERT_TO_BYTEPTR(tmp_buf3 + 2 * MAXTXLEN * MAXTXLEN * len);
+  } else {
+#endif
+    dst_buf1[0] = tmp_buf1;
+    dst_buf1[1] = tmp_buf1 + MAXTXLEN * MAXTXLEN;
+    dst_buf1[2] = tmp_buf1 + 2 * MAXTXLEN * MAXTXLEN;
+    dst_buf2[0] = tmp_buf2;
+    dst_buf2[1] = tmp_buf2 + MAXTXLEN * MAXTXLEN;
+    dst_buf2[2] = tmp_buf2 + 2 * MAXTXLEN * MAXTXLEN;
+    dst_buf3[0] = tmp_buf3;
+    dst_buf3[1] = tmp_buf3 + MAXTXLEN * MAXTXLEN;
+    dst_buf3[2] = tmp_buf3 + 2 * MAXTXLEN * MAXTXLEN;
+#if CONFIG_VP9_HIGHBITDEPTH
+  }
+#endif
+
   if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols)
     return;
 
@@ -939,91 +1093,197 @@ static void dec_predict_sb_complex(VP9_COMMON *const cm, MACROBLOCKD *const xd,
   switch (partition) {
     case PARTITION_NONE:
       assert(bsize < top_bsize);
-      dec_predict_b_extend(cm, xd, tile, mi_row, mi_col, mi_row_ori, mi_col_ori,
-                           top_bsize);
+      dec_predict_b_extend(cm, xd, tile, 0, mi_row, mi_col, mi_row, mi_col,
+                           mi_row_top, mi_col_top, dst_buf, dst_stride,
+                           top_bsize, bsize, 0, 0);
+      dec_extend_all(cm, xd, tile, 0, bsize, top_bsize, mi_row, mi_col,
+                     mi_row_top, mi_col_top, dst_buf, dst_stride);
       break;
     case PARTITION_HORZ:
-      if (bsize > BLOCK_8X8) {
-        dec_predict_b_extend(cm, xd, tile, mi_row, mi_col, mi_row_ori,
-                             mi_col_ori, top_bsize);
+      if (bsize == BLOCK_8X8) {
+        // For sub8x8, predict in 8x8 unit
+        // First half
+        dec_predict_b_extend(cm, xd, tile, 0, mi_row, mi_col, mi_row, mi_col,
+                             mi_row_top, mi_col_top, dst_buf, dst_stride,
+                             top_bsize, BLOCK_8X8, 1, 0);
+        if (bsize < top_bsize)
+          dec_extend_all(cm, xd, tile, 0, subsize, top_bsize, mi_row, mi_col,
+                         mi_row_top, mi_col_top, dst_buf, dst_stride);
+
+        // Second half
+        dec_predict_b_extend(cm, xd, tile, 2, mi_row, mi_col, mi_row, mi_col,
+                             mi_row_top, mi_col_top, dst_buf1, dst_stride1,
+                             top_bsize, BLOCK_8X8, 1, 1);
+        if (bsize < top_bsize)
+          dec_extend_all(cm, xd, tile, 2, subsize, top_bsize, mi_row, mi_col,
+                         mi_row_top, mi_col_top, dst_buf1, dst_stride1);
+
+        // weighted average to smooth the boundary
+        xd->plane[0].dst.buf = dst_buf[0];
+        xd->plane[0].dst.stride = dst_stride[0];
+        vp9_build_masked_inter_predictor_complex(xd,
+                                                 dst_buf[0], dst_stride[0],
+                                                 dst_buf1[0], dst_stride1[0],
+                                                 &xd->plane[0],
+                                                 mi_row, mi_col,
+                                                 mi_row_top, mi_col_top,
+                                                 bsize, top_bsize,
+                                                 PARTITION_HORZ, 0);
       } else {
-        dec_predict_b_sub8x8_extend(cm, xd, tile, mi_row, mi_col,
-                                    mi_row_ori, mi_col_ori,
-                                    top_bsize, partition);
-      }
-      if (mi_row + hbs < cm->mi_rows && bsize > BLOCK_8X8) {
-        for (i = 0; i < MAX_MB_PLANE; i++) {
-          xd->plane[i].dst.buf = dst_buf1[i];
-          xd->plane[i].dst.stride = dst_stride1[i];
-        }
-        dec_predict_b_extend(cm, xd, tile, mi_row + hbs, mi_col,
-                             mi_row_ori, mi_col_ori, top_bsize);
-        for (i = 0; i < MAX_MB_PLANE; i++) {
-          xd->plane[i].dst.buf = dst_buf[i];
-          xd->plane[i].dst.stride = dst_stride[i];
-          vp9_build_masked_inter_predictor_complex(xd,
-                                                   dst_buf[i], dst_stride[i],
-                                                   dst_buf1[i], dst_stride1[i],
-                                                   &xd->plane[i],
-                                                   mi_row, mi_col,
-                                                   mi_row_ori, mi_col_ori,
-                                                   bsize, top_bsize,
-                                                   PARTITION_HORZ);
+        // First half
+        dec_predict_b_extend(cm, xd, tile, 0, mi_row, mi_col, mi_row, mi_col,
+                             mi_row_top, mi_col_top, dst_buf, dst_stride,
+                             top_bsize, subsize, 0, 0);
+        if (bsize < top_bsize)
+          dec_extend_all(cm, xd, tile, 0, subsize, top_bsize, mi_row, mi_col,
+                         mi_row_top, mi_col_top, dst_buf, dst_stride);
+        else
+          dec_extend_dir(cm, xd, tile, 0, subsize, top_bsize, mi_row, mi_col,
+                         mi_row_top, mi_col_top, dst_buf, dst_stride, 0);
+
+        if (mi_row + hbs < cm->mi_rows) {
+          // Second half
+          dec_predict_b_extend(cm, xd, tile, 0, mi_row + hbs, mi_col,
+                               mi_row + hbs, mi_col,
+                               mi_row_top, mi_col_top,
+                               dst_buf1, dst_stride1,
+                               top_bsize, subsize, 0, 0);
+          if (bsize < top_bsize)
+            dec_extend_all(cm, xd, tile, 0, subsize, top_bsize,
+                           mi_row + hbs, mi_col,
+                           mi_row_top, mi_col_top,
+                           dst_buf1, dst_stride1);
+          else
+            dec_extend_dir(cm, xd, tile, 0, subsize, top_bsize,
+                           mi_row + hbs, mi_col,
+                           mi_row_top, mi_col_top,
+                           dst_buf1, dst_stride1, 1);
+
+          // weighted average to smooth the boundary
+          for (i = 0; i < MAX_MB_PLANE; i++) {
+            xd->plane[i].dst.buf = dst_buf[i];
+            xd->plane[i].dst.stride = dst_stride[i];
+            vp9_build_masked_inter_predictor_complex(
+                xd, dst_buf[i], dst_stride[i], dst_buf1[i], dst_stride1[i],
+                &xd->plane[i], mi_row, mi_col, mi_row_top, mi_col_top,
+                bsize, top_bsize, PARTITION_HORZ, i);
+          }
         }
       }
       break;
     case PARTITION_VERT:
-      if (bsize > BLOCK_8X8) {
-        dec_predict_b_extend(cm, xd, tile, mi_row, mi_col, mi_row_ori,
-                             mi_col_ori, top_bsize);
+      if (bsize == BLOCK_8X8) {
+        // First half
+        dec_predict_b_extend(cm, xd, tile, 0, mi_row, mi_col, mi_row, mi_col,
+                             mi_row_top, mi_col_top, dst_buf, dst_stride,
+                             top_bsize, BLOCK_8X8, 1, 0);
+        if (bsize < top_bsize)
+          dec_extend_all(cm, xd, tile, 0, subsize, top_bsize, mi_row, mi_col,
+                         mi_row_top, mi_col_top, dst_buf, dst_stride);
+
+        // Second half
+        dec_predict_b_extend(cm, xd, tile, 1, mi_row, mi_col, mi_row, mi_col,
+                             mi_row_top, mi_col_top, dst_buf1, dst_stride1,
+                             top_bsize, BLOCK_8X8, 1, 1);
+        if (bsize < top_bsize)
+          dec_extend_all(cm, xd, tile, 1, subsize, top_bsize, mi_row, mi_col,
+                         mi_row_top, mi_col_top, dst_buf1, dst_stride1);
+
+        // Smooth
+        xd->plane[0].dst.buf = dst_buf[0];
+        xd->plane[0].dst.stride = dst_stride[0];
+        vp9_build_masked_inter_predictor_complex(xd,
+                                                 dst_buf[0], dst_stride[0],
+                                                 dst_buf1[0], dst_stride1[0],
+                                                 &xd->plane[0],
+                                                 mi_row, mi_col,
+                                                 mi_row_top, mi_col_top,
+                                                 bsize, top_bsize,
+                                                 PARTITION_VERT, 0);
       } else {
-        dec_predict_b_sub8x8_extend(cm, xd, tile, mi_row, mi_col,
-                                    mi_row_ori, mi_col_ori,
-                                    top_bsize, partition);
-      }
-      if (mi_col + hbs < cm->mi_cols && bsize > BLOCK_8X8) {
-        for (i = 0; i < MAX_MB_PLANE; i++) {
-          xd->plane[i].dst.buf = dst_buf1[i];
-          xd->plane[i].dst.stride = dst_stride1[i];
-        }
-        dec_predict_b_extend(cm, xd, tile, mi_row, mi_col + hbs, mi_row_ori,
-                             mi_col_ori, top_bsize);
-        for (i = 0; i < MAX_MB_PLANE; i++) {
-          xd->plane[i].dst.buf = dst_buf[i];
-          xd->plane[i].dst.stride = dst_stride[i];
-          vp9_build_masked_inter_predictor_complex(xd,
-                                                   dst_buf[i], dst_stride[i],
-                                                   dst_buf1[i], dst_stride1[i],
-                                                   &xd->plane[i],
-                                                   mi_row, mi_col,
-                                                   mi_row_ori, mi_col_ori,
-                                                   bsize, top_bsize,
-                                                   PARTITION_VERT);
+        // First half
+        dec_predict_b_extend(cm, xd, tile, 0, mi_row, mi_col, mi_row, mi_col,
+                             mi_row_top, mi_col_top, dst_buf, dst_stride,
+                             top_bsize, subsize, 0, 0);
+        if (bsize < top_bsize)
+          dec_extend_all(cm, xd, tile, 0, subsize, top_bsize, mi_row, mi_col,
+                         mi_row_top, mi_col_top, dst_buf, dst_stride);
+        else
+          dec_extend_dir(cm, xd, tile, 0, subsize, top_bsize, mi_row, mi_col,
+                         mi_row_top, mi_col_top, dst_buf, dst_stride, 3);
+
+        // Second half
+        if (mi_col + hbs < cm->mi_cols) {
+          dec_predict_b_extend(cm, xd, tile, 0, mi_row, mi_col + hbs,
+                               mi_row, mi_col + hbs,
+                               mi_row_top, mi_col_top,
+                               dst_buf1, dst_stride1,
+                               top_bsize, subsize, 0, 0);
+          if (bsize < top_bsize)
+            dec_extend_all(cm, xd, tile, 0, subsize, top_bsize,
+                           mi_row, mi_col + hbs, mi_row_top, mi_col_top,
+                           dst_buf1, dst_stride1);
+          else
+            dec_extend_dir(cm, xd, tile, 0, subsize, top_bsize,
+                           mi_row, mi_col + hbs, mi_row_top, mi_col_top,
+                           dst_buf1, dst_stride1, 2);
+
+          // Smooth
+          for (i = 0; i < MAX_MB_PLANE; i++) {
+            xd->plane[i].dst.buf = dst_buf[i];
+            xd->plane[i].dst.stride = dst_stride[i];
+            vp9_build_masked_inter_predictor_complex(
+                xd, dst_buf[i], dst_stride[i], dst_buf1[i], dst_stride1[i],
+                &xd->plane[i], mi_row, mi_col, mi_row_top, mi_col_top,
+                bsize, top_bsize, PARTITION_VERT, i);
+          }
         }
       }
       break;
     case PARTITION_SPLIT:
       if (bsize == BLOCK_8X8) {
-        dec_predict_b_sub8x8_extend(cm, xd, tile, mi_row, mi_col,
-                                    mi_row_ori, mi_col_ori,
-                                    top_bsize, partition);
+        dec_predict_b_extend(cm, xd, tile, 0, mi_row, mi_col, mi_row, mi_col,
+                             mi_row_top, mi_col_top, dst_buf, dst_stride,
+                             top_bsize, BLOCK_8X8, 1, 0);
+        dec_predict_b_extend(cm, xd, tile, 1, mi_row, mi_col, mi_row, mi_col,
+                             mi_row_top, mi_col_top, dst_buf1, dst_stride1,
+                             top_bsize, BLOCK_8X8, 1, 1);
+        dec_predict_b_extend(cm, xd, tile, 2, mi_row, mi_col, mi_row, mi_col,
+                             mi_row_top, mi_col_top, dst_buf2, dst_stride2,
+                             top_bsize, BLOCK_8X8, 1, 1);
+        dec_predict_b_extend(cm, xd, tile, 3, mi_row, mi_col, mi_row, mi_col,
+                             mi_row_top, mi_col_top, dst_buf3, dst_stride3,
+                             top_bsize, BLOCK_8X8, 1, 1);
+        if (bsize < top_bsize) {
+          dec_extend_all(cm, xd, tile, 0, subsize, top_bsize, mi_row, mi_col,
+                         mi_row_top, mi_col_top, dst_buf, dst_stride);
+          dec_extend_all(cm, xd, tile, 1, subsize, top_bsize, mi_row, mi_col,
+                         mi_row_top, mi_col_top, dst_buf1, dst_stride1);
+          dec_extend_all(cm, xd, tile, 2, subsize, top_bsize, mi_row, mi_col,
+                         mi_row_top, mi_col_top, dst_buf2, dst_stride2);
+          dec_extend_all(cm, xd, tile, 3, subsize, top_bsize, mi_row, mi_col,
+                         mi_row_top, mi_col_top, dst_buf3, dst_stride3);
+        }
       } else {
         dec_predict_sb_complex(cm, xd, tile, mi_row, mi_col,
-                               mi_row_ori, mi_col_ori, subsize, top_bsize,
+                               mi_row_top, mi_col_top, subsize, top_bsize,
                                dst_buf, dst_stride);
         if (mi_row < cm->mi_rows && mi_col + hbs < cm->mi_cols)
           dec_predict_sb_complex(cm, xd, tile, mi_row, mi_col + hbs,
-                                 mi_row_ori, mi_col_ori, subsize, top_bsize,
+                                 mi_row_top, mi_col_top, subsize, top_bsize,
                                  dst_buf1, dst_stride1);
         if (mi_row + hbs < cm->mi_rows && mi_col < cm->mi_cols)
           dec_predict_sb_complex(cm, xd, tile, mi_row + hbs, mi_col,
-                                 mi_row_ori, mi_col_ori, subsize, top_bsize,
+                                 mi_row_top, mi_col_top, subsize, top_bsize,
                                  dst_buf2, dst_stride2);
         if (mi_row + hbs < cm->mi_rows && mi_col + hbs < cm->mi_cols)
           dec_predict_sb_complex(cm, xd, tile, mi_row + hbs, mi_col + hbs,
-                                 mi_row_ori, mi_col_ori, subsize, top_bsize,
+                                 mi_row_top, mi_col_top, subsize, top_bsize,
                                  dst_buf3, dst_stride3);
+      }
         for (i = 0; i < MAX_MB_PLANE; i++) {
+          if (bsize == BLOCK_8X8 && i != 0)
+            continue;  // Skip <4x4 chroma smoothing
           if (mi_row < cm->mi_rows && mi_col + hbs < cm->mi_cols) {
             vp9_build_masked_inter_predictor_complex(xd,
                                                      dst_buf[i], dst_stride[i],
@@ -1031,9 +1291,9 @@ static void dec_predict_sb_complex(VP9_COMMON *const cm, MACROBLOCKD *const xd,
                                                      dst_stride1[i],
                                                      &xd->plane[i],
                                                      mi_row, mi_col,
-                                                     mi_row_ori, mi_col_ori,
+                                                     mi_row_top, mi_col_top,
                                                      bsize, top_bsize,
-                                                     PARTITION_VERT);
+                                                     PARTITION_VERT, i);
             if (mi_row + hbs < cm->mi_rows) {
               vp9_build_masked_inter_predictor_complex(xd,
                                                        dst_buf2[i],
@@ -1042,9 +1302,9 @@ static void dec_predict_sb_complex(VP9_COMMON *const cm, MACROBLOCKD *const xd,
                                                        dst_stride3[i],
                                                        &xd->plane[i],
                                                        mi_row, mi_col,
-                                                       mi_row_ori, mi_col_ori,
+                                                       mi_row_top, mi_col_top,
                                                        bsize, top_bsize,
-                                                       PARTITION_VERT);
+                                                       PARTITION_VERT, i);
               vp9_build_masked_inter_predictor_complex(xd,
                                                        dst_buf[i],
                                                        dst_stride[i],
@@ -1052,9 +1312,9 @@ static void dec_predict_sb_complex(VP9_COMMON *const cm, MACROBLOCKD *const xd,
                                                        dst_stride2[i],
                                                        &xd->plane[i],
                                                        mi_row, mi_col,
-                                                       mi_row_ori, mi_col_ori,
+                                                       mi_row_top, mi_col_top,
                                                        bsize, top_bsize,
-                                                       PARTITION_HORZ);
+                                                       PARTITION_HORZ, i);
             }
           } else if (mi_row + hbs < cm->mi_rows && mi_col < cm->mi_cols) {
             vp9_build_masked_inter_predictor_complex(xd,
@@ -1064,29 +1324,38 @@ static void dec_predict_sb_complex(VP9_COMMON *const cm, MACROBLOCKD *const xd,
                                                      dst_stride2[i],
                                                      &xd->plane[i],
                                                      mi_row, mi_col,
-                                                     mi_row_ori, mi_col_ori,
+                                                     mi_row_top, mi_col_top,
                                                      bsize, top_bsize,
-                                                     PARTITION_HORZ);
+                                                     PARTITION_HORZ, i);
           }
         }
-      }
       break;
 #if CONFIG_EXT_PARTITION
     case PARTITION_HORZ_A:
-      dec_predict_b_extend(cm, xd, tile, mi_row, mi_col, mi_row_ori,
-                           mi_col_ori, top_bsize);
-      for (i = 0; i < MAX_MB_PLANE; i++) {
-        xd->plane[i].dst.buf = dst_buf1[i];
-        xd->plane[i].dst.stride = dst_stride1[i];
-      }
-      dec_predict_b_extend(cm, xd, tile, mi_row, mi_col + hbs,
-                           mi_row_ori, mi_col_ori, top_bsize);
-      for (i = 0; i < MAX_MB_PLANE; i++) {
-        xd->plane[i].dst.buf = dst_buf2[i];
-        xd->plane[i].dst.stride = dst_stride2[i];
-      }
-      dec_predict_b_extend(cm, xd, tile, mi_row + hbs, mi_col,
-                           mi_row_ori, mi_col_ori, top_bsize);
+      dec_predict_b_extend(cm, xd, tile, 0, mi_row, mi_col, mi_row, mi_col,
+                           mi_row_top, mi_col_top, dst_buf, dst_stride,
+                           top_bsize, bsize2, 0, 0);
+      dec_extend_all(cm, xd, tile, 0, bsize2, top_bsize, mi_row, mi_col,
+                     mi_row_top, mi_col_top, dst_buf, dst_stride);
+
+      dec_predict_b_extend(cm, xd, tile, 0, mi_row, mi_col + hbs, mi_row,
+                           mi_col + hbs, mi_row_top, mi_col_top,
+                           dst_buf1, dst_stride1, top_bsize, bsize2, 0, 0);
+      dec_extend_all(cm, xd, tile, 0, bsize2, top_bsize, mi_row, mi_col + hbs,
+                     mi_row_top, mi_col_top, dst_buf1, dst_stride1);
+
+      dec_predict_b_extend(cm, xd, tile, 0, mi_row + hbs, mi_col,
+                           mi_row + hbs, mi_col, mi_row_top, mi_col_top,
+                           dst_buf2, dst_stride2, top_bsize, subsize, 0, 0);
+      if (bsize < top_bsize)
+        dec_extend_all(cm, xd, tile, 0, subsize, top_bsize,
+                       mi_row + hbs, mi_col, mi_row_top, mi_col_top,
+                       dst_buf2, dst_stride2);
+      else
+        dec_extend_dir(cm, xd, tile, 0, subsize, top_bsize,
+                       mi_row + hbs, mi_col, mi_row_top, mi_col_top,
+                       dst_buf2, dst_stride2, 1);
+
       for (i = 0; i < MAX_MB_PLANE; i++) {
         xd->plane[i].dst.buf = dst_buf[i];
         xd->plane[i].dst.stride = dst_stride[i];
@@ -1095,9 +1364,9 @@ static void dec_predict_sb_complex(VP9_COMMON *const cm, MACROBLOCKD *const xd,
                                                  dst_buf1[i], dst_stride1[i],
                                                  &xd->plane[i],
                                                  mi_row, mi_col,
-                                                 mi_row_ori, mi_col_ori,
+                                                 mi_row_top, mi_col_top,
                                                  bsize, top_bsize,
-                                                 PARTITION_VERT);
+                                                 PARTITION_VERT, i);
       }
       for (i = 0; i < MAX_MB_PLANE; i++) {
         vp9_build_masked_inter_predictor_complex(xd,
@@ -1105,350 +1374,38 @@ static void dec_predict_sb_complex(VP9_COMMON *const cm, MACROBLOCKD *const xd,
                                                  dst_buf2[i], dst_stride2[i],
                                                  &xd->plane[i],
                                                  mi_row, mi_col,
-                                                 mi_row_ori, mi_col_ori,
+                                                 mi_row_top, mi_col_top,
                                                  bsize, top_bsize,
-                                                 PARTITION_HORZ);
+                                                 PARTITION_HORZ, i);
       }
       break;
     case PARTITION_VERT_A:
-      dec_predict_b_extend(cm, xd, tile, mi_row, mi_col, mi_row_ori,
-                           mi_col_ori, top_bsize);
-      for (i = 0; i < MAX_MB_PLANE; i++) {
-        xd->plane[i].dst.buf = dst_buf1[i];
-        xd->plane[i].dst.stride = dst_stride1[i];
-      }
-      dec_predict_b_extend(cm, xd, tile, mi_row + hbs, mi_col,
-                           mi_row_ori, mi_col_ori, top_bsize);
-      for (i = 0; i < MAX_MB_PLANE; i++) {
-        xd->plane[i].dst.buf = dst_buf2[i];
-        xd->plane[i].dst.stride = dst_stride2[i];
-      }
-      dec_predict_b_extend(cm, xd, tile, mi_row, mi_col + hbs,
-                           mi_row_ori, mi_col_ori, top_bsize);
-      for (i = 0; i < MAX_MB_PLANE; i++) {
-        xd->plane[i].dst.buf = dst_buf[i];
-        xd->plane[i].dst.stride = dst_stride[i];
-        vp9_build_masked_inter_predictor_complex(xd,
-                                                 dst_buf[i], dst_stride[i],
-                                                 dst_buf1[i], dst_stride1[i],
-                                                 &xd->plane[i],
-                                                 mi_row, mi_col,
-                                                 mi_row_ori, mi_col_ori,
-                                                 bsize, top_bsize,
-                                                 PARTITION_HORZ);
-      }
-      for (i = 0; i < MAX_MB_PLANE; i++) {
-        vp9_build_masked_inter_predictor_complex(xd,
-                                                 dst_buf[i], dst_stride[i],
-                                                 dst_buf2[i], dst_stride2[i],
-                                                 &xd->plane[i],
-                                                 mi_row, mi_col,
-                                                 mi_row_ori, mi_col_ori,
-                                                 bsize, top_bsize,
-                                                 PARTITION_VERT);
-      }
-      break;
-    case PARTITION_HORZ_B:
-      dec_predict_b_extend(cm, xd, tile, mi_row, mi_col, mi_row_ori,
-                           mi_col_ori, top_bsize);
-      for (i = 0; i < MAX_MB_PLANE; i++) {
-        xd->plane[i].dst.buf = dst_buf1[i];
-        xd->plane[i].dst.stride = dst_stride1[i];
-      }
-      dec_predict_b_extend(cm, xd, tile, mi_row + hbs, mi_col,
-                           mi_row_ori, mi_col_ori, top_bsize);
-      for (i = 0; i < MAX_MB_PLANE; i++) {
-        xd->plane[i].dst.buf = dst_buf2[i];
-        xd->plane[i].dst.stride = dst_stride2[i];
-      }
-      dec_predict_b_extend(cm, xd, tile, mi_row + hbs, mi_col + hbs,
-                           mi_row_ori, mi_col_ori, top_bsize);
-      for (i = 0; i < MAX_MB_PLANE; i++) {
-        xd->plane[i].dst.buf = dst_buf1[i];
-        xd->plane[i].dst.stride = dst_stride1[i];
-        vp9_build_masked_inter_predictor_complex(xd,
-                                                 dst_buf1[i], dst_stride1[i],
-                                                 dst_buf2[i], dst_stride2[i],
-                                                 &xd->plane[i],
-                                                 mi_row, mi_col,
-                                                 mi_row_ori, mi_col_ori,
-                                                 bsize, top_bsize,
-                                                 PARTITION_VERT);
-      }
-      for (i = 0; i < MAX_MB_PLANE; i++) {
-        xd->plane[i].dst.buf = dst_buf[i];
-        xd->plane[i].dst.stride = dst_stride[i];
-        vp9_build_masked_inter_predictor_complex(xd,
-                                                 dst_buf[i], dst_stride[i],
-                                                 dst_buf1[i], dst_stride1[i],
-                                                 &xd->plane[i],
-                                                 mi_row, mi_col,
-                                                 mi_row_ori, mi_col_ori,
-                                                 bsize, top_bsize,
-                                                 PARTITION_HORZ);
-      }
-      break;
-    case PARTITION_VERT_B:
-      dec_predict_b_extend(cm, xd, tile, mi_row, mi_col, mi_row_ori,
-                           mi_col_ori, top_bsize);
-      for (i = 0; i < MAX_MB_PLANE; i++) {
-        xd->plane[i].dst.buf = dst_buf1[i];
-        xd->plane[i].dst.stride = dst_stride1[i];
-      }
-      dec_predict_b_extend(cm, xd, tile, mi_row, mi_col + hbs,
-                           mi_row_ori, mi_col_ori, top_bsize);
-      for (i = 0; i < MAX_MB_PLANE; i++) {
-        xd->plane[i].dst.buf = dst_buf2[i];
-        xd->plane[i].dst.stride = dst_stride2[i];
-      }
-      dec_predict_b_extend(cm, xd, tile, mi_row + hbs, mi_col + hbs,
-                           mi_row_ori, mi_col_ori, top_bsize);
-      for (i = 0; i < MAX_MB_PLANE; i++) {
-        xd->plane[i].dst.buf = dst_buf1[i];
-        xd->plane[i].dst.stride = dst_stride1[i];
-        vp9_build_masked_inter_predictor_complex(xd,
-                                                 dst_buf1[i], dst_stride1[i],
-                                                 dst_buf2[i], dst_stride2[i],
-                                                 &xd->plane[i],
-                                                 mi_row, mi_col,
-                                                 mi_row_ori, mi_col_ori,
-                                                 bsize, top_bsize,
-                                                 PARTITION_HORZ);
-      }
-      for (i = 0; i < MAX_MB_PLANE; i++) {
-        xd->plane[i].dst.buf = dst_buf[i];
-        xd->plane[i].dst.stride = dst_stride[i];
-        vp9_build_masked_inter_predictor_complex(xd,
-                                                 dst_buf[i], dst_stride[i],
-                                                 dst_buf1[i], dst_stride1[i],
-                                                 &xd->plane[i],
-                                                 mi_row, mi_col,
-                                                 mi_row_ori, mi_col_ori,
-                                                 bsize, top_bsize,
-                                                 PARTITION_VERT);
-      }
-      break;
-#endif
-    default:
-      assert(0);
-  }
-}
-
-#if CONFIG_VP9_HIGHBITDEPTH
-static void dec_predict_sb_complex_highbd(
-    VP9_COMMON *const cm, MACROBLOCKD *const xd,
-    const TileInfo *const tile,
-    int mi_row, int mi_col,
-    int mi_row_ori, int mi_col_ori,
-    BLOCK_SIZE bsize, BLOCK_SIZE top_bsize,
-    uint8_t *dst_buf[3], int dst_stride[3]) {
-  const int bsl = b_width_log2_lookup[bsize], hbs = (1 << bsl) / 4;
-  PARTITION_TYPE partition;
-  BLOCK_SIZE subsize;
-#if !CONFIG_EXT_PARTITION
-  MB_MODE_INFO *mbmi;
-#endif
-  int i, offset = mi_row * cm->mi_stride + mi_col;
 
-  DECLARE_ALIGNED_ARRAY(16, uint8_t, tmp_buf1,
-                        MAX_MB_PLANE * MAXTXLEN * MAXTXLEN * sizeof(uint16_t));
-  DECLARE_ALIGNED_ARRAY(16, uint8_t, tmp_buf2,
-                        MAX_MB_PLANE * MAXTXLEN * MAXTXLEN * sizeof(uint16_t));
-  DECLARE_ALIGNED_ARRAY(16, uint8_t, tmp_buf3,
-                        MAX_MB_PLANE * MAXTXLEN * MAXTXLEN * sizeof(uint16_t));
-  uint8_t *dst_buf1[3] = {
-    CONVERT_TO_BYTEPTR(tmp_buf1),
-    CONVERT_TO_BYTEPTR(tmp_buf1 + MAXTXLEN * MAXTXLEN * sizeof(uint16_t)),
-    CONVERT_TO_BYTEPTR(tmp_buf1 + 2 * MAXTXLEN * MAXTXLEN * sizeof(uint16_t))};
-  uint8_t *dst_buf2[3] = {
-    CONVERT_TO_BYTEPTR(tmp_buf2),
-    CONVERT_TO_BYTEPTR(tmp_buf2 + MAXTXLEN * MAXTXLEN * sizeof(uint16_t)),
-    CONVERT_TO_BYTEPTR(tmp_buf2 + 2 * MAXTXLEN * MAXTXLEN * sizeof(uint16_t))};
-  uint8_t *dst_buf3[3] = {
-    CONVERT_TO_BYTEPTR(tmp_buf3),
-    CONVERT_TO_BYTEPTR(tmp_buf3 + MAXTXLEN * MAXTXLEN * sizeof(uint16_t)),
-    CONVERT_TO_BYTEPTR(tmp_buf3 + 2 * MAXTXLEN * MAXTXLEN * sizeof(uint16_t))};
-  int dst_stride1[3] = {MAXTXLEN, MAXTXLEN, MAXTXLEN};
-  int dst_stride2[3] = {MAXTXLEN, MAXTXLEN, MAXTXLEN};
-  int dst_stride3[3] = {MAXTXLEN, MAXTXLEN, MAXTXLEN};
-
-  if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols)
-    return;
-
-  xd->mi = cm->mi + offset;
-  xd->mi[0].src_mi = &xd->mi[0];
-#if CONFIG_EXT_PARTITION
-  partition = get_partition(cm->mi, cm->mi_stride, cm->mi_rows, cm->mi_cols,
-                            mi_row, mi_col, bsize);
-#else
-  mbmi = &xd->mi[0].mbmi;
-  partition = partition_lookup[bsl][mbmi->sb_type];
-#endif
-  subsize = get_subsize(bsize, partition);
-
-  for (i = 0; i < MAX_MB_PLANE; i++) {
-    xd->plane[i].dst.buf = dst_buf[i];
-    xd->plane[i].dst.stride = dst_stride[i];
-  }
+      dec_predict_b_extend(cm, xd, tile, 0, mi_row, mi_col, mi_row, mi_col,
+                           mi_row_top, mi_col_top, dst_buf, dst_stride,
+                           top_bsize, bsize2, 0, 0);
+      dec_extend_all(cm, xd, tile, 0, bsize2, top_bsize, mi_row, mi_col,
+                     mi_row_top, mi_col_top, dst_buf, dst_stride);
+
+      dec_predict_b_extend(cm, xd, tile, 0, mi_row + hbs, mi_col,
+                           mi_row + hbs, mi_col, mi_row_top, mi_col_top,
+                           dst_buf1, dst_stride1, top_bsize, bsize2, 0, 0);
+      dec_extend_all(cm, xd, tile, 0, bsize2, top_bsize, mi_row + hbs, mi_col,
+                     mi_row_top, mi_col_top, dst_buf1, dst_stride1);
+
+      dec_predict_b_extend(cm, xd, tile, 0, mi_row, mi_col + hbs,
+                           mi_row, mi_col + hbs, mi_row_top, mi_col_top,
+                           dst_buf2, dst_stride2,
+                           top_bsize, subsize, 0, 0);
+      if (bsize < top_bsize)
+        dec_extend_all(cm, xd, tile, 0, subsize, top_bsize,
+                       mi_row, mi_col + hbs, mi_row_top, mi_col_top,
+                       dst_buf2, dst_stride2);
+      else
+        dec_extend_dir(cm, xd, tile, 0, subsize, top_bsize,
+                       mi_row, mi_col + hbs, mi_row_top, mi_col_top,
+                       dst_buf2, dst_stride2, 2);
 
-  switch (partition) {
-    case PARTITION_NONE:
-      assert(bsize < top_bsize);
-      dec_predict_b_extend(cm, xd, tile, mi_row, mi_col, mi_row_ori, mi_col_ori,
-                           top_bsize);
-      break;
-    case PARTITION_HORZ:
-      if (bsize > BLOCK_8X8) {
-        dec_predict_b_extend(cm, xd, tile, mi_row, mi_col, mi_row_ori,
-                             mi_col_ori, top_bsize);
-      } else {
-        dec_predict_b_sub8x8_extend(cm, xd, tile, mi_row, mi_col,
-                                    mi_row_ori, mi_col_ori,
-                                    top_bsize, partition);
-      }
-      if (mi_row + hbs < cm->mi_rows && bsize > BLOCK_8X8) {
-        for (i = 0; i < MAX_MB_PLANE; i++) {
-          xd->plane[i].dst.buf = dst_buf1[i];
-          xd->plane[i].dst.stride = dst_stride1[i];
-        }
-        dec_predict_b_extend(cm, xd, tile, mi_row + hbs, mi_col,
-                             mi_row_ori, mi_col_ori, top_bsize);
-        for (i = 0; i < MAX_MB_PLANE; i++) {
-          xd->plane[i].dst.buf = dst_buf[i];
-          xd->plane[i].dst.stride = dst_stride[i];
-          vp9_build_masked_inter_predictor_complex(
-              xd,
-              dst_buf[i], dst_stride[i],
-              dst_buf1[i], dst_stride1[i],
-              &xd->plane[i],
-              mi_row, mi_col,
-              mi_row_ori, mi_col_ori,
-              bsize, top_bsize,
-              PARTITION_HORZ);
-        }
-      }
-      break;
-    case PARTITION_VERT:
-      if (bsize > BLOCK_8X8) {
-        dec_predict_b_extend(cm, xd, tile, mi_row, mi_col, mi_row_ori,
-                             mi_col_ori, top_bsize);
-      } else {
-        dec_predict_b_sub8x8_extend(cm, xd, tile, mi_row, mi_col,
-                                    mi_row_ori, mi_col_ori,
-                                    top_bsize, partition);
-      }
-      if (mi_col + hbs < cm->mi_cols && bsize > BLOCK_8X8) {
-        for (i = 0; i < MAX_MB_PLANE; i++) {
-          xd->plane[i].dst.buf = dst_buf1[i];
-          xd->plane[i].dst.stride = dst_stride1[i];
-        }
-        dec_predict_b_extend(cm, xd, tile, mi_row, mi_col + hbs, mi_row_ori,
-                             mi_col_ori, top_bsize);
-        for (i = 0; i < MAX_MB_PLANE; i++) {
-          xd->plane[i].dst.buf = dst_buf[i];
-          xd->plane[i].dst.stride = dst_stride[i];
-          vp9_build_masked_inter_predictor_complex(
-              xd,
-              dst_buf[i], dst_stride[i],
-              dst_buf1[i], dst_stride1[i],
-              &xd->plane[i],
-              mi_row, mi_col,
-              mi_row_ori, mi_col_ori,
-              bsize, top_bsize,
-              PARTITION_VERT);
-        }
-      }
-      break;
-    case PARTITION_SPLIT:
-      if (bsize == BLOCK_8X8) {
-        dec_predict_b_sub8x8_extend(cm, xd, tile, mi_row, mi_col,
-                                    mi_row_ori, mi_col_ori,
-                                    top_bsize, partition);
-      } else {
-        dec_predict_sb_complex_highbd(cm, xd, tile, mi_row, mi_col,
-                                      mi_row_ori, mi_col_ori, subsize,
-                                      top_bsize, dst_buf, dst_stride);
-        if (mi_row < cm->mi_rows && mi_col + hbs < cm->mi_cols)
-          dec_predict_sb_complex_highbd(cm, xd, tile, mi_row, mi_col + hbs,
-                                        mi_row_ori, mi_col_ori, subsize,
-                                        top_bsize, dst_buf1, dst_stride1);
-        if (mi_row + hbs < cm->mi_rows && mi_col < cm->mi_cols)
-          dec_predict_sb_complex_highbd(cm, xd, tile, mi_row + hbs, mi_col,
-                                        mi_row_ori, mi_col_ori, subsize,
-                                        top_bsize, dst_buf2, dst_stride2);
-        if (mi_row + hbs < cm->mi_rows && mi_col + hbs < cm->mi_cols)
-          dec_predict_sb_complex_highbd(cm, xd, tile,
-                                        mi_row + hbs, mi_col + hbs,
-                                        mi_row_ori, mi_col_ori, subsize,
-                                        top_bsize, dst_buf3, dst_stride3);
-        for (i = 0; i < MAX_MB_PLANE; i++) {
-          if (mi_row < cm->mi_rows && mi_col + hbs < cm->mi_cols) {
-            vp9_build_masked_inter_predictor_complex(
-                xd,
-                dst_buf[i], dst_stride[i],
-                dst_buf1[i],
-                dst_stride1[i],
-                &xd->plane[i],
-                mi_row, mi_col,
-                mi_row_ori, mi_col_ori,
-                bsize, top_bsize,
-                PARTITION_VERT);
-            if (mi_row + hbs < cm->mi_rows) {
-              vp9_build_masked_inter_predictor_complex(
-                  xd,
-                  dst_buf2[i],
-                  dst_stride2[i],
-                  dst_buf3[i],
-                  dst_stride3[i],
-                  &xd->plane[i],
-                  mi_row, mi_col,
-                  mi_row_ori, mi_col_ori,
-                  bsize, top_bsize,
-                  PARTITION_VERT);
-              vp9_build_masked_inter_predictor_complex(
-                  xd,
-                  dst_buf[i],
-                  dst_stride[i],
-                  dst_buf2[i],
-                  dst_stride2[i],
-                  &xd->plane[i],
-                  mi_row, mi_col,
-                  mi_row_ori, mi_col_ori,
-                  bsize, top_bsize,
-                  PARTITION_HORZ);
-            }
-          } else if (mi_row + hbs < cm->mi_rows && mi_col < cm->mi_cols) {
-            vp9_build_masked_inter_predictor_complex(
-                xd,
-                dst_buf[i],
-                dst_stride[i],
-                dst_buf2[i],
-                dst_stride2[i],
-                &xd->plane[i],
-                mi_row, mi_col,
-                mi_row_ori, mi_col_ori,
-                bsize, top_bsize,
-                PARTITION_HORZ);
-          }
-        }
-      }
-      break;
-#if CONFIG_EXT_PARTITION
-    case PARTITION_HORZ_A:
-      dec_predict_b_extend(cm, xd, tile, mi_row, mi_col, mi_row_ori,
-                           mi_col_ori, top_bsize);
-      for (i = 0; i < MAX_MB_PLANE; i++) {
-        xd->plane[i].dst.buf = dst_buf1[i];
-        xd->plane[i].dst.stride = dst_stride1[i];
-      }
-      dec_predict_b_extend(cm, xd, tile, mi_row, mi_col + hbs,
-                           mi_row_ori, mi_col_ori, top_bsize);
-      for (i = 0; i < MAX_MB_PLANE; i++) {
-        xd->plane[i].dst.buf = dst_buf2[i];
-        xd->plane[i].dst.stride = dst_stride2[i];
-      }
-      dec_predict_b_extend(cm, xd, tile, mi_row + hbs, mi_col,
-                           mi_row_ori, mi_col_ori, top_bsize);
       for (i = 0; i < MAX_MB_PLANE; i++) {
         xd->plane[i].dst.buf = dst_buf[i];
         xd->plane[i].dst.stride = dst_stride[i];
@@ -1457,9 +1414,9 @@ static void dec_predict_sb_complex_highbd(
                                                  dst_buf1[i], dst_stride1[i],
                                                  &xd->plane[i],
                                                  mi_row, mi_col,
-                                                 mi_row_ori, mi_col_ori,
+                                                 mi_row_top, mi_col_top,
                                                  bsize, top_bsize,
-                                                 PARTITION_VERT);
+                                                 PARTITION_HORZ, i);
       }
       for (i = 0; i < MAX_MB_PLANE; i++) {
         vp9_build_masked_inter_predictor_complex(xd,
@@ -1467,64 +1424,35 @@ static void dec_predict_sb_complex_highbd(
                                                  dst_buf2[i], dst_stride2[i],
                                                  &xd->plane[i],
                                                  mi_row, mi_col,
-                                                 mi_row_ori, mi_col_ori,
+                                                 mi_row_top, mi_col_top,
                                                  bsize, top_bsize,
-                                                 PARTITION_HORZ);
-      }
-      break;
-    case PARTITION_VERT_A:
-      dec_predict_b_extend(cm, xd, tile, mi_row, mi_col, mi_row_ori,
-                           mi_col_ori, top_bsize);
-      for (i = 0; i < MAX_MB_PLANE; i++) {
-        xd->plane[i].dst.buf = dst_buf1[i];
-        xd->plane[i].dst.stride = dst_stride1[i];
-      }
-      dec_predict_b_extend(cm, xd, tile, mi_row + hbs, mi_col,
-                           mi_row_ori, mi_col_ori, top_bsize);
-      for (i = 0; i < MAX_MB_PLANE; i++) {
-        xd->plane[i].dst.buf = dst_buf2[i];
-        xd->plane[i].dst.stride = dst_stride2[i];
-      }
-      dec_predict_b_extend(cm, xd, tile, mi_row, mi_col + hbs,
-                           mi_row_ori, mi_col_ori, top_bsize);
-      for (i = 0; i < MAX_MB_PLANE; i++) {
-        xd->plane[i].dst.buf = dst_buf[i];
-        xd->plane[i].dst.stride = dst_stride[i];
-        vp9_build_masked_inter_predictor_complex(xd,
-                                                 dst_buf[i], dst_stride[i],
-                                                 dst_buf1[i], dst_stride1[i],
-                                                 &xd->plane[i],
-                                                 mi_row, mi_col,
-                                                 mi_row_ori, mi_col_ori,
-                                                 bsize, top_bsize,
-                                                 PARTITION_HORZ);
-      }
-      for (i = 0; i < MAX_MB_PLANE; i++) {
-        vp9_build_masked_inter_predictor_complex(xd,
-                                                 dst_buf[i], dst_stride[i],
-                                                 dst_buf2[i], dst_stride2[i],
-                                                 &xd->plane[i],
-                                                 mi_row, mi_col,
-                                                 mi_row_ori, mi_col_ori,
-                                                 bsize, top_bsize,
-                                                 PARTITION_VERT);
+                                                 PARTITION_VERT, i);
       }
       break;
     case PARTITION_HORZ_B:
-      dec_predict_b_extend(cm, xd, tile, mi_row, mi_col, mi_row_ori,
-                           mi_col_ori, top_bsize);
-      for (i = 0; i < MAX_MB_PLANE; i++) {
-        xd->plane[i].dst.buf = dst_buf1[i];
-        xd->plane[i].dst.stride = dst_stride1[i];
-      }
-      dec_predict_b_extend(cm, xd, tile, mi_row + hbs, mi_col,
-                           mi_row_ori, mi_col_ori, top_bsize);
-      for (i = 0; i < MAX_MB_PLANE; i++) {
-        xd->plane[i].dst.buf = dst_buf2[i];
-        xd->plane[i].dst.stride = dst_stride2[i];
-      }
-      dec_predict_b_extend(cm, xd, tile, mi_row + hbs, mi_col + hbs,
-                           mi_row_ori, mi_col_ori, top_bsize);
+      dec_predict_b_extend(cm, xd, tile, 0, mi_row, mi_col, mi_row, mi_col,
+                           mi_row_top, mi_col_top, dst_buf, dst_stride,
+                           top_bsize, subsize, 0, 0);
+      if (bsize < top_bsize)
+        dec_extend_all(cm, xd, tile, 0, subsize, top_bsize, mi_row, mi_col,
+                       mi_row_top, mi_col_top, dst_buf, dst_stride);
+      else
+        dec_extend_dir(cm, xd, tile, 0, subsize, top_bsize, mi_row, mi_col,
+                       mi_row_top, mi_col_top, dst_buf, dst_stride, 0);
+
+      dec_predict_b_extend(cm, xd, tile, 0, mi_row + hbs, mi_col, mi_row + hbs,
+                           mi_col, mi_row_top, mi_col_top,
+                           dst_buf1, dst_stride1, top_bsize, bsize2, 0, 0);
+      dec_extend_all(cm, xd, tile, 0, bsize2, top_bsize, mi_row + hbs, mi_col,
+                     mi_row_top, mi_col_top, dst_buf1, dst_stride1);
+
+      dec_predict_b_extend(cm, xd, tile, 0, mi_row + hbs, mi_col + hbs,
+                           mi_row + hbs, mi_col + hbs, mi_row_top, mi_col_top,
+                           dst_buf2, dst_stride2, top_bsize, bsize2, 0, 0);
+      dec_extend_all(cm, xd, tile, 0, bsize2, top_bsize,
+                     mi_row + hbs, mi_col + hbs,
+                     mi_row_top, mi_col_top, dst_buf2, dst_stride2);
+
       for (i = 0; i < MAX_MB_PLANE; i++) {
         xd->plane[i].dst.buf = dst_buf1[i];
         xd->plane[i].dst.stride = dst_stride1[i];
@@ -1533,9 +1461,9 @@ static void dec_predict_sb_complex_highbd(
                                                  dst_buf2[i], dst_stride2[i],
                                                  &xd->plane[i],
                                                  mi_row, mi_col,
-                                                 mi_row_ori, mi_col_ori,
+                                                 mi_row_top, mi_col_top,
                                                  bsize, top_bsize,
-                                                 PARTITION_VERT);
+                                                 PARTITION_VERT, i);
       }
       for (i = 0; i < MAX_MB_PLANE; i++) {
         xd->plane[i].dst.buf = dst_buf[i];
@@ -1545,26 +1473,35 @@ static void dec_predict_sb_complex_highbd(
                                                  dst_buf1[i], dst_stride1[i],
                                                  &xd->plane[i],
                                                  mi_row, mi_col,
-                                                 mi_row_ori, mi_col_ori,
+                                                 mi_row_top, mi_col_top,
                                                  bsize, top_bsize,
-                                                 PARTITION_HORZ);
+                                                 PARTITION_HORZ, i);
       }
       break;
     case PARTITION_VERT_B:
-      dec_predict_b_extend(cm, xd, tile, mi_row, mi_col, mi_row_ori,
-                           mi_col_ori, top_bsize);
-      for (i = 0; i < MAX_MB_PLANE; i++) {
-        xd->plane[i].dst.buf = dst_buf1[i];
-        xd->plane[i].dst.stride = dst_stride1[i];
-      }
-      dec_predict_b_extend(cm, xd, tile, mi_row, mi_col + hbs,
-                           mi_row_ori, mi_col_ori, top_bsize);
-      for (i = 0; i < MAX_MB_PLANE; i++) {
-        xd->plane[i].dst.buf = dst_buf2[i];
-        xd->plane[i].dst.stride = dst_stride2[i];
-      }
-      dec_predict_b_extend(cm, xd, tile, mi_row + hbs, mi_col + hbs,
-                           mi_row_ori, mi_col_ori, top_bsize);
+      dec_predict_b_extend(cm, xd, tile, 0, mi_row, mi_col, mi_row, mi_col,
+                           mi_row_top, mi_col_top, dst_buf, dst_stride,
+                           top_bsize, subsize, 0, 0);
+      if (bsize < top_bsize)
+        dec_extend_all(cm, xd, tile, 0, subsize, top_bsize, mi_row, mi_col,
+                       mi_row_top, mi_col_top, dst_buf, dst_stride);
+      else
+        dec_extend_dir(cm, xd, tile, 0, subsize, top_bsize, mi_row, mi_col,
+                       mi_row_top, mi_col_top, dst_buf, dst_stride, 3);
+
+      dec_predict_b_extend(cm, xd, tile, 0, mi_row, mi_col + hbs,
+                           mi_row, mi_col + hbs, mi_row_top, mi_col_top,
+                           dst_buf1, dst_stride1, top_bsize, bsize2, 0, 0);
+      dec_extend_all(cm, xd, tile, 0, bsize2, top_bsize, mi_row, mi_col + hbs,
+                     mi_row_top, mi_col_top, dst_buf1, dst_stride1);
+
+      dec_predict_b_extend(cm, xd, tile, 0, mi_row + hbs, mi_col + hbs,
+                           mi_row + hbs, mi_col + hbs, mi_row_top, mi_col_top,
+                           dst_buf2, dst_stride2, top_bsize, bsize2, 0, 0);
+      dec_extend_all(cm, xd, tile, 0, bsize2, top_bsize,
+                     mi_row + hbs, mi_col + hbs,
+                     mi_row_top, mi_col_top, dst_buf2, dst_stride2);
+
       for (i = 0; i < MAX_MB_PLANE; i++) {
         xd->plane[i].dst.buf = dst_buf1[i];
         xd->plane[i].dst.stride = dst_stride1[i];
@@ -1573,9 +1510,9 @@ static void dec_predict_sb_complex_highbd(
                                                  dst_buf2[i], dst_stride2[i],
                                                  &xd->plane[i],
                                                  mi_row, mi_col,
-                                                 mi_row_ori, mi_col_ori,
+                                                 mi_row_top, mi_col_top,
                                                  bsize, top_bsize,
-                                                 PARTITION_HORZ);
+                                                 PARTITION_HORZ, i);
       }
       for (i = 0; i < MAX_MB_PLANE; i++) {
         xd->plane[i].dst.buf = dst_buf[i];
@@ -1585,9 +1522,9 @@ static void dec_predict_sb_complex_highbd(
                                                  dst_buf1[i], dst_stride1[i],
                                                  &xd->plane[i],
                                                  mi_row, mi_col,
-                                                 mi_row_ori, mi_col_ori,
+                                                 mi_row_top, mi_col_top,
                                                  bsize, top_bsize,
-                                                 PARTITION_VERT);
+                                                 PARTITION_VERT, i);
       }
       break;
 #endif
@@ -1595,7 +1532,7 @@ static void dec_predict_sb_complex_highbd(
       assert(0);
   }
 }
-#endif  // CONFIG_VP9_HIGHBITDEPTH
+
 #endif  // CONFIG_SUPERTX
 
 static void decode_block(VP9_COMMON *const cm, MACROBLOCKD *const xd,
@@ -2040,14 +1977,8 @@ static void decode_partition(VP9_COMMON *const cm, MACROBLOCKD *const xd,
       dst_buf[i] = xd->plane[i].dst.buf;
       dst_stride[i] = xd->plane[i].dst.stride;
     }
-#if CONFIG_VP9_HIGHBITDEPTH
-    if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH)
-      dec_predict_sb_complex_highbd(cm, xd, tile, mi_row, mi_col, mi_row,
-                                    mi_col, bsize, bsize, dst_buf, dst_stride);
-    else
-#endif  // CONFIG_VP9_HIGHBITDEPTH
-      dec_predict_sb_complex(cm, xd, tile, mi_row, mi_col, mi_row, mi_col,
-                             bsize, bsize, dst_buf, dst_stride);
+    dec_predict_sb_complex(cm, xd, tile, mi_row, mi_col, mi_row, mi_col,
+                           bsize, bsize, dst_buf, dst_stride);
 
     if (!skip) {
       int eobtotal = 0;
index 5e85a177c8194a388c20ea067979284bd410c647..ee4bbe4f5d205f8be83e92929cbf9dd8958df730 100644 (file)
@@ -69,10 +69,10 @@ static int check_intra_sb(VP9_COMP *cpi, const TileInfo *const tile,
                           PC_TREE *pc_tree);
 static void predict_superblock(VP9_COMP *cpi,
 #if CONFIG_WEDGE_PARTITION
-                               int mi_row, int mi_col,
-#endif  // CONFIG_WEDGE_PARTITION
                                int mi_row_ori, int mi_col_ori,
-                               BLOCK_SIZE bsize);
+#endif  // CONFIG_WEDGE_PARTITION
+                               int mi_row_pred, int mi_col_pred,
+                               BLOCK_SIZE bsize_pred, int b_sub8x8, int block);
 static int check_supertx_sb(BLOCK_SIZE bsize, TX_SIZE supertx_size,
                             PC_TREE *pc_tree);
 static void predict_sb_complex(VP9_COMP *cpi, const TileInfo *const tile,
@@ -82,15 +82,6 @@ static void predict_sb_complex(VP9_COMP *cpi, const TileInfo *const tile,
                                BLOCK_SIZE top_bsize,
                                uint8_t *dst_buf[3], int dst_stride[3],
                                PC_TREE *pc_tree);
-#if CONFIG_VP9_HIGHBITDEPTH
-static void predict_sb_complex_highbd(VP9_COMP *cpi, const TileInfo *const tile,
-                                      int mi_row, int mi_col,
-                                      int mi_row_ori, int mi_col_ori,
-                                      int output_enabled, BLOCK_SIZE bsize,
-                                      BLOCK_SIZE top_bsize,
-                                      uint8_t *dst_buf[3], int dst_stride[3],
-                                      PC_TREE *pc_tree);
-#endif  // CONFIG_VP9_HIGHBITDEPTH
 static void update_state_sb_supertx(VP9_COMP *cpi, const TileInfo *const tile,
                                     int mi_row, int mi_col,
                                     BLOCK_SIZE bsize,
@@ -317,34 +308,37 @@ static void set_offsets_supertx(VP9_COMP *cpi, const TileInfo *const tile,
 }
 
 static void set_offsets_extend(VP9_COMP *cpi, const TileInfo *const tile,
-                               int mi_row, int mi_col,
+                               int mi_row_pred, int mi_col_pred,
                                int mi_row_ori, int mi_col_ori,
-                               BLOCK_SIZE bsize, BLOCK_SIZE top_bsize) {
+                               BLOCK_SIZE bsize_pred, BLOCK_SIZE bsize_ori) {
+  // Used in supertx
+  // (mi_row_ori, mi_col_ori, bsize_ori): region for mv
+  // (mi_row_pred, mi_col_pred, bsize_pred): region to predict
   MACROBLOCK *const x = &cpi->mb;
   VP9_COMMON *const cm = &cpi->common;
   MACROBLOCKD *const xd = &x->e_mbd;
   MB_MODE_INFO *mbmi;
-  const int mi_width = num_8x8_blocks_wide_lookup[top_bsize];
-  const int mi_height = num_8x8_blocks_high_lookup[top_bsize];
+  const int mi_width = num_8x8_blocks_wide_lookup[bsize_pred];
+  const int mi_height = num_8x8_blocks_high_lookup[bsize_pred];
   const struct segmentation *const seg = &cm->seg;
 
-  set_modeinfo_offsets(cm, xd, mi_row, mi_col);
+  set_modeinfo_offsets(cm, xd, mi_row_ori, mi_col_ori);
 
   mbmi = &xd->mi[0].src_mi->mbmi;
 
   // Set up limit values for MV components.
   // Mv beyond the range do not produce new/different prediction block.
-  x->mv_row_min = -(((mi_row_ori + mi_height) * MI_SIZE) + VP9_INTERP_EXTEND);
-  x->mv_col_min = -(((mi_col_ori + mi_width) * MI_SIZE) + VP9_INTERP_EXTEND);
-  x->mv_row_max = (cm->mi_rows - mi_row_ori) * MI_SIZE + VP9_INTERP_EXTEND;
-  x->mv_col_max = (cm->mi_cols - mi_col_ori) * MI_SIZE + VP9_INTERP_EXTEND;
+  x->mv_row_min = -(((mi_row_pred + mi_height) * MI_SIZE) + VP9_INTERP_EXTEND);
+  x->mv_col_min = -(((mi_col_pred + mi_width) * MI_SIZE) + VP9_INTERP_EXTEND);
+  x->mv_row_max = (cm->mi_rows - mi_row_pred) * MI_SIZE + VP9_INTERP_EXTEND;
+  x->mv_col_max = (cm->mi_cols - mi_col_pred) * MI_SIZE + VP9_INTERP_EXTEND;
 
   // Set up distance of MB to edge of frame in 1/8th pel units.
-  assert(!(mi_col_ori & (mi_width - 1)) && !(mi_row_ori & (mi_height - 1)));
-  set_mi_row_col(xd, tile, mi_row_ori, mi_height, mi_col_ori, mi_width,
+  assert(!(mi_col_pred & (mi_width - 1)) && !(mi_row_pred & (mi_height - 1)));
+  set_mi_row_col(xd, tile, mi_row_pred, mi_height, mi_col_pred, mi_width,
                  cm->mi_rows, cm->mi_cols);
-  xd->up_available    = (mi_row != 0);
-  xd->left_available  = (mi_col > tile->mi_col_start);
+  xd->up_available    = (mi_row_ori != 0);
+  xd->left_available  = (mi_col_ori > tile->mi_col_start);
 
   // R/D setup.
   x->rddiv = cpi->rd.RDDIV;
@@ -355,7 +349,8 @@ static void set_offsets_extend(VP9_COMP *cpi, const TileInfo *const tile,
     if (cpi->oxcf.aq_mode != VARIANCE_AQ) {
       const uint8_t *const map = seg->update_map ? cpi->segmentation_map
                                                  : cm->last_frame_seg_map;
-      mbmi->segment_id = vp9_get_segment_id(cm, map, bsize, mi_row, mi_col);
+      mbmi->segment_id = vp9_get_segment_id(cm, map, bsize_ori,
+                                            mi_row_ori, mi_col_ori);
     }
     vp9_init_plane_quantizers(cpi, x);
 
@@ -1728,16 +1723,9 @@ static void encode_sb(VP9_COMP *cpi, const TileInfo *const tile,
         dst_buf[i] = xd->plane[i].dst.buf;
         dst_stride[i] = xd->plane[i].dst.stride;
       }
-#if CONFIG_VP9_HIGHBITDEPTH
-      if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH)
-        predict_sb_complex_highbd(cpi, tile, mi_row, mi_col, mi_row, mi_col,
-                                  output_enabled, bsize, bsize,
-                                  dst_buf, dst_stride, pc_tree);
-      else
-#endif  // CONFIG_VP9_HIGHBITDEPTH
-        predict_sb_complex(cpi, tile, mi_row, mi_col, mi_row, mi_col,
-                           output_enabled, bsize, bsize,
-                           dst_buf, dst_stride, pc_tree);
+      predict_sb_complex(cpi, tile, mi_row, mi_col, mi_row, mi_col,
+                         output_enabled, bsize, bsize,
+                         dst_buf, dst_stride, pc_tree);
 
       set_offsets(cpi, tile, mi_row, mi_col, bsize);
       if (!x->skip) {
@@ -3733,6 +3721,7 @@ static void rd_pick_partition(VP9_COMP *cpi, const TileInfo *const tile,
   }
 }
 
+
 static void encode_rd_sb_row(VP9_COMP *cpi, const TileInfo *const tile,
                              int mi_row, TOKENEXTRA **tp) {
   VP9_COMMON *const cm = &cpi->common;
@@ -4503,7 +4492,8 @@ static void sum_intra_stats(FRAME_COUNTS *counts,
 #if CONFIG_FILTERINTRA
   if (is_filter_allowed(uv_mode) &&
       is_filter_enabled(get_uv_tx_size(&(mi->mbmi), &xd->plane[1])))
-    ++counts->filterintra[get_uv_tx_size(&(mi->mbmi), &xd->plane[1])][uv_mode][uv_fbit];
+    ++counts->filterintra[get_uv_tx_size(&(mi->mbmi),
+                          &xd->plane[1])][uv_mode][uv_fbit];
 #endif
 }
 
@@ -4853,10 +4843,13 @@ static int check_supertx_sb(BLOCK_SIZE bsize, TX_SIZE supertx_size,
 
 static void predict_superblock(VP9_COMP *cpi,
 #if CONFIG_WEDGE_PARTITION
-                               int mi_row, int mi_col,
-#endif  // CONFIG_WEDGE_PARTITION
                                int mi_row_ori, int mi_col_ori,
-                               BLOCK_SIZE bsize) {
+#endif  // CONFIG_WEDGE_PARTITION
+                               int mi_row_pred, int mi_col_pred,
+                               BLOCK_SIZE bsize_pred, int b_sub8x8, int block) {
+  // Used in supertx
+  // (mi_row_ori, mi_col_ori): location for mv
+  // (mi_row_pred, mi_col_pred, bsize_pred): region to predict
   VP9_COMMON *const cm = &cpi->common;
   MACROBLOCK *const x = &cpi->mb;
   MACROBLOCKD *const xd = &x->e_mbd;
@@ -4871,82 +4864,186 @@ static void predict_superblock(VP9_COMP *cpi,
   for (ref = 0; ref < 1 + is_compound; ++ref) {
     YV12_BUFFER_CONFIG *cfg = get_ref_frame_buffer(cpi,
                                                    mbmi->ref_frame[ref]);
-    vp9_setup_pre_planes(xd, ref, cfg, mi_row_ori, mi_col_ori,
+    vp9_setup_pre_planes(xd, ref, cfg, mi_row_pred, mi_col_pred,
                          &xd->block_refs[ref]->sf);
   }
-#if CONFIG_WEDGE_PARTITION
-  vp9_build_inter_predictors_sb_extend(xd, mi_row, mi_col,
-                                       mi_row_ori, mi_col_ori, bsize);
+
+#if !CONFIG_WEDGE_PARTITION
+  if (!b_sub8x8)
+    vp9_build_inter_predictors_sb(xd, mi_row_pred, mi_col_pred, bsize_pred);
+  else
+    vp9_build_inter_predictors_sb_sub8x8(xd, mi_row_pred, mi_col_pred,
+                                         bsize_pred, block);
 #else
-  vp9_build_inter_predictors_sb(xd, mi_row_ori, mi_col_ori, bsize);
+  if (!b_sub8x8)
+    vp9_build_inter_predictors_sb_extend(xd, mi_row_ori, mi_col_ori,
+                                         mi_row_pred, mi_col_pred, bsize_pred);
+  else
+    vp9_build_inter_predictors_sb_sub8x8_extend(
+        xd, mi_row_ori, mi_col_ori,
+        mi_row_pred, mi_col_pred, bsize_pred, block);
 #endif  // CONFIG_WEDGE_PARTITION
 }
 
-static void predict_superblock_sub8x8_extend(VP9_COMP *cpi,
-                                             int mi_row, int mi_col,
-                                             int mi_row_ori, int mi_col_ori,
-                                             BLOCK_SIZE top_bsize,
-                                             PARTITION_TYPE partition) {
-  VP9_COMMON *const cm = &cpi->common;
+static void predict_b_extend(VP9_COMP *cpi, const TileInfo *const tile,
+                             int block,
+                             int mi_row_ori, int mi_col_ori,
+                             int mi_row_pred, int mi_col_pred,
+                             int mi_row_top, int mi_col_top,
+                             uint8_t * dst_buf[3], int dst_stride[3],
+                             BLOCK_SIZE bsize_ori, BLOCK_SIZE bsize_top,
+                             BLOCK_SIZE bsize_pred, int output_enabled,
+                             int b_sub8x8, int bextend) {
+  // Used in supertx
+  // (mi_row_ori, mi_col_ori): location for mv
+  // (mi_row_pred, mi_col_pred, bsize_pred): region to predict
+  // (mi_row_top, mi_col_top, bsize_top): region of the top partition size
+  // block: sub location of sub8x8 blocks
+  // b_sub8x8: 1: ori is sub8x8; 0: ori is not sub8x8
+  // bextend: 1: region to predict is an extension of ori; 0: not
+
   MACROBLOCK *const x = &cpi->mb;
+  VP9_COMMON *const cm = &cpi->common;
   MACROBLOCKD *const xd = &x->e_mbd;
-  MODE_INFO *mi_8x8 = xd->mi;
-  MODE_INFO *mi = mi_8x8;
-  MB_MODE_INFO *mbmi = &mi->mbmi;
-  int ref;
-  const int is_compound = has_second_ref(mbmi);
+  int r = (mi_row_pred - mi_row_top) * MI_SIZE;
+  int c = (mi_col_pred - mi_col_top) * MI_SIZE;
+  const int mi_width_top = num_8x8_blocks_wide_lookup[bsize_top];
+  const int mi_height_top = num_8x8_blocks_high_lookup[bsize_top];
+
+  if (mi_row_pred < mi_row_top || mi_col_pred < mi_col_top ||
+      mi_row_pred >= mi_row_top + mi_height_top ||
+      mi_col_pred >= mi_col_top + mi_width_top ||
+      mi_row_pred >= cm->mi_rows || mi_col_pred >= cm->mi_cols)
+    return;
 
-  set_ref_ptrs(cm, xd, mbmi->ref_frame[0], mbmi->ref_frame[1]);
+  set_offsets_extend(cpi, tile, mi_row_pred, mi_col_pred,
+                     mi_row_ori, mi_col_ori, bsize_pred, bsize_ori);
+  xd->plane[0].dst.stride = dst_stride[0];
+  xd->plane[1].dst.stride = dst_stride[1];
+  xd->plane[2].dst.stride = dst_stride[2];
+  xd->plane[0].dst.buf = dst_buf[0] +
+                         (r >> xd->plane[0].subsampling_y) * dst_stride[0] +
+                         (c >> xd->plane[0].subsampling_x);
+  xd->plane[1].dst.buf = dst_buf[1] +
+                         (r >> xd->plane[1].subsampling_y) * dst_stride[1] +
+                         (c >> xd->plane[1].subsampling_x);
+  xd->plane[2].dst.buf = dst_buf[2] +
+                         (r >> xd->plane[2].subsampling_y) * dst_stride[2] +
+                         (c >> xd->plane[2].subsampling_x);
 
-  for (ref = 0; ref < 1 + is_compound; ++ref) {
-    YV12_BUFFER_CONFIG *cfg = get_ref_frame_buffer(cpi,
-                                                   mbmi->ref_frame[ref]);
-    vp9_setup_pre_planes(xd, ref, cfg, mi_row_ori, mi_col_ori,
-                         &xd->block_refs[ref]->sf);
-  }
-  vp9_build_inter_predictors_sby_sub8x8_extend(xd, mi_row, mi_col,
-                                               mi_row_ori, mi_col_ori,
-                                               top_bsize, partition);
-  vp9_build_inter_predictors_sbuv_sub8x8_extend(xd,
+  predict_superblock(cpi,
 #if CONFIG_WEDGE_PARTITION
-                                                mi_row, mi_col,
+                     mi_row_ori, mi_col_ori,
 #endif
-                                                mi_row_ori, mi_col_ori,
-                                                top_bsize);
-}
+                     mi_row_pred, mi_col_pred, bsize_pred,
+                     b_sub8x8, block);
 
-static void predict_b_sub8x8_extend(VP9_COMP *cpi, const TileInfo *const tile,
-                                    int mi_row, int mi_col,
-                                    int mi_row_ori, int mi_col_ori,
-                                    int output_enabled,
-                                    BLOCK_SIZE bsize, BLOCK_SIZE top_bsize,
-                                    PARTITION_TYPE partition) {
-  set_offsets_extend(cpi, tile, mi_row, mi_col, mi_row_ori, mi_col_ori,
-                     bsize, top_bsize);
-  predict_superblock_sub8x8_extend(cpi, mi_row, mi_col, mi_row_ori, mi_col_ori,
-                                   top_bsize, partition);
-
-  if (output_enabled)
+  if (output_enabled && !bextend)
     update_stats(&cpi->common, &cpi->mb);
 }
 
-static void predict_b_extend(VP9_COMP *cpi, const TileInfo *const tile,
-                             int mi_row, int mi_col,
-                             int mi_row_ori, int mi_col_ori,
-                             int output_enabled,
-                             BLOCK_SIZE bsize, BLOCK_SIZE top_bsize) {
-  set_offsets_extend(cpi, tile, mi_row, mi_col, mi_row_ori, mi_col_ori,
-                     bsize, top_bsize);
-  predict_superblock(cpi,
-#if CONFIG_WEDGE_PARTITION
-                     mi_row, mi_col,
-#endif
-                     mi_row_ori, mi_col_ori, top_bsize);
+static void extend_dir(VP9_COMP *cpi, const TileInfo *const tile,
+                       int block, BLOCK_SIZE bsize, BLOCK_SIZE top_bsize,
+                       int mi_row, int mi_col,
+                       int mi_row_top, int mi_col_top,
+                       int output_enabled,
+                       uint8_t * dst_buf[3], int dst_stride[3], int dir) {
+  // dir: 0-lower, 1-upper, 2-left, 3-right
+  //      4-lowerleft, 5-upperleft, 6-lowerright, 7-upperright
+  MACROBLOCKD *xd = &cpi->mb.e_mbd;
+  const int mi_width = num_8x8_blocks_wide_lookup[bsize];
+  const int mi_height = num_8x8_blocks_high_lookup[bsize];
+  int xss = xd->plane[1].subsampling_x;
+  int yss = xd->plane[1].subsampling_y;
+  int b_sub8x8 = (bsize < BLOCK_8X8) ? 1 : 0;
+
+  BLOCK_SIZE extend_bsize;
+  int unit, mi_row_pred, mi_col_pred;
+
+  if (dir == 0 || dir == 1) {  // lower and upper
+    extend_bsize = (mi_width == 1 || bsize < BLOCK_8X8 || xss < yss) ?
+                   BLOCK_8X8 : BLOCK_16X8;
+    unit = num_8x8_blocks_wide_lookup[extend_bsize];
+    mi_row_pred = mi_row + ((dir == 0) ? mi_height : -1);
+    mi_col_pred = mi_col;
+
+    predict_b_extend(cpi, tile, block, mi_row, mi_col,
+                     mi_row_pred, mi_col_pred,
+                     mi_row_top, mi_col_top, dst_buf, dst_stride,
+                     bsize, top_bsize, extend_bsize,
+                     output_enabled, b_sub8x8, 1);
+
+    if (mi_width > unit) {
+      int i;
+      for (i = 0; i < mi_width/unit - 1; i++) {
+        mi_col_pred += unit;
+        predict_b_extend(cpi, tile, block, mi_row, mi_col,
+                         mi_row_pred, mi_col_pred, mi_row_top, mi_col_top,
+                         dst_buf, dst_stride, bsize, top_bsize, extend_bsize,
+                         output_enabled, b_sub8x8, 1);
+      }
+    }
+  } else if (dir == 2 || dir == 3) {  // left and right
+    extend_bsize = (mi_height == 1 || bsize < BLOCK_8X8 || yss < xss) ?
+                   BLOCK_8X8 : BLOCK_8X16;
+    unit = num_8x8_blocks_high_lookup[extend_bsize];
+    mi_row_pred = mi_row;
+    mi_col_pred = mi_col + ((dir == 3) ? mi_width : -1);
+
+    predict_b_extend(cpi, tile, block, mi_row, mi_col,
+                     mi_row_pred, mi_col_pred, mi_row_top, mi_col_top,
+                     dst_buf, dst_stride, bsize, top_bsize, extend_bsize,
+                     output_enabled, b_sub8x8, 1);
+
+    if (mi_height > unit) {
+      int i;
+      for (i = 0; i < mi_height/unit - 1; i++) {
+        mi_row_pred += unit;
+        predict_b_extend(cpi, tile, block, mi_row, mi_col,
+                         mi_row_pred, mi_col_pred, mi_row_top, mi_col_top,
+                         dst_buf, dst_stride, bsize, top_bsize, extend_bsize,
+                         output_enabled, b_sub8x8, 1);
+      }
+    }
+  } else {
+    extend_bsize = BLOCK_8X8;
+    mi_row_pred = mi_row + ((dir == 4 || dir == 6) ? mi_height : -1);
+    mi_col_pred = mi_col + ((dir == 6 || dir == 7) ? mi_width : -1);
+
+    predict_b_extend(cpi, tile, block, mi_row, mi_col,
+                     mi_row_pred, mi_col_pred, mi_row_top, mi_col_top,
+                     dst_buf, dst_stride, bsize, top_bsize, extend_bsize,
+                     output_enabled, b_sub8x8, 1);
+  }
+}
 
-  if (output_enabled)
-    update_stats(&cpi->common, &cpi->mb);
+static void extend_all(VP9_COMP *cpi, const TileInfo *const tile,
+                       int block,
+                       BLOCK_SIZE bsize, BLOCK_SIZE top_bsize,
+                       int mi_row, int mi_col,
+                       int mi_row_top, int mi_col_top,
+                       int output_enabled,
+                       uint8_t * dst_buf[3], int dst_stride[3]) {
+  assert(block >= 0 && block < 4);
+  extend_dir(cpi, tile, block, bsize, top_bsize, mi_row, mi_col,
+             mi_row_top, mi_col_top, output_enabled, dst_buf, dst_stride, 0);
+  extend_dir(cpi, tile, block, bsize, top_bsize, mi_row, mi_col,
+             mi_row_top, mi_col_top, output_enabled, dst_buf, dst_stride, 1);
+  extend_dir(cpi, tile, block, bsize, top_bsize, mi_row, mi_col,
+             mi_row_top, mi_col_top, output_enabled, dst_buf, dst_stride, 2);
+  extend_dir(cpi, tile, block, bsize, top_bsize, mi_row, mi_col,
+             mi_row_top, mi_col_top, output_enabled, dst_buf, dst_stride, 3);
+  extend_dir(cpi, tile, block, bsize, top_bsize, mi_row, mi_col,
+             mi_row_top, mi_col_top, output_enabled, dst_buf, dst_stride, 4);
+  extend_dir(cpi, tile, block, bsize, top_bsize, mi_row, mi_col,
+             mi_row_top, mi_col_top, output_enabled, dst_buf, dst_stride, 5);
+  extend_dir(cpi, tile, block, bsize, top_bsize, mi_row, mi_col,
+             mi_row_top, mi_col_top, output_enabled, dst_buf, dst_stride, 6);
+  extend_dir(cpi, tile, block, bsize, top_bsize, mi_row, mi_col,
+             mi_row_top, mi_col_top, output_enabled, dst_buf, dst_stride, 7);
 }
 
+
 // This function generates prediction for multiple blocks, between which
 // discontinuity around boundary is reduced by smoothing masks. The basic
 // smoothing mask is a soft step function along horz/vert direction. In more
@@ -4957,7 +5054,7 @@ static void predict_b_extend(VP9_COMP *cpi, const TileInfo *const tile,
 // prediction is stored in dst_buf[] passed from higher level.
 static void predict_sb_complex(VP9_COMP *cpi, const TileInfo *const tile,
                                int mi_row, int mi_col,
-                               int mi_row_ori, int mi_col_ori,
+                               int mi_row_top, int mi_col_top,
                                int output_enabled, BLOCK_SIZE bsize,
                                BLOCK_SIZE top_bsize,
                                uint8_t *dst_buf[3], int dst_stride[3],
@@ -4974,27 +5071,42 @@ static void predict_sb_complex(VP9_COMP *cpi, const TileInfo *const tile,
 #endif
 
   int i, ctx;
+  uint8_t *dst_buf1[3], *dst_buf2[3], *dst_buf3[3];
   DECLARE_ALIGNED_ARRAY(16, uint8_t, tmp_buf1,
-                        MAX_MB_PLANE * MAXTXLEN * MAXTXLEN);
+                        MAX_MB_PLANE * MAXTXLEN * MAXTXLEN * sizeof(uint16_t));
   DECLARE_ALIGNED_ARRAY(16, uint8_t, tmp_buf2,
-                        MAX_MB_PLANE * MAXTXLEN * MAXTXLEN);
+                        MAX_MB_PLANE * MAXTXLEN * MAXTXLEN * sizeof(uint16_t));
   DECLARE_ALIGNED_ARRAY(16, uint8_t, tmp_buf3,
-                        MAX_MB_PLANE * MAXTXLEN * MAXTXLEN);
-  uint8_t *dst_buf1[3] = {
-    tmp_buf1,
-    tmp_buf1 + MAXTXLEN * MAXTXLEN,
-    tmp_buf1 + 2 * MAXTXLEN * MAXTXLEN};
-  uint8_t *dst_buf2[3] = {
-    tmp_buf2,
-    tmp_buf2 + MAXTXLEN * MAXTXLEN,
-    tmp_buf2 + 2 * MAXTXLEN * MAXTXLEN};
-  uint8_t *dst_buf3[3] = {
-    tmp_buf3,
-    tmp_buf3 + MAXTXLEN * MAXTXLEN,
-    tmp_buf3 + 2 * MAXTXLEN * MAXTXLEN};
+                        MAX_MB_PLANE * MAXTXLEN * MAXTXLEN * sizeof(uint16_t));
   int dst_stride1[3] = {MAXTXLEN, MAXTXLEN, MAXTXLEN};
   int dst_stride2[3] = {MAXTXLEN, MAXTXLEN, MAXTXLEN};
   int dst_stride3[3] = {MAXTXLEN, MAXTXLEN, MAXTXLEN};
+#if CONFIG_VP9_HIGHBITDEPTH
+  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+    int len = sizeof(uint16_t);
+    dst_buf1[0] = CONVERT_TO_BYTEPTR(tmp_buf1);
+    dst_buf1[1] = CONVERT_TO_BYTEPTR(tmp_buf1 + MAXTXLEN * MAXTXLEN * len);
+    dst_buf1[2] = CONVERT_TO_BYTEPTR(tmp_buf1 + 2 * MAXTXLEN * MAXTXLEN * len);
+    dst_buf2[0] = CONVERT_TO_BYTEPTR(tmp_buf2);
+    dst_buf2[1] = CONVERT_TO_BYTEPTR(tmp_buf2 + MAXTXLEN * MAXTXLEN * len);
+    dst_buf2[2] = CONVERT_TO_BYTEPTR(tmp_buf2 + 2 * MAXTXLEN * MAXTXLEN * len);
+    dst_buf3[0] = CONVERT_TO_BYTEPTR(tmp_buf3);
+    dst_buf3[1] = CONVERT_TO_BYTEPTR(tmp_buf3 + MAXTXLEN * MAXTXLEN * len);
+    dst_buf3[2] = CONVERT_TO_BYTEPTR(tmp_buf3 + 2 * MAXTXLEN * MAXTXLEN * len);
+  } else {
+#endif
+    dst_buf1[0] = tmp_buf1;
+    dst_buf1[1] = tmp_buf1 + MAXTXLEN * MAXTXLEN;
+    dst_buf1[2] = tmp_buf1 + 2 * MAXTXLEN * MAXTXLEN;
+    dst_buf2[0] = tmp_buf2;
+    dst_buf2[1] = tmp_buf2 + MAXTXLEN * MAXTXLEN;
+    dst_buf2[2] = tmp_buf2 + 2 * MAXTXLEN * MAXTXLEN;
+    dst_buf3[0] = tmp_buf3;
+    dst_buf3[1] = tmp_buf3 + MAXTXLEN * MAXTXLEN;
+    dst_buf3[2] = tmp_buf3 + 2 * MAXTXLEN * MAXTXLEN;
+#if CONFIG_VP9_HIGHBITDEPTH
+  }
+#endif
 
   if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols)
     return;
@@ -5022,97 +5134,208 @@ static void predict_sb_complex(VP9_COMP *cpi, const TileInfo *const tile,
   switch (partition) {
     case PARTITION_NONE:
       assert(bsize < top_bsize);
-      predict_b_extend(cpi, tile, mi_row, mi_col, mi_row_ori, mi_col_ori,
-                       output_enabled, bsize, top_bsize);
+      predict_b_extend(cpi, tile, 0, mi_row, mi_col, mi_row, mi_col,
+                       mi_row_top, mi_col_top, dst_buf, dst_stride,
+                       bsize, top_bsize, bsize, output_enabled, 0, 0);
+      extend_all(cpi, tile, 0, bsize, top_bsize, mi_row, mi_col,
+                 mi_row_top, mi_col_top, output_enabled, dst_buf, dst_stride);
       break;
     case PARTITION_HORZ:
-      if (bsize > BLOCK_8X8) {
-        predict_b_extend(cpi, tile, mi_row, mi_col, mi_row_ori, mi_col_ori,
-                         output_enabled, subsize, top_bsize);
-      } else {
-        predict_b_sub8x8_extend(cpi, tile, mi_row, mi_col,
-                                mi_row_ori, mi_col_ori, output_enabled,
-                                bsize, top_bsize, PARTITION_HORZ);
-      }
-      if (mi_row + hbs < cm->mi_rows && bsize > BLOCK_8X8) {
-        for (i = 0; i < MAX_MB_PLANE; i++) {
-          xd->plane[i].dst.buf = dst_buf1[i];
-          xd->plane[i].dst.stride = dst_stride1[i];
-        }
-        predict_b_extend(cpi, tile, mi_row + hbs, mi_col,
-                         mi_row_ori, mi_col_ori, output_enabled,
-                         subsize, top_bsize);
-        for (i = 0; i < MAX_MB_PLANE; i++) {
-          xd->plane[i].dst.buf = dst_buf[i];
-          xd->plane[i].dst.stride = dst_stride[i];
-          vp9_build_masked_inter_predictor_complex(xd,
-                                                   dst_buf[i], dst_stride[i],
-                                                   dst_buf1[i], dst_stride1[i],
-                                                   &xd->plane[i],
-                                                   mi_row, mi_col,
-                                                   mi_row_ori, mi_col_ori,
-                                                   bsize, top_bsize,
-                                                   PARTITION_HORZ);
+      if (bsize == BLOCK_8X8) {
+        // Fisrt half
+        predict_b_extend(cpi, tile, 0, mi_row, mi_col, mi_row, mi_col,
+                         mi_row_top, mi_col_top, dst_buf, dst_stride,
+                         subsize, top_bsize, BLOCK_8X8, output_enabled, 1, 0);
+        if (bsize < top_bsize)
+          extend_all(cpi, tile, 0, subsize, top_bsize, mi_row, mi_col,
+                     mi_row_top, mi_col_top, output_enabled,
+                     dst_buf, dst_stride);
+
+        // Second half
+        predict_b_extend(cpi, tile, 2, mi_row, mi_col, mi_row, mi_col,
+                         mi_row_top, mi_col_top, dst_buf1, dst_stride1,
+                         subsize, top_bsize, BLOCK_8X8, output_enabled, 1, 1);
+        if (bsize < top_bsize)
+          extend_all(cpi, tile, 2, subsize, top_bsize, mi_row, mi_col,
+                     mi_row_top, mi_col_top, output_enabled,
+                     dst_buf1, dst_stride1);
+
+        // Smooth
+        xd->plane[0].dst.buf = dst_buf[0];
+        xd->plane[0].dst.stride = dst_stride[0];
+        vp9_build_masked_inter_predictor_complex(xd,
+                                                 dst_buf[0], dst_stride[0],
+                                                 dst_buf1[0], dst_stride1[0],
+                                                 &xd->plane[0],
+                                                 mi_row, mi_col,
+                                                 mi_row_top, mi_col_top,
+                                                 bsize, top_bsize,
+                                                 PARTITION_HORZ, 0);
+      }  else {
+        // First half
+        predict_b_extend(cpi, tile, 0, mi_row, mi_col, mi_row, mi_col,
+                         mi_row_top, mi_col_top, dst_buf, dst_stride,
+                         subsize, top_bsize, subsize, output_enabled, 0, 0);
+        if (bsize < top_bsize)
+          extend_all(cpi, tile, 0, subsize, top_bsize, mi_row, mi_col,
+                     mi_row_top, mi_col_top, output_enabled,
+                     dst_buf, dst_stride);
+        else
+          extend_dir(cpi, tile, 0, subsize, top_bsize, mi_row, mi_col,
+                     mi_row_top, mi_col_top, output_enabled,
+                     dst_buf, dst_stride, 0);
+
+        if (mi_row + hbs < cm->mi_rows) {
+          // Second half
+          predict_b_extend(cpi, tile, 0, mi_row + hbs, mi_col,
+                           mi_row + hbs, mi_col, mi_row_top, mi_col_top,
+                           dst_buf1, dst_stride1, subsize, top_bsize, subsize,
+                           output_enabled, 0, 0);
+          if (bsize < top_bsize)
+            extend_all(cpi, tile, 0, subsize, top_bsize, mi_row + hbs, mi_col,
+                       mi_row_top, mi_col_top, output_enabled,
+                       dst_buf1, dst_stride1);
+          else
+            extend_dir(cpi, tile, 0, subsize, top_bsize, mi_row + hbs, mi_col,
+                       mi_row_top, mi_col_top, output_enabled,
+                       dst_buf1, dst_stride1, 1);
+
+          // Smooth
+          for (i = 0; i < MAX_MB_PLANE; i++) {
+            xd->plane[i].dst.buf = dst_buf[i];
+            xd->plane[i].dst.stride = dst_stride[i];
+            vp9_build_masked_inter_predictor_complex(
+                xd, dst_buf[i], dst_stride[i], dst_buf1[i], dst_stride1[i],
+                &xd->plane[i], mi_row, mi_col, mi_row_top, mi_col_top,
+                bsize, top_bsize, PARTITION_HORZ, i);
+          }
         }
       }
       break;
     case PARTITION_VERT:
-      if (bsize > BLOCK_8X8) {
-        predict_b_extend(cpi, tile, mi_row, mi_col, mi_row_ori, mi_col_ori,
-                         output_enabled, subsize, top_bsize);
+      if (bsize == BLOCK_8X8) {
+        // First half
+        predict_b_extend(cpi, tile, 0, mi_row, mi_col, mi_row, mi_col,
+                         mi_row_top, mi_col_top, dst_buf, dst_stride,
+                         subsize, top_bsize, BLOCK_8X8, output_enabled, 1, 0);
+        if (bsize < top_bsize)
+          extend_all(cpi, tile, 0, subsize, top_bsize, mi_row, mi_col,
+                     mi_row_top, mi_col_top, output_enabled,
+                     dst_buf, dst_stride);
+
+        // Second half
+        predict_b_extend(cpi, tile, 1, mi_row, mi_col, mi_row, mi_col,
+                         mi_row_top, mi_col_top, dst_buf1, dst_stride1,
+                         subsize, top_bsize, BLOCK_8X8, output_enabled, 1, 1);
+        if (bsize < top_bsize)
+          extend_all(cpi, tile, 1, subsize, top_bsize, mi_row, mi_col,
+                     mi_row_top, mi_col_top, output_enabled,
+                     dst_buf1, dst_stride1);
+
+        // Smooth
+        xd->plane[0].dst.buf = dst_buf[0];
+        xd->plane[0].dst.stride = dst_stride[0];
+        vp9_build_masked_inter_predictor_complex(xd,
+                                                 dst_buf[0], dst_stride[0],
+                                                 dst_buf1[0], dst_stride1[0],
+                                                 &xd->plane[0],
+                                                 mi_row, mi_col,
+                                                 mi_row_top, mi_col_top,
+                                                 bsize, top_bsize,
+                                                 PARTITION_VERT, 0);
       } else {
-        predict_b_sub8x8_extend(cpi, tile, mi_row, mi_col,
-                                mi_row_ori, mi_col_ori, output_enabled,
-                                bsize, top_bsize, PARTITION_VERT);
-      }
-      if (mi_col + hbs < cm->mi_cols && bsize > BLOCK_8X8) {
-        for (i = 0; i < MAX_MB_PLANE; i++) {
-          xd->plane[i].dst.buf = dst_buf1[i];
-          xd->plane[i].dst.stride = dst_stride1[i];
-        }
-        predict_b_extend(cpi, tile, mi_row, mi_col + hbs,
-                         mi_row_ori, mi_col_ori, output_enabled,
-                         subsize, top_bsize);
-        for (i = 0; i < MAX_MB_PLANE; i++) {
-          xd->plane[i].dst.buf = dst_buf[i];
-          xd->plane[i].dst.stride = dst_stride[i];
-          vp9_build_masked_inter_predictor_complex(xd,
-                                                   dst_buf[i], dst_stride[i],
-                                                   dst_buf1[i], dst_stride1[i],
-                                                   &xd->plane[i],
-                                                   mi_row, mi_col,
-                                                   mi_row_ori, mi_col_ori,
-                                                   bsize, top_bsize,
-                                                   PARTITION_VERT);
+        // bsize: not important, not useful
+        predict_b_extend(cpi, tile, 0, mi_row, mi_col, mi_row, mi_col,
+                         mi_row_top, mi_col_top, dst_buf, dst_stride,
+                         subsize, top_bsize, subsize, output_enabled, 0, 0);
+        if (bsize < top_bsize)
+          extend_all(cpi, tile, 0, subsize, top_bsize, mi_row, mi_col,
+                     mi_row_top, mi_col_top, output_enabled,
+                     dst_buf, dst_stride);
+        else
+          extend_dir(cpi, tile, 0, subsize, top_bsize, mi_row, mi_col,
+                     mi_row_top, mi_col_top, output_enabled,
+                     dst_buf, dst_stride, 3);
+
+
+        if (mi_col + hbs < cm->mi_cols) {
+          predict_b_extend(cpi, tile, 0, mi_row, mi_col + hbs,
+                           mi_row, mi_col + hbs, mi_row_top, mi_col_top,
+                           dst_buf1, dst_stride1, subsize, top_bsize, subsize,
+                           output_enabled, 0, 0);
+          if (bsize < top_bsize)
+            extend_all(cpi, tile, 0, subsize, top_bsize, mi_row, mi_col + hbs,
+                       mi_row_top, mi_col_top, output_enabled,
+                       dst_buf1, dst_stride1);
+          else
+            extend_dir(cpi, tile, 0, subsize, top_bsize, mi_row, mi_col + hbs,
+                       mi_row_top, mi_col_top, output_enabled,
+                       dst_buf1, dst_stride1, 2);
+
+          for (i = 0; i < MAX_MB_PLANE; i++) {
+            xd->plane[i].dst.buf = dst_buf[i];
+            xd->plane[i].dst.stride = dst_stride[i];
+            vp9_build_masked_inter_predictor_complex(
+                xd, dst_buf[i], dst_stride[i], dst_buf1[i], dst_stride1[i],
+                &xd->plane[i], mi_row, mi_col, mi_row_top, mi_col_top,
+                bsize, top_bsize, PARTITION_VERT, i);
+          }
         }
       }
       break;
     case PARTITION_SPLIT:
       if (bsize == BLOCK_8X8) {
-        predict_b_sub8x8_extend(cpi, tile, mi_row, mi_col,
-                                mi_row_ori, mi_col_ori, output_enabled,
-                                bsize, top_bsize, PARTITION_SPLIT);
+        predict_b_extend(cpi, tile, 0, mi_row, mi_col, mi_row, mi_col,
+                         mi_row_top, mi_col_top, dst_buf, dst_stride,
+                         subsize, top_bsize, BLOCK_8X8, output_enabled, 1, 0);
+        predict_b_extend(cpi, tile, 1, mi_row, mi_col, mi_row, mi_col,
+                         mi_row_top, mi_col_top, dst_buf1, dst_stride1,
+                         subsize, top_bsize, BLOCK_8X8, output_enabled, 1, 1);
+        predict_b_extend(cpi, tile, 2, mi_row, mi_col, mi_row, mi_col,
+                         mi_row_top, mi_col_top, dst_buf2, dst_stride2,
+                         subsize, top_bsize, BLOCK_8X8, output_enabled, 1, 1);
+        predict_b_extend(cpi, tile, 3, mi_row, mi_col, mi_row, mi_col,
+                         mi_row_top, mi_col_top, dst_buf3, dst_stride3,
+                         subsize, top_bsize, BLOCK_8X8, output_enabled, 1, 1);
+
+        if (bsize < top_bsize) {
+          extend_all(cpi, tile, 0, subsize, top_bsize, mi_row, mi_col,
+                     mi_row_top, mi_col_top, output_enabled,
+                     dst_buf, dst_stride);
+          extend_all(cpi, tile, 1, subsize, top_bsize, mi_row, mi_col,
+                     mi_row_top, mi_col_top, output_enabled,
+                     dst_buf1, dst_stride1);
+          extend_all(cpi, tile, 2, subsize, top_bsize, mi_row, mi_col,
+                     mi_row_top, mi_col_top, output_enabled,
+                     dst_buf2, dst_stride2);
+          extend_all(cpi, tile, 3, subsize, top_bsize, mi_row, mi_col,
+                     mi_row_top, mi_col_top, output_enabled,
+                     dst_buf3, dst_stride3);
+        }
       } else {
         predict_sb_complex(cpi, tile, mi_row, mi_col,
-                           mi_row_ori, mi_col_ori, output_enabled, subsize,
+                           mi_row_top, mi_col_top, output_enabled, subsize,
                            top_bsize, dst_buf, dst_stride,
                            pc_tree->split[0]);
         if (mi_row < cm->mi_rows && mi_col + hbs < cm->mi_cols)
           predict_sb_complex(cpi, tile, mi_row, mi_col + hbs,
-                             mi_row_ori, mi_col_ori, output_enabled, subsize,
+                             mi_row_top, mi_col_top, output_enabled, subsize,
                              top_bsize, dst_buf1, dst_stride1,
                              pc_tree->split[1]);
         if (mi_row + hbs < cm->mi_rows && mi_col < cm->mi_cols)
           predict_sb_complex(cpi, tile, mi_row + hbs, mi_col,
-                             mi_row_ori, mi_col_ori, output_enabled, subsize,
+                             mi_row_top, mi_col_top, output_enabled, subsize,
                              top_bsize, dst_buf2, dst_stride2,
                              pc_tree->split[2]);
         if (mi_row + hbs < cm->mi_rows && mi_col + hbs < cm->mi_cols)
           predict_sb_complex(cpi, tile, mi_row + hbs, mi_col + hbs,
-                             mi_row_ori, mi_col_ori, output_enabled, subsize,
+                             mi_row_top, mi_col_top, output_enabled, subsize,
                              top_bsize, dst_buf3, dst_stride3,
                              pc_tree->split[3]);
+      }
         for (i = 0; i < MAX_MB_PLANE; i++) {
+          if (bsize == BLOCK_8X8 && i != 0)
+            continue;  // Skip <4x4 chroma smoothing
           if (mi_row < cm->mi_rows && mi_col + hbs < cm->mi_cols) {
             vp9_build_masked_inter_predictor_complex(xd,
                                                      dst_buf[i],
@@ -5121,9 +5344,9 @@ static void predict_sb_complex(VP9_COMP *cpi, const TileInfo *const tile,
                                                      dst_stride1[i],
                                                      &xd->plane[i],
                                                      mi_row, mi_col,
-                                                     mi_row_ori, mi_col_ori,
+                                                     mi_row_top, mi_col_top,
                                                      bsize, top_bsize,
-                                                     PARTITION_VERT);
+                                                     PARTITION_VERT, i);
             if (mi_row + hbs < cm->mi_rows) {
               vp9_build_masked_inter_predictor_complex(xd,
                                                        dst_buf2[i],
@@ -5132,9 +5355,9 @@ static void predict_sb_complex(VP9_COMP *cpi, const TileInfo *const tile,
                                                        dst_stride3[i],
                                                        &xd->plane[i],
                                                        mi_row, mi_col,
-                                                       mi_row_ori, mi_col_ori,
+                                                       mi_row_top, mi_col_top,
                                                        bsize, top_bsize,
-                                                       PARTITION_VERT);
+                                                       PARTITION_VERT, i);
               vp9_build_masked_inter_predictor_complex(xd,
                                                        dst_buf[i],
                                                        dst_stride[i],
@@ -5142,9 +5365,9 @@ static void predict_sb_complex(VP9_COMP *cpi, const TileInfo *const tile,
                                                        dst_stride2[i],
                                                        &xd->plane[i],
                                                        mi_row, mi_col,
-                                                       mi_row_ori, mi_col_ori,
+                                                       mi_row_top, mi_col_top,
                                                        bsize, top_bsize,
-                                                       PARTITION_HORZ);
+                                                       PARTITION_HORZ, i);
             }
           } else if (mi_row + hbs < cm->mi_rows && mi_col < cm->mi_cols) {
             vp9_build_masked_inter_predictor_complex(xd,
@@ -5154,31 +5377,39 @@ static void predict_sb_complex(VP9_COMP *cpi, const TileInfo *const tile,
                                                      dst_stride2[i],
                                                      &xd->plane[i],
                                                      mi_row, mi_col,
-                                                     mi_row_ori, mi_col_ori,
+                                                     mi_row_top, mi_col_top,
                                                      bsize, top_bsize,
-                                                     PARTITION_HORZ);
+                                                     PARTITION_HORZ, i);
           }
-        }
       }
       break;
 #if CONFIG_EXT_PARTITION
     case PARTITION_HORZ_A:
-      predict_b_extend(cpi, tile, mi_row, mi_col, mi_row_ori,
-                       mi_col_ori, output_enabled, bsize2, top_bsize);
-      for (i = 0; i < MAX_MB_PLANE; i++) {
-        xd->plane[i].dst.buf = dst_buf1[i];
-        xd->plane[i].dst.stride = dst_stride1[i];
-      }
-      predict_b_extend(cpi, tile, mi_row, mi_col + hbs,
-                       mi_row_ori, mi_col_ori, output_enabled, bsize2,
-                       top_bsize);
-      for (i = 0; i < MAX_MB_PLANE; i++) {
-        xd->plane[i].dst.buf = dst_buf2[i];
-        xd->plane[i].dst.stride = dst_stride2[i];
-      }
-      predict_b_extend(cpi, tile, mi_row + hbs, mi_col,
-                       mi_row_ori, mi_col_ori, output_enabled, subsize,
-                       top_bsize);
+      predict_b_extend(cpi, tile, 0, mi_row, mi_col, mi_row, mi_col,
+                       mi_row_top, mi_col_top, dst_buf, dst_stride,
+                       bsize2, top_bsize, bsize2, output_enabled, 0, 0);
+      extend_all(cpi, tile, 0, bsize2, top_bsize, mi_row, mi_col,
+                 mi_row_top, mi_col_top, output_enabled, dst_buf, dst_stride);
+
+      predict_b_extend(cpi, tile, 0, mi_row, mi_col + hbs,
+                       mi_row, mi_col + hbs, mi_row_top, mi_col_top,
+                       dst_buf1, dst_stride1, bsize2, top_bsize, bsize2,
+                       output_enabled, 0, 0);
+      extend_all(cpi, tile, 0, bsize2, top_bsize, mi_row, mi_col + hbs,
+                 mi_row_top, mi_col_top, output_enabled, dst_buf1, dst_stride1);
+
+      predict_b_extend(cpi, tile, 0, mi_row + hbs, mi_col, mi_row + hbs, mi_col,
+                       mi_row_top, mi_col_top, dst_buf2, dst_stride2,
+                       subsize, top_bsize, subsize, output_enabled, 0, 0);
+      if (bsize < top_bsize)
+        extend_all(cpi, tile, 0, subsize, top_bsize, mi_row + hbs, mi_col,
+                   mi_row_top, mi_col_top, output_enabled,
+                   dst_buf2, dst_stride2);
+      else
+        extend_dir(cpi, tile, 0, subsize, top_bsize, mi_row + hbs, mi_col,
+                   mi_row_top, mi_col_top, output_enabled,
+                   dst_buf2, dst_stride2, 1);
+
       for (i = 0; i < MAX_MB_PLANE; i++) {
         xd->plane[i].dst.buf = dst_buf[i];
         xd->plane[i].dst.stride = dst_stride[i];
@@ -5187,9 +5418,9 @@ static void predict_sb_complex(VP9_COMP *cpi, const TileInfo *const tile,
                                                  dst_buf1[i], dst_stride1[i],
                                                  &xd->plane[i],
                                                  mi_row, mi_col,
-                                                 mi_row_ori, mi_col_ori,
+                                                 mi_row_top, mi_col_top,
                                                  bsize, top_bsize,
-                                                 PARTITION_VERT);
+                                                 PARTITION_VERT, i);
       }
       for (i = 0; i < MAX_MB_PLANE; i++) {
         vp9_build_masked_inter_predictor_complex(xd,
@@ -5197,415 +5428,38 @@ static void predict_sb_complex(VP9_COMP *cpi, const TileInfo *const tile,
                                                  dst_buf2[i], dst_stride2[i],
                                                  &xd->plane[i],
                                                  mi_row, mi_col,
-                                                 mi_row_ori, mi_col_ori,
+                                                 mi_row_top, mi_col_top,
                                                  bsize, top_bsize,
-                                                 PARTITION_HORZ);
+                                                 PARTITION_HORZ, i);
       }
+
       break;
     case PARTITION_VERT_A:
-      predict_b_extend(cpi, tile, mi_row, mi_col, mi_row_ori, mi_col_ori,
-                       output_enabled, bsize2, top_bsize);
-      for (i = 0; i < MAX_MB_PLANE; i++) {
-        xd->plane[i].dst.buf = dst_buf1[i];
-        xd->plane[i].dst.stride = dst_stride1[i];
-      }
-      predict_b_extend(cpi, tile, mi_row + hbs, mi_col,
-                       mi_row_ori, mi_col_ori, output_enabled, bsize2,
-                       top_bsize);
-      for (i = 0; i < MAX_MB_PLANE; i++) {
-        xd->plane[i].dst.buf = dst_buf2[i];
-        xd->plane[i].dst.stride = dst_stride2[i];
-      }
-      predict_b_extend(cpi, tile, mi_row, mi_col + hbs, mi_row_ori,
-                       mi_col_ori, output_enabled, subsize, top_bsize);
-      for (i = 0; i < MAX_MB_PLANE; i++) {
-        xd->plane[i].dst.buf = dst_buf[i];
-        xd->plane[i].dst.stride = dst_stride[i];
-        vp9_build_masked_inter_predictor_complex(xd,
-                                                 dst_buf[i], dst_stride[i],
-                                                 dst_buf1[i], dst_stride1[i],
-                                                 &xd->plane[i],
-                                                 mi_row, mi_col,
-                                                 mi_row_ori, mi_col_ori,
-                                                 bsize, top_bsize,
-                                                 PARTITION_HORZ);
-      }
-      for (i = 0; i < MAX_MB_PLANE; i++) {
-        vp9_build_masked_inter_predictor_complex(xd,
-                                                 dst_buf[i], dst_stride[i],
-                                                 dst_buf2[i], dst_stride2[i],
-                                                 &xd->plane[i],
-                                                 mi_row, mi_col,
-                                                 mi_row_ori, mi_col_ori,
-                                                 bsize, top_bsize,
-                                                 PARTITION_VERT);
-      }
-      break;
-    case PARTITION_HORZ_B:
-      predict_b_extend(cpi, tile, mi_row, mi_col, mi_row_ori,
-                       mi_col_ori, output_enabled, subsize, top_bsize);
-      for (i = 0; i < MAX_MB_PLANE; i++) {
-        xd->plane[i].dst.buf = dst_buf1[i];
-        xd->plane[i].dst.stride = dst_stride1[i];
-      }
-      predict_b_extend(cpi, tile, mi_row + hbs, mi_col, mi_row_ori,
-                       mi_col_ori, output_enabled, bsize2, top_bsize);
-      for (i = 0; i < MAX_MB_PLANE; i++) {
-        xd->plane[i].dst.buf = dst_buf2[i];
-        xd->plane[i].dst.stride = dst_stride2[i];
-      }
-      predict_b_extend(cpi, tile, mi_row + hbs, mi_col + hbs, mi_row_ori,
-                       mi_col_ori, output_enabled, bsize2, top_bsize);
-      for (i = 0; i < MAX_MB_PLANE; i++) {
-        xd->plane[i].dst.buf = dst_buf1[i];
-        xd->plane[i].dst.stride = dst_stride1[i];
-        vp9_build_masked_inter_predictor_complex(xd,
-                                                 dst_buf1[i], dst_stride1[i],
-                                                 dst_buf2[i], dst_stride2[i],
-                                                 &xd->plane[i],
-                                                 mi_row, mi_col,
-                                                 mi_row_ori, mi_col_ori,
-                                                 bsize, top_bsize,
-                                                 PARTITION_VERT);
-      }
-      for (i = 0; i < MAX_MB_PLANE; i++) {
-        xd->plane[i].dst.buf = dst_buf[i];
-        xd->plane[i].dst.stride = dst_stride[i];
-        vp9_build_masked_inter_predictor_complex(xd,
-                                                 dst_buf[i], dst_stride[i],
-                                                 dst_buf1[i], dst_stride1[i],
-                                                 &xd->plane[i],
-                                                 mi_row, mi_col,
-                                                 mi_row_ori, mi_col_ori,
-                                                 bsize, top_bsize,
-                                                 PARTITION_HORZ);
-      }
-      break;
-    case PARTITION_VERT_B:
-      predict_b_extend(cpi, tile, mi_row, mi_col, mi_row_ori,
-                       mi_col_ori, output_enabled, subsize, top_bsize);
-      for (i = 0; i < MAX_MB_PLANE; i++) {
-        xd->plane[i].dst.buf = dst_buf1[i];
-        xd->plane[i].dst.stride = dst_stride1[i];
-      }
-      predict_b_extend(cpi, tile, mi_row, mi_col + hbs, mi_row_ori,
-                       mi_col_ori, output_enabled, bsize2, top_bsize);
-      for (i = 0; i < MAX_MB_PLANE; i++) {
-        xd->plane[i].dst.buf = dst_buf2[i];
-        xd->plane[i].dst.stride = dst_stride2[i];
-      }
-      predict_b_extend(cpi, tile, mi_row + hbs, mi_col + hbs, mi_row_ori,
-                       mi_col_ori, output_enabled, subsize, top_bsize);
-      for (i = 0; i < MAX_MB_PLANE; i++) {
-        xd->plane[i].dst.buf = dst_buf1[i];
-        xd->plane[i].dst.stride = dst_stride1[i];
-        vp9_build_masked_inter_predictor_complex(xd,
-                                                 dst_buf1[i], dst_stride1[i],
-                                                 dst_buf2[i], dst_stride2[i],
-                                                 &xd->plane[i],
-                                                 mi_row, mi_col,
-                                                 mi_row_ori, mi_col_ori,
-                                                 bsize, top_bsize,
-                                                 PARTITION_HORZ);
-      }
-      for (i = 0; i < MAX_MB_PLANE; i++) {
-        xd->plane[i].dst.buf = dst_buf[i];
-        xd->plane[i].dst.stride = dst_stride[i];
-        vp9_build_masked_inter_predictor_complex(xd,
-                                                 dst_buf[i], dst_stride[i],
-                                                 dst_buf1[i], dst_stride1[i],
-                                                 &xd->plane[i],
-                                                 mi_row, mi_col,
-                                                 mi_row_ori, mi_col_ori,
-                                                 bsize, top_bsize,
-                                                 PARTITION_VERT);
-      }
-      break;
-#endif
-    default:
-      assert(0);
-  }
-#if CONFIG_EXT_PARTITION
-  if (bsize < top_bsize)
-    update_ext_partition_context(xd, mi_row, mi_col, subsize, bsize, partition);
-#else
-  if (bsize < top_bsize && (partition != PARTITION_SPLIT || bsize == BLOCK_8X8))
-    update_partition_context(xd, mi_row, mi_col, subsize, bsize);
-#endif
-}
-
-#if CONFIG_VP9_HIGHBITDEPTH
-static void predict_sb_complex_highbd(VP9_COMP *cpi, const TileInfo *const tile,
-                                      int mi_row, int mi_col,
-                                      int mi_row_ori, int mi_col_ori,
-                                      int output_enabled, BLOCK_SIZE bsize,
-                                      BLOCK_SIZE top_bsize,
-                                      uint8_t *dst_buf[3], int dst_stride[3],
-                                      PC_TREE *pc_tree) {
-  VP9_COMMON *const cm = &cpi->common;
-  MACROBLOCK *const x = &cpi->mb;
-  MACROBLOCKD *const xd = &x->e_mbd;
-
-  const int bsl = b_width_log2_lookup[bsize], hbs = (1 << bsl) / 4;
-  PARTITION_TYPE partition;
-  BLOCK_SIZE subsize;
-#if CONFIG_EXT_PARTITION
-  BLOCK_SIZE bsize2 = get_subsize(bsize, PARTITION_SPLIT);
-#endif
-
-  int i, ctx;
-
-  DECLARE_ALIGNED_ARRAY(16, uint8_t, tmp_buf1,
-                        MAX_MB_PLANE * MAXTXLEN * MAXTXLEN * sizeof(uint16_t));
-  DECLARE_ALIGNED_ARRAY(16, uint8_t, tmp_buf2,
-                        MAX_MB_PLANE * MAXTXLEN * MAXTXLEN * sizeof(uint16_t));
-  DECLARE_ALIGNED_ARRAY(16, uint8_t, tmp_buf3,
-                        MAX_MB_PLANE * MAXTXLEN * MAXTXLEN * sizeof(uint16_t));
-  uint8_t *dst_buf1[3] = {
-    CONVERT_TO_BYTEPTR(tmp_buf1),
-    CONVERT_TO_BYTEPTR(tmp_buf1 + MAXTXLEN * MAXTXLEN * sizeof(uint16_t)),
-    CONVERT_TO_BYTEPTR(tmp_buf1 + 2 * MAXTXLEN * MAXTXLEN * sizeof(uint16_t))};
-  uint8_t *dst_buf2[3] = {
-    CONVERT_TO_BYTEPTR(tmp_buf2),
-    CONVERT_TO_BYTEPTR(tmp_buf2 + MAXTXLEN * MAXTXLEN * sizeof(uint16_t)),
-    CONVERT_TO_BYTEPTR(tmp_buf2 + 2 * MAXTXLEN * MAXTXLEN * sizeof(uint16_t))};
-  uint8_t *dst_buf3[3] = {
-    CONVERT_TO_BYTEPTR(tmp_buf3),
-    CONVERT_TO_BYTEPTR(tmp_buf3 + MAXTXLEN * MAXTXLEN * sizeof(uint16_t)),
-    CONVERT_TO_BYTEPTR(tmp_buf3 + 2 * MAXTXLEN * MAXTXLEN * sizeof(uint16_t))};
-
-  int dst_stride1[3] = {MAXTXLEN, MAXTXLEN, MAXTXLEN};
-  int dst_stride2[3] = {MAXTXLEN, MAXTXLEN, MAXTXLEN};
-  int dst_stride3[3] = {MAXTXLEN, MAXTXLEN, MAXTXLEN};
-
-  if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols)
-    return;
 
-  if (bsize >= BLOCK_8X8) {
-    ctx = partition_plane_context(xd, mi_row, mi_col, bsize);
-    subsize = get_subsize(bsize, pc_tree->partitioning);
-  } else {
-    ctx = 0;
-    subsize = BLOCK_4X4;
-  }
-  partition = partition_lookup[bsl][subsize];
-#if CONFIG_EXT_PARTITION
-  if (bsize > BLOCK_8X8)
-    partition = pc_tree->partitioning;
-#endif
-  if (output_enabled && bsize != BLOCK_4X4 && bsize < top_bsize)
-      cm->counts.partition[ctx][partition]++;
-
-  for (i = 0; i < MAX_MB_PLANE; i++) {
-    xd->plane[i].dst.buf = dst_buf[i];
-    xd->plane[i].dst.stride = dst_stride[i];
-  }
+      predict_b_extend(cpi, tile, 0, mi_row, mi_col, mi_row, mi_col,
+                       mi_row_top, mi_col_top, dst_buf, dst_stride,
+                       bsize2, top_bsize, bsize2, output_enabled, 0, 0);
+      extend_all(cpi, tile, 0, bsize2, top_bsize, mi_row, mi_col,
+                 mi_row_top, mi_col_top, output_enabled, dst_buf, dst_stride);
+
+      predict_b_extend(cpi, tile, 0, mi_row + hbs, mi_col, mi_row + hbs, mi_col,
+                       mi_row_top, mi_col_top, dst_buf1, dst_stride1,
+                       bsize2, top_bsize, bsize2, output_enabled, 0, 0);
+      extend_all(cpi, tile, 0, bsize2, top_bsize, mi_row + hbs, mi_col,
+                 mi_row_top, mi_col_top, output_enabled, dst_buf1, dst_stride1);
+
+      predict_b_extend(cpi, tile, 0, mi_row, mi_col + hbs, mi_row, mi_col + hbs,
+                       mi_row_top, mi_col_top, dst_buf2, dst_stride2,
+                       subsize, top_bsize, subsize, output_enabled, 0, 0);
+      if (bsize < top_bsize)
+        extend_all(cpi, tile, 0, subsize, top_bsize, mi_row, mi_col + hbs,
+                   mi_row_top, mi_col_top, output_enabled,
+                   dst_buf2, dst_stride2);
+      else
+        extend_dir(cpi, tile, 0, subsize, top_bsize, mi_row, mi_col + hbs,
+                   mi_row_top, mi_col_top, output_enabled,
+                   dst_buf2, dst_stride2, 2);
 
-  switch (partition) {
-    case PARTITION_NONE:
-      assert(bsize < top_bsize);
-      predict_b_extend(cpi, tile, mi_row, mi_col, mi_row_ori, mi_col_ori,
-                       output_enabled, bsize, top_bsize);
-      break;
-    case PARTITION_HORZ:
-      if (bsize > BLOCK_8X8) {
-        predict_b_extend(cpi, tile, mi_row, mi_col, mi_row_ori, mi_col_ori,
-                         output_enabled, subsize, top_bsize);
-      } else {
-        predict_b_sub8x8_extend(cpi, tile, mi_row, mi_col,
-                                mi_row_ori, mi_col_ori, output_enabled,
-                                bsize, top_bsize, PARTITION_HORZ);
-      }
-      if (mi_row + hbs < cm->mi_rows && bsize > BLOCK_8X8) {
-        for (i = 0; i < MAX_MB_PLANE; i++) {
-          xd->plane[i].dst.buf = dst_buf1[i];
-          xd->plane[i].dst.stride = dst_stride1[i];
-        }
-        predict_b_extend(cpi, tile, mi_row + hbs, mi_col,
-                         mi_row_ori, mi_col_ori, output_enabled,
-                         subsize, top_bsize);
-        for (i = 0; i < MAX_MB_PLANE; i++) {
-          xd->plane[i].dst.buf = dst_buf[i];
-          xd->plane[i].dst.stride = dst_stride[i];
-          vp9_build_masked_inter_predictor_complex(
-              xd,
-              dst_buf[i], dst_stride[i],
-              dst_buf1[i], dst_stride1[i],
-              &xd->plane[i],
-              mi_row, mi_col,
-              mi_row_ori, mi_col_ori,
-              bsize, top_bsize,
-              PARTITION_HORZ);
-        }
-      }
-      break;
-    case PARTITION_VERT:
-      if (bsize > BLOCK_8X8) {
-        predict_b_extend(cpi, tile, mi_row, mi_col, mi_row_ori, mi_col_ori,
-                         output_enabled, subsize, top_bsize);
-      } else {
-        predict_b_sub8x8_extend(cpi, tile, mi_row, mi_col,
-                                mi_row_ori, mi_col_ori, output_enabled,
-                                bsize, top_bsize, PARTITION_VERT);
-      }
-      if (mi_col + hbs < cm->mi_cols && bsize > BLOCK_8X8) {
-        for (i = 0; i < MAX_MB_PLANE; i++) {
-          xd->plane[i].dst.buf =  dst_buf1[i];
-          xd->plane[i].dst.stride = dst_stride1[i];
-        }
-        predict_b_extend(cpi, tile, mi_row, mi_col + hbs,
-                         mi_row_ori, mi_col_ori, output_enabled,
-                         subsize, top_bsize);
-        for (i = 0; i < MAX_MB_PLANE; i++) {
-          xd->plane[i].dst.buf = dst_buf[i];
-          xd->plane[i].dst.stride = dst_stride[i];
-          vp9_build_masked_inter_predictor_complex(
-              xd,
-              dst_buf[i], dst_stride[i],
-              dst_buf1[i], dst_stride1[i],
-              &xd->plane[i],
-              mi_row, mi_col,
-              mi_row_ori, mi_col_ori,
-              bsize, top_bsize,
-              PARTITION_VERT);
-        }
-      }
-      break;
-    case PARTITION_SPLIT:
-      if (bsize == BLOCK_8X8) {
-        predict_b_sub8x8_extend(cpi, tile, mi_row, mi_col,
-                                mi_row_ori, mi_col_ori, output_enabled,
-                                bsize, top_bsize, PARTITION_SPLIT);
-      } else {
-        predict_sb_complex_highbd(cpi, tile, mi_row, mi_col,
-                                  mi_row_ori, mi_col_ori, output_enabled,
-                                  subsize, top_bsize, dst_buf, dst_stride,
-                                  pc_tree->split[0]);
-        if (mi_row < cm->mi_rows && mi_col + hbs < cm->mi_cols)
-          predict_sb_complex_highbd(cpi, tile, mi_row, mi_col + hbs,
-                                    mi_row_ori, mi_col_ori, output_enabled,
-                                    subsize, top_bsize, dst_buf1, dst_stride1,
-                                    pc_tree->split[1]);
-        if (mi_row + hbs < cm->mi_rows && mi_col < cm->mi_cols)
-          predict_sb_complex_highbd(cpi, tile, mi_row + hbs, mi_col,
-                                    mi_row_ori, mi_col_ori, output_enabled,
-                                    subsize, top_bsize, dst_buf2, dst_stride2,
-                                    pc_tree->split[2]);
-        if (mi_row + hbs < cm->mi_rows && mi_col + hbs < cm->mi_cols)
-          predict_sb_complex_highbd(cpi, tile, mi_row + hbs, mi_col + hbs,
-                                    mi_row_ori, mi_col_ori, output_enabled,
-                                    subsize, top_bsize, dst_buf3, dst_stride3,
-                                    pc_tree->split[3]);
-        for (i = 0; i < MAX_MB_PLANE; i++) {
-          if (mi_row < cm->mi_rows && mi_col + hbs < cm->mi_cols) {
-            vp9_build_masked_inter_predictor_complex(xd,
-                                                     dst_buf[i],
-                                                     dst_stride[i],
-                                                     dst_buf1[i],
-                                                     dst_stride1[i],
-                                                     &xd->plane[i],
-                                                     mi_row, mi_col,
-                                                     mi_row_ori,
-                                                     mi_col_ori,
-                                                     bsize, top_bsize,
-                                                     PARTITION_VERT);
-            if (mi_row + hbs < cm->mi_rows) {
-              vp9_build_masked_inter_predictor_complex(xd,
-                                                       dst_buf2[i],
-                                                       dst_stride2[i],
-                                                       dst_buf3[i],
-                                                       dst_stride3[i],
-                                                       &xd->plane[i],
-                                                       mi_row, mi_col,
-                                                       mi_row_ori,
-                                                       mi_col_ori,
-                                                       bsize, top_bsize,
-                                                       PARTITION_VERT);
-              vp9_build_masked_inter_predictor_complex(xd,
-                                                       dst_buf[i],
-                                                       dst_stride[i],
-                                                       dst_buf2[i],
-                                                       dst_stride2[i],
-                                                       &xd->plane[i],
-                                                       mi_row, mi_col,
-                                                       mi_row_ori,
-                                                       mi_col_ori,
-                                                       bsize, top_bsize,
-                                                       PARTITION_HORZ);
-            }
-          } else if (mi_row + hbs < cm->mi_rows && mi_col < cm->mi_cols) {
-            vp9_build_masked_inter_predictor_complex(xd,
-                                                     dst_buf[i],
-                                                     dst_stride[i],
-                                                     dst_buf2[i],
-                                                     dst_stride2[i],
-                                                     &xd->plane[i],
-                                                     mi_row, mi_col,
-                                                     mi_row_ori,
-                                                     mi_col_ori,
-                                                     bsize, top_bsize,
-                                                     PARTITION_HORZ);
-          }
-        }
-      }
-      break;
-#if CONFIG_EXT_PARTITION
-    case PARTITION_HORZ_A:
-      predict_b_extend(cpi, tile, mi_row, mi_col, mi_row_ori,
-                       mi_col_ori, output_enabled, bsize2, top_bsize);
-      for (i = 0; i < MAX_MB_PLANE; i++) {
-        xd->plane[i].dst.buf = dst_buf1[i];
-        xd->plane[i].dst.stride = dst_stride1[i];
-      }
-      predict_b_extend(cpi, tile, mi_row, mi_col + hbs,
-                       mi_row_ori, mi_col_ori, output_enabled, bsize2,
-                       top_bsize);
-      for (i = 0; i < MAX_MB_PLANE; i++) {
-        xd->plane[i].dst.buf = dst_buf2[i];
-        xd->plane[i].dst.stride = dst_stride2[i];
-      }
-      predict_b_extend(cpi, tile, mi_row + hbs, mi_col,
-                       mi_row_ori, mi_col_ori, output_enabled, subsize,
-                       top_bsize);
-      for (i = 0; i < MAX_MB_PLANE; i++) {
-        xd->plane[i].dst.buf = dst_buf[i];
-        xd->plane[i].dst.stride = dst_stride[i];
-        vp9_build_masked_inter_predictor_complex(xd,
-                                                 dst_buf[i], dst_stride[i],
-                                                 dst_buf1[i], dst_stride1[i],
-                                                 &xd->plane[i],
-                                                 mi_row, mi_col,
-                                                 mi_row_ori, mi_col_ori,
-                                                 bsize, top_bsize,
-                                                 PARTITION_VERT);
-      }
-      for (i = 0; i < MAX_MB_PLANE; i++) {
-        vp9_build_masked_inter_predictor_complex(xd,
-                                                 dst_buf[i], dst_stride[i],
-                                                 dst_buf2[i], dst_stride2[i],
-                                                 &xd->plane[i],
-                                                 mi_row, mi_col,
-                                                 mi_row_ori, mi_col_ori,
-                                                 bsize, top_bsize,
-                                                 PARTITION_HORZ);
-      }
-      break;
-    case PARTITION_VERT_A:
-      predict_b_extend(cpi, tile, mi_row, mi_col, mi_row_ori, mi_col_ori,
-                       output_enabled, bsize2, top_bsize);
-      for (i = 0; i < MAX_MB_PLANE; i++) {
-        xd->plane[i].dst.buf = dst_buf1[i];
-        xd->plane[i].dst.stride = dst_stride1[i];
-      }
-      predict_b_extend(cpi, tile, mi_row + hbs, mi_col,
-                       mi_row_ori, mi_col_ori, output_enabled, bsize2,
-                       top_bsize);
-      for (i = 0; i < MAX_MB_PLANE; i++) {
-        xd->plane[i].dst.buf = dst_buf2[i];
-        xd->plane[i].dst.stride = dst_stride2[i];
-      }
-      predict_b_extend(cpi, tile, mi_row, mi_col + hbs, mi_row_ori,
-                       mi_col_ori, output_enabled, subsize, top_bsize);
       for (i = 0; i < MAX_MB_PLANE; i++) {
         xd->plane[i].dst.buf = dst_buf[i];
         xd->plane[i].dst.stride = dst_stride[i];
@@ -5614,9 +5468,9 @@ static void predict_sb_complex_highbd(VP9_COMP *cpi, const TileInfo *const tile,
                                                  dst_buf1[i], dst_stride1[i],
                                                  &xd->plane[i],
                                                  mi_row, mi_col,
-                                                 mi_row_ori, mi_col_ori,
+                                                 mi_row_top, mi_col_top,
                                                  bsize, top_bsize,
-                                                 PARTITION_HORZ);
+                                                 PARTITION_HORZ, i);
       }
       for (i = 0; i < MAX_MB_PLANE; i++) {
         vp9_build_masked_inter_predictor_complex(xd,
@@ -5624,26 +5478,37 @@ static void predict_sb_complex_highbd(VP9_COMP *cpi, const TileInfo *const tile,
                                                  dst_buf2[i], dst_stride2[i],
                                                  &xd->plane[i],
                                                  mi_row, mi_col,
-                                                 mi_row_ori, mi_col_ori,
+                                                 mi_row_top, mi_col_top,
                                                  bsize, top_bsize,
-                                                 PARTITION_VERT);
+                                                 PARTITION_VERT, i);
       }
       break;
     case PARTITION_HORZ_B:
-      predict_b_extend(cpi, tile, mi_row, mi_col, mi_row_ori,
-                       mi_col_ori, output_enabled, subsize, top_bsize);
-      for (i = 0; i < MAX_MB_PLANE; i++) {
-        xd->plane[i].dst.buf = dst_buf1[i];
-        xd->plane[i].dst.stride = dst_stride1[i];
-      }
-      predict_b_extend(cpi, tile, mi_row + hbs, mi_col, mi_row_ori,
-                       mi_col_ori, output_enabled, bsize2, top_bsize);
-      for (i = 0; i < MAX_MB_PLANE; i++) {
-        xd->plane[i].dst.buf = dst_buf2[i];
-        xd->plane[i].dst.stride = dst_stride2[i];
-      }
-      predict_b_extend(cpi, tile, mi_row + hbs, mi_col + hbs, mi_row_ori,
-                       mi_col_ori, output_enabled, bsize2, top_bsize);
+
+      predict_b_extend(cpi, tile, 0, mi_row, mi_col, mi_row, mi_col,
+                       mi_row_top, mi_col_top, dst_buf, dst_stride,
+                       subsize, top_bsize, subsize, output_enabled, 0, 0);
+      if (bsize < top_bsize)
+        extend_all(cpi, tile, 0, subsize, top_bsize, mi_row, mi_col,
+                   mi_row_top, mi_col_top, output_enabled, dst_buf, dst_stride);
+      else
+        extend_dir(cpi, tile, 0, subsize, top_bsize, mi_row, mi_col,
+                   mi_row_top, mi_col_top, output_enabled,
+                   dst_buf, dst_stride, 0);
+
+      predict_b_extend(cpi, tile, 0, mi_row + hbs, mi_col, mi_row + hbs, mi_col,
+                       mi_row_top, mi_col_top, dst_buf1, dst_stride1,
+                       bsize2, top_bsize, bsize2, output_enabled, 0, 0);
+      extend_all(cpi, tile, 0, bsize2, top_bsize, mi_row + hbs, mi_col,
+                 mi_row_top, mi_col_top, output_enabled, dst_buf1, dst_stride1);
+
+      predict_b_extend(cpi, tile, 0, mi_row + hbs, mi_col + hbs,
+                       mi_row + hbs, mi_col + hbs, mi_row_top, mi_col_top,
+                       dst_buf2, dst_stride2, bsize2, top_bsize, bsize2,
+                       output_enabled, 0, 0);
+      extend_all(cpi, tile, 0, bsize2, top_bsize, mi_row + hbs, mi_col + hbs,
+                 mi_row_top, mi_col_top, output_enabled, dst_buf2, dst_stride2);
+
       for (i = 0; i < MAX_MB_PLANE; i++) {
         xd->plane[i].dst.buf = dst_buf1[i];
         xd->plane[i].dst.stride = dst_stride1[i];
@@ -5652,9 +5517,9 @@ static void predict_sb_complex_highbd(VP9_COMP *cpi, const TileInfo *const tile,
                                                  dst_buf2[i], dst_stride2[i],
                                                  &xd->plane[i],
                                                  mi_row, mi_col,
-                                                 mi_row_ori, mi_col_ori,
+                                                 mi_row_top, mi_col_top,
                                                  bsize, top_bsize,
-                                                 PARTITION_VERT);
+                                                 PARTITION_VERT, i);
       }
       for (i = 0; i < MAX_MB_PLANE; i++) {
         xd->plane[i].dst.buf = dst_buf[i];
@@ -5664,26 +5529,37 @@ static void predict_sb_complex_highbd(VP9_COMP *cpi, const TileInfo *const tile,
                                                  dst_buf1[i], dst_stride1[i],
                                                  &xd->plane[i],
                                                  mi_row, mi_col,
-                                                 mi_row_ori, mi_col_ori,
+                                                 mi_row_top, mi_col_top,
                                                  bsize, top_bsize,
-                                                 PARTITION_HORZ);
+                                                 PARTITION_HORZ, i);
       }
       break;
     case PARTITION_VERT_B:
-      predict_b_extend(cpi, tile, mi_row, mi_col, mi_row_ori,
-                       mi_col_ori, output_enabled, subsize, top_bsize);
-      for (i = 0; i < MAX_MB_PLANE; i++) {
-        xd->plane[i].dst.buf = dst_buf1[i];
-        xd->plane[i].dst.stride = dst_stride1[i];
-      }
-      predict_b_extend(cpi, tile, mi_row, mi_col + hbs, mi_row_ori,
-                       mi_col_ori, output_enabled, bsize2, top_bsize);
-      for (i = 0; i < MAX_MB_PLANE; i++) {
-        xd->plane[i].dst.buf = dst_buf2[i];
-        xd->plane[i].dst.stride = dst_stride2[i];
-      }
-      predict_b_extend(cpi, tile, mi_row + hbs, mi_col + hbs, mi_row_ori,
-                       mi_col_ori, output_enabled, subsize, top_bsize);
+
+      predict_b_extend(cpi, tile, 0, mi_row, mi_col, mi_row, mi_col,
+                       mi_row_top, mi_col_top, dst_buf, dst_stride,
+                       subsize, top_bsize, subsize, output_enabled, 0, 0);
+      if (bsize < top_bsize)
+        extend_all(cpi, tile, 0, subsize, top_bsize, mi_row, mi_col,
+                   mi_row_top, mi_col_top, output_enabled, dst_buf, dst_stride);
+      else
+        extend_dir(cpi, tile, 0, subsize, top_bsize, mi_row, mi_col,
+                   mi_row_top, mi_col_top, output_enabled,
+                   dst_buf, dst_stride, 3);
+
+      predict_b_extend(cpi, tile, 0, mi_row, mi_col + hbs, mi_row, mi_col + hbs,
+                       mi_row_top, mi_col_top, dst_buf1, dst_stride1,
+                       bsize2, top_bsize, bsize2, output_enabled, 0, 0);
+      extend_all(cpi, tile, 0, bsize2, top_bsize, mi_row, mi_col + hbs,
+                 mi_row_top, mi_col_top, output_enabled, dst_buf1, dst_stride1);
+
+      predict_b_extend(cpi, tile, 0, mi_row + hbs, mi_col + hbs,
+                       mi_row + hbs, mi_col + hbs, mi_row_top, mi_col_top,
+                       dst_buf2, dst_stride2, bsize2, top_bsize, bsize2,
+                       output_enabled, 0, 0);
+      extend_all(cpi, tile, 0, bsize2, top_bsize, mi_row + hbs, mi_col + hbs,
+                 mi_row_top, mi_col_top, output_enabled, dst_buf2, dst_stride2);
+
       for (i = 0; i < MAX_MB_PLANE; i++) {
         xd->plane[i].dst.buf = dst_buf1[i];
         xd->plane[i].dst.stride = dst_stride1[i];
@@ -5692,9 +5568,9 @@ static void predict_sb_complex_highbd(VP9_COMP *cpi, const TileInfo *const tile,
                                                  dst_buf2[i], dst_stride2[i],
                                                  &xd->plane[i],
                                                  mi_row, mi_col,
-                                                 mi_row_ori, mi_col_ori,
+                                                 mi_row_top, mi_col_top,
                                                  bsize, top_bsize,
-                                                 PARTITION_HORZ);
+                                                 PARTITION_HORZ, i);
       }
       for (i = 0; i < MAX_MB_PLANE; i++) {
         xd->plane[i].dst.buf = dst_buf[i];
@@ -5704,15 +5580,17 @@ static void predict_sb_complex_highbd(VP9_COMP *cpi, const TileInfo *const tile,
                                                  dst_buf1[i], dst_stride1[i],
                                                  &xd->plane[i],
                                                  mi_row, mi_col,
-                                                 mi_row_ori, mi_col_ori,
+                                                 mi_row_top, mi_col_top,
                                                  bsize, top_bsize,
-                                                 PARTITION_VERT);
+                                                 PARTITION_VERT, i);
       }
       break;
 #endif
     default:
       assert(0);
   }
+
+
 #if CONFIG_EXT_PARTITION
   if (bsize < top_bsize)
     update_ext_partition_context(xd, mi_row, mi_col, subsize, bsize, partition);
@@ -5721,7 +5599,6 @@ static void predict_sb_complex_highbd(VP9_COMP *cpi, const TileInfo *const tile,
     update_partition_context(xd, mi_row, mi_col, subsize, bsize);
 #endif
 }
-#endif  // CONFIG_VP9_HIGHBITDEPTH
 
 static void rd_supertx_sb(VP9_COMP *cpi, const TileInfo *const tile,
                           int mi_row, int mi_col, BLOCK_SIZE bsize,
@@ -5753,18 +5630,8 @@ static void rd_supertx_sb(VP9_COMP *cpi, const TileInfo *const tile,
     dst_buf[plane] = xd->plane[plane].dst.buf;
     dst_stride[plane] = xd->plane[plane].dst.stride;
   }
-#if CONFIG_VP9_HIGHBITDEPTH
-  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
-    predict_sb_complex_highbd(cpi, tile, mi_row, mi_col, mi_row, mi_col,
-                              0, bsize, bsize, dst_buf, dst_stride, pc_tree);
-  } else {
-    predict_sb_complex(cpi, tile, mi_row, mi_col, mi_row, mi_col,
-                       0, bsize, bsize, dst_buf, dst_stride, pc_tree);
-  }
-#else
-    predict_sb_complex(cpi, tile, mi_row, mi_col, mi_row, mi_col,
-                       0, bsize, bsize, dst_buf, dst_stride, pc_tree);
-#endif  // CONFIG_VP9_HIGHBITDEPTH
+  predict_sb_complex(cpi, tile, mi_row, mi_col, mi_row, mi_col,
+                     0, bsize, bsize, dst_buf, dst_stride, pc_tree);
 
   set_offsets(cpi, tile, mi_row, mi_col, bsize);
 #if CONFIG_EXT_TX