From 938b8dfc73d1cd1c3ce1c4a66e64147a55e163f3 Mon Sep 17 00:00:00 2001
From: Geza Lore <gezalore@gmail.com>
Date: Fri, 4 Mar 2016 15:55:48 +0000
Subject: [PATCH] Extend convolution functions to 128x128 for ext-partition.

Change-Id: I7f7e26cd1d58eb38417200550c6fbf4108c9f942
---
 test/convolve_test.cc                      | 128 ++++++++++++++++-----
 test/masked_sad_test.cc                    |   2 -
 test/masked_variance_test.cc               |   2 -
 vpx_dsp/vpx_convolve.c                     |  53 +++++----
 vpx_dsp/vpx_convolve.h                     |  18 +++
 vpx_dsp/vpx_dsp_common.h                   |   6 +
 vpx_dsp/vpx_dsp_rtcd_defs.pl               |  78 ++++++-------
 vpx_dsp/x86/convolve.h                     |  58 ++++++----
 vpx_dsp/x86/vpx_convolve_copy_sse2.asm     | 118 ++++++++++++++++++-
 vpx_dsp/x86/vpx_subpixel_8t_intrin_ssse3.c |  37 ++++--
 10 files changed, 366 insertions(+), 134 deletions(-)
diff --git a/test/convolve_test.cc b/test/convolve_test.cc
index 12022be52..0e54c4013 100644
--- a/test/convolve_test.cc
+++ b/test/convolve_test.cc
@@ -28,7 +28,7 @@
 
 namespace {
 
-static const unsigned int kMaxDimension = 64;
+static const unsigned int kMaxDimension = MAX_CU_SIZE;
 
 typedef void (*ConvolveFunc)(const uint8_t *src, ptrdiff_t src_stride,
                              uint8_t *dst, ptrdiff_t dst_stride,
@@ -102,7 +102,7 @@ void filter_block2d_8_c(const uint8_t *src_ptr,
   //                               = 23
   // and filter_max_width          = 16
   //
-  uint8_t intermediate_buffer[71 * kMaxDimension];
+  uint8_t intermediate_buffer[(kMaxDimension+8) * kMaxDimension];
   const int intermediate_next_stride = 1 - intermediate_height * output_width;
 
   // Horizontal pass (src -> transposed intermediate).
@@ -183,9 +183,9 @@ void filter_average_block2d_8_c(const uint8_t *src_ptr,
 
   assert(output_width <= kMaxDimension);
   assert(output_height <= kMaxDimension);
-  filter_block2d_8_c(src_ptr, src_stride, HFilter, VFilter, tmp, 64,
+  filter_block2d_8_c(src_ptr, src_stride, HFilter, VFilter, tmp, kMaxDimension,
                      output_width, output_height);
-  block2d_average_c(tmp, 64, dst_ptr, dst_stride,
+  block2d_average_c(tmp, kMaxDimension, dst_ptr, dst_stride,
                     output_width, output_height);
 }
 
@@ -214,7 +214,7 @@ void highbd_filter_block2d_8_c(const uint16_t *src_ptr,
    *                               = 23
    * and filter_max_width = 16
    */
-  uint16_t intermediate_buffer[71 * kMaxDimension];
+  uint16_t intermediate_buffer[(kMaxDimension+8) * kMaxDimension];
   const int intermediate_next_stride = 1 - intermediate_height * output_width;
 
   // Horizontal pass (src -> transposed intermediate).
@@ -302,9 +302,10 @@ void highbd_filter_average_block2d_8_c(const uint16_t *src_ptr,
 
   assert(output_width <= kMaxDimension);
   assert(output_height <= kMaxDimension);
-  highbd_filter_block2d_8_c(src_ptr, src_stride, HFilter, VFilter, tmp, 64,
+  highbd_filter_block2d_8_c(src_ptr, src_stride, HFilter, VFilter,
+                            tmp, kMaxDimension,
                             output_width, output_height, bd);
-  highbd_block2d_average_c(tmp, 64, dst_ptr, dst_stride,
+  highbd_block2d_average_c(tmp, kMaxDimension, dst_ptr, dst_stride,
                            output_width, output_height);
 }
 #endif  // CONFIG_VP9_HIGHBITDEPTH
@@ -351,7 +352,7 @@ class ConvolveTest : public ::testing::TestWithParam<ConvolveParam> {
 
  protected:
   static const int kDataAlignment = 16;
-  static const int kOuterBlockSize = 256;
+  static const int kOuterBlockSize = 4*kMaxDimension;
   static const int kInputStride = kOuterBlockSize;
   static const int kOutputStride = kOuterBlockSize;
   static const int kInputBufferSize = kOuterBlockSize * kOuterBlockSize;
@@ -414,7 +415,8 @@ class ConvolveTest : public ::testing::TestWithParam<ConvolveParam> {
   void CopyOutputToRef() {
     memcpy(output_ref_, output_, kOutputBufferSize);
 #if CONFIG_VP9_HIGHBITDEPTH
-    memcpy(output16_ref_, output16_, kOutputBufferSize);
+    memcpy(output16_ref_, output16_,
+           kOutputBufferSize * sizeof(*output16_ref_));
 #endif
   }
 
@@ -426,41 +428,41 @@ class ConvolveTest : public ::testing::TestWithParam<ConvolveParam> {
   }
 
   uint8_t *input() const {
+    const int index = BorderTop() * kOuterBlockSize + BorderLeft();
 #if CONFIG_VP9_HIGHBITDEPTH
     if (UUT_->use_highbd_ == 0) {
-      return input_ + BorderTop() * kOuterBlockSize + BorderLeft();
+      return input_ + index;
     } else {
-      return CONVERT_TO_BYTEPTR(input16_ + BorderTop() * kOuterBlockSize +
-                                BorderLeft());
+      return CONVERT_TO_BYTEPTR(input16_) + index;
     }
 #else
-    return input_ + BorderTop() * kOuterBlockSize + BorderLeft();
+    return input_ + index;
 #endif
   }
 
   uint8_t *output() const {
+    const int index = BorderTop() * kOuterBlockSize + BorderLeft();
 #if CONFIG_VP9_HIGHBITDEPTH
     if (UUT_->use_highbd_ == 0) {
-      return output_ + BorderTop() * kOuterBlockSize + BorderLeft();
+      return output_ + index;
     } else {
-      return CONVERT_TO_BYTEPTR(output16_ + BorderTop() * kOuterBlockSize +
-                                BorderLeft());
+      return CONVERT_TO_BYTEPTR(output16_ + index);
     }
 #else
-    return output_ + BorderTop() * kOuterBlockSize + BorderLeft();
+    return output_ + index;
 #endif
   }
 
   uint8_t *output_ref() const {
+    const int index = BorderTop() * kOuterBlockSize + BorderLeft();
 #if CONFIG_VP9_HIGHBITDEPTH
     if (UUT_->use_highbd_ == 0) {
-      return output_ref_ + BorderTop() * kOuterBlockSize + BorderLeft();
+      return output_ref_ + index;
     } else {
-      return CONVERT_TO_BYTEPTR(output16_ref_ + BorderTop() * kOuterBlockSize +
-                                BorderLeft());
+      return CONVERT_TO_BYTEPTR(output16_ref_ + index);
     }
 #else
-    return output_ref_ + BorderTop() * kOuterBlockSize + BorderLeft();
+    return output_ref_ + index;
 #endif
   }
 
@@ -1035,6 +1037,11 @@ const ConvolveFunctions convolve8_c(
     wrap_convolve8_vert_c_8, wrap_convolve8_avg_vert_c_8,
     wrap_convolve8_c_8, wrap_convolve8_avg_c_8, 8);
 INSTANTIATE_TEST_CASE_P(C_8, ConvolveTest, ::testing::Values(
+#if CONFIG_VP10 && CONFIG_EXT_PARTITION
+    make_tuple(128, 64, &convolve8_c),
+    make_tuple(64, 128, &convolve8_c),
+    make_tuple(128, 128, &convolve8_c),
+#endif  // CONFIG_VP10 && CONFIG_EXT_PARTITION
     make_tuple(4, 4, &convolve8_c),
     make_tuple(8, 4, &convolve8_c),
     make_tuple(4, 8, &convolve8_c),
@@ -1057,6 +1064,11 @@ const ConvolveFunctions convolve10_c(
     wrap_convolve8_vert_c_10, wrap_convolve8_avg_vert_c_10,
     wrap_convolve8_c_10, wrap_convolve8_avg_c_10, 10);
 INSTANTIATE_TEST_CASE_P(C_10, ConvolveTest, ::testing::Values(
+#if CONFIG_VP10 && CONFIG_EXT_PARTITION
+    make_tuple(128, 64, &convolve10_c),
+    make_tuple(64, 128, &convolve10_c),
+    make_tuple(128, 128, &convolve10_c),
+#endif  // CONFIG_VP10 && CONFIG_EXT_PARTITION
     make_tuple(4, 4, &convolve10_c),
     make_tuple(8, 4, &convolve10_c),
     make_tuple(4, 8, &convolve10_c),
@@ -1079,6 +1091,11 @@ const ConvolveFunctions convolve12_c(
     wrap_convolve8_vert_c_12, wrap_convolve8_avg_vert_c_12,
     wrap_convolve8_c_12, wrap_convolve8_avg_c_12, 12);
 INSTANTIATE_TEST_CASE_P(C_12, ConvolveTest, ::testing::Values(
+#if CONFIG_VP10 && CONFIG_EXT_PARTITION
+    make_tuple(128, 64, &convolve12_c),
+    make_tuple(64, 128, &convolve12_c),
+    make_tuple(128, 128, &convolve12_c),
+#endif  // CONFIG_VP10 && CONFIG_EXT_PARTITION
     make_tuple(4, 4, &convolve12_c),
     make_tuple(8, 4, &convolve12_c),
     make_tuple(4, 8, &convolve12_c),
@@ -1105,6 +1122,11 @@ const ConvolveFunctions convolve8_c(
     vpx_scaled_2d_c, vpx_scaled_avg_2d_c, 0);
 
 INSTANTIATE_TEST_CASE_P(C, ConvolveTest, ::testing::Values(
+#if CONFIG_VP10 && CONFIG_EXT_PARTITION
+    make_tuple(128, 64, &convolve8_c),
+    make_tuple(64, 128, &convolve8_c),
+    make_tuple(128, 128, &convolve8_c),
+#endif  // CONFIG_VP10 && CONFIG_EXT_PARTITION
     make_tuple(4, 4, &convolve8_c),
     make_tuple(8, 4, &convolve8_c),
     make_tuple(4, 8, &convolve8_c),
@@ -1158,7 +1180,12 @@ const ConvolveFunctions convolve12_sse2(
     wrap_convolve8_horiz_sse2_12, wrap_convolve8_avg_horiz_sse2_12,
     wrap_convolve8_vert_sse2_12, wrap_convolve8_avg_vert_sse2_12,
     wrap_convolve8_sse2_12, wrap_convolve8_avg_sse2_12, 12);
-INSTANTIATE_TEST_CASE_P(SSE2, ConvolveTest, ::testing::Values(
+INSTANTIATE_TEST_CASE_P(SSE2_8, ConvolveTest, ::testing::Values(
+#if CONFIG_VP10 && CONFIG_EXT_PARTITION
+    make_tuple(128, 64, &convolve8_sse2),
+    make_tuple(64, 128, &convolve8_sse2),
+    make_tuple(128, 128, &convolve8_sse2),
+#endif  // CONFIG_VP10 && CONFIG_EXT_PARTITION
     make_tuple(4, 4, &convolve8_sse2),
     make_tuple(8, 4, &convolve8_sse2),
     make_tuple(4, 8, &convolve8_sse2),
@@ -1171,7 +1198,13 @@ INSTANTIATE_TEST_CASE_P(SSE2, ConvolveTest, ::testing::Values(
     make_tuple(32, 32, &convolve8_sse2),
     make_tuple(64, 32, &convolve8_sse2),
     make_tuple(32, 64, &convolve8_sse2),
-    make_tuple(64, 64, &convolve8_sse2),
+    make_tuple(64, 64, &convolve8_sse2)));
+INSTANTIATE_TEST_CASE_P(SSE2_10, ConvolveTest, ::testing::Values(
+#if CONFIG_VP10 && CONFIG_EXT_PARTITION
+    make_tuple(128, 64, &convolve10_sse2),
+    make_tuple(64, 128, &convolve10_sse2),
+    make_tuple(128, 128, &convolve10_sse2),
+#endif  // CONFIG_VP10 && CONFIG_EXT_PARTITION
     make_tuple(4, 4, &convolve10_sse2),
     make_tuple(8, 4, &convolve10_sse2),
     make_tuple(4, 8, &convolve10_sse2),
@@ -1184,7 +1217,13 @@ INSTANTIATE_TEST_CASE_P(SSE2, ConvolveTest, ::testing::Values(
     make_tuple(32, 32, &convolve10_sse2),
     make_tuple(64, 32, &convolve10_sse2),
     make_tuple(32, 64, &convolve10_sse2),
-    make_tuple(64, 64, &convolve10_sse2),
+    make_tuple(64, 64, &convolve10_sse2)));
+INSTANTIATE_TEST_CASE_P(SSE2_12, ConvolveTest, ::testing::Values(
+#if CONFIG_VP10 && CONFIG_EXT_PARTITION
+    make_tuple(128, 64, &convolve12_sse2),
+    make_tuple(64, 128, &convolve12_sse2),
+    make_tuple(128, 128, &convolve12_sse2),
+#endif  // CONFIG_VP10 && CONFIG_EXT_PARTITION
     make_tuple(4, 4, &convolve12_sse2),
     make_tuple(8, 4, &convolve12_sse2),
     make_tuple(4, 8, &convolve12_sse2),
@@ -1213,6 +1252,11 @@ const ConvolveFunctions convolve8_sse2(
     vpx_scaled_2d_c, vpx_scaled_avg_2d_c, 0);
 
 INSTANTIATE_TEST_CASE_P(SSE2, ConvolveTest, ::testing::Values(
+#if CONFIG_VP10 && CONFIG_EXT_PARTITION
+    make_tuple(128, 64, &convolve8_sse2),
+    make_tuple(64, 128, &convolve8_sse2),
+    make_tuple(128, 128, &convolve8_sse2),
+#endif  // CONFIG_VP10 && CONFIG_EXT_PARTITION
     make_tuple(4, 4, &convolve8_sse2),
     make_tuple(8, 4, &convolve8_sse2),
     make_tuple(4, 8, &convolve8_sse2),
@@ -1237,9 +1281,14 @@ const ConvolveFunctions convolve8_ssse3(
     vpx_convolve8_ssse3, vpx_convolve8_avg_ssse3,
     vpx_scaled_horiz_c, vpx_scaled_avg_horiz_c,
     vpx_scaled_vert_c, vpx_scaled_avg_vert_c,
-    vpx_scaled_2d_c, vpx_scaled_avg_2d_c, 0);
+    vpx_scaled_2d_ssse3, vpx_scaled_avg_2d_c, 0);
 
 INSTANTIATE_TEST_CASE_P(SSSE3, ConvolveTest, ::testing::Values(
+#if CONFIG_VP10 && CONFIG_EXT_PARTITION
+    make_tuple(128, 64, &convolve8_ssse3),
+    make_tuple(64, 128, &convolve8_ssse3),
+    make_tuple(128, 128, &convolve8_ssse3),
+#endif  // CONFIG_VP10 && CONFIG_EXT_PARTITION
     make_tuple(4, 4, &convolve8_ssse3),
     make_tuple(8, 4, &convolve8_ssse3),
     make_tuple(4, 8, &convolve8_ssse3),
@@ -1266,6 +1315,11 @@ const ConvolveFunctions convolve8_avx2(
     vpx_scaled_2d_c, vpx_scaled_avg_2d_c, 0);
 
 INSTANTIATE_TEST_CASE_P(AVX2, ConvolveTest, ::testing::Values(
+#if CONFIG_VP10 && CONFIG_EXT_PARTITION
+    make_tuple(128, 64, &convolve8_avx2),
+    make_tuple(64, 128, &convolve8_avx2),
+    make_tuple(128, 128, &convolve8_avx2),
+#endif  // CONFIG_VP10 && CONFIG_EXT_PARTITION
     make_tuple(4, 4, &convolve8_avx2),
     make_tuple(8, 4, &convolve8_avx2),
     make_tuple(4, 8, &convolve8_avx2),
@@ -1281,7 +1335,8 @@ INSTANTIATE_TEST_CASE_P(AVX2, ConvolveTest, ::testing::Values(
     make_tuple(64, 64, &convolve8_avx2)));
 #endif  // HAVE_AVX2 && HAVE_SSSE3
 
-#if HAVE_NEON
+// TODO(any): Make NEON versions support 128x128 128x64 64x128 block sizes
+#if HAVE_NEON && !(CONFIG_VP10 && CONFIG_EXT_PARTITION)
 #if HAVE_NEON_ASM
 const ConvolveFunctions convolve8_neon(
     vpx_convolve_copy_neon, vpx_convolve_avg_neon,
@@ -1303,6 +1358,11 @@ const ConvolveFunctions convolve8_neon(
 #endif  // HAVE_NEON_ASM
 
 INSTANTIATE_TEST_CASE_P(NEON, ConvolveTest, ::testing::Values(
+#if CONFIG_VP10 && CONFIG_EXT_PARTITION
+    make_tuple(128, 64, &convolve8_neon),
+    make_tuple(64, 128, &convolve8_neon),
+    make_tuple(128, 128, &convolve8_neon),
+#endif  // CONFIG_VP10 && CONFIG_EXT_PARTITION
     make_tuple(4, 4, &convolve8_neon),
     make_tuple(8, 4, &convolve8_neon),
     make_tuple(4, 8, &convolve8_neon),
@@ -1318,7 +1378,8 @@ INSTANTIATE_TEST_CASE_P(NEON, ConvolveTest, ::testing::Values(
     make_tuple(64, 64, &convolve8_neon)));
 #endif  // HAVE_NEON
 
-#if HAVE_DSPR2
+// TODO(any): Make DSPR2 versions support 128x128 128x64 64x128 block sizes
+#if HAVE_DSPR2 && !(CONFIG_VP10 && CONFIG_EXT_PARTITION)
 const ConvolveFunctions convolve8_dspr2(
     vpx_convolve_copy_dspr2, vpx_convolve_avg_dspr2,
     vpx_convolve8_horiz_dspr2, vpx_convolve8_avg_horiz_dspr2,
@@ -1329,6 +1390,11 @@ const ConvolveFunctions convolve8_dspr2(
     vpx_scaled_2d_c, vpx_scaled_avg_2d_c, 0);
 
 INSTANTIATE_TEST_CASE_P(DSPR2, ConvolveTest, ::testing::Values(
+#if CONFIG_VP10 && CONFIG_EXT_PARTITION
+    make_tuple(128, 64, &convolve8_dspr2),
+    make_tuple(64, 128, &convolve8_dspr2),
+    make_tuple(128, 128, &convolve8_dspr2),
+#endif  // CONFIG_VP10 && CONFIG_EXT_PARTITION
     make_tuple(4, 4, &convolve8_dspr2),
     make_tuple(8, 4, &convolve8_dspr2),
     make_tuple(4, 8, &convolve8_dspr2),
@@ -1344,7 +1410,8 @@ INSTANTIATE_TEST_CASE_P(DSPR2, ConvolveTest, ::testing::Values(
     make_tuple(64, 64, &convolve8_dspr2)));
 #endif
 
-#if HAVE_MSA
+// TODO(any): Make MSA versions support 128x128 128x64 64x128 block sizes
+#if HAVE_MSA && !(CONFIG_VP10 && CONFIG_EXT_PARTITION)
 const ConvolveFunctions convolve8_msa(
     vpx_convolve_copy_msa, vpx_convolve_avg_msa,
     vpx_convolve8_horiz_msa, vpx_convolve8_avg_horiz_msa,
@@ -1355,6 +1422,11 @@ const ConvolveFunctions convolve8_msa(
     vpx_scaled_2d_c, vpx_scaled_avg_2d_c, 0);
 
 INSTANTIATE_TEST_CASE_P(MSA, ConvolveTest, ::testing::Values(
+#if CONFIG_VP10 && CONFIG_EXT_PARTITION
+    make_tuple(128, 64, &convolve8_msa),
+    make_tuple(64, 128, &convolve8_msa),
+    make_tuple(128, 128, &convolve8_msa),
+#endif  // CONFIG_VP10 && CONFIG_EXT_PARTITION
     make_tuple(4, 4, &convolve8_msa),
     make_tuple(8, 4, &convolve8_msa),
     make_tuple(4, 8, &convolve8_msa),
diff --git a/test/masked_sad_test.cc b/test/masked_sad_test.cc
index d7c6fcec4..34223eac8 100644
--- a/test/masked_sad_test.cc
+++ b/test/masked_sad_test.cc
@@ -22,8 +22,6 @@
 #include "./vpx_dsp_rtcd.h"
 #include "vpx/vpx_integer.h"
 
-#define MAX_CU_SIZE 128
-
 using libvpx_test::ACMRandom;
 
 namespace {
diff --git a/test/masked_variance_test.cc b/test/masked_variance_test.cc
index c312899a6..1f8bf1e22 100644
--- a/test/masked_variance_test.cc
+++ b/test/masked_variance_test.cc
@@ -25,8 +25,6 @@
 #include "vpx_dsp/vpx_filter.h"
 #include "vpx_mem/vpx_mem.h"
 
-#define MAX_CU_SIZE 128
-
 using libvpx_test::ACMRandom;
 
 namespace {
diff --git a/vpx_dsp/vpx_convolve.c b/vpx_dsp/vpx_convolve.c
index 2d1c927cb..2e85ed481 100644
--- a/vpx_dsp/vpx_convolve.c
+++ b/vpx_dsp/vpx_convolve.c
@@ -130,18 +130,21 @@ static void convolve(const uint8_t *src, ptrdiff_t src_stride,
   // --Must round-up because block may be located at sub-pixel position.
   // --Require an additional SUBPEL_TAPS rows for the 8-tap filter tails.
   // --((64 - 1) * 32 + 15) >> 4 + 8 = 135.
-  uint8_t temp[135 * 64];
+  uint8_t temp[MAX_EXT_SIZE * MAX_CU_SIZE];
   int intermediate_height =
           (((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + SUBPEL_TAPS;
 
-  assert(w <= 64);
-  assert(h <= 64);
+  assert(w <= MAX_CU_SIZE);
+  assert(h <= MAX_CU_SIZE);
+
   assert(y_step_q4 <= 32);
   assert(x_step_q4 <= 32);
 
-  convolve_horiz(src - src_stride * (SUBPEL_TAPS / 2 - 1), src_stride, temp, 64,
+  convolve_horiz(src - src_stride * (SUBPEL_TAPS / 2 - 1), src_stride,
+                 temp, MAX_CU_SIZE,
                  x_filters, x0_q4, x_step_q4, w, intermediate_height);
-  convolve_vert(temp + 64 * (SUBPEL_TAPS / 2 - 1), 64, dst, dst_stride,
+  convolve_vert(temp + MAX_CU_SIZE * (SUBPEL_TAPS / 2 - 1), MAX_CU_SIZE,
+                dst, dst_stride,
                 y_filters, y0_q4, y_step_q4, w, h);
 }
 
@@ -237,13 +240,14 @@ void vpx_convolve8_avg_c(const uint8_t *src, ptrdiff_t src_stride,
                          const int16_t *filter_y, int y_step_q4,
                          int w, int h) {
   /* Fixed size intermediate buffer places limits on parameters. */
-  DECLARE_ALIGNED(16, uint8_t, temp[64 * 64]);
-  assert(w <= 64);
-  assert(h <= 64);
+  DECLARE_ALIGNED(16, uint8_t, temp[MAX_CU_SIZE * MAX_CU_SIZE]);
+  assert(w <= MAX_CU_SIZE);
+  assert(h <= MAX_CU_SIZE);
 
-  vpx_convolve8_c(src, src_stride, temp, 64,
+  vpx_convolve8_c(src, src_stride, temp, MAX_CU_SIZE,
                   filter_x, x_step_q4, filter_y, y_step_q4, w, h);
-  vpx_convolve_avg_c(temp, 64, dst, dst_stride, NULL, 0, NULL, 0, w, h);
+  vpx_convolve_avg_c(temp, MAX_CU_SIZE, dst, dst_stride,
+                     NULL, 0, NULL, 0, w, h);
 }
 
 void vpx_convolve_copy_c(const uint8_t *src, ptrdiff_t src_stride,
@@ -459,22 +463,23 @@ static void highbd_convolve(const uint8_t *src, ptrdiff_t src_stride,
   // --Must round-up because block may be located at sub-pixel position.
   // --Require an additional SUBPEL_TAPS rows for the 8-tap filter tails.
   // --((64 - 1) * 32 + 15) >> 4 + 8 = 135.
-  uint16_t temp[64 * 135];
+  uint16_t temp[MAX_EXT_SIZE * MAX_CU_SIZE];
   int intermediate_height =
           (((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + SUBPEL_TAPS;
 
-  assert(w <= 64);
-  assert(h <= 64);
+  assert(w <= MAX_CU_SIZE);
+  assert(h <= MAX_CU_SIZE);
   assert(y_step_q4 <= 32);
   assert(x_step_q4 <= 32);
 
-  highbd_convolve_horiz(src - src_stride * (SUBPEL_TAPS / 2 - 1),
-                        src_stride, CONVERT_TO_BYTEPTR(temp), 64,
+  highbd_convolve_horiz(src - src_stride * (SUBPEL_TAPS / 2 - 1), src_stride,
+                        CONVERT_TO_BYTEPTR(temp), MAX_CU_SIZE,
                         x_filters, x0_q4, x_step_q4, w,
                         intermediate_height, bd);
-  highbd_convolve_vert(CONVERT_TO_BYTEPTR(temp) + 64 * (SUBPEL_TAPS / 2 - 1),
-                       64, dst, dst_stride, y_filters, y0_q4, y_step_q4,
-                       w, h, bd);
+  highbd_convolve_vert(
+    CONVERT_TO_BYTEPTR(temp) + MAX_CU_SIZE * (SUBPEL_TAPS / 2 - 1), MAX_CU_SIZE,
+    dst, dst_stride,
+    y_filters, y0_q4, y_step_q4, w, h, bd);
 }
 
 
@@ -556,13 +561,15 @@ void vpx_highbd_convolve8_avg_c(const uint8_t *src, ptrdiff_t src_stride,
                                 const int16_t *filter_y, int y_step_q4,
                                 int w, int h, int bd) {
   // Fixed size intermediate buffer places limits on parameters.
-  DECLARE_ALIGNED(16, uint16_t, temp[64 * 64]);
-  assert(w <= 64);
-  assert(h <= 64);
+  DECLARE_ALIGNED(16, uint16_t, temp[MAX_CU_SIZE * MAX_CU_SIZE]);
+  assert(w <= MAX_CU_SIZE);
+  assert(h <= MAX_CU_SIZE);
 
-  vpx_highbd_convolve8_c(src, src_stride, CONVERT_TO_BYTEPTR(temp), 64,
+  vpx_highbd_convolve8_c(src, src_stride,
+                         CONVERT_TO_BYTEPTR(temp), MAX_CU_SIZE,
                          filter_x, x_step_q4, filter_y, y_step_q4, w, h, bd);
-  vpx_highbd_convolve_avg_c(CONVERT_TO_BYTEPTR(temp), 64, dst, dst_stride,
+  vpx_highbd_convolve_avg_c(CONVERT_TO_BYTEPTR(temp), MAX_CU_SIZE,
+                            dst, dst_stride,
                             NULL, 0, NULL, 0, w, h, bd);
 }
 
diff --git a/vpx_dsp/vpx_convolve.h b/vpx_dsp/vpx_convolve.h
index 9ed3f1750..bd8679d10 100644
--- a/vpx_dsp/vpx_convolve.h
+++ b/vpx_dsp/vpx_convolve.h
@@ -17,6 +17,24 @@
 extern "C" {
 #endif
 
+// Note: Fixed size intermediate buffers, place limits on parameters
+// of some functions. 2d filtering proceeds in 2 steps:
+//   (1) Interpolate horizontally into an intermediate buffer, temp.
+//   (2) Interpolate temp vertically to derive the sub-pixel result.
+// Deriving the maximum number of rows in the temp buffer (135):
+// --Smallest scaling factor is x1/2 ==> y_step_q4 = 32 (Normative).
+// --Largest block size is 64x64 pixels.
+// --64 rows in the downscaled frame span a distance of (64 - 1) * 32 in the
+//   original frame (in 1/16th pixel units).
+// --Must round-up because block may be located at sub-pixel position.
+// --Require an additional SUBPEL_TAPS rows for the 8-tap filter tails.
+// --((64 - 1) * 32 + 15) >> 4 + 8 = 135.
+#if CONFIG_VP10 && CONFIG_EXT_PARTITION
+# define MAX_EXT_SIZE 263
+#else
+# define MAX_EXT_SIZE 135
+#endif  // CONFIG_VP10 && CONFIG_EXT_PARTITION
+
 typedef void (*convolve_fn_t)(const uint8_t *src, ptrdiff_t src_stride,
                               uint8_t *dst, ptrdiff_t dst_stride,
                               const int16_t *filter_x, int x_step_q4,
diff --git a/vpx_dsp/vpx_dsp_common.h b/vpx_dsp/vpx_dsp_common.h
index b4e6f4c27..8d9bf558d 100644
--- a/vpx_dsp/vpx_dsp_common.h
+++ b/vpx_dsp/vpx_dsp_common.h
@@ -20,6 +20,12 @@
 extern "C" {
 #endif
 
+#if CONFIG_VP10 && CONFIG_EXT_PARTITION
+# define MAX_CU_SIZE 128
+#else
+# define MAX_CU_SIZE 64
+#endif  // CONFIG_VP10 && CONFIG_EXT_PARTITION
+
 #define VPXMIN(x, y) (((x) < (y)) ? (x) : (y))
 #define VPXMAX(x, y) (((x) > (y)) ? (x) : (y))
 
diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl
index 2ce0b99fb..583d9fa89 100644
--- a/vpx_dsp/vpx_dsp_rtcd_defs.pl
+++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl
@@ -466,52 +466,44 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
 #
 # Sub Pixel Filters
 #
-add_proto qw/void vpx_convolve_copy/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
-specialize qw/vpx_convolve_copy neon dspr2 msa/, "$sse2_x86inc";
-
-add_proto qw/void vpx_convolve_avg/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
-specialize qw/vpx_convolve_avg neon dspr2 msa/, "$sse2_x86inc";
-
-add_proto qw/void vpx_convolve8/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
-specialize qw/vpx_convolve8 sse2 ssse3 neon dspr2 msa/, "$avx2_ssse3";
-
-add_proto qw/void vpx_convolve8_horiz/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
-specialize qw/vpx_convolve8_horiz sse2 ssse3 neon dspr2 msa/, "$avx2_ssse3";
-
-add_proto qw/void vpx_convolve8_vert/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
-specialize qw/vpx_convolve8_vert sse2 ssse3 neon dspr2 msa/, "$avx2_ssse3";
-
-add_proto qw/void vpx_convolve8_avg/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
-specialize qw/vpx_convolve8_avg sse2 ssse3 neon dspr2 msa/;
-
+add_proto qw/void vpx_convolve_copy/,       "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
+add_proto qw/void vpx_convolve_avg/,        "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
+add_proto qw/void vpx_convolve8/,           "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
+add_proto qw/void vpx_convolve8_horiz/,     "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
+add_proto qw/void vpx_convolve8_vert/,      "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
+add_proto qw/void vpx_convolve8_avg/,       "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
 add_proto qw/void vpx_convolve8_avg_horiz/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
-specialize qw/vpx_convolve8_avg_horiz sse2 ssse3 neon dspr2 msa/;
-
-add_proto qw/void vpx_convolve8_avg_vert/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
-specialize qw/vpx_convolve8_avg_vert sse2 ssse3 neon dspr2 msa/;
-
-add_proto qw/void vpx_scaled_2d/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
-specialize qw/vpx_scaled_2d ssse3/;
-
-add_proto qw/void vpx_scaled_horiz/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
-specialize qw/vpx_scaled_horiz/;
-
-add_proto qw/void vpx_scaled_vert/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
-specialize qw/vpx_scaled_vert/;
-
-add_proto qw/void vpx_scaled_avg_2d/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
-specialize qw/vpx_scaled_avg_2d/;
-
-add_proto qw/void vpx_scaled_avg_horiz/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
-specialize qw/vpx_scaled_avg_horiz/;
-
-add_proto qw/void vpx_scaled_avg_vert/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
-specialize qw/vpx_scaled_avg_vert/;
+add_proto qw/void vpx_convolve8_avg_vert/,  "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
+add_proto qw/void vpx_scaled_2d/,           "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
+add_proto qw/void vpx_scaled_horiz/,        "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
+add_proto qw/void vpx_scaled_vert/,         "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
+add_proto qw/void vpx_scaled_avg_2d/,       "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
+add_proto qw/void vpx_scaled_avg_horiz/,    "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
+add_proto qw/void vpx_scaled_avg_vert/,     "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
+
+specialize qw/vpx_convolve_copy                 /, "$sse2_x86inc";
+specialize qw/vpx_convolve_avg                  /, "$sse2_x86inc";
+specialize qw/vpx_convolve8           sse2 ssse3/, "$avx2_ssse3";
+specialize qw/vpx_convolve8_horiz     sse2 ssse3/, "$avx2_ssse3";
+specialize qw/vpx_convolve8_vert      sse2 ssse3/, "$avx2_ssse3";
+specialize qw/vpx_convolve8_avg       sse2 ssse3/;
+specialize qw/vpx_convolve8_avg_horiz sse2 ssse3/;
+specialize qw/vpx_convolve8_avg_vert  sse2 ssse3/;
+specialize qw/vpx_scaled_2d                ssse3/;
+
+# TODO(any): These need to be extended to up to 128x128 block sizes
+if (!(vpx_config("CONFIG_VP10") eq "yes" && vpx_config("CONFIG_EXT_PARTITION") eq "yes")) {
+  specialize qw/vpx_convolve_copy       neon dspr2 msa/;
+  specialize qw/vpx_convolve_avg        neon dspr2 msa/;
+  specialize qw/vpx_convolve8           neon dspr2 msa/;
+  specialize qw/vpx_convolve8_horiz     neon dspr2 msa/;
+  specialize qw/vpx_convolve8_vert      neon dspr2 msa/;
+  specialize qw/vpx_convolve8_avg       neon dspr2 msa/;
+  specialize qw/vpx_convolve8_avg_horiz neon dspr2 msa/;
+  specialize qw/vpx_convolve8_avg_vert  neon dspr2 msa/;
+}
 
 if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
-  #
-  # Sub Pixel Filters
-  #
   add_proto qw/void vpx_highbd_convolve_copy/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps";
   specialize qw/vpx_highbd_convolve_copy/, "$sse2_x86inc";
 
diff --git a/vpx_dsp/x86/convolve.h b/vpx_dsp/x86/convolve.h
index b6fbfcf92..95aa790ae 100644
--- a/vpx_dsp/x86/convolve.h
+++ b/vpx_dsp/x86/convolve.h
@@ -15,6 +15,7 @@
 #include "./vpx_config.h"
 #include "vpx/vpx_integer.h"
 #include "vpx_ports/mem.h"
+#include "vpx_dsp/vpx_convolve.h"
 
 typedef void filter8_1dfunction (
   const uint8_t *src_ptr,
@@ -112,25 +113,27 @@ void vpx_convolve8_##avg##opt(const uint8_t *src, ptrdiff_t src_stride, \
                               int w, int h) { \
   assert(filter_x[3] != 128); \
   assert(filter_y[3] != 128); \
-  assert(w <= 64); \
-  assert(h <= 64); \
+  assert(w <= MAX_CU_SIZE); \
+  assert(h <= MAX_CU_SIZE); \
   assert(x_step_q4 == 16); \
   assert(y_step_q4 == 16); \
   if (filter_x[0] || filter_x[1] || filter_x[2]|| \
       filter_y[0] || filter_y[1] || filter_y[2]) { \
-    DECLARE_ALIGNED(16, uint8_t, fdata2[64 * 71]); \
-    vpx_convolve8_horiz_##opt(src - 3 * src_stride, src_stride, fdata2, 64, \
+    DECLARE_ALIGNED(16, uint8_t, fdata2[MAX_CU_SIZE * (MAX_CU_SIZE+7)]); \
+    vpx_convolve8_horiz_##opt(src - 3 * src_stride, src_stride, \
+                              fdata2, MAX_CU_SIZE, \
                               filter_x, x_step_q4, filter_y, y_step_q4, \
                               w, h + 7); \
-    vpx_convolve8_##avg##vert_##opt(fdata2 + 3 * 64, 64, dst, dst_stride, \
+    vpx_convolve8_##avg##vert_##opt(fdata2 + 3 * MAX_CU_SIZE, MAX_CU_SIZE, \
+                                    dst, dst_stride, \
                                     filter_x, x_step_q4, filter_y, \
                                     y_step_q4, w, h); \
   } else { \
-    DECLARE_ALIGNED(16, uint8_t, fdata2[64 * 65]); \
-    vpx_convolve8_horiz_##opt(src, src_stride, fdata2, 64, \
+    DECLARE_ALIGNED(16, uint8_t, fdata2[MAX_CU_SIZE * (MAX_CU_SIZE+1)]); \
+    vpx_convolve8_horiz_##opt(src, src_stride, fdata2, MAX_CU_SIZE, \
                               filter_x, x_step_q4, filter_y, y_step_q4, \
                               w, h + 1); \
-    vpx_convolve8_##avg##vert_##opt(fdata2, 64, dst, dst_stride, \
+    vpx_convolve8_##avg##vert_##opt(fdata2, MAX_CU_SIZE, dst, dst_stride, \
                                     filter_x, x_step_q4, filter_y, \
                                     y_step_q4, w, h); \
   } \
@@ -250,31 +253,40 @@ void vpx_highbd_convolve8_##avg##opt(const uint8_t *src, ptrdiff_t src_stride, \
                                      const int16_t *filter_x, int x_step_q4, \
                                      const int16_t *filter_y, int y_step_q4, \
                                      int w, int h, int bd) { \
-  assert(w <= 64); \
-  assert(h <= 64); \
+  assert(w <= MAX_CU_SIZE); \
+  assert(h <= MAX_CU_SIZE); \
   if (x_step_q4 == 16 && y_step_q4 == 16) { \
     if (filter_x[0] || filter_x[1] || filter_x[2] || filter_x[3] == 128 || \
         filter_y[0] || filter_y[1] || filter_y[2] || filter_y[3] == 128) { \
-      DECLARE_ALIGNED(16, uint16_t, fdata2[64 * 71]); \
-      vpx_highbd_convolve8_horiz_##opt(src - 3 * src_stride, src_stride, \
-                                       CONVERT_TO_BYTEPTR(fdata2), 64, \
+      DECLARE_ALIGNED(16, uint16_t, fdata2[MAX_CU_SIZE * (MAX_CU_SIZE+7)]); \
+      vpx_highbd_convolve8_horiz_##opt(src - 3 * src_stride, \
+                                       src_stride, \
+                                       CONVERT_TO_BYTEPTR(fdata2), \
+                                       MAX_CU_SIZE, \
                                        filter_x, x_step_q4, \
                                        filter_y, y_step_q4, \
                                        w, h + 7, bd); \
-      vpx_highbd_convolve8_##avg##vert_##opt(CONVERT_TO_BYTEPTR(fdata2) + 192, \
-                                             64, dst, dst_stride, \
-                                             filter_x, x_step_q4, \
-                                             filter_y, y_step_q4, \
-                                             w, h, bd); \
+      vpx_highbd_convolve8_##avg##vert_##opt( \
+        CONVERT_TO_BYTEPTR(fdata2) + 3 * MAX_CU_SIZE, \
+        MAX_CU_SIZE, \
+        dst, \
+        dst_stride, \
+        filter_x, x_step_q4, \
+        filter_y, y_step_q4, \
+        w, h, bd); \
     } else { \
-      DECLARE_ALIGNED(16, uint16_t, fdata2[64 * 65]); \
-      vpx_highbd_convolve8_horiz_##opt(src, src_stride, \
-                                       CONVERT_TO_BYTEPTR(fdata2), 64, \
+      DECLARE_ALIGNED(16, uint16_t, fdata2[MAX_CU_SIZE * (MAX_CU_SIZE+1)]); \
+      vpx_highbd_convolve8_horiz_##opt(src, \
+                                       src_stride, \
+                                       CONVERT_TO_BYTEPTR(fdata2), \
+                                       MAX_CU_SIZE, \
                                        filter_x, x_step_q4, \
                                        filter_y, y_step_q4, \
                                        w, h + 1, bd); \
-      vpx_highbd_convolve8_##avg##vert_##opt(CONVERT_TO_BYTEPTR(fdata2), 64, \
-                                             dst, dst_stride, \
+      vpx_highbd_convolve8_##avg##vert_##opt(CONVERT_TO_BYTEPTR(fdata2), \
+                                             MAX_CU_SIZE, \
+                                             dst, \
+                                             dst_stride, \
                                              filter_x, x_step_q4, \
                                              filter_y, y_step_q4, \
                                              w, h, bd); \
diff --git a/vpx_dsp/x86/vpx_convolve_copy_sse2.asm b/vpx_dsp/x86/vpx_convolve_copy_sse2.asm
index abc027065..6d43fc18e 100644
--- a/vpx_dsp/x86/vpx_convolve_copy_sse2.asm
+++ b/vpx_dsp/x86/vpx_convolve_copy_sse2.asm
@@ -46,6 +46,119 @@ cglobal convolve_%1, 4, 7, 4+AUX_XMM_REGS, src, src_stride, \
   je .w16
   cmp r4d, 32
   je .w32
+
+%if CONFIG_VP10 && CONFIG_EXT_PARTITION
+  cmp r4d, 64
+  je .w64
+%ifidn %2, highbd
+  cmp r4d, 128
+  je .w128
+
+.w256:
+  mov                    r4d, dword hm
+.loop256:
+  movu                    m0, [srcq]
+  movu                    m1, [srcq+16]
+  movu                    m2, [srcq+32]
+  movu                    m3, [srcq+48]
+%ifidn %1, avg
+  pavg                    m0, [dstq]
+  pavg                    m1, [dstq+16]
+  pavg                    m2, [dstq+32]
+  pavg                    m3, [dstq+48]
+%endif
+  mova             [dstq   ], m0
+  mova             [dstq+16], m1
+  mova             [dstq+32], m2
+  mova             [dstq+48], m3
+  movu                    m0, [srcq+64]
+  movu                    m1, [srcq+80]
+  movu                    m2, [srcq+96]
+  movu                    m3, [srcq+112]
+%ifidn %1, avg
+  pavg                    m0, [dstq+64]
+  pavg                    m1, [dstq+80]
+  pavg                    m2, [dstq+96]
+  pavg                    m3, [dstq+112]
+%endif
+  mova             [dstq+64], m0
+  mova             [dstq+80], m1
+  mova             [dstq+96], m2
+  mova            [dstq+112], m3
+  movu                    m0, [srcq+128]
+  movu                    m1, [srcq+128+16]
+  movu                    m2, [srcq+128+32]
+  movu                    m3, [srcq+128+48]
+%ifidn %1, avg
+  pavg                    m0, [dstq+128]
+  pavg                    m1, [dstq+128+16]
+  pavg                    m2, [dstq+128+32]
+  pavg                    m3, [dstq+128+48]
+%endif
+  mova         [dstq+128   ], m0
+  mova         [dstq+128+16], m1
+  mova         [dstq+128+32], m2
+  mova         [dstq+128+48], m3
+  movu                    m0, [srcq+128+64]
+  movu                    m1, [srcq+128+80]
+  movu                    m2, [srcq+128+96]
+  movu                    m3, [srcq+128+112]
+  add                   srcq, src_strideq
+%ifidn %1, avg
+  pavg                    m0, [dstq+128+64]
+  pavg                    m1, [dstq+128+80]
+  pavg                    m2, [dstq+128+96]
+  pavg                    m3, [dstq+128+112]
+%endif
+  mova         [dstq+128+64], m0
+  mova         [dstq+128+80], m1
+  mova         [dstq+128+96], m2
+  mova        [dstq+128+112], m3
+  add                   dstq, dst_strideq
+  sub                    r4d, 1
+  jnz .loop256
+  RET
+%endif
+
+.w128:
+  mov                    r4d, dword hm
+.loop128:
+  movu                    m0, [srcq]
+  movu                    m1, [srcq+16]
+  movu                    m2, [srcq+32]
+  movu                    m3, [srcq+48]
+%ifidn %1, avg
+  pavg                    m0, [dstq]
+  pavg                    m1, [dstq+16]
+  pavg                    m2, [dstq+32]
+  pavg                    m3, [dstq+48]
+%endif
+  mova             [dstq   ], m0
+  mova             [dstq+16], m1
+  mova             [dstq+32], m2
+  mova             [dstq+48], m3
+  movu                    m0, [srcq+64]
+  movu                    m1, [srcq+80]
+  movu                    m2, [srcq+96]
+  movu                    m3, [srcq+112]
+  add                   srcq, src_strideq
+%ifidn %1, avg
+  pavg                    m0, [dstq+64]
+  pavg                    m1, [dstq+80]
+  pavg                    m2, [dstq+96]
+  pavg                    m3, [dstq+112]
+%endif
+  mova             [dstq+64], m0
+  mova             [dstq+80], m1
+  mova             [dstq+96], m2
+  mova            [dstq+112], m3
+  add                   dstq, dst_strideq
+  sub                    r4d, 1
+  jnz .loop128
+  RET
+
+%else  ; CONFIG_VP10 && CONFIG_EXT_PARTITION
+
 %ifidn %2, highbd
   cmp r4d, 64
   je .w64
@@ -82,10 +195,11 @@ cglobal convolve_%1, 4, 7, 4+AUX_XMM_REGS, src, src_stride, \
   mova             [dstq+96], m2
   mova            [dstq+112], m3
   add                   dstq, dst_strideq
-  dec                    r4d
+  sub                    r4d, 1
   jnz .loop128
   RET
 %endif
+%endif  ; CONFIG_VP10 && CONFIG_EXT_PARTITION
 
 .w64
   mov                    r4d, dword hm
@@ -106,7 +220,7 @@ cglobal convolve_%1, 4, 7, 4+AUX_XMM_REGS, src, src_stride, \
   mova             [dstq+32], m2
   mova             [dstq+48], m3
   add                   dstq, dst_strideq
-  dec                    r4d
+  sub                    r4d, 1
   jnz .loop64
   RET
 
diff --git a/vpx_dsp/x86/vpx_subpixel_8t_intrin_ssse3.c b/vpx_dsp/x86/vpx_subpixel_8t_intrin_ssse3.c
index 6fd52087c..6c5991858 100644
--- a/vpx_dsp/x86/vpx_subpixel_8t_intrin_ssse3.c
+++ b/vpx_dsp/x86/vpx_subpixel_8t_intrin_ssse3.c
@@ -844,34 +844,49 @@ static void scaledconvolve2d(const uint8_t *src, ptrdiff_t src_stride,
   // --Require an additional SUBPEL_TAPS rows for the 8-tap filter tails.
   // --((64 - 1) * 32 + 15) >> 4 + 8 = 135.
   // --Require an additional 8 rows for the horiz_w8 transpose tail.
-  DECLARE_ALIGNED(16, uint8_t, temp[(135 + 8) * 64]);
+  DECLARE_ALIGNED(16, uint8_t, temp[(MAX_EXT_SIZE + 8) * MAX_CU_SIZE]);
   const int intermediate_height =
       (((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + SUBPEL_TAPS;
 
-  assert(w <= 64);
-  assert(h <= 64);
+  assert(w <= MAX_CU_SIZE);
+  assert(h <= MAX_CU_SIZE);
   assert(y_step_q4 <= 32);
   assert(x_step_q4 <= 32);
 
   if (w >= 8) {
     scaledconvolve_horiz_w8(src - src_stride * (SUBPEL_TAPS / 2 - 1),
-                            src_stride, temp, 64, x_filters, x0_q4, x_step_q4,
+                            src_stride,
+                            temp,
+                            MAX_CU_SIZE,
+                            x_filters, x0_q4, x_step_q4,
                             w, intermediate_height);
   } else {
     scaledconvolve_horiz_w4(src - src_stride * (SUBPEL_TAPS / 2 - 1),
-                            src_stride, temp, 64, x_filters, x0_q4, x_step_q4,
+                            src_stride,
+                            temp,
+                            MAX_CU_SIZE,
+                            x_filters, x0_q4, x_step_q4,
                             w, intermediate_height);
   }
 
   if (w >= 16) {
-    scaledconvolve_vert_w16(temp + 64 * (SUBPEL_TAPS / 2 - 1), 64, dst,
-                            dst_stride, y_filters, y0_q4, y_step_q4, w, h);
+    scaledconvolve_vert_w16(temp + MAX_CU_SIZE * (SUBPEL_TAPS / 2 - 1),
+                            MAX_CU_SIZE,
+                            dst,
+                            dst_stride,
+                            y_filters, y0_q4, y_step_q4, w, h);
   } else if (w == 8) {
-    scaledconvolve_vert_w8(temp + 64 * (SUBPEL_TAPS / 2 - 1), 64, dst,
-                           dst_stride, y_filters, y0_q4, y_step_q4, w, h);
+    scaledconvolve_vert_w8(temp + MAX_CU_SIZE * (SUBPEL_TAPS / 2 - 1),
+                           MAX_CU_SIZE,
+                           dst,
+                           dst_stride,
+                           y_filters, y0_q4, y_step_q4, w, h);
   } else {
-    scaledconvolve_vert_w4(temp + 64 * (SUBPEL_TAPS / 2 - 1), 64, dst,
-                           dst_stride, y_filters, y0_q4, y_step_q4, w, h);
+    scaledconvolve_vert_w4(temp + MAX_CU_SIZE * (SUBPEL_TAPS / 2 - 1),
+                           MAX_CU_SIZE,
+                           dst,
+                           dst_stride,
+                           y_filters, y0_q4, y_step_q4, w, h);
   }
 }
 
-- 
2.40.0