From c182725cbc9e1e4892784a24c32b1bed80047b0c Mon Sep 17 00:00:00 2001
From: chiyotsai <chiyotsai@google.com>
Date: Fri, 2 Nov 2018 17:08:05 -0700
Subject: [PATCH] Remove unnecessary calculation in 4-tap interpolation filter

Reduces the number of rows calculated for 2D 4-tap interpolation filter
from h+7 rows to h+3 rows.
Also fixes a bug in the avx2 function for 4-tap filters where the last
row is computed incorrectly.

Performance:
           | Baseline |  Result  | Pct Gain |
bitdepth lo| 4.00 fps | 4.02 fps |   0.5%   |
bitdepth 10| 1.90 fps | 1.91 fps |   0.5%   |

The performance is evaluated on speed 1 on jets.y4m br 500 over 100
frames.

No BDBR loss is observed.

Change-Id: I90b0d4d697319b7bba599f03c5dc01abd85d13b1
---
 vpx_dsp/x86/convolve.h                     | 217 ++++++++++++---------
 vpx_dsp/x86/highbd_convolve_avx2.c         |  34 ++--
 vpx_dsp/x86/vpx_subpixel_4t_intrin_sse2.c  |  49 ++---
 vpx_dsp/x86/vpx_subpixel_8t_intrin_avx2.c  |  26 +--
 vpx_dsp/x86/vpx_subpixel_8t_intrin_ssse3.c |  26 +--
 5 files changed, 172 insertions(+), 180 deletions(-)

diff --git a/vpx_dsp/x86/convolve.h b/vpx_dsp/x86/convolve.h
index 8398ec3c1..b75d4d721 100644
--- a/vpx_dsp/x86/convolve.h
+++ b/vpx_dsp/x86/convolve.h
@@ -16,11 +16,17 @@
 #include "vpx/vpx_integer.h"
 #include "vpx_ports/mem.h"
 
+// TODO(chiyotsai@google.com): Refactor the code here. Currently this is pretty
+// hacky and awful to read. Note that there is a filter_x[3] == 128 check in
+// HIGHBD_FUN_CONV_2D to avoid seg fault due to the fact that the c function
+// assumes the filter is always 8 tap.
 typedef void filter8_1dfunction(const uint8_t *src_ptr, ptrdiff_t src_pitch,
                                 uint8_t *output_ptr, ptrdiff_t out_pitch,
                                 uint32_t output_height, const int16_t *filter);
 
-#define FUN_CONV_1D(name, offset, step_q4, dir, src_start, avg, opt)         \
+// TODO(chiyotsai@google.com): Remove the is_avg argument to the MACROS once we
+// have 4-tap vert avg filter.
+#define FUN_CONV_1D(name, offset, step_q4, dir, src_start, avg, opt, is_avg) \
   void vpx_convolve8_##name##_##opt(                                         \
       const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,                \
       ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4,           \
@@ -33,6 +39,7 @@ typedef void filter8_1dfunction(const uint8_t *src_ptr, ptrdiff_t src_pitch,
     assert(filter_row[3] != 128);                                            \
     assert(step_q4 == 16);                                                   \
     if (filter_row[0] | filter_row[1] | filter_row[6] | filter_row[7]) {     \
+      const int num_taps = 8;                                                \
       while (w >= 16) {                                                      \
         vpx_filter_block1d16_##dir##8_##avg##opt(src_start, src_stride, dst, \
                                                  dst_stride, h, filter_row); \
@@ -47,7 +54,9 @@ typedef void filter8_1dfunction(const uint8_t *src_ptr, ptrdiff_t src_pitch,
         vpx_filter_block1d4_##dir##8_##avg##opt(src_start, src_stride, dst,  \
                                                 dst_stride, h, filter_row);  \
       }                                                                      \
+      (void)num_taps;                                                        \
     } else if (filter_row[2] | filter_row[5]) {                              \
+      const int num_taps = is_avg ? 8 : 4;                                   \
       while (w >= 16) {                                                      \
         vpx_filter_block1d16_##dir##4_##avg##opt(src_start, src_stride, dst, \
                                                  dst_stride, h, filter_row); \
@@ -62,25 +71,28 @@ typedef void filter8_1dfunction(const uint8_t *src_ptr, ptrdiff_t src_pitch,
         vpx_filter_block1d4_##dir##4_##avg##opt(src_start, src_stride, dst,  \
                                                 dst_stride, h, filter_row);  \
       }                                                                      \
+      (void)num_taps;                                                        \
     } else {                                                                 \
+      const int num_taps = 2;                                                \
       while (w >= 16) {                                                      \
-        vpx_filter_block1d16_##dir##2_##avg##opt(src, src_stride, dst,       \
+        vpx_filter_block1d16_##dir##2_##avg##opt(src_start, src_stride, dst, \
                                                  dst_stride, h, filter_row); \
         src += 16;                                                           \
         dst += 16;                                                           \
         w -= 16;                                                             \
       }                                                                      \
       if (w == 8) {                                                          \
-        vpx_filter_block1d8_##dir##2_##avg##opt(src, src_stride, dst,        \
+        vpx_filter_block1d8_##dir##2_##avg##opt(src_start, src_stride, dst,  \
                                                 dst_stride, h, filter_row);  \
       } else if (w == 4) {                                                   \
-        vpx_filter_block1d4_##dir##2_##avg##opt(src, src_stride, dst,        \
+        vpx_filter_block1d4_##dir##2_##avg##opt(src_start, src_stride, dst,  \
                                                 dst_stride, h, filter_row);  \
       }                                                                      \
+      (void)num_taps;                                                        \
     }                                                                        \
   }
 
-#define FUN_CONV_2D(avg, opt)                                                  \
+#define FUN_CONV_2D(avg, opt, is_avg)                                          \
   void vpx_convolve8_##avg##opt(                                               \
       const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,                  \
       ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4,             \
@@ -94,7 +106,7 @@ typedef void filter8_1dfunction(const uint8_t *src_ptr, ptrdiff_t src_pitch,
     assert(h <= 64);                                                           \
     assert(x_step_q4 == 16);                                                   \
     assert(y_step_q4 == 16);                                                   \
-    if (filter_x[0] | filter_x[1] | filter_x[2]) {                             \
+    if (filter_x[0] | filter_x[1] | filter_x[6] | filter_x[7]) {               \
       DECLARE_ALIGNED(16, uint8_t, fdata2[64 * 71]);                           \
       vpx_convolve8_horiz_##opt(src - 3 * src_stride, src_stride, fdata2, 64,  \
                                 filter, x0_q4, x_step_q4, y0_q4, y_step_q4, w, \
@@ -102,6 +114,15 @@ typedef void filter8_1dfunction(const uint8_t *src_ptr, ptrdiff_t src_pitch,
       vpx_convolve8_##avg##vert_##opt(fdata2 + 3 * 64, 64, dst, dst_stride,    \
                                       filter, x0_q4, x_step_q4, y0_q4,         \
                                       y_step_q4, w, h);                        \
+    } else if (filter_x[2] | filter_x[5]) {                                    \
+      const int num_taps = is_avg ? 8 : 4;                                     \
+      DECLARE_ALIGNED(16, uint8_t, fdata2[64 * 71]);                           \
+      vpx_convolve8_horiz_##opt(                                               \
+          src - (num_taps / 2 - 1) * src_stride, src_stride, fdata2, 64,       \
+          filter, x0_q4, x_step_q4, y0_q4, y_step_q4, w, h + num_taps - 1);    \
+      vpx_convolve8_##avg##vert_##opt(fdata2 + 64 * (num_taps / 2 - 1), 64,    \
+                                      dst, dst_stride, filter, x0_q4,          \
+                                      x_step_q4, y0_q4, y_step_q4, w, h);      \
     } else {                                                                   \
       DECLARE_ALIGNED(16, uint8_t, fdata2[64 * 65]);                           \
       vpx_convolve8_horiz_##opt(src, src_stride, fdata2, 64, filter, x0_q4,    \
@@ -121,89 +142,96 @@ typedef void highbd_filter8_1dfunction(const uint16_t *src_ptr,
                                        unsigned int output_height,
                                        const int16_t *filter, int bd);
 
-#define HIGH_FUN_CONV_1D(name, offset, step_q4, dir, src_start, avg, opt)  \
-  void vpx_highbd_convolve8_##name##_##opt(                                \
-      const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst,            \
-      ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4,         \
-      int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bd) {     \
-    const int16_t *filter_row = filter[offset];                            \
-    if (step_q4 == 16 && filter_row[3] != 128) {                           \
-      if (filter_row[0] | filter_row[1] | filter_row[6] | filter_row[7]) { \
-        while (w >= 16) {                                                  \
-          vpx_highbd_filter_block1d16_##dir##8_##avg##opt(                 \
-              src_start, src_stride, dst, dst_stride, h, filter_row, bd);  \
-          src += 16;                                                       \
-          dst += 16;                                                       \
-          w -= 16;                                                         \
-        }                                                                  \
-        while (w >= 8) {                                                   \
-          vpx_highbd_filter_block1d8_##dir##8_##avg##opt(                  \
-              src_start, src_stride, dst, dst_stride, h, filter_row, bd);  \
-          src += 8;                                                        \
-          dst += 8;                                                        \
-          w -= 8;                                                          \
-        }                                                                  \
-        while (w >= 4) {                                                   \
-          vpx_highbd_filter_block1d4_##dir##8_##avg##opt(                  \
-              src_start, src_stride, dst, dst_stride, h, filter_row, bd);  \
-          src += 4;                                                        \
-          dst += 4;                                                        \
-          w -= 4;                                                          \
-        }                                                                  \
-      } else if (filter_row[2] | filter_row[5]) {                          \
-        while (w >= 16) {                                                  \
-          vpx_highbd_filter_block1d16_##dir##4_##avg##opt(                 \
-              src_start, src_stride, dst, dst_stride, h, filter_row, bd);  \
-          src += 16;                                                       \
-          dst += 16;                                                       \
-          w -= 16;                                                         \
-        }                                                                  \
-        while (w >= 8) {                                                   \
-          vpx_highbd_filter_block1d8_##dir##4_##avg##opt(                  \
-              src_start, src_stride, dst, dst_stride, h, filter_row, bd);  \
-          src += 8;                                                        \
-          dst += 8;                                                        \
-          w -= 8;                                                          \
-        }                                                                  \
-        while (w >= 4) {                                                   \
-          vpx_highbd_filter_block1d4_##dir##4_##avg##opt(                  \
-              src_start, src_stride, dst, dst_stride, h, filter_row, bd);  \
-          src += 4;                                                        \
-          dst += 4;                                                        \
-          w -= 4;                                                          \
-        }                                                                  \
-      } else {                                                             \
-        while (w >= 16) {                                                  \
-          vpx_highbd_filter_block1d16_##dir##2_##avg##opt(                 \
-              src, src_stride, dst, dst_stride, h, filter_row, bd);        \
-          src += 16;                                                       \
-          dst += 16;                                                       \
-          w -= 16;                                                         \
-        }                                                                  \
-        while (w >= 8) {                                                   \
-          vpx_highbd_filter_block1d8_##dir##2_##avg##opt(                  \
-              src, src_stride, dst, dst_stride, h, filter_row, bd);        \
-          src += 8;                                                        \
-          dst += 8;                                                        \
-          w -= 8;                                                          \
-        }                                                                  \
-        while (w >= 4) {                                                   \
-          vpx_highbd_filter_block1d4_##dir##2_##avg##opt(                  \
-              src, src_stride, dst, dst_stride, h, filter_row, bd);        \
-          src += 4;                                                        \
-          dst += 4;                                                        \
-          w -= 4;                                                          \
-        }                                                                  \
-      }                                                                    \
-    }                                                                      \
-    if (w) {                                                               \
-      vpx_highbd_convolve8_##name##_c(src, src_stride, dst, dst_stride,    \
-                                      filter, x0_q4, x_step_q4, y0_q4,     \
-                                      y_step_q4, w, h, bd);                \
-    }                                                                      \
+#define HIGH_FUN_CONV_1D(name, offset, step_q4, dir, src_start, avg, opt,     \
+                         is_avg)                                              \
+  void vpx_highbd_convolve8_##name##_##opt(                                   \
+      const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst,               \
+      ptrdiff_t dst_stride, const InterpKernel *filter_kernel, int x0_q4,     \
+      int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bd) {        \
+    const int16_t *filter_row = filter_kernel[offset];                        \
+    if (step_q4 == 16 && filter_row[3] != 128) {                              \
+      if (filter_row[0] | filter_row[1] | filter_row[6] | filter_row[7]) {    \
+        const int num_taps = 8;                                               \
+        while (w >= 16) {                                                     \
+          vpx_highbd_filter_block1d16_##dir##8_##avg##opt(                    \
+              src_start, src_stride, dst, dst_stride, h, filter_row, bd);     \
+          src += 16;                                                          \
+          dst += 16;                                                          \
+          w -= 16;                                                            \
+        }                                                                     \
+        while (w >= 8) {                                                      \
+          vpx_highbd_filter_block1d8_##dir##8_##avg##opt(                     \
+              src_start, src_stride, dst, dst_stride, h, filter_row, bd);     \
+          src += 8;                                                           \
+          dst += 8;                                                           \
+          w -= 8;                                                             \
+        }                                                                     \
+        while (w >= 4) {                                                      \
+          vpx_highbd_filter_block1d4_##dir##8_##avg##opt(                     \
+              src_start, src_stride, dst, dst_stride, h, filter_row, bd);     \
+          src += 4;                                                           \
+          dst += 4;                                                           \
+          w -= 4;                                                             \
+        }                                                                     \
+        (void)num_taps;                                                       \
+      } else if (filter_row[2] | filter_row[5]) {                             \
+        const int num_taps = is_avg ? 8 : 4;                                  \
+        while (w >= 16) {                                                     \
+          vpx_highbd_filter_block1d16_##dir##4_##avg##opt(                    \
+              src_start, src_stride, dst, dst_stride, h, filter_row, bd);     \
+          src += 16;                                                          \
+          dst += 16;                                                          \
+          w -= 16;                                                            \
+        }                                                                     \
+        while (w >= 8) {                                                      \
+          vpx_highbd_filter_block1d8_##dir##4_##avg##opt(                     \
+              src_start, src_stride, dst, dst_stride, h, filter_row, bd);     \
+          src += 8;                                                           \
+          dst += 8;                                                           \
+          w -= 8;                                                             \
+        }                                                                     \
+        while (w >= 4) {                                                      \
+          vpx_highbd_filter_block1d4_##dir##4_##avg##opt(                     \
+              src_start, src_stride, dst, dst_stride, h, filter_row, bd);     \
+          src += 4;                                                           \
+          dst += 4;                                                           \
+          w -= 4;                                                             \
+        }                                                                     \
+        (void)num_taps;                                                       \
+      } else {                                                                \
+        const int num_taps = 2;                                               \
+        while (w >= 16) {                                                     \
+          vpx_highbd_filter_block1d16_##dir##2_##avg##opt(                    \
+              src_start, src_stride, dst, dst_stride, h, filter_row, bd);     \
+          src += 16;                                                          \
+          dst += 16;                                                          \
+          w -= 16;                                                            \
+        }                                                                     \
+        while (w >= 8) {                                                      \
+          vpx_highbd_filter_block1d8_##dir##2_##avg##opt(                     \
+              src_start, src_stride, dst, dst_stride, h, filter_row, bd);     \
+          src += 8;                                                           \
+          dst += 8;                                                           \
+          w -= 8;                                                             \
+        }                                                                     \
+        while (w >= 4) {                                                      \
+          vpx_highbd_filter_block1d4_##dir##2_##avg##opt(                     \
+              src_start, src_stride, dst, dst_stride, h, filter_row, bd);     \
+          src += 4;                                                           \
+          dst += 4;                                                           \
+          w -= 4;                                                             \
+        }                                                                     \
+        (void)num_taps;                                                       \
+      }                                                                       \
+    }                                                                         \
+    if (w) {                                                                  \
+      vpx_highbd_convolve8_##name##_c(src, src_stride, dst, dst_stride,       \
+                                      filter_kernel, x0_q4, x_step_q4, y0_q4, \
+                                      y_step_q4, w, h, bd);                   \
+    }                                                                         \
   }
 
-#define HIGH_FUN_CONV_2D(avg, opt)                                             \
+#define HIGH_FUN_CONV_2D(avg, opt, is_avg)                                     \
   void vpx_highbd_convolve8_##avg##opt(                                        \
       const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst,                \
       ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4,             \
@@ -212,7 +240,8 @@ typedef void highbd_filter8_1dfunction(const uint16_t *src_ptr,
     assert(w <= 64);                                                           \
     assert(h <= 64);                                                           \
     if (x_step_q4 == 16 && y_step_q4 == 16) {                                  \
-      if ((filter_x[0] | filter_x[1] | filter_x[2]) || filter_x[3] == 128) {   \
+      if ((filter_x[0] | filter_x[1] | filter_x[6] | filter_x[7]) ||           \
+          filter_x[3] == 128) {                                                \
         DECLARE_ALIGNED(16, uint16_t, fdata2[64 * 71]);                        \
         vpx_highbd_convolve8_horiz_##opt(src - 3 * src_stride, src_stride,     \
                                          fdata2, 64, filter, x0_q4, x_step_q4, \
@@ -220,6 +249,16 @@ typedef void highbd_filter8_1dfunction(const uint16_t *src_ptr,
         vpx_highbd_convolve8_##avg##vert_##opt(                                \
             fdata2 + 192, 64, dst, dst_stride, filter, x0_q4, x_step_q4,       \
             y0_q4, y_step_q4, w, h, bd);                                       \
+      } else if (filter_x[2] | filter_x[5]) {                                  \
+        const int num_taps = is_avg ? 8 : 4;                                   \
+        DECLARE_ALIGNED(16, uint16_t, fdata2[64 * 71]);                        \
+        vpx_highbd_convolve8_horiz_##opt(                                      \
+            src - (num_taps / 2 - 1) * src_stride, src_stride, fdata2, 64,     \
+            filter, x0_q4, x_step_q4, y0_q4, y_step_q4, w, h + num_taps - 1,   \
+            bd);                                                               \
+        vpx_highbd_convolve8_##avg##vert_##opt(                                \
+            fdata2 + 64 * (num_taps / 2 - 1), 64, dst, dst_stride, filter,     \
+            x0_q4, x_step_q4, y0_q4, y_step_q4, w, h, bd);                     \
       } else {                                                                 \
         DECLARE_ALIGNED(16, uint16_t, fdata2[64 * 65]);                        \
         vpx_highbd_convolve8_horiz_##opt(src, src_stride, fdata2, 64, filter,  \
@@ -235,6 +274,6 @@ typedef void highbd_filter8_1dfunction(const uint16_t *src_ptr,
                                     bd);                                       \
     }                                                                          \
   }
-#endif  // CONFIG_VP9_HIGHBITDEPTH
 
+#endif  // CONFIG_VP9_HIGHBITDEPTH
 #endif  // VPX_VPX_DSP_X86_CONVOLVE_H_
diff --git a/vpx_dsp/x86/highbd_convolve_avx2.c b/vpx_dsp/x86/highbd_convolve_avx2.c
index aef067ea7..320962561 100644
--- a/vpx_dsp/x86/highbd_convolve_avx2.c
+++ b/vpx_dsp/x86/highbd_convolve_avx2.c
@@ -1089,22 +1089,19 @@ static void vpx_highbd_filter_block1d8_h4_avx2(
 
   // Repeat for the last row if needed
   if (h > 0) {
-    src_reg = _mm256_loadu_si256((const __m256i *)src_ptr);
-    // Reorder into 2 1 1 2
-    src_reg = _mm256_permute4x64_epi64(src_reg, 0x94);
-
+    src_reg = mm256_loadu2_si128(src_ptr, src_ptr + 4);
     src_reg_shift_0 = _mm256_shuffle_epi8(src_reg, idx_shift_0);
     src_reg_shift_2 = _mm256_shuffle_epi8(src_reg, idx_shift_2);
 
     res_reg = mm256_madd_add_epi32(&src_reg_shift_0, &src_reg_shift_2,
                                    &kernel_reg_23, &kernel_reg_45);
 
-    res_reg = mm256_round_epi32(&res_first, &reg_round, CONV8_ROUNDING_BITS);
+    res_reg = mm256_round_epi32(&res_reg, &reg_round, CONV8_ROUNDING_BITS);
 
     res_reg = _mm256_packus_epi32(res_reg, res_reg);
-    res_reg = _mm256_permute4x64_epi64(res_reg, 0x8);
+    res_reg = _mm256_min_epi16(res_reg, reg_max);
 
-    _mm_store_si128((__m128i *)dst_ptr, _mm256_castsi256_si128(res_reg));
+    mm256_storeu2_epi64((__m128i *)dst_ptr, (__m128i *)(dst_ptr + 4), &res_reg);
   }
 }
 
@@ -1279,10 +1276,6 @@ static void vpx_highbd_filter_block1d4_v4_avx2(
   const ptrdiff_t dst_stride_unrolled = dst_stride << 1;
   int h;
 
-  // We only need to go num_taps/2 - 1 row above the souce, so we move
-  // 3 - (num_taps/2 - 1) = 4 - num_taps/2 = 2 back down
-  src_ptr += src_stride_unrolled;
-
   // Load Kernel
   kernel_reg_128 = _mm_loadu_si128((const __m128i *)kernel);
   kernel_reg = _mm256_broadcastsi128_si256(kernel_reg_128);
@@ -1368,10 +1361,6 @@ static void vpx_highbd_filter_block1d8_v4_avx2(
   const ptrdiff_t dst_stride_unrolled = dst_stride << 1;
   int h;
 
-  // We only need to go num_taps/2 - 1 row above the souce, so we move
-  // 3 - (num_taps/2 - 1) = 4 - num_taps/2 = 2 back down
-  src_ptr += src_stride_unrolled;
-
   // Load Kernel
   kernel_reg_128 = _mm_loadu_si128((const __m128i *)kernel);
   kernel_reg = _mm256_broadcastsi128_si256(kernel_reg_128);
@@ -1476,9 +1465,10 @@ highbd_filter8_1dfunction vpx_highbd_filter_block1d4_v2_sse2;
 #define vpx_highbd_filter_block1d4_h4_avg_avx2 \
   vpx_highbd_filter_block1d4_h8_avg_avx2
 
-HIGH_FUN_CONV_1D(horiz, x0_q4, x_step_q4, h, src, , avx2);
-HIGH_FUN_CONV_1D(vert, y0_q4, y_step_q4, v, src - src_stride * 3, , avx2);
-HIGH_FUN_CONV_2D(, avx2);
+HIGH_FUN_CONV_1D(horiz, x0_q4, x_step_q4, h, src, , avx2, 0);
+HIGH_FUN_CONV_1D(vert, y0_q4, y_step_q4, v,
+                 src - src_stride * (num_taps / 2 - 1), , avx2, 0);
+HIGH_FUN_CONV_2D(, avx2, 0);
 
 // From vpx_dsp/x86/vpx_high_subpixel_8t_sse2.asm.
 highbd_filter8_1dfunction vpx_highbd_filter_block1d4_h8_avg_sse2;
@@ -1497,9 +1487,9 @@ highbd_filter8_1dfunction vpx_highbd_filter_block1d4_v2_avg_sse2;
 #define vpx_highbd_filter_block1d4_v2_avg_avx2 \
   vpx_highbd_filter_block1d4_v2_avg_sse2
 
-HIGH_FUN_CONV_1D(avg_horiz, x0_q4, x_step_q4, h, src, avg_, avx2);
-HIGH_FUN_CONV_1D(avg_vert, y0_q4, y_step_q4, v, src - src_stride * 3, avg_,
-                 avx2);
-HIGH_FUN_CONV_2D(avg_, avx2);
+HIGH_FUN_CONV_1D(avg_horiz, x0_q4, x_step_q4, h, src, avg_, avx2, 1);
+HIGH_FUN_CONV_1D(avg_vert, y0_q4, y_step_q4, v,
+                 src - src_stride * (num_taps / 2 - 1), avg_, avx2, 1);
+HIGH_FUN_CONV_2D(avg_, avx2, 1);
 
 #undef HIGHBD_FUNC
diff --git a/vpx_dsp/x86/vpx_subpixel_4t_intrin_sse2.c b/vpx_dsp/x86/vpx_subpixel_4t_intrin_sse2.c
index e40fe693a..e0e8b8f90 100644
--- a/vpx_dsp/x86/vpx_subpixel_4t_intrin_sse2.c
+++ b/vpx_dsp/x86/vpx_subpixel_4t_intrin_sse2.c
@@ -133,10 +133,6 @@ static void vpx_filter_block1d16_v4_sse2(const uint8_t *src_ptr,
   const ptrdiff_t dst_stride_unrolled = dst_stride << 1;
   int h;
 
-  // We only need to go num_taps/2 - 1 row above the souce, so we move
-  // 3 - (num_taps/2 - 1) = 4 - num_taps/2 = 2 back down
-  src_ptr += src_stride_unrolled;
-
   // Load Kernel
   kernel_reg = _mm_loadu_si128((const __m128i *)kernel);
   kernel_reg = _mm_srai_epi16(kernel_reg, 1);
@@ -345,10 +341,6 @@ static void vpx_filter_block1d8_v4_sse2(const uint8_t *src_ptr,
   const ptrdiff_t dst_stride_unrolled = dst_stride << 1;
   int h;
 
-  // We only need to go num_taps/2 - 1 row above the souce, so we move
-  // 3 - (num_taps/2 - 1) = 4 - num_taps/2 = 2 back down
-  src_ptr += src_stride_unrolled;
-
   // Load Kernel
   kernel_reg = _mm_loadu_si128((const __m128i *)kernel);
   kernel_reg = _mm_srai_epi16(kernel_reg, 1);
@@ -531,10 +523,6 @@ static void vpx_filter_block1d4_v4_sse2(const uint8_t *src_ptr,
   const ptrdiff_t dst_stride_unrolled = dst_stride << 1;
   int h;
 
-  // We only need to go num_taps/2 - 1 row above the souce, so we move
-  // 3 - (num_taps/2 - 1) = 4 - num_taps/2 = 2 back down
-  src_ptr += src_stride_unrolled;
-
   // Load Kernel
   kernel_reg = _mm_loadu_si128((const __m128i *)kernel);
   kernel_reg = _mm_srai_epi16(kernel_reg, 1);
@@ -713,10 +701,6 @@ static void vpx_highbd_filter_block1d4_v4_sse2(
   const ptrdiff_t dst_stride_unrolled = dst_stride << 1;
   int h;
 
-  // We only need to go num_taps/2 - 1 row above the source, so we move
-  // 3 - (num_taps/2 - 1) = 4 - num_taps/2 = 2 back down
-  src_ptr += src_stride_unrolled;
-
   // Load Kernel
   kernel_reg = _mm_loadu_si128((const __m128i *)kernel);
   kernel_reg_23 = extract_quarter_2_epi16_sse2(&kernel_reg);
@@ -896,10 +880,6 @@ static void vpx_highbd_filter_block1d8_v4_sse2(
   const ptrdiff_t dst_stride_unrolled = dst_stride << 1;
   int h;
 
-  // We only need to go num_taps/2 - 1 row above the source, so we move
-  // 3 - (num_taps/2 - 1) = 4 - num_taps/2 = 2 back down
-  src_ptr += src_stride_unrolled;
-
   // Load Kernel
   kernel_reg = _mm_loadu_si128((const __m128i *)kernel);
   kernel_reg_23 = extract_quarter_2_epi16_sse2(&kernel_reg);
@@ -1060,10 +1040,12 @@ filter8_1dfunction vpx_filter_block1d4_h2_avg_sse2;
 //                                  const InterpKernel *filter, int x0_q4,
 //                                  int32_t x_step_q4, int y0_q4, int y_step_q4,
 //                                  int w, int h);
-FUN_CONV_1D(horiz, x0_q4, x_step_q4, h, src, , sse2);
-FUN_CONV_1D(vert, y0_q4, y_step_q4, v, src - src_stride * 3, , sse2);
-FUN_CONV_1D(avg_horiz, x0_q4, x_step_q4, h, src, avg_, sse2);
-FUN_CONV_1D(avg_vert, y0_q4, y_step_q4, v, src - src_stride * 3, avg_, sse2);
+FUN_CONV_1D(horiz, x0_q4, x_step_q4, h, src, , sse2, 0);
+FUN_CONV_1D(vert, y0_q4, y_step_q4, v, src - (num_taps / 2 - 1) * src_stride, ,
+            sse2, 0);
+FUN_CONV_1D(avg_horiz, x0_q4, x_step_q4, h, src, avg_, sse2, 1);
+FUN_CONV_1D(avg_vert, y0_q4, y_step_q4, v,
+            src - (num_taps / 2 - 1) * src_stride, avg_, sse2, 1);
 
 // void vpx_convolve8_sse2(const uint8_t *src, ptrdiff_t src_stride,
 //                         uint8_t *dst, ptrdiff_t dst_stride,
@@ -1075,8 +1057,8 @@ FUN_CONV_1D(avg_vert, y0_q4, y_step_q4, v, src - src_stride * 3, avg_, sse2);
 //                             const InterpKernel *filter, int x0_q4,
 //                             int32_t x_step_q4, int y0_q4, int y_step_q4,
 //                             int w, int h);
-FUN_CONV_2D(, sse2);
-FUN_CONV_2D(avg_, sse2);
+FUN_CONV_2D(, sse2, 0);
+FUN_CONV_2D(avg_, sse2, 1);
 
 #if CONFIG_VP9_HIGHBITDEPTH && ARCH_X86_64
 // From vpx_dsp/x86/vpx_high_subpixel_8t_sse2.asm.
@@ -1157,11 +1139,12 @@ highbd_filter8_1dfunction vpx_highbd_filter_block1d4_h2_avg_sse2;
 //                                         const int16_t *filter_y,
 //                                         int y_step_q4,
 //                                         int w, int h, int bd);
-HIGH_FUN_CONV_1D(horiz, x0_q4, x_step_q4, h, src, , sse2);
-HIGH_FUN_CONV_1D(vert, y0_q4, y_step_q4, v, src - src_stride * 3, , sse2);
-HIGH_FUN_CONV_1D(avg_horiz, x0_q4, x_step_q4, h, src, avg_, sse2);
-HIGH_FUN_CONV_1D(avg_vert, y0_q4, y_step_q4, v, src - src_stride * 3, avg_,
-                 sse2);
+HIGH_FUN_CONV_1D(horiz, x0_q4, x_step_q4, h, src, , sse2, 0);
+HIGH_FUN_CONV_1D(vert, y0_q4, y_step_q4, v,
+                 src - src_stride * (num_taps / 2 - 1), , sse2, 0);
+HIGH_FUN_CONV_1D(avg_horiz, x0_q4, x_step_q4, h, src, avg_, sse2, 1);
+HIGH_FUN_CONV_1D(avg_vert, y0_q4, y_step_q4, v,
+                 src - src_stride * (num_taps / 2 - 1), avg_, sse2, 1);
 
 // void vpx_highbd_convolve8_sse2(const uint8_t *src, ptrdiff_t src_stride,
 //                                uint8_t *dst, ptrdiff_t dst_stride,
@@ -1173,6 +1156,6 @@ HIGH_FUN_CONV_1D(avg_vert, y0_q4, y_step_q4, v, src - src_stride * 3, avg_,
 //                                    const InterpKernel *filter, int x0_q4,
 //                                    int32_t x_step_q4, int y0_q4,
 //                                    int y_step_q4, int w, int h, int bd);
-HIGH_FUN_CONV_2D(, sse2);
-HIGH_FUN_CONV_2D(avg_, sse2);
+HIGH_FUN_CONV_2D(, sse2, 0);
+HIGH_FUN_CONV_2D(avg_, sse2, 1);
 #endif  // CONFIG_VP9_HIGHBITDEPTH && ARCH_X86_64
diff --git a/vpx_dsp/x86/vpx_subpixel_8t_intrin_avx2.c b/vpx_dsp/x86/vpx_subpixel_8t_intrin_avx2.c
index ccedfe206..d381a7a47 100644
--- a/vpx_dsp/x86/vpx_subpixel_8t_intrin_avx2.c
+++ b/vpx_dsp/x86/vpx_subpixel_8t_intrin_avx2.c
@@ -464,10 +464,6 @@ static void vpx_filter_block1d16_v4_avx2(const uint8_t *src_ptr,
   const ptrdiff_t dst_stride_unrolled = dst_stride << 1;
   int h;
 
-  // We only need to go num_taps/2 - 1 row above the souce, so we move
-  // 3 - (num_taps/2 - 1) = 4 - num_taps/2 = 2 back down
-  src_ptr += src_stride_unrolled;
-
   // Load Kernel
   kernel_reg = _mm_loadu_si128((const __m128i *)kernel);
   kernel_reg = _mm_srai_epi16(kernel_reg, 1);
@@ -665,10 +661,6 @@ static void vpx_filter_block1d8_v4_avx2(const uint8_t *src_ptr,
   const ptrdiff_t dst_stride_unrolled = dst_stride << 1;
   int h;
 
-  // We only need to go num_taps/2 - 1 row above the souce, so we move
-  // 3 - (num_taps/2 - 1) = 4 - num_taps/2 = 2 back down
-  src_ptr += src_stride_unrolled;
-
   // Load Kernel
   kernel_reg_128 = _mm_loadu_si128((const __m128i *)kernel);
   kernel_reg_128 = _mm_srai_epi16(kernel_reg_128, 1);
@@ -839,10 +831,6 @@ static void vpx_filter_block1d4_v4_avx2(const uint8_t *src_ptr,
   const ptrdiff_t dst_stride_unrolled = dst_stride << 1;
   int h;
 
-  // We only need to go num_taps/2 - 1 row above the souce, so we move
-  // 3 - (num_taps/2 - 1) = 4 - num_taps/2 = 2 back down
-  src_ptr += src_stride_unrolled;
-
   // Load Kernel
   kernel_reg_128 = _mm_loadu_si128((const __m128i *)kernel);
   kernel_reg_128 = _mm_srai_epi16(kernel_reg_128, 1);
@@ -981,10 +969,12 @@ filter8_1dfunction vpx_filter_block1d4_h2_avg_ssse3;
 //                                   const InterpKernel *filter, int x0_q4,
 //                                   int32_t x_step_q4, int y0_q4,
 //                                   int y_step_q4, int w, int h);
-FUN_CONV_1D(horiz, x0_q4, x_step_q4, h, src, , avx2);
-FUN_CONV_1D(vert, y0_q4, y_step_q4, v, src - src_stride * 3, , avx2);
-FUN_CONV_1D(avg_horiz, x0_q4, x_step_q4, h, src, avg_, avx2);
-FUN_CONV_1D(avg_vert, y0_q4, y_step_q4, v, src - src_stride * 3, avg_, avx2);
+FUN_CONV_1D(horiz, x0_q4, x_step_q4, h, src, , avx2, 0);
+FUN_CONV_1D(vert, y0_q4, y_step_q4, v, src - src_stride * (num_taps / 2 - 1), ,
+            avx2, 0);
+FUN_CONV_1D(avg_horiz, x0_q4, x_step_q4, h, src, avg_, avx2, 1);
+FUN_CONV_1D(avg_vert, y0_q4, y_step_q4, v,
+            src - src_stride * (num_taps / 2 - 1), avg_, avx2, 1);
 
 // void vpx_convolve8_avx2(const uint8_t *src, ptrdiff_t src_stride,
 //                          uint8_t *dst, ptrdiff_t dst_stride,
@@ -996,6 +986,6 @@ FUN_CONV_1D(avg_vert, y0_q4, y_step_q4, v, src - src_stride * 3, avg_, avx2);
 //                              const InterpKernel *filter, int x0_q4,
 //                              int32_t x_step_q4, int y0_q4, int y_step_q4,
 //                              int w, int h);
-FUN_CONV_2D(, avx2);
-FUN_CONV_2D(avg_, avx2);
+FUN_CONV_2D(, avx2, 0);
+FUN_CONV_2D(avg_, avx2, 1);
 #endif  // HAVE_AX2 && HAVE_SSSE3
diff --git a/vpx_dsp/x86/vpx_subpixel_8t_intrin_ssse3.c b/vpx_dsp/x86/vpx_subpixel_8t_intrin_ssse3.c
index 37d1de0f1..63049c934 100644
--- a/vpx_dsp/x86/vpx_subpixel_8t_intrin_ssse3.c
+++ b/vpx_dsp/x86/vpx_subpixel_8t_intrin_ssse3.c
@@ -310,10 +310,6 @@ static void vpx_filter_block1d16_v4_ssse3(const uint8_t *src_ptr,
   const ptrdiff_t dst_stride_unrolled = dst_stride << 1;
   int h;
 
-  // We only need to go num_taps/2 - 1 row above the souce, so we move
-  // 3 - (num_taps/2 - 1) = 4 - num_taps/2 = 2 back down
-  src_ptr += src_stride_unrolled;
-
   // Load Kernel
   kernel_reg = _mm_loadu_si128((const __m128i *)kernel);
   kernel_reg = _mm_srai_epi16(kernel_reg, 1);
@@ -483,10 +479,6 @@ static void vpx_filter_block1d8_v4_ssse3(const uint8_t *src_ptr,
   const ptrdiff_t dst_stride_unrolled = dst_stride << 1;
   int h;
 
-  // We only need to go num_taps/2 - 1 row above the souce, so we move
-  // 3 - (num_taps/2 - 1) = 4 - num_taps/2 = 2 back down
-  src_ptr += src_stride_unrolled;
-
   // Load Kernel
   kernel_reg = _mm_loadu_si128((const __m128i *)kernel);
   kernel_reg = _mm_srai_epi16(kernel_reg, 1);
@@ -627,10 +619,6 @@ static void vpx_filter_block1d4_v4_ssse3(const uint8_t *src_ptr,
   const ptrdiff_t dst_stride_unrolled = dst_stride << 1;
   int h;
 
-  // We only need to go num_taps/2 - 1 row above the souce, so we move
-  // 3 - (num_taps/2 - 1) = 4 - num_taps/2 = 2 back down
-  src_ptr += src_stride_unrolled;
-
   // Load Kernel
   kernel_reg = _mm_loadu_si128((const __m128i *)kernel);
   kernel_reg = _mm_srai_epi16(kernel_reg, 1);
@@ -743,10 +731,12 @@ filter8_1dfunction vpx_filter_block1d4_h2_avg_ssse3;
 //                                   const InterpKernel *filter, int x0_q4,
 //                                   int32_t x_step_q4, int y0_q4,
 //                                   int y_step_q4, int w, int h);
-FUN_CONV_1D(horiz, x0_q4, x_step_q4, h, src, , ssse3);
-FUN_CONV_1D(vert, y0_q4, y_step_q4, v, src - src_stride * 3, , ssse3);
-FUN_CONV_1D(avg_horiz, x0_q4, x_step_q4, h, src, avg_, ssse3);
-FUN_CONV_1D(avg_vert, y0_q4, y_step_q4, v, src - src_stride * 3, avg_, ssse3);
+FUN_CONV_1D(horiz, x0_q4, x_step_q4, h, src, , ssse3, 0);
+FUN_CONV_1D(vert, y0_q4, y_step_q4, v, src - src_stride * (num_taps / 2 - 1), ,
+            ssse3, 0);
+FUN_CONV_1D(avg_horiz, x0_q4, x_step_q4, h, src, avg_, ssse3, 1);
+FUN_CONV_1D(avg_vert, y0_q4, y_step_q4, v,
+            src - src_stride * (num_taps / 2 - 1), avg_, ssse3, 1);
 
 static void filter_horiz_w8_ssse3(const uint8_t *const src,
                                   const ptrdiff_t src_stride,
@@ -1093,5 +1083,5 @@ void vpx_scaled_2d_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
 //                              const InterpKernel *filter, int x0_q4,
 //                              int32_t x_step_q4, int y0_q4, int y_step_q4,
 //                              int w, int h);
-FUN_CONV_2D(, ssse3);
-FUN_CONV_2D(avg_, ssse3);
+FUN_CONV_2D(, ssse3, 0);
+FUN_CONV_2D(avg_, ssse3, 1);
-- 
2.50.1