From c182725cbc9e1e4892784a24c32b1bed80047b0c Mon Sep 17 00:00:00 2001 From: chiyotsai Date: Fri, 2 Nov 2018 17:08:05 -0700 Subject: [PATCH] Remove unnecessary calculation in 4-tap interpolation filter Reduces the number of rows calculated for 2D 4-tap interpolation filter from h+7 rows to h+3 rows. Also fixes a bug in the avx2 function for 4-tap filters where the last row is computed incorrectly. Performance: | Baseline | Result | Pct Gain | bitdepth lo| 4.00 fps | 4.02 fps | 0.5% | bitdepth 10| 1.90 fps | 1.91 fps | 0.5% | The performance is evaluated on speed 1 on jets.y4m br 500 over 100 frames. No BDBR loss is observed. Change-Id: I90b0d4d697319b7bba599f03c5dc01abd85d13b1 --- vpx_dsp/x86/convolve.h | 217 ++++++++++++--------- vpx_dsp/x86/highbd_convolve_avx2.c | 34 ++-- vpx_dsp/x86/vpx_subpixel_4t_intrin_sse2.c | 49 ++--- vpx_dsp/x86/vpx_subpixel_8t_intrin_avx2.c | 26 +-- vpx_dsp/x86/vpx_subpixel_8t_intrin_ssse3.c | 26 +-- 5 files changed, 172 insertions(+), 180 deletions(-) diff --git a/vpx_dsp/x86/convolve.h b/vpx_dsp/x86/convolve.h index 8398ec3c1..b75d4d721 100644 --- a/vpx_dsp/x86/convolve.h +++ b/vpx_dsp/x86/convolve.h @@ -16,11 +16,17 @@ #include "vpx/vpx_integer.h" #include "vpx_ports/mem.h" +// TODO(chiyotsai@google.com): Refactor the code here. Currently this is pretty +// hacky and awful to read. Note that there is a filter_x[3] == 128 check in +// HIGHBD_FUN_CONV_2D to avoid seg fault due to the fact that the c function +// assumes the filter is always 8 tap. typedef void filter8_1dfunction(const uint8_t *src_ptr, ptrdiff_t src_pitch, uint8_t *output_ptr, ptrdiff_t out_pitch, uint32_t output_height, const int16_t *filter); -#define FUN_CONV_1D(name, offset, step_q4, dir, src_start, avg, opt) \ +// TODO(chiyotsai@google.com): Remove the is_avg argument to the MACROS once we +// have 4-tap vert avg filter. +#define FUN_CONV_1D(name, offset, step_q4, dir, src_start, avg, opt, is_avg) \ void vpx_convolve8_##name##_##opt( \ const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, \ ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, \ @@ -33,6 +39,7 @@ typedef void filter8_1dfunction(const uint8_t *src_ptr, ptrdiff_t src_pitch, assert(filter_row[3] != 128); \ assert(step_q4 == 16); \ if (filter_row[0] | filter_row[1] | filter_row[6] | filter_row[7]) { \ + const int num_taps = 8; \ while (w >= 16) { \ vpx_filter_block1d16_##dir##8_##avg##opt(src_start, src_stride, dst, \ dst_stride, h, filter_row); \ @@ -47,7 +54,9 @@ typedef void filter8_1dfunction(const uint8_t *src_ptr, ptrdiff_t src_pitch, vpx_filter_block1d4_##dir##8_##avg##opt(src_start, src_stride, dst, \ dst_stride, h, filter_row); \ } \ + (void)num_taps; \ } else if (filter_row[2] | filter_row[5]) { \ + const int num_taps = is_avg ? 8 : 4; \ while (w >= 16) { \ vpx_filter_block1d16_##dir##4_##avg##opt(src_start, src_stride, dst, \ dst_stride, h, filter_row); \ @@ -62,25 +71,28 @@ typedef void filter8_1dfunction(const uint8_t *src_ptr, ptrdiff_t src_pitch, vpx_filter_block1d4_##dir##4_##avg##opt(src_start, src_stride, dst, \ dst_stride, h, filter_row); \ } \ + (void)num_taps; \ } else { \ + const int num_taps = 2; \ while (w >= 16) { \ - vpx_filter_block1d16_##dir##2_##avg##opt(src, src_stride, dst, \ + vpx_filter_block1d16_##dir##2_##avg##opt(src_start, src_stride, dst, \ dst_stride, h, filter_row); \ src += 16; \ dst += 16; \ w -= 16; \ } \ if (w == 8) { \ - vpx_filter_block1d8_##dir##2_##avg##opt(src, src_stride, dst, \ + vpx_filter_block1d8_##dir##2_##avg##opt(src_start, src_stride, dst, \ dst_stride, h, filter_row); \ } else if (w == 4) { \ - vpx_filter_block1d4_##dir##2_##avg##opt(src, src_stride, dst, \ + vpx_filter_block1d4_##dir##2_##avg##opt(src_start, src_stride, dst, \ dst_stride, h, filter_row); \ } \ + (void)num_taps; \ } \ } -#define FUN_CONV_2D(avg, opt) \ +#define FUN_CONV_2D(avg, opt, is_avg) \ void vpx_convolve8_##avg##opt( \ const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, \ ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, \ @@ -94,7 +106,7 @@ typedef void filter8_1dfunction(const uint8_t *src_ptr, ptrdiff_t src_pitch, assert(h <= 64); \ assert(x_step_q4 == 16); \ assert(y_step_q4 == 16); \ - if (filter_x[0] | filter_x[1] | filter_x[2]) { \ + if (filter_x[0] | filter_x[1] | filter_x[6] | filter_x[7]) { \ DECLARE_ALIGNED(16, uint8_t, fdata2[64 * 71]); \ vpx_convolve8_horiz_##opt(src - 3 * src_stride, src_stride, fdata2, 64, \ filter, x0_q4, x_step_q4, y0_q4, y_step_q4, w, \ @@ -102,6 +114,15 @@ typedef void filter8_1dfunction(const uint8_t *src_ptr, ptrdiff_t src_pitch, vpx_convolve8_##avg##vert_##opt(fdata2 + 3 * 64, 64, dst, dst_stride, \ filter, x0_q4, x_step_q4, y0_q4, \ y_step_q4, w, h); \ + } else if (filter_x[2] | filter_x[5]) { \ + const int num_taps = is_avg ? 8 : 4; \ + DECLARE_ALIGNED(16, uint8_t, fdata2[64 * 71]); \ + vpx_convolve8_horiz_##opt( \ + src - (num_taps / 2 - 1) * src_stride, src_stride, fdata2, 64, \ + filter, x0_q4, x_step_q4, y0_q4, y_step_q4, w, h + num_taps - 1); \ + vpx_convolve8_##avg##vert_##opt(fdata2 + 64 * (num_taps / 2 - 1), 64, \ + dst, dst_stride, filter, x0_q4, \ + x_step_q4, y0_q4, y_step_q4, w, h); \ } else { \ DECLARE_ALIGNED(16, uint8_t, fdata2[64 * 65]); \ vpx_convolve8_horiz_##opt(src, src_stride, fdata2, 64, filter, x0_q4, \ @@ -121,89 +142,96 @@ typedef void highbd_filter8_1dfunction(const uint16_t *src_ptr, unsigned int output_height, const int16_t *filter, int bd); -#define HIGH_FUN_CONV_1D(name, offset, step_q4, dir, src_start, avg, opt) \ - void vpx_highbd_convolve8_##name##_##opt( \ - const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, \ - ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, \ - int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bd) { \ - const int16_t *filter_row = filter[offset]; \ - if (step_q4 == 16 && filter_row[3] != 128) { \ - if (filter_row[0] | filter_row[1] | filter_row[6] | filter_row[7]) { \ - while (w >= 16) { \ - vpx_highbd_filter_block1d16_##dir##8_##avg##opt( \ - src_start, src_stride, dst, dst_stride, h, filter_row, bd); \ - src += 16; \ - dst += 16; \ - w -= 16; \ - } \ - while (w >= 8) { \ - vpx_highbd_filter_block1d8_##dir##8_##avg##opt( \ - src_start, src_stride, dst, dst_stride, h, filter_row, bd); \ - src += 8; \ - dst += 8; \ - w -= 8; \ - } \ - while (w >= 4) { \ - vpx_highbd_filter_block1d4_##dir##8_##avg##opt( \ - src_start, src_stride, dst, dst_stride, h, filter_row, bd); \ - src += 4; \ - dst += 4; \ - w -= 4; \ - } \ - } else if (filter_row[2] | filter_row[5]) { \ - while (w >= 16) { \ - vpx_highbd_filter_block1d16_##dir##4_##avg##opt( \ - src_start, src_stride, dst, dst_stride, h, filter_row, bd); \ - src += 16; \ - dst += 16; \ - w -= 16; \ - } \ - while (w >= 8) { \ - vpx_highbd_filter_block1d8_##dir##4_##avg##opt( \ - src_start, src_stride, dst, dst_stride, h, filter_row, bd); \ - src += 8; \ - dst += 8; \ - w -= 8; \ - } \ - while (w >= 4) { \ - vpx_highbd_filter_block1d4_##dir##4_##avg##opt( \ - src_start, src_stride, dst, dst_stride, h, filter_row, bd); \ - src += 4; \ - dst += 4; \ - w -= 4; \ - } \ - } else { \ - while (w >= 16) { \ - vpx_highbd_filter_block1d16_##dir##2_##avg##opt( \ - src, src_stride, dst, dst_stride, h, filter_row, bd); \ - src += 16; \ - dst += 16; \ - w -= 16; \ - } \ - while (w >= 8) { \ - vpx_highbd_filter_block1d8_##dir##2_##avg##opt( \ - src, src_stride, dst, dst_stride, h, filter_row, bd); \ - src += 8; \ - dst += 8; \ - w -= 8; \ - } \ - while (w >= 4) { \ - vpx_highbd_filter_block1d4_##dir##2_##avg##opt( \ - src, src_stride, dst, dst_stride, h, filter_row, bd); \ - src += 4; \ - dst += 4; \ - w -= 4; \ - } \ - } \ - } \ - if (w) { \ - vpx_highbd_convolve8_##name##_c(src, src_stride, dst, dst_stride, \ - filter, x0_q4, x_step_q4, y0_q4, \ - y_step_q4, w, h, bd); \ - } \ +#define HIGH_FUN_CONV_1D(name, offset, step_q4, dir, src_start, avg, opt, \ + is_avg) \ + void vpx_highbd_convolve8_##name##_##opt( \ + const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, \ + ptrdiff_t dst_stride, const InterpKernel *filter_kernel, int x0_q4, \ + int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bd) { \ + const int16_t *filter_row = filter_kernel[offset]; \ + if (step_q4 == 16 && filter_row[3] != 128) { \ + if (filter_row[0] | filter_row[1] | filter_row[6] | filter_row[7]) { \ + const int num_taps = 8; \ + while (w >= 16) { \ + vpx_highbd_filter_block1d16_##dir##8_##avg##opt( \ + src_start, src_stride, dst, dst_stride, h, filter_row, bd); \ + src += 16; \ + dst += 16; \ + w -= 16; \ + } \ + while (w >= 8) { \ + vpx_highbd_filter_block1d8_##dir##8_##avg##opt( \ + src_start, src_stride, dst, dst_stride, h, filter_row, bd); \ + src += 8; \ + dst += 8; \ + w -= 8; \ + } \ + while (w >= 4) { \ + vpx_highbd_filter_block1d4_##dir##8_##avg##opt( \ + src_start, src_stride, dst, dst_stride, h, filter_row, bd); \ + src += 4; \ + dst += 4; \ + w -= 4; \ + } \ + (void)num_taps; \ + } else if (filter_row[2] | filter_row[5]) { \ + const int num_taps = is_avg ? 8 : 4; \ + while (w >= 16) { \ + vpx_highbd_filter_block1d16_##dir##4_##avg##opt( \ + src_start, src_stride, dst, dst_stride, h, filter_row, bd); \ + src += 16; \ + dst += 16; \ + w -= 16; \ + } \ + while (w >= 8) { \ + vpx_highbd_filter_block1d8_##dir##4_##avg##opt( \ + src_start, src_stride, dst, dst_stride, h, filter_row, bd); \ + src += 8; \ + dst += 8; \ + w -= 8; \ + } \ + while (w >= 4) { \ + vpx_highbd_filter_block1d4_##dir##4_##avg##opt( \ + src_start, src_stride, dst, dst_stride, h, filter_row, bd); \ + src += 4; \ + dst += 4; \ + w -= 4; \ + } \ + (void)num_taps; \ + } else { \ + const int num_taps = 2; \ + while (w >= 16) { \ + vpx_highbd_filter_block1d16_##dir##2_##avg##opt( \ + src_start, src_stride, dst, dst_stride, h, filter_row, bd); \ + src += 16; \ + dst += 16; \ + w -= 16; \ + } \ + while (w >= 8) { \ + vpx_highbd_filter_block1d8_##dir##2_##avg##opt( \ + src_start, src_stride, dst, dst_stride, h, filter_row, bd); \ + src += 8; \ + dst += 8; \ + w -= 8; \ + } \ + while (w >= 4) { \ + vpx_highbd_filter_block1d4_##dir##2_##avg##opt( \ + src_start, src_stride, dst, dst_stride, h, filter_row, bd); \ + src += 4; \ + dst += 4; \ + w -= 4; \ + } \ + (void)num_taps; \ + } \ + } \ + if (w) { \ + vpx_highbd_convolve8_##name##_c(src, src_stride, dst, dst_stride, \ + filter_kernel, x0_q4, x_step_q4, y0_q4, \ + y_step_q4, w, h, bd); \ + } \ } -#define HIGH_FUN_CONV_2D(avg, opt) \ +#define HIGH_FUN_CONV_2D(avg, opt, is_avg) \ void vpx_highbd_convolve8_##avg##opt( \ const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, \ ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, \ @@ -212,7 +240,8 @@ typedef void highbd_filter8_1dfunction(const uint16_t *src_ptr, assert(w <= 64); \ assert(h <= 64); \ if (x_step_q4 == 16 && y_step_q4 == 16) { \ - if ((filter_x[0] | filter_x[1] | filter_x[2]) || filter_x[3] == 128) { \ + if ((filter_x[0] | filter_x[1] | filter_x[6] | filter_x[7]) || \ + filter_x[3] == 128) { \ DECLARE_ALIGNED(16, uint16_t, fdata2[64 * 71]); \ vpx_highbd_convolve8_horiz_##opt(src - 3 * src_stride, src_stride, \ fdata2, 64, filter, x0_q4, x_step_q4, \ @@ -220,6 +249,16 @@ typedef void highbd_filter8_1dfunction(const uint16_t *src_ptr, vpx_highbd_convolve8_##avg##vert_##opt( \ fdata2 + 192, 64, dst, dst_stride, filter, x0_q4, x_step_q4, \ y0_q4, y_step_q4, w, h, bd); \ + } else if (filter_x[2] | filter_x[5]) { \ + const int num_taps = is_avg ? 8 : 4; \ + DECLARE_ALIGNED(16, uint16_t, fdata2[64 * 71]); \ + vpx_highbd_convolve8_horiz_##opt( \ + src - (num_taps / 2 - 1) * src_stride, src_stride, fdata2, 64, \ + filter, x0_q4, x_step_q4, y0_q4, y_step_q4, w, h + num_taps - 1, \ + bd); \ + vpx_highbd_convolve8_##avg##vert_##opt( \ + fdata2 + 64 * (num_taps / 2 - 1), 64, dst, dst_stride, filter, \ + x0_q4, x_step_q4, y0_q4, y_step_q4, w, h, bd); \ } else { \ DECLARE_ALIGNED(16, uint16_t, fdata2[64 * 65]); \ vpx_highbd_convolve8_horiz_##opt(src, src_stride, fdata2, 64, filter, \ @@ -235,6 +274,6 @@ typedef void highbd_filter8_1dfunction(const uint16_t *src_ptr, bd); \ } \ } -#endif // CONFIG_VP9_HIGHBITDEPTH +#endif // CONFIG_VP9_HIGHBITDEPTH #endif // VPX_VPX_DSP_X86_CONVOLVE_H_ diff --git a/vpx_dsp/x86/highbd_convolve_avx2.c b/vpx_dsp/x86/highbd_convolve_avx2.c index aef067ea7..320962561 100644 --- a/vpx_dsp/x86/highbd_convolve_avx2.c +++ b/vpx_dsp/x86/highbd_convolve_avx2.c @@ -1089,22 +1089,19 @@ static void vpx_highbd_filter_block1d8_h4_avx2( // Repeat for the last row if needed if (h > 0) { - src_reg = _mm256_loadu_si256((const __m256i *)src_ptr); - // Reorder into 2 1 1 2 - src_reg = _mm256_permute4x64_epi64(src_reg, 0x94); - + src_reg = mm256_loadu2_si128(src_ptr, src_ptr + 4); src_reg_shift_0 = _mm256_shuffle_epi8(src_reg, idx_shift_0); src_reg_shift_2 = _mm256_shuffle_epi8(src_reg, idx_shift_2); res_reg = mm256_madd_add_epi32(&src_reg_shift_0, &src_reg_shift_2, &kernel_reg_23, &kernel_reg_45); - res_reg = mm256_round_epi32(&res_first, ®_round, CONV8_ROUNDING_BITS); + res_reg = mm256_round_epi32(&res_reg, ®_round, CONV8_ROUNDING_BITS); res_reg = _mm256_packus_epi32(res_reg, res_reg); - res_reg = _mm256_permute4x64_epi64(res_reg, 0x8); + res_reg = _mm256_min_epi16(res_reg, reg_max); - _mm_store_si128((__m128i *)dst_ptr, _mm256_castsi256_si128(res_reg)); + mm256_storeu2_epi64((__m128i *)dst_ptr, (__m128i *)(dst_ptr + 4), &res_reg); } } @@ -1279,10 +1276,6 @@ static void vpx_highbd_filter_block1d4_v4_avx2( const ptrdiff_t dst_stride_unrolled = dst_stride << 1; int h; - // We only need to go num_taps/2 - 1 row above the souce, so we move - // 3 - (num_taps/2 - 1) = 4 - num_taps/2 = 2 back down - src_ptr += src_stride_unrolled; - // Load Kernel kernel_reg_128 = _mm_loadu_si128((const __m128i *)kernel); kernel_reg = _mm256_broadcastsi128_si256(kernel_reg_128); @@ -1368,10 +1361,6 @@ static void vpx_highbd_filter_block1d8_v4_avx2( const ptrdiff_t dst_stride_unrolled = dst_stride << 1; int h; - // We only need to go num_taps/2 - 1 row above the souce, so we move - // 3 - (num_taps/2 - 1) = 4 - num_taps/2 = 2 back down - src_ptr += src_stride_unrolled; - // Load Kernel kernel_reg_128 = _mm_loadu_si128((const __m128i *)kernel); kernel_reg = _mm256_broadcastsi128_si256(kernel_reg_128); @@ -1476,9 +1465,10 @@ highbd_filter8_1dfunction vpx_highbd_filter_block1d4_v2_sse2; #define vpx_highbd_filter_block1d4_h4_avg_avx2 \ vpx_highbd_filter_block1d4_h8_avg_avx2 -HIGH_FUN_CONV_1D(horiz, x0_q4, x_step_q4, h, src, , avx2); -HIGH_FUN_CONV_1D(vert, y0_q4, y_step_q4, v, src - src_stride * 3, , avx2); -HIGH_FUN_CONV_2D(, avx2); +HIGH_FUN_CONV_1D(horiz, x0_q4, x_step_q4, h, src, , avx2, 0); +HIGH_FUN_CONV_1D(vert, y0_q4, y_step_q4, v, + src - src_stride * (num_taps / 2 - 1), , avx2, 0); +HIGH_FUN_CONV_2D(, avx2, 0); // From vpx_dsp/x86/vpx_high_subpixel_8t_sse2.asm. highbd_filter8_1dfunction vpx_highbd_filter_block1d4_h8_avg_sse2; @@ -1497,9 +1487,9 @@ highbd_filter8_1dfunction vpx_highbd_filter_block1d4_v2_avg_sse2; #define vpx_highbd_filter_block1d4_v2_avg_avx2 \ vpx_highbd_filter_block1d4_v2_avg_sse2 -HIGH_FUN_CONV_1D(avg_horiz, x0_q4, x_step_q4, h, src, avg_, avx2); -HIGH_FUN_CONV_1D(avg_vert, y0_q4, y_step_q4, v, src - src_stride * 3, avg_, - avx2); -HIGH_FUN_CONV_2D(avg_, avx2); +HIGH_FUN_CONV_1D(avg_horiz, x0_q4, x_step_q4, h, src, avg_, avx2, 1); +HIGH_FUN_CONV_1D(avg_vert, y0_q4, y_step_q4, v, + src - src_stride * (num_taps / 2 - 1), avg_, avx2, 1); +HIGH_FUN_CONV_2D(avg_, avx2, 1); #undef HIGHBD_FUNC diff --git a/vpx_dsp/x86/vpx_subpixel_4t_intrin_sse2.c b/vpx_dsp/x86/vpx_subpixel_4t_intrin_sse2.c index e40fe693a..e0e8b8f90 100644 --- a/vpx_dsp/x86/vpx_subpixel_4t_intrin_sse2.c +++ b/vpx_dsp/x86/vpx_subpixel_4t_intrin_sse2.c @@ -133,10 +133,6 @@ static void vpx_filter_block1d16_v4_sse2(const uint8_t *src_ptr, const ptrdiff_t dst_stride_unrolled = dst_stride << 1; int h; - // We only need to go num_taps/2 - 1 row above the souce, so we move - // 3 - (num_taps/2 - 1) = 4 - num_taps/2 = 2 back down - src_ptr += src_stride_unrolled; - // Load Kernel kernel_reg = _mm_loadu_si128((const __m128i *)kernel); kernel_reg = _mm_srai_epi16(kernel_reg, 1); @@ -345,10 +341,6 @@ static void vpx_filter_block1d8_v4_sse2(const uint8_t *src_ptr, const ptrdiff_t dst_stride_unrolled = dst_stride << 1; int h; - // We only need to go num_taps/2 - 1 row above the souce, so we move - // 3 - (num_taps/2 - 1) = 4 - num_taps/2 = 2 back down - src_ptr += src_stride_unrolled; - // Load Kernel kernel_reg = _mm_loadu_si128((const __m128i *)kernel); kernel_reg = _mm_srai_epi16(kernel_reg, 1); @@ -531,10 +523,6 @@ static void vpx_filter_block1d4_v4_sse2(const uint8_t *src_ptr, const ptrdiff_t dst_stride_unrolled = dst_stride << 1; int h; - // We only need to go num_taps/2 - 1 row above the souce, so we move - // 3 - (num_taps/2 - 1) = 4 - num_taps/2 = 2 back down - src_ptr += src_stride_unrolled; - // Load Kernel kernel_reg = _mm_loadu_si128((const __m128i *)kernel); kernel_reg = _mm_srai_epi16(kernel_reg, 1); @@ -713,10 +701,6 @@ static void vpx_highbd_filter_block1d4_v4_sse2( const ptrdiff_t dst_stride_unrolled = dst_stride << 1; int h; - // We only need to go num_taps/2 - 1 row above the source, so we move - // 3 - (num_taps/2 - 1) = 4 - num_taps/2 = 2 back down - src_ptr += src_stride_unrolled; - // Load Kernel kernel_reg = _mm_loadu_si128((const __m128i *)kernel); kernel_reg_23 = extract_quarter_2_epi16_sse2(&kernel_reg); @@ -896,10 +880,6 @@ static void vpx_highbd_filter_block1d8_v4_sse2( const ptrdiff_t dst_stride_unrolled = dst_stride << 1; int h; - // We only need to go num_taps/2 - 1 row above the source, so we move - // 3 - (num_taps/2 - 1) = 4 - num_taps/2 = 2 back down - src_ptr += src_stride_unrolled; - // Load Kernel kernel_reg = _mm_loadu_si128((const __m128i *)kernel); kernel_reg_23 = extract_quarter_2_epi16_sse2(&kernel_reg); @@ -1060,10 +1040,12 @@ filter8_1dfunction vpx_filter_block1d4_h2_avg_sse2; // const InterpKernel *filter, int x0_q4, // int32_t x_step_q4, int y0_q4, int y_step_q4, // int w, int h); -FUN_CONV_1D(horiz, x0_q4, x_step_q4, h, src, , sse2); -FUN_CONV_1D(vert, y0_q4, y_step_q4, v, src - src_stride * 3, , sse2); -FUN_CONV_1D(avg_horiz, x0_q4, x_step_q4, h, src, avg_, sse2); -FUN_CONV_1D(avg_vert, y0_q4, y_step_q4, v, src - src_stride * 3, avg_, sse2); +FUN_CONV_1D(horiz, x0_q4, x_step_q4, h, src, , sse2, 0); +FUN_CONV_1D(vert, y0_q4, y_step_q4, v, src - (num_taps / 2 - 1) * src_stride, , + sse2, 0); +FUN_CONV_1D(avg_horiz, x0_q4, x_step_q4, h, src, avg_, sse2, 1); +FUN_CONV_1D(avg_vert, y0_q4, y_step_q4, v, + src - (num_taps / 2 - 1) * src_stride, avg_, sse2, 1); // void vpx_convolve8_sse2(const uint8_t *src, ptrdiff_t src_stride, // uint8_t *dst, ptrdiff_t dst_stride, @@ -1075,8 +1057,8 @@ FUN_CONV_1D(avg_vert, y0_q4, y_step_q4, v, src - src_stride * 3, avg_, sse2); // const InterpKernel *filter, int x0_q4, // int32_t x_step_q4, int y0_q4, int y_step_q4, // int w, int h); -FUN_CONV_2D(, sse2); -FUN_CONV_2D(avg_, sse2); +FUN_CONV_2D(, sse2, 0); +FUN_CONV_2D(avg_, sse2, 1); #if CONFIG_VP9_HIGHBITDEPTH && ARCH_X86_64 // From vpx_dsp/x86/vpx_high_subpixel_8t_sse2.asm. @@ -1157,11 +1139,12 @@ highbd_filter8_1dfunction vpx_highbd_filter_block1d4_h2_avg_sse2; // const int16_t *filter_y, // int y_step_q4, // int w, int h, int bd); -HIGH_FUN_CONV_1D(horiz, x0_q4, x_step_q4, h, src, , sse2); -HIGH_FUN_CONV_1D(vert, y0_q4, y_step_q4, v, src - src_stride * 3, , sse2); -HIGH_FUN_CONV_1D(avg_horiz, x0_q4, x_step_q4, h, src, avg_, sse2); -HIGH_FUN_CONV_1D(avg_vert, y0_q4, y_step_q4, v, src - src_stride * 3, avg_, - sse2); +HIGH_FUN_CONV_1D(horiz, x0_q4, x_step_q4, h, src, , sse2, 0); +HIGH_FUN_CONV_1D(vert, y0_q4, y_step_q4, v, + src - src_stride * (num_taps / 2 - 1), , sse2, 0); +HIGH_FUN_CONV_1D(avg_horiz, x0_q4, x_step_q4, h, src, avg_, sse2, 1); +HIGH_FUN_CONV_1D(avg_vert, y0_q4, y_step_q4, v, + src - src_stride * (num_taps / 2 - 1), avg_, sse2, 1); // void vpx_highbd_convolve8_sse2(const uint8_t *src, ptrdiff_t src_stride, // uint8_t *dst, ptrdiff_t dst_stride, @@ -1173,6 +1156,6 @@ HIGH_FUN_CONV_1D(avg_vert, y0_q4, y_step_q4, v, src - src_stride * 3, avg_, // const InterpKernel *filter, int x0_q4, // int32_t x_step_q4, int y0_q4, // int y_step_q4, int w, int h, int bd); -HIGH_FUN_CONV_2D(, sse2); -HIGH_FUN_CONV_2D(avg_, sse2); +HIGH_FUN_CONV_2D(, sse2, 0); +HIGH_FUN_CONV_2D(avg_, sse2, 1); #endif // CONFIG_VP9_HIGHBITDEPTH && ARCH_X86_64 diff --git a/vpx_dsp/x86/vpx_subpixel_8t_intrin_avx2.c b/vpx_dsp/x86/vpx_subpixel_8t_intrin_avx2.c index ccedfe206..d381a7a47 100644 --- a/vpx_dsp/x86/vpx_subpixel_8t_intrin_avx2.c +++ b/vpx_dsp/x86/vpx_subpixel_8t_intrin_avx2.c @@ -464,10 +464,6 @@ static void vpx_filter_block1d16_v4_avx2(const uint8_t *src_ptr, const ptrdiff_t dst_stride_unrolled = dst_stride << 1; int h; - // We only need to go num_taps/2 - 1 row above the souce, so we move - // 3 - (num_taps/2 - 1) = 4 - num_taps/2 = 2 back down - src_ptr += src_stride_unrolled; - // Load Kernel kernel_reg = _mm_loadu_si128((const __m128i *)kernel); kernel_reg = _mm_srai_epi16(kernel_reg, 1); @@ -665,10 +661,6 @@ static void vpx_filter_block1d8_v4_avx2(const uint8_t *src_ptr, const ptrdiff_t dst_stride_unrolled = dst_stride << 1; int h; - // We only need to go num_taps/2 - 1 row above the souce, so we move - // 3 - (num_taps/2 - 1) = 4 - num_taps/2 = 2 back down - src_ptr += src_stride_unrolled; - // Load Kernel kernel_reg_128 = _mm_loadu_si128((const __m128i *)kernel); kernel_reg_128 = _mm_srai_epi16(kernel_reg_128, 1); @@ -839,10 +831,6 @@ static void vpx_filter_block1d4_v4_avx2(const uint8_t *src_ptr, const ptrdiff_t dst_stride_unrolled = dst_stride << 1; int h; - // We only need to go num_taps/2 - 1 row above the souce, so we move - // 3 - (num_taps/2 - 1) = 4 - num_taps/2 = 2 back down - src_ptr += src_stride_unrolled; - // Load Kernel kernel_reg_128 = _mm_loadu_si128((const __m128i *)kernel); kernel_reg_128 = _mm_srai_epi16(kernel_reg_128, 1); @@ -981,10 +969,12 @@ filter8_1dfunction vpx_filter_block1d4_h2_avg_ssse3; // const InterpKernel *filter, int x0_q4, // int32_t x_step_q4, int y0_q4, // int y_step_q4, int w, int h); -FUN_CONV_1D(horiz, x0_q4, x_step_q4, h, src, , avx2); -FUN_CONV_1D(vert, y0_q4, y_step_q4, v, src - src_stride * 3, , avx2); -FUN_CONV_1D(avg_horiz, x0_q4, x_step_q4, h, src, avg_, avx2); -FUN_CONV_1D(avg_vert, y0_q4, y_step_q4, v, src - src_stride * 3, avg_, avx2); +FUN_CONV_1D(horiz, x0_q4, x_step_q4, h, src, , avx2, 0); +FUN_CONV_1D(vert, y0_q4, y_step_q4, v, src - src_stride * (num_taps / 2 - 1), , + avx2, 0); +FUN_CONV_1D(avg_horiz, x0_q4, x_step_q4, h, src, avg_, avx2, 1); +FUN_CONV_1D(avg_vert, y0_q4, y_step_q4, v, + src - src_stride * (num_taps / 2 - 1), avg_, avx2, 1); // void vpx_convolve8_avx2(const uint8_t *src, ptrdiff_t src_stride, // uint8_t *dst, ptrdiff_t dst_stride, @@ -996,6 +986,6 @@ FUN_CONV_1D(avg_vert, y0_q4, y_step_q4, v, src - src_stride * 3, avg_, avx2); // const InterpKernel *filter, int x0_q4, // int32_t x_step_q4, int y0_q4, int y_step_q4, // int w, int h); -FUN_CONV_2D(, avx2); -FUN_CONV_2D(avg_, avx2); +FUN_CONV_2D(, avx2, 0); +FUN_CONV_2D(avg_, avx2, 1); #endif // HAVE_AX2 && HAVE_SSSE3 diff --git a/vpx_dsp/x86/vpx_subpixel_8t_intrin_ssse3.c b/vpx_dsp/x86/vpx_subpixel_8t_intrin_ssse3.c index 37d1de0f1..63049c934 100644 --- a/vpx_dsp/x86/vpx_subpixel_8t_intrin_ssse3.c +++ b/vpx_dsp/x86/vpx_subpixel_8t_intrin_ssse3.c @@ -310,10 +310,6 @@ static void vpx_filter_block1d16_v4_ssse3(const uint8_t *src_ptr, const ptrdiff_t dst_stride_unrolled = dst_stride << 1; int h; - // We only need to go num_taps/2 - 1 row above the souce, so we move - // 3 - (num_taps/2 - 1) = 4 - num_taps/2 = 2 back down - src_ptr += src_stride_unrolled; - // Load Kernel kernel_reg = _mm_loadu_si128((const __m128i *)kernel); kernel_reg = _mm_srai_epi16(kernel_reg, 1); @@ -483,10 +479,6 @@ static void vpx_filter_block1d8_v4_ssse3(const uint8_t *src_ptr, const ptrdiff_t dst_stride_unrolled = dst_stride << 1; int h; - // We only need to go num_taps/2 - 1 row above the souce, so we move - // 3 - (num_taps/2 - 1) = 4 - num_taps/2 = 2 back down - src_ptr += src_stride_unrolled; - // Load Kernel kernel_reg = _mm_loadu_si128((const __m128i *)kernel); kernel_reg = _mm_srai_epi16(kernel_reg, 1); @@ -627,10 +619,6 @@ static void vpx_filter_block1d4_v4_ssse3(const uint8_t *src_ptr, const ptrdiff_t dst_stride_unrolled = dst_stride << 1; int h; - // We only need to go num_taps/2 - 1 row above the souce, so we move - // 3 - (num_taps/2 - 1) = 4 - num_taps/2 = 2 back down - src_ptr += src_stride_unrolled; - // Load Kernel kernel_reg = _mm_loadu_si128((const __m128i *)kernel); kernel_reg = _mm_srai_epi16(kernel_reg, 1); @@ -743,10 +731,12 @@ filter8_1dfunction vpx_filter_block1d4_h2_avg_ssse3; // const InterpKernel *filter, int x0_q4, // int32_t x_step_q4, int y0_q4, // int y_step_q4, int w, int h); -FUN_CONV_1D(horiz, x0_q4, x_step_q4, h, src, , ssse3); -FUN_CONV_1D(vert, y0_q4, y_step_q4, v, src - src_stride * 3, , ssse3); -FUN_CONV_1D(avg_horiz, x0_q4, x_step_q4, h, src, avg_, ssse3); -FUN_CONV_1D(avg_vert, y0_q4, y_step_q4, v, src - src_stride * 3, avg_, ssse3); +FUN_CONV_1D(horiz, x0_q4, x_step_q4, h, src, , ssse3, 0); +FUN_CONV_1D(vert, y0_q4, y_step_q4, v, src - src_stride * (num_taps / 2 - 1), , + ssse3, 0); +FUN_CONV_1D(avg_horiz, x0_q4, x_step_q4, h, src, avg_, ssse3, 1); +FUN_CONV_1D(avg_vert, y0_q4, y_step_q4, v, + src - src_stride * (num_taps / 2 - 1), avg_, ssse3, 1); static void filter_horiz_w8_ssse3(const uint8_t *const src, const ptrdiff_t src_stride, @@ -1093,5 +1083,5 @@ void vpx_scaled_2d_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, // const InterpKernel *filter, int x0_q4, // int32_t x_step_q4, int y0_q4, int y_step_q4, // int w, int h); -FUN_CONV_2D(, ssse3); -FUN_CONV_2D(avg_, ssse3); +FUN_CONV_2D(, ssse3, 0); +FUN_CONV_2D(avg_, ssse3, 1); -- 2.40.0