From 505f2ed7fc3090b2fc33d11cca571acdd4825d4c Mon Sep 17 00:00:00 2001 From: chiyotsai Date: Fri, 26 Oct 2018 14:14:28 -0700 Subject: [PATCH] Add AVX2 support for hbd 4-tap interpolation filter. Speed gain: BIT DEPTH | 8TAP FPS | 4TAP FPS | PCT INC | 10 | 1.69 | 1.85 | 9.46% | 12 | 1.64 | 1.78 | 8.54% | Speed test is done on jet.y4m on speed 1 profile 2 over 100 frame with br=500. Change-Id: I411e122553e2c466be7a26e64b4dd144efb884a9 --- vpx_dsp/x86/convolve.h | 24 +- vpx_dsp/x86/convolve_avx2.h | 16 ++ vpx_dsp/x86/highbd_convolve_avx2.c | 445 +++++++++++++++++++++++++++-- vpx_dsp/x86/vpx_asm_stubs.c | 19 ++ 4 files changed, 478 insertions(+), 26 deletions(-) diff --git a/vpx_dsp/x86/convolve.h b/vpx_dsp/x86/convolve.h index f47cce4d2..aa60f44f7 100644 --- a/vpx_dsp/x86/convolve.h +++ b/vpx_dsp/x86/convolve.h @@ -128,7 +128,7 @@ typedef void highbd_filter8_1dfunction(const uint16_t *src_ptr, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bd) { \ const int16_t *filter = filter_kernel[offset]; \ if (step_q4 == 16 && filter[3] != 128) { \ - if (filter[0] | filter[1] | filter[2]) { \ + if (filter[0] | filter[1] | filter[6] | filter[7]) { \ while (w >= 16) { \ vpx_highbd_filter_block1d16_##dir##8_##avg##opt( \ src_start, src_stride, dst, dst_stride, h, filter, bd); \ @@ -150,6 +150,28 @@ typedef void highbd_filter8_1dfunction(const uint16_t *src_ptr, dst += 4; \ w -= 4; \ } \ + } else if (filter[2] | filter[5]) { \ + while (w >= 16) { \ + vpx_highbd_filter_block1d16_##dir##4_##avg##opt( \ + src_start, src_stride, dst, dst_stride, h, filter, bd); \ + src += 16; \ + dst += 16; \ + w -= 16; \ + } \ + while (w >= 8) { \ + vpx_highbd_filter_block1d8_##dir##4_##avg##opt( \ + src_start, src_stride, dst, dst_stride, h, filter, bd); \ + src += 8; \ + dst += 8; \ + w -= 8; \ + } \ + while (w >= 4) { \ + vpx_highbd_filter_block1d4_##dir##4_##avg##opt( \ + src_start, src_stride, dst, dst_stride, h, filter, bd); \ + src += 4; \ + dst += 4; \ + w -= 4; \ + } \ } else { \ while (w >= 16) { \ vpx_highbd_filter_block1d16_##dir##2_##avg##opt( \ diff --git a/vpx_dsp/x86/convolve_avx2.h b/vpx_dsp/x86/convolve_avx2.h index e9fc9c06a..99bc9637f 100644 --- a/vpx_dsp/x86/convolve_avx2.h +++ b/vpx_dsp/x86/convolve_avx2.h @@ -134,6 +134,13 @@ static INLINE void mm256_storeu2_epi32(__m128i *const dst_ptr_1, _mm_cvtsi128_si32(_mm256_extractf128_si256(*src, 1)); } +static INLINE __m256i mm256_round_epi32(const __m256i *const src, + const __m256i *const half_depth, + const int depth) { + const __m256i nearest_src = _mm256_add_epi32(*src, *half_depth); + return _mm256_srai_epi32(nearest_src, depth); +} + static INLINE __m256i mm256_round_epi16(const __m256i *const src, const __m256i *const half_depth, const int depth) { @@ -141,6 +148,15 @@ static INLINE __m256i mm256_round_epi16(const __m256i *const src, return _mm256_srai_epi16(nearest_src, depth); } +static INLINE __m256i mm256_madd_add_epi32(const __m256i *const src_0, + const __m256i *const src_1, + const __m256i *const ker_0, + const __m256i *const ker_1) { + const __m256i tmp_0 = _mm256_madd_epi16(*src_0, *ker_0); + const __m256i tmp_1 = _mm256_madd_epi16(*src_1, *ker_1); + return _mm256_add_epi32(tmp_0, tmp_1); +} + #undef MM256_BROADCASTSI128_SI256 #endif // VPX_VPX_DSP_X86_CONVOLVE_AVX2_H_ diff --git a/vpx_dsp/x86/highbd_convolve_avx2.c b/vpx_dsp/x86/highbd_convolve_avx2.c index ef94522a3..ff5ef5f85 100644 --- a/vpx_dsp/x86/highbd_convolve_avx2.c +++ b/vpx_dsp/x86/highbd_convolve_avx2.c @@ -9,9 +9,9 @@ */ #include - #include "./vpx_dsp_rtcd.h" #include "vpx_dsp/x86/convolve.h" +#include "vpx_dsp/x86/convolve_avx2.h" // ----------------------------------------------------------------------------- // Copy and average @@ -209,6 +209,7 @@ static const uint8_t signal_pattern_2[32] = { 6, 7, 8, 9, 8, 9, 10, 11, static const uint32_t signal_index[8] = { 2, 3, 4, 5, 2, 3, 4, 5 }; #define CONV8_ROUNDING_BITS (7) +#define CONV8_ROUNDING_NUM (1 << (CONV8_ROUNDING_BITS - 1)) // ----------------------------------------------------------------------------- // Horizontal Filtering @@ -923,6 +924,200 @@ static void vpx_highbd_filter_block1d16_h8_avg_avx2( } while (height > 0); } +static void vpx_highbd_filter_block1d4_h4_avx2( + const uint16_t *src_ptr, ptrdiff_t src_stride, uint16_t *dst_ptr, + ptrdiff_t dst_stride, uint32_t height, const int16_t *kernel, int bd) { + // We extract the middle four elements of the kernel into two registers in + // the form + // ... k[3] k[2] k[3] k[2] + // ... k[5] k[4] k[5] k[4] + // Then we shuffle the source into + // ... s[1] s[0] s[0] s[-1] + // ... s[3] s[2] s[2] s[1] + // Calling multiply and add gives us half of the sum. Calling add on the two + // halves gives us the output. Since avx2 allows us to use 256-bit buffer, we + // can do this two rows at a time. + + __m256i src_reg, src_reg_shift_0, src_reg_shift_2; + __m256i res_reg; + __m256i idx_shift_0 = + _mm256_setr_epi8(0, 1, 2, 3, 2, 3, 4, 5, 4, 5, 6, 7, 6, 7, 8, 9, 0, 1, 2, + 3, 2, 3, 4, 5, 4, 5, 6, 7, 6, 7, 8, 9); + __m256i idx_shift_2 = + _mm256_setr_epi8(4, 5, 6, 7, 6, 7, 8, 9, 8, 9, 10, 11, 10, 11, 12, 13, 4, + 5, 6, 7, 6, 7, 8, 9, 8, 9, 10, 11, 10, 11, 12, 13); + + __m128i kernel_reg_128; // Kernel + __m256i kernel_reg, kernel_reg_23, + kernel_reg_45; // Segments of the kernel used + const __m256i reg_round = + _mm256_set1_epi32(CONV8_ROUNDING_NUM); // Used for rounding + const __m256i reg_max = _mm256_set1_epi16((1 << bd) - 1); + const ptrdiff_t unrolled_src_stride = src_stride << 1; + const ptrdiff_t unrolled_dst_stride = dst_stride << 1; + int h; + + // Start one pixel before as we need tap/2 - 1 = 1 sample from the past + src_ptr -= 1; + + // Load Kernel + kernel_reg_128 = _mm_loadu_si128((const __m128i *)kernel); + kernel_reg = _mm256_broadcastsi128_si256(kernel_reg_128); + kernel_reg_23 = _mm256_shuffle_epi32(kernel_reg, 0x55); + kernel_reg_45 = _mm256_shuffle_epi32(kernel_reg, 0xaa); + + for (h = height; h >= 2; h -= 2) { + // Load the source + src_reg = mm256_loadu2_si128(src_ptr, src_ptr + src_stride); + src_reg_shift_0 = _mm256_shuffle_epi8(src_reg, idx_shift_0); + src_reg_shift_2 = _mm256_shuffle_epi8(src_reg, idx_shift_2); + + // Get the output + res_reg = mm256_madd_add_epi32(&src_reg_shift_0, &src_reg_shift_2, + &kernel_reg_23, &kernel_reg_45); + + // Round the result + res_reg = mm256_round_epi32(&res_reg, ®_round, CONV8_ROUNDING_BITS); + + // Finally combine to get the final dst + res_reg = _mm256_packus_epi32(res_reg, res_reg); + res_reg = _mm256_min_epi16(res_reg, reg_max); + mm256_storeu2_epi64((__m128i *)dst_ptr, (__m128i *)(dst_ptr + dst_stride), + &res_reg); + + src_ptr += unrolled_src_stride; + dst_ptr += unrolled_dst_stride; + } + + // Repeat for the last row if needed + if (h > 0) { + // Load the source + src_reg = mm256_loadu2_si128(src_ptr, src_ptr + 4); + src_reg_shift_0 = _mm256_shuffle_epi8(src_reg, idx_shift_0); + src_reg_shift_2 = _mm256_shuffle_epi8(src_reg, idx_shift_2); + + // Get the output + res_reg = mm256_madd_add_epi32(&src_reg_shift_0, &src_reg_shift_2, + &kernel_reg_23, &kernel_reg_45); + + // Round the result + res_reg = mm256_round_epi32(&res_reg, ®_round, CONV8_ROUNDING_BITS); + + // Finally combine to get the final dst + res_reg = _mm256_packus_epi32(res_reg, res_reg); + res_reg = _mm256_min_epi16(res_reg, reg_max); + _mm_storel_epi64((__m128i *)dst_ptr, _mm256_castsi256_si128(res_reg)); + } +} + +void vpx_highbd_filter_block1d8_h4_avx2(const uint16_t *src_ptr, + ptrdiff_t src_stride, uint16_t *dst_ptr, + ptrdiff_t dst_stride, uint32_t height, + const int16_t *kernel, int bd) { + // We will extract the middle four elements of the kernel into two registers + // in the form + // ... k[3] k[2] k[3] k[2] + // ... k[5] k[4] k[5] k[4] + // Then we shuffle the source into + // ... s[1] s[0] s[0] s[-1] + // ... s[3] s[2] s[2] s[1] + // Calling multiply and add gives us half of the sum of the first half. + // Calling add gives us first half of the output. Repat again to get the whole + // output. Since avx2 allows us to use 256-bit buffer, we can do this two rows + // at a time. + + __m256i src_reg, src_reg_shift_0, src_reg_shift_2; + __m256i res_reg, res_first, res_last; + __m256i idx_shift_0 = + _mm256_setr_epi8(0, 1, 2, 3, 2, 3, 4, 5, 4, 5, 6, 7, 6, 7, 8, 9, 0, 1, 2, + 3, 2, 3, 4, 5, 4, 5, 6, 7, 6, 7, 8, 9); + __m256i idx_shift_2 = + _mm256_setr_epi8(4, 5, 6, 7, 6, 7, 8, 9, 8, 9, 10, 11, 10, 11, 12, 13, 4, + 5, 6, 7, 6, 7, 8, 9, 8, 9, 10, 11, 10, 11, 12, 13); + + __m128i kernel_reg_128; // Kernel + __m256i kernel_reg, kernel_reg_23, + kernel_reg_45; // Segments of the kernel used + const __m256i reg_round = + _mm256_set1_epi32(CONV8_ROUNDING_NUM); // Used for rounding + const __m256i reg_max = _mm256_set1_epi16((1 << bd) - 1); + const ptrdiff_t unrolled_src_stride = src_stride << 1; + const ptrdiff_t unrolled_dst_stride = dst_stride << 1; + int h; + + // Start one pixel before as we need tap/2 - 1 = 1 sample from the past + src_ptr -= 1; + + // Load Kernel + kernel_reg_128 = _mm_loadu_si128((const __m128i *)kernel); + kernel_reg = _mm256_broadcastsi128_si256(kernel_reg_128); + kernel_reg_23 = _mm256_shuffle_epi32(kernel_reg, 0x55); + kernel_reg_45 = _mm256_shuffle_epi32(kernel_reg, 0xaa); + + for (h = height; h >= 2; h -= 2) { + // Load the source + src_reg = mm256_loadu2_si128(src_ptr, src_ptr + src_stride); + src_reg_shift_0 = _mm256_shuffle_epi8(src_reg, idx_shift_0); + src_reg_shift_2 = _mm256_shuffle_epi8(src_reg, idx_shift_2); + + // Result for first half + res_first = mm256_madd_add_epi32(&src_reg_shift_0, &src_reg_shift_2, + &kernel_reg_23, &kernel_reg_45); + + // Do again to get the second half of dst + // Load the source + src_reg = mm256_loadu2_si128(src_ptr + 4, src_ptr + src_stride + 4); + src_reg_shift_0 = _mm256_shuffle_epi8(src_reg, idx_shift_0); + src_reg_shift_2 = _mm256_shuffle_epi8(src_reg, idx_shift_2); + + // Result for second half + res_last = mm256_madd_add_epi32(&src_reg_shift_0, &src_reg_shift_2, + &kernel_reg_23, &kernel_reg_45); + + // Round each result + res_first = mm256_round_epi32(&res_first, ®_round, CONV8_ROUNDING_BITS); + res_last = mm256_round_epi32(&res_last, ®_round, CONV8_ROUNDING_BITS); + + // Finally combine to get the final dst + res_reg = _mm256_packus_epi32(res_first, res_last); + res_reg = _mm256_min_epi16(res_reg, reg_max); + mm256_store2_si128((__m128i *)dst_ptr, (__m128i *)(dst_ptr + dst_stride), + &res_reg); + + src_ptr += unrolled_src_stride; + dst_ptr += unrolled_dst_stride; + } + + // Repeat for the last row if needed + if (h > 0) { + src_reg = _mm256_loadu_si256((const __m256i *)src_ptr); + // Reorder into 2 1 1 2 + src_reg = _mm256_permute4x64_epi64(src_reg, 0x94); + + src_reg_shift_0 = _mm256_shuffle_epi8(src_reg, idx_shift_0); + src_reg_shift_2 = _mm256_shuffle_epi8(src_reg, idx_shift_2); + + res_reg = mm256_madd_add_epi32(&src_reg_shift_0, &src_reg_shift_2, + &kernel_reg_23, &kernel_reg_45); + + res_reg = mm256_round_epi32(&res_first, ®_round, CONV8_ROUNDING_BITS); + + res_reg = _mm256_packus_epi32(res_reg, res_reg); + res_reg = _mm256_permute4x64_epi64(res_reg, 0x8); + + _mm_store_si128((__m128i *)dst_ptr, _mm256_castsi256_si128(res_reg)); + } +} + +static void vpx_highbd_filter_block1d16_h4_avx2( + const uint16_t *src_ptr, ptrdiff_t src_stride, uint16_t *dst_ptr, + ptrdiff_t dst_stride, uint32_t height, const int16_t *kernel, int bd) { + vpx_highbd_filter_block1d8_h4_avx2(src_ptr, src_stride, dst_ptr, dst_stride, + height, kernel, bd); + vpx_highbd_filter_block1d8_h4_avx2(src_ptr + 8, src_stride, dst_ptr + 8, + dst_stride, height, kernel, bd); +} + static void vpx_highbd_filter_block1d8_v8_avg_avx2( const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr, ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) { @@ -1058,39 +1253,239 @@ static void vpx_highbd_filter_block1d8_v2_avg_avx2( } while (height > 0); } -void vpx_highbd_filter_block1d4_h8_sse2(const uint16_t *, ptrdiff_t, uint16_t *, - ptrdiff_t, uint32_t, const int16_t *, - int); -void vpx_highbd_filter_block1d4_h2_sse2(const uint16_t *, ptrdiff_t, uint16_t *, - ptrdiff_t, uint32_t, const int16_t *, - int); -void vpx_highbd_filter_block1d4_v8_sse2(const uint16_t *, ptrdiff_t, uint16_t *, - ptrdiff_t, uint32_t, const int16_t *, - int); -void vpx_highbd_filter_block1d4_v2_sse2(const uint16_t *, ptrdiff_t, uint16_t *, - ptrdiff_t, uint32_t, const int16_t *, - int); +void vpx_highbd_filter_block1d4_v4_avx2(const uint16_t *src_ptr, + ptrdiff_t src_stride, uint16_t *dst_ptr, + ptrdiff_t dst_stride, uint32_t height, + const int16_t *kernel, int bd) { + // We will load two rows of pixels and rearrange them into the form + // ... s[1,0] s[0,0] s[0,0] s[-1,0] + // so that we can call multiply and add with the kernel partial output. Then + // we can call add with another row to get the output. + + // Register for source s[-1:3, :] + __m256i src_reg_1, src_reg_2, src_reg_3; + // Interleaved rows of the source. lo is first half, hi second + __m256i src_reg_m10, src_reg_01, src_reg_12, src_reg_23; + __m256i src_reg_m1001, src_reg_1223; + + // Result after multiply and add + __m256i res_reg; + + __m128i kernel_reg_128; // Kernel + __m256i kernel_reg, kernel_reg_23, kernel_reg_45; // Segments of kernel used + + const __m256i reg_round = + _mm256_set1_epi32(CONV8_ROUNDING_NUM); // Used for rounding + const __m256i reg_max = _mm256_set1_epi16((1 << bd) - 1); + const ptrdiff_t src_stride_unrolled = src_stride << 1; + const ptrdiff_t dst_stride_unrolled = dst_stride << 1; + int h; + + // We only need to go num_taps/2 - 1 row above the souce, so we move + // 3 - (num_taps/2 - 1) = 4 - num_taps/2 = 2 back down + src_ptr += src_stride_unrolled; + + // Load Kernel + kernel_reg_128 = _mm_loadu_si128((const __m128i *)kernel); + kernel_reg = _mm256_broadcastsi128_si256(kernel_reg_128); + kernel_reg_23 = _mm256_shuffle_epi32(kernel_reg, 0x55); + kernel_reg_45 = _mm256_shuffle_epi32(kernel_reg, 0xaa); + + // Row -1 to row 0 + src_reg_m10 = mm256_loadu2_epi64((const __m128i *)src_ptr, + (const __m128i *)(src_ptr + src_stride)); + + // Row 0 to row 1 + src_reg_1 = _mm256_castsi128_si256( + _mm_loadu_si128((const __m128i *)(src_ptr + src_stride * 2))); + src_reg_01 = _mm256_permute2x128_si256(src_reg_m10, src_reg_1, 0x21); + + // First three rows + src_reg_m1001 = _mm256_unpacklo_epi16(src_reg_m10, src_reg_01); + + for (h = height; h > 1; h -= 2) { + src_reg_2 = _mm256_castsi128_si256( + _mm_loadl_epi64((const __m128i *)(src_ptr + src_stride * 3))); + + src_reg_12 = _mm256_inserti128_si256(src_reg_1, + _mm256_castsi256_si128(src_reg_2), 1); + + src_reg_3 = _mm256_castsi128_si256( + _mm_loadl_epi64((const __m128i *)(src_ptr + src_stride * 4))); + + src_reg_23 = _mm256_inserti128_si256(src_reg_2, + _mm256_castsi256_si128(src_reg_3), 1); + + // Last three rows + src_reg_1223 = _mm256_unpacklo_epi16(src_reg_12, src_reg_23); + + // Output + res_reg = mm256_madd_add_epi32(&src_reg_m1001, &src_reg_1223, + &kernel_reg_23, &kernel_reg_45); + + // Round the words + res_reg = mm256_round_epi32(&res_reg, ®_round, CONV8_ROUNDING_BITS); + + // Combine to get the result + res_reg = _mm256_packus_epi32(res_reg, res_reg); + res_reg = _mm256_min_epi16(res_reg, reg_max); + + // Save the result + mm256_storeu2_epi64((__m128i *)dst_ptr, (__m128i *)(dst_ptr + dst_stride), + &res_reg); + + // Update the source by two rows + src_ptr += src_stride_unrolled; + dst_ptr += dst_stride_unrolled; + + src_reg_m1001 = src_reg_1223; + src_reg_1 = src_reg_3; + } +} + +void vpx_highbd_filter_block1d8_v4_avx2(const uint16_t *src_ptr, + ptrdiff_t src_stride, uint16_t *dst_ptr, + ptrdiff_t dst_stride, uint32_t height, + const int16_t *kernel, int bd) { + // We will load two rows of pixels and rearrange them into the form + // ... s[1,0] s[0,0] s[0,0] s[-1,0] + // so that we can call multiply and add with the kernel partial output. Then + // we can call add with another row to get the output. + + // Register for source s[-1:3, :] + __m256i src_reg_1, src_reg_2, src_reg_3; + // Interleaved rows of the source. lo is first half, hi second + __m256i src_reg_m10, src_reg_01, src_reg_12, src_reg_23; + __m256i src_reg_m1001_lo, src_reg_m1001_hi, src_reg_1223_lo, src_reg_1223_hi; + + __m128i kernel_reg_128; // Kernel + __m256i kernel_reg, kernel_reg_23, kernel_reg_45; // Segments of kernel + + // Result after multiply and add + __m256i res_reg, res_reg_lo, res_reg_hi; + + const __m256i reg_round = + _mm256_set1_epi32(CONV8_ROUNDING_NUM); // Used for rounding + const __m256i reg_max = _mm256_set1_epi16((1 << bd) - 1); + const ptrdiff_t src_stride_unrolled = src_stride << 1; + const ptrdiff_t dst_stride_unrolled = dst_stride << 1; + int h; + + // We only need to go num_taps/2 - 1 row above the souce, so we move + // 3 - (num_taps/2 - 1) = 4 - num_taps/2 = 2 back down + src_ptr += src_stride_unrolled; + + // Load Kernel + kernel_reg_128 = _mm_loadu_si128((const __m128i *)kernel); + kernel_reg = _mm256_broadcastsi128_si256(kernel_reg_128); + kernel_reg_23 = _mm256_shuffle_epi32(kernel_reg, 0x55); + kernel_reg_45 = _mm256_shuffle_epi32(kernel_reg, 0xaa); + + // Row -1 to row 0 + src_reg_m10 = mm256_loadu2_si128((const __m128i *)src_ptr, + (const __m128i *)(src_ptr + src_stride)); + + // Row 0 to row 1 + src_reg_1 = _mm256_castsi128_si256( + _mm_loadu_si128((const __m128i *)(src_ptr + src_stride * 2))); + src_reg_01 = _mm256_permute2x128_si256(src_reg_m10, src_reg_1, 0x21); + + // First three rows + src_reg_m1001_lo = _mm256_unpacklo_epi16(src_reg_m10, src_reg_01); + src_reg_m1001_hi = _mm256_unpackhi_epi16(src_reg_m10, src_reg_01); + + for (h = height; h > 1; h -= 2) { + src_reg_2 = _mm256_castsi128_si256( + _mm_loadu_si128((const __m128i *)(src_ptr + src_stride * 3))); + + src_reg_12 = _mm256_inserti128_si256(src_reg_1, + _mm256_castsi256_si128(src_reg_2), 1); + + src_reg_3 = _mm256_castsi128_si256( + _mm_loadu_si128((const __m128i *)(src_ptr + src_stride * 4))); + + src_reg_23 = _mm256_inserti128_si256(src_reg_2, + _mm256_castsi256_si128(src_reg_3), 1); + + // Last three rows + src_reg_1223_lo = _mm256_unpacklo_epi16(src_reg_12, src_reg_23); + src_reg_1223_hi = _mm256_unpackhi_epi16(src_reg_12, src_reg_23); + + // Output from first half + res_reg_lo = mm256_madd_add_epi32(&src_reg_m1001_lo, &src_reg_1223_lo, + &kernel_reg_23, &kernel_reg_45); + + // Output from second half + res_reg_hi = mm256_madd_add_epi32(&src_reg_m1001_hi, &src_reg_1223_hi, + &kernel_reg_23, &kernel_reg_45); + + // Round the words + res_reg_lo = + mm256_round_epi32(&res_reg_lo, ®_round, CONV8_ROUNDING_BITS); + res_reg_hi = + mm256_round_epi32(&res_reg_hi, ®_round, CONV8_ROUNDING_BITS); + + // Combine to get the result + res_reg = _mm256_packus_epi32(res_reg_lo, res_reg_hi); + res_reg = _mm256_min_epi16(res_reg, reg_max); + + // Save the result + mm256_store2_si128((__m128i *)dst_ptr, (__m128i *)(dst_ptr + dst_stride), + &res_reg); + + // Update the source by two rows + src_ptr += src_stride_unrolled; + dst_ptr += dst_stride_unrolled; + + src_reg_m1001_lo = src_reg_1223_lo; + src_reg_m1001_hi = src_reg_1223_hi; + src_reg_1 = src_reg_3; + } +} + +void vpx_highbd_filter_block1d16_v4_avx2(const uint16_t *src_ptr, + ptrdiff_t src_stride, + uint16_t *dst_ptr, + ptrdiff_t dst_stride, uint32_t height, + const int16_t *kernel, int bd) { + vpx_highbd_filter_block1d8_v4_avx2(src_ptr, src_stride, dst_ptr, dst_stride, + height, kernel, bd); + vpx_highbd_filter_block1d8_v4_avx2(src_ptr + 8, src_stride, dst_ptr + 8, + dst_stride, height, kernel, bd); +} + +highbd_filter8_1dfunction vpx_highbd_filter_block1d4_h8_sse2; +highbd_filter8_1dfunction vpx_highbd_filter_block1d4_h2_sse2; +highbd_filter8_1dfunction vpx_highbd_filter_block1d4_v8_sse2; +highbd_filter8_1dfunction vpx_highbd_filter_block1d4_v2_sse2; + #define vpx_highbd_filter_block1d4_h8_avx2 vpx_highbd_filter_block1d4_h8_sse2 #define vpx_highbd_filter_block1d4_h2_avx2 vpx_highbd_filter_block1d4_h2_sse2 #define vpx_highbd_filter_block1d4_v8_avx2 vpx_highbd_filter_block1d4_v8_sse2 #define vpx_highbd_filter_block1d4_v2_avx2 vpx_highbd_filter_block1d4_v2_sse2 +#define vpx_highbd_filter_block1d16_v4_avg_avx2 \ + vpx_highbd_filter_block1d16_v8_avg_avx2 +#define vpx_highbd_filter_block1d16_h4_avg_avx2 \ + vpx_highbd_filter_block1d16_h8_avg_avx2 +#define vpx_highbd_filter_block1d8_v4_avg_avx2 \ + vpx_highbd_filter_block1d8_v8_avg_avx2 +#define vpx_highbd_filter_block1d8_h4_avg_avx2 \ + vpx_highbd_filter_block1d8_h8_avg_avx2 +#define vpx_highbd_filter_block1d4_v4_avg_avx2 \ + vpx_highbd_filter_block1d4_v8_avg_avx2 +#define vpx_highbd_filter_block1d4_h4_avg_avx2 \ + vpx_highbd_filter_block1d4_h8_avg_avx2 + HIGH_FUN_CONV_1D(horiz, x0_q4, x_step_q4, h, src, , avx2); HIGH_FUN_CONV_1D(vert, y0_q4, y_step_q4, v, src - src_stride * 3, , avx2); HIGH_FUN_CONV_2D(, avx2); -void vpx_highbd_filter_block1d4_h8_avg_sse2(const uint16_t *, ptrdiff_t, - uint16_t *, ptrdiff_t, uint32_t, - const int16_t *, int); -void vpx_highbd_filter_block1d4_h2_avg_sse2(const uint16_t *, ptrdiff_t, - uint16_t *, ptrdiff_t, uint32_t, - const int16_t *, int); -void vpx_highbd_filter_block1d4_v8_avg_sse2(const uint16_t *, ptrdiff_t, - uint16_t *, ptrdiff_t, uint32_t, - const int16_t *, int); -void vpx_highbd_filter_block1d4_v2_avg_sse2(const uint16_t *, ptrdiff_t, - uint16_t *, ptrdiff_t, uint32_t, - const int16_t *, int); +highbd_filter8_1dfunction vpx_highbd_filter_block1d4_h8_avg_sse2; +highbd_filter8_1dfunction vpx_highbd_filter_block1d4_h2_avg_sse2; +highbd_filter8_1dfunction vpx_highbd_filter_block1d4_v8_avg_sse2; +highbd_filter8_1dfunction vpx_highbd_filter_block1d4_v2_avg_sse2; + #define vpx_highbd_filter_block1d4_h8_avg_avx2 \ vpx_highbd_filter_block1d4_h8_avg_sse2 #define vpx_highbd_filter_block1d4_h2_avg_avx2 \ diff --git a/vpx_dsp/x86/vpx_asm_stubs.c b/vpx_dsp/x86/vpx_asm_stubs.c index 80c7654d5..12194a6fa 100644 --- a/vpx_dsp/x86/vpx_asm_stubs.c +++ b/vpx_dsp/x86/vpx_asm_stubs.c @@ -104,6 +104,25 @@ highbd_filter8_1dfunction vpx_highbd_filter_block1d8_h8_avg_sse2; highbd_filter8_1dfunction vpx_highbd_filter_block1d4_v8_avg_sse2; highbd_filter8_1dfunction vpx_highbd_filter_block1d4_h8_avg_sse2; +#define vpx_highbd_filter_block1d16_v4_sse2 vpx_highbd_filter_block1d16_v8_sse2 +#define vpx_highbd_filter_block1d16_h4_sse2 vpx_highbd_filter_block1d16_h8_sse2 +#define vpx_highbd_filter_block1d8_v4_sse2 vpx_highbd_filter_block1d8_v8_sse2 +#define vpx_highbd_filter_block1d8_h4_sse2 vpx_highbd_filter_block1d8_h8_sse2 +#define vpx_highbd_filter_block1d4_v4_sse2 vpx_highbd_filter_block1d4_v8_sse2 +#define vpx_highbd_filter_block1d4_h4_sse2 vpx_highbd_filter_block1d4_h8_sse2 +#define vpx_highbd_filter_block1d16_v4_avg_sse2 \ + vpx_highbd_filter_block1d16_v8_avg_sse2 +#define vpx_highbd_filter_block1d16_h4_avg_sse2 \ + vpx_highbd_filter_block1d16_h8_avg_sse2 +#define vpx_highbd_filter_block1d8_v4_avg_sse2 \ + vpx_highbd_filter_block1d8_v8_avg_sse2 +#define vpx_highbd_filter_block1d8_h4_avg_sse2 \ + vpx_highbd_filter_block1d8_h8_avg_sse2 +#define vpx_highbd_filter_block1d4_v4_avg_sse2 \ + vpx_highbd_filter_block1d4_v8_avg_sse2 +#define vpx_highbd_filter_block1d4_h4_avg_sse2 \ + vpx_highbd_filter_block1d4_h8_avg_sse2 + highbd_filter8_1dfunction vpx_highbd_filter_block1d16_v2_sse2; highbd_filter8_1dfunction vpx_highbd_filter_block1d16_h2_sse2; highbd_filter8_1dfunction vpx_highbd_filter_block1d8_v2_sse2; -- 2.40.0