From 6657ab8571b0a363f0e3ffd46348794651c92d44 Mon Sep 17 00:00:00 2001 From: chiyotsai Date: Tue, 23 Oct 2018 12:42:21 -0700 Subject: [PATCH] Add AVX2 support for 4-tap interpolation filter. Performance: | 4X4 | 8X8 |16X16|64X64| 2 DIM|1.491|1.902|1.772|1.479| HORZ|1.145|1.521|1.757|1.497| VERT|1.176|1.614|1.707|1.467| Each number in the chart above is 8-tap function time / 4-tap function time. The framerate tested on jets.y4m for 100 frames on speed 1 increased from 3.72 fps to 3.91 fps (about 5% increase). Change-Id: Ic0ad275cf32fafeefd0a89811badd8adff2134a0 --- vpx_dsp/x86/convolve_avx2.h | 41 ++ vpx_dsp/x86/vpx_subpixel_8t_intrin_avx2.c | 578 +++++++++++++++++++++- 2 files changed, 613 insertions(+), 6 deletions(-) diff --git a/vpx_dsp/x86/convolve_avx2.h b/vpx_dsp/x86/convolve_avx2.h index 343af9fd0..e9fc9c06a 100644 --- a/vpx_dsp/x86/convolve_avx2.h +++ b/vpx_dsp/x86/convolve_avx2.h @@ -100,6 +100,47 @@ static INLINE __m128i convolve8_8_avx2(const __m256i *const s, return sum1; } +static INLINE __m256i mm256_loadu2_si128(const void *lo, const void *hi) { + const __m256i tmp = + _mm256_castsi128_si256(_mm_loadu_si128((const __m128i *)lo)); + return _mm256_inserti128_si256(tmp, _mm_loadu_si128((const __m128i *)hi), 1); +} + +static INLINE __m256i mm256_loadu2_epi64(const void *lo, const void *hi) { + const __m256i tmp = + _mm256_castsi128_si256(_mm_loadl_epi64((const __m128i *)lo)); + return _mm256_inserti128_si256(tmp, _mm_loadl_epi64((const __m128i *)hi), 1); +} + +static INLINE void mm256_store2_si128(__m128i *const dst_ptr_1, + __m128i *const dst_ptr_2, + const __m256i *const src) { + _mm_store_si128(dst_ptr_1, _mm256_castsi256_si128(*src)); + _mm_store_si128(dst_ptr_2, _mm256_extractf128_si256(*src, 1)); +} + +static INLINE void mm256_storeu2_epi64(__m128i *const dst_ptr_1, + __m128i *const dst_ptr_2, + const __m256i *const src) { + _mm_storel_epi64(dst_ptr_1, _mm256_castsi256_si128(*src)); + _mm_storel_epi64(dst_ptr_2, _mm256_extractf128_si256(*src, 1)); +} + +static INLINE void mm256_storeu2_epi32(__m128i *const dst_ptr_1, + __m128i *const dst_ptr_2, + const __m256i *const src) { + *((uint32_t *)(dst_ptr_1)) = _mm_cvtsi128_si32(_mm256_castsi256_si128(*src)); + *((uint32_t *)(dst_ptr_2)) = + _mm_cvtsi128_si32(_mm256_extractf128_si256(*src, 1)); +} + +static INLINE __m256i mm256_round_epi16(const __m256i *const src, + const __m256i *const half_depth, + const int depth) { + const __m256i nearest_src = _mm256_adds_epi16(*src, *half_depth); + return _mm256_srai_epi16(nearest_src, depth); +} + #undef MM256_BROADCASTSI128_SI256 #endif // VPX_VPX_DSP_X86_CONVOLVE_AVX2_H_ diff --git a/vpx_dsp/x86/vpx_subpixel_8t_intrin_avx2.c b/vpx_dsp/x86/vpx_subpixel_8t_intrin_avx2.c index 426b82592..0ccf89694 100644 --- a/vpx_dsp/x86/vpx_subpixel_8t_intrin_avx2.c +++ b/vpx_dsp/x86/vpx_subpixel_8t_intrin_avx2.c @@ -9,10 +9,12 @@ */ #include +#include #include "./vpx_dsp_rtcd.h" #include "vpx_dsp/x86/convolve.h" #include "vpx_dsp/x86/convolve_avx2.h" +#include "vpx_dsp/x86/convolve_sse2.h" #include "vpx_ports/mem.h" // filters for 16_h8 @@ -326,6 +328,576 @@ static void vpx_filter_block1d16_v8_avg_avx2( height, filter, 1); } +void vpx_filter_block1d16_h4_avx2(const uint8_t *src_ptr, ptrdiff_t src_stride, + uint8_t *dst_ptr, ptrdiff_t dst_stride, + uint32_t height, const int16_t *kernel) { + // We will cast the kernel from 16-bit words to 8-bit words, and then extract + // the middle four elements of the kernel into two registers in the form + // ... k[3] k[2] k[3] k[2] + // ... k[5] k[4] k[5] k[4] + // Then we shuffle the source into + // ... s[1] s[0] s[0] s[-1] + // ... s[3] s[2] s[2] s[1] + // Calling multiply and add gives us half of the sum. Calling add gives us + // first half of the output. Repeat again to get the second half of the + // output. Finally we shuffle again to combine the two outputs. + // Since avx2 allows us to use 256-bit buffer, we can do this two rows at a + // time. + + __m128i kernel_reg; // Kernel + __m256i kernel_reg_256, kernel_reg_23, + kernel_reg_45; // Segments of the kernel used + const __m256i reg_32 = _mm256_set1_epi16(32); // Used for rounding + const ptrdiff_t unrolled_src_stride = src_stride << 1; + const ptrdiff_t unrolled_dst_stride = dst_stride << 1; + int h; + + __m256i src_reg, src_reg_shift_0, src_reg_shift_2; + __m256i dst_first, dst_second; + __m256i tmp_0, tmp_1; + __m256i idx_shift_0 = + _mm256_setr_epi8(0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 0, 1, 1, + 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8); + __m256i idx_shift_2 = + _mm256_setr_epi8(2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 2, 3, 3, + 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10); + + // Start one pixel before as we need tap/2 - 1 = 1 sample from the past + src_ptr -= 1; + + // Load Kernel + kernel_reg = _mm_loadu_si128((const __m128i *)kernel); + kernel_reg = _mm_srai_epi16(kernel_reg, 1); + kernel_reg = _mm_packs_epi16(kernel_reg, kernel_reg); + kernel_reg_256 = _mm256_broadcastsi128_si256(kernel_reg); + kernel_reg_23 = + _mm256_shuffle_epi8(kernel_reg_256, _mm256_set1_epi16(0x0302u)); + kernel_reg_45 = + _mm256_shuffle_epi8(kernel_reg_256, _mm256_set1_epi16(0x0504u)); + + for (h = height; h >= 2; h -= 2) { + // Load the source + src_reg = mm256_loadu2_si128(src_ptr, src_ptr + src_stride); + src_reg_shift_0 = _mm256_shuffle_epi8(src_reg, idx_shift_0); + src_reg_shift_2 = _mm256_shuffle_epi8(src_reg, idx_shift_2); + + // Partial result for first half + tmp_0 = _mm256_maddubs_epi16(src_reg_shift_0, kernel_reg_23); + tmp_1 = _mm256_maddubs_epi16(src_reg_shift_2, kernel_reg_45); + dst_first = _mm256_adds_epi16(tmp_0, tmp_1); + + // Do again to get the second half of dst + // Load the source + src_reg = mm256_loadu2_si128(src_ptr + 8, src_ptr + src_stride + 8); + src_reg_shift_0 = _mm256_shuffle_epi8(src_reg, idx_shift_0); + src_reg_shift_2 = _mm256_shuffle_epi8(src_reg, idx_shift_2); + + // Partial result for second half + tmp_0 = _mm256_maddubs_epi16(src_reg_shift_0, kernel_reg_23); + tmp_1 = _mm256_maddubs_epi16(src_reg_shift_2, kernel_reg_45); + dst_second = _mm256_adds_epi16(tmp_0, tmp_1); + + // Round each result + dst_first = mm256_round_epi16(&dst_first, ®_32, 6); + dst_second = mm256_round_epi16(&dst_second, ®_32, 6); + + // Finally combine to get the final dst + dst_first = _mm256_packus_epi16(dst_first, dst_second); + mm256_store2_si128((__m128i *)dst_ptr, (__m128i *)(dst_ptr + dst_stride), + &dst_first); + + src_ptr += unrolled_src_stride; + dst_ptr += unrolled_dst_stride; + } + + // Repeat for the last row if needed + if (h > 0) { + src_reg = _mm256_loadu_si256((const __m256i *)src_ptr); + // Reorder into 2 1 1 2 + src_reg = _mm256_permute4x64_epi64(src_reg, 0x94); + + src_reg_shift_0 = _mm256_shuffle_epi8(src_reg, idx_shift_0); + src_reg_shift_2 = _mm256_shuffle_epi8(src_reg, idx_shift_2); + + tmp_0 = _mm256_maddubs_epi16(src_reg_shift_0, kernel_reg_23); + tmp_1 = _mm256_maddubs_epi16(src_reg_shift_2, kernel_reg_45); + dst_first = _mm256_adds_epi16(tmp_0, tmp_1); + + dst_first = mm256_round_epi16(&dst_first, ®_32, 6); + + dst_first = _mm256_packus_epi16(dst_first, dst_first); + dst_first = _mm256_permute4x64_epi64(dst_first, 0x8); + + _mm_store_si128((__m128i *)dst_ptr, _mm256_castsi256_si128(dst_first)); + } +} + +void vpx_filter_block1d16_v4_avx2(const uint8_t *src_ptr, ptrdiff_t src_stride, + uint8_t *dst_ptr, ptrdiff_t dst_stride, + uint32_t height, const int16_t *kernel) { + // We will load two rows of pixels as 8-bit words, rearrange them into the + // form + // ... s[1,0] s[0,0] s[0,0] s[-1,0] + // so that we can call multiply and add with the kernel partial output. Then + // we can call add with another row to get the output. + + // Register for source s[-1:3, :] + __m256i src_reg_1, src_reg_2, src_reg_3; + // Interleaved rows of the source. lo is first half, hi second + __m256i src_reg_m10, src_reg_01, src_reg_12, src_reg_23; + __m256i src_reg_m1001_lo, src_reg_m1001_hi, src_reg_1223_lo, src_reg_1223_hi; + + __m128i kernel_reg; // Kernel + __m256i kernel_reg_256, kernel_reg_23, + kernel_reg_45; // Segments of the kernel used + + // Result after multiply and add + __m256i res_reg_m1001_lo, res_reg_1223_lo, res_reg_m1001_hi, res_reg_1223_hi; + __m256i res_reg, res_reg_lo, res_reg_hi; + + const __m256i reg_32 = _mm256_set1_epi16(32); // Used for rounding + + // We will compute the result two rows at a time + const ptrdiff_t src_stride_unrolled = src_stride << 1; + const ptrdiff_t dst_stride_unrolled = dst_stride << 1; + int h; + + // We only need to go num_taps/2 - 1 row above the souce, so we move + // 3 - (num_taps/2 - 1) = 4 - num_taps/2 = 2 back down + src_ptr += src_stride_unrolled; + + // Load Kernel + kernel_reg = _mm_loadu_si128((const __m128i *)kernel); + kernel_reg = _mm_srai_epi16(kernel_reg, 1); + kernel_reg = _mm_packs_epi16(kernel_reg, kernel_reg); + kernel_reg_256 = _mm256_broadcastsi128_si256(kernel_reg); + kernel_reg_23 = + _mm256_shuffle_epi8(kernel_reg_256, _mm256_set1_epi16(0x0302u)); + kernel_reg_45 = + _mm256_shuffle_epi8(kernel_reg_256, _mm256_set1_epi16(0x0504u)); + + // Row -1 to row 0 + src_reg_m10 = mm256_loadu2_si128((const __m128i *)src_ptr, + (const __m128i *)(src_ptr + src_stride)); + + // Row 0 to row 1 + src_reg_1 = _mm256_castsi128_si256( + _mm_loadu_si128((const __m128i *)(src_ptr + src_stride * 2))); + src_reg_01 = _mm256_permute2x128_si256(src_reg_m10, src_reg_1, 0x21); + + // First three rows + src_reg_m1001_lo = _mm256_unpacklo_epi8(src_reg_m10, src_reg_01); + src_reg_m1001_hi = _mm256_unpackhi_epi8(src_reg_m10, src_reg_01); + + for (h = height; h > 1; h -= 2) { + src_reg_2 = _mm256_castsi128_si256( + _mm_loadu_si128((const __m128i *)(src_ptr + src_stride * 3))); + + src_reg_12 = _mm256_inserti128_si256(src_reg_1, + _mm256_castsi256_si128(src_reg_2), 1); + + src_reg_3 = _mm256_castsi128_si256( + _mm_loadu_si128((const __m128i *)(src_ptr + src_stride * 4))); + + src_reg_23 = _mm256_inserti128_si256(src_reg_2, + _mm256_castsi256_si128(src_reg_3), 1); + + // Last three rows + src_reg_1223_lo = _mm256_unpacklo_epi8(src_reg_12, src_reg_23); + src_reg_1223_hi = _mm256_unpackhi_epi8(src_reg_12, src_reg_23); + + // Output from first half + res_reg_m1001_lo = _mm256_maddubs_epi16(src_reg_m1001_lo, kernel_reg_23); + res_reg_1223_lo = _mm256_maddubs_epi16(src_reg_1223_lo, kernel_reg_45); + res_reg_lo = _mm256_adds_epi16(res_reg_m1001_lo, res_reg_1223_lo); + + // Output from second half + res_reg_m1001_hi = _mm256_maddubs_epi16(src_reg_m1001_hi, kernel_reg_23); + res_reg_1223_hi = _mm256_maddubs_epi16(src_reg_1223_hi, kernel_reg_45); + res_reg_hi = _mm256_adds_epi16(res_reg_m1001_hi, res_reg_1223_hi); + + // Round the words + res_reg_lo = mm256_round_epi16(&res_reg_lo, ®_32, 6); + res_reg_hi = mm256_round_epi16(&res_reg_hi, ®_32, 6); + + // Combine to get the result + res_reg = _mm256_packus_epi16(res_reg_lo, res_reg_hi); + + // Save the result + mm256_store2_si128((__m128i *)dst_ptr, (__m128i *)(dst_ptr + dst_stride), + &res_reg); + + // Update the source by two rows + src_ptr += src_stride_unrolled; + dst_ptr += dst_stride_unrolled; + + src_reg_m1001_lo = src_reg_1223_lo; + src_reg_m1001_hi = src_reg_1223_hi; + src_reg_1 = src_reg_3; + } +} + +void vpx_filter_block1d8_h4_avx2(const uint8_t *src_ptr, ptrdiff_t src_stride, + uint8_t *dst_ptr, ptrdiff_t dst_stride, + uint32_t height, const int16_t *kernel) { + // We will cast the kernel from 16-bit words to 8-bit words, and then extract + // the middle four elements of the kernel into two registers in the form + // ... k[3] k[2] k[3] k[2] + // ... k[5] k[4] k[5] k[4] + // Then we shuffle the source into + // ... s[1] s[0] s[0] s[-1] + // ... s[3] s[2] s[2] s[1] + // Calling multiply and add gives us half of the sum. Calling add gives us + // first half of the output. Repeat again to get the second half of the + // output. Finally we shuffle again to combine the two outputs. + // Since avx2 allows us to use 256-bit buffer, we can do this two rows at a + // time. + + __m128i kernel_reg_128; // Kernel + __m256i kernel_reg, kernel_reg_23, + kernel_reg_45; // Segments of the kernel used + const __m256i reg_32 = _mm256_set1_epi16(32); // Used for rounding + const ptrdiff_t unrolled_src_stride = src_stride << 1; + const ptrdiff_t unrolled_dst_stride = dst_stride << 1; + int h; + + __m256i src_reg, src_reg_shift_0, src_reg_shift_2; + __m256i dst_reg; + __m256i tmp_0, tmp_1; + __m256i idx_shift_0 = + _mm256_setr_epi8(0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 0, 1, 1, + 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8); + __m256i idx_shift_2 = + _mm256_setr_epi8(2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 2, 3, 3, + 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10); + + // Start one pixel before as we need tap/2 - 1 = 1 sample from the past + src_ptr -= 1; + + // Load Kernel + kernel_reg_128 = _mm_loadu_si128((const __m128i *)kernel); + kernel_reg_128 = _mm_srai_epi16(kernel_reg_128, 1); + kernel_reg_128 = _mm_packs_epi16(kernel_reg_128, kernel_reg_128); + kernel_reg = _mm256_broadcastsi128_si256(kernel_reg_128); + kernel_reg_23 = _mm256_shuffle_epi8(kernel_reg, _mm256_set1_epi16(0x0302u)); + kernel_reg_45 = _mm256_shuffle_epi8(kernel_reg, _mm256_set1_epi16(0x0504u)); + + for (h = height; h >= 2; h -= 2) { + // Load the source + src_reg = mm256_loadu2_si128(src_ptr, src_ptr + src_stride); + src_reg_shift_0 = _mm256_shuffle_epi8(src_reg, idx_shift_0); + src_reg_shift_2 = _mm256_shuffle_epi8(src_reg, idx_shift_2); + + // Get the output + tmp_0 = _mm256_maddubs_epi16(src_reg_shift_0, kernel_reg_23); + tmp_1 = _mm256_maddubs_epi16(src_reg_shift_2, kernel_reg_45); + dst_reg = _mm256_adds_epi16(tmp_0, tmp_1); + + // Round the result + dst_reg = mm256_round_epi16(&dst_reg, ®_32, 6); + + // Finally combine to get the final dst + dst_reg = _mm256_packus_epi16(dst_reg, dst_reg); + mm256_storeu2_epi64((__m128i *)dst_ptr, (__m128i *)(dst_ptr + dst_stride), + &dst_reg); + + src_ptr += unrolled_src_stride; + dst_ptr += unrolled_dst_stride; + } + + // Repeat for the last row if needed + if (h > 0) { + __m128i src_reg = _mm_loadu_si128((const __m128i *)src_ptr); + __m128i dst_reg; + const __m128i reg_32 = _mm_set1_epi16(32); // Used for rounding + __m128i tmp_0, tmp_1; + + __m128i src_reg_shift_0 = + _mm_shuffle_epi8(src_reg, _mm256_castsi256_si128(idx_shift_0)); + __m128i src_reg_shift_2 = + _mm_shuffle_epi8(src_reg, _mm256_castsi256_si128(idx_shift_2)); + + tmp_0 = _mm_maddubs_epi16(src_reg_shift_0, + _mm256_castsi256_si128(kernel_reg_23)); + tmp_1 = _mm_maddubs_epi16(src_reg_shift_2, + _mm256_castsi256_si128(kernel_reg_45)); + dst_reg = _mm_adds_epi16(tmp_0, tmp_1); + + dst_reg = round_epi16_sse2(&dst_reg, ®_32, 6); + + dst_reg = _mm_packus_epi16(dst_reg, _mm_setzero_si128()); + + _mm_storel_epi64((__m128i *)dst_ptr, dst_reg); + } +} + +void vpx_filter_block1d8_v4_avx2(const uint8_t *src_ptr, ptrdiff_t src_stride, + uint8_t *dst_ptr, ptrdiff_t dst_stride, + uint32_t height, const int16_t *kernel) { + // We will load two rows of pixels as 8-bit words, rearrange them into the + // form + // ... s[1,0] s[0,0] s[0,0] s[-1,0] + // so that we can call multiply and add with the kernel partial output. Then + // we can call add with another row to get the output. + + // Register for source s[-1:3, :] + __m256i src_reg_1, src_reg_2, src_reg_3; + // Interleaved rows of the source. lo is first half, hi second + __m256i src_reg_m10, src_reg_01, src_reg_12, src_reg_23; + __m256i src_reg_m1001, src_reg_1223; + + __m128i kernel_reg_128; // Kernel + __m256i kernel_reg, kernel_reg_23, + kernel_reg_45; // Segments of the kernel used + + // Result after multiply and add + __m256i res_reg_m1001, res_reg_1223; + __m256i res_reg; + + const __m256i reg_32 = _mm256_set1_epi16(32); // Used for rounding + + // We will compute the result two rows at a time + const ptrdiff_t src_stride_unrolled = src_stride << 1; + const ptrdiff_t dst_stride_unrolled = dst_stride << 1; + int h; + + // We only need to go num_taps/2 - 1 row above the souce, so we move + // 3 - (num_taps/2 - 1) = 4 - num_taps/2 = 2 back down + src_ptr += src_stride_unrolled; + + // Load Kernel + kernel_reg_128 = _mm_loadu_si128((const __m128i *)kernel); + kernel_reg_128 = _mm_srai_epi16(kernel_reg_128, 1); + kernel_reg_128 = _mm_packs_epi16(kernel_reg_128, kernel_reg_128); + kernel_reg = _mm256_broadcastsi128_si256(kernel_reg_128); + kernel_reg_23 = _mm256_shuffle_epi8(kernel_reg, _mm256_set1_epi16(0x0302u)); + kernel_reg_45 = _mm256_shuffle_epi8(kernel_reg, _mm256_set1_epi16(0x0504u)); + + // Row -1 to row 0 + src_reg_m10 = mm256_loadu2_epi64((const __m128i *)src_ptr, + (const __m128i *)(src_ptr + src_stride)); + + // Row 0 to row 1 + src_reg_1 = _mm256_castsi128_si256( + _mm_loadu_si128((const __m128i *)(src_ptr + src_stride * 2))); + src_reg_01 = _mm256_permute2x128_si256(src_reg_m10, src_reg_1, 0x21); + + // First three rows + src_reg_m1001 = _mm256_unpacklo_epi8(src_reg_m10, src_reg_01); + + for (h = height; h > 1; h -= 2) { + src_reg_2 = _mm256_castsi128_si256( + _mm_loadl_epi64((const __m128i *)(src_ptr + src_stride * 3))); + + src_reg_12 = _mm256_inserti128_si256(src_reg_1, + _mm256_castsi256_si128(src_reg_2), 1); + + src_reg_3 = _mm256_castsi128_si256( + _mm_loadl_epi64((const __m128i *)(src_ptr + src_stride * 4))); + + src_reg_23 = _mm256_inserti128_si256(src_reg_2, + _mm256_castsi256_si128(src_reg_3), 1); + + // Last three rows + src_reg_1223 = _mm256_unpacklo_epi8(src_reg_12, src_reg_23); + + // Output + res_reg_m1001 = _mm256_maddubs_epi16(src_reg_m1001, kernel_reg_23); + res_reg_1223 = _mm256_maddubs_epi16(src_reg_1223, kernel_reg_45); + res_reg = _mm256_adds_epi16(res_reg_m1001, res_reg_1223); + + // Round the words + res_reg = mm256_round_epi16(&res_reg, ®_32, 6); + + // Combine to get the result + res_reg = _mm256_packus_epi16(res_reg, res_reg); + + // Save the result + mm256_storeu2_epi64((__m128i *)dst_ptr, (__m128i *)(dst_ptr + dst_stride), + &res_reg); + + // Update the source by two rows + src_ptr += src_stride_unrolled; + dst_ptr += dst_stride_unrolled; + + src_reg_m1001 = src_reg_1223; + src_reg_1 = src_reg_3; + } +} + +void vpx_filter_block1d4_h4_avx2(const uint8_t *src_ptr, ptrdiff_t src_stride, + uint8_t *dst_ptr, ptrdiff_t dst_stride, + uint32_t height, const int16_t *kernel) { + // We will cast the kernel from 16-bit words to 8-bit words, and then extract + // the middle four elements of the kernel into a single register in the form + // k[5:2] k[5:2] k[5:2] k[5:2] + // Then we shuffle the source into + // s[5:2] s[4:1] s[3:0] s[2:-1] + // Calling multiply and add gives us half of the sum next to each other. + // Calling horizontal add then gives us the output. + // Since avx2 has 256-bit register, we can do 2 rows at a time. + + __m128i kernel_reg_128; // Kernel + __m256i kernel_reg; + const __m256i reg_32 = _mm256_set1_epi16(32); // Used for rounding + int h; + const ptrdiff_t unrolled_src_stride = src_stride << 1; + const ptrdiff_t unrolled_dst_stride = dst_stride << 1; + + __m256i src_reg, src_reg_shuf; + __m256i dst; + __m256i shuf_idx = + _mm256_setr_epi8(0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6, 0, 1, 2, + 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6); + + // Start one pixel before as we need tap/2 - 1 = 1 sample from the past + src_ptr -= 1; + + // Load Kernel + kernel_reg_128 = _mm_loadu_si128((const __m128i *)kernel); + kernel_reg_128 = _mm_srai_epi16(kernel_reg_128, 1); + kernel_reg_128 = _mm_packs_epi16(kernel_reg_128, kernel_reg_128); + kernel_reg = _mm256_broadcastsi128_si256(kernel_reg_128); + kernel_reg = _mm256_shuffle_epi8(kernel_reg, _mm256_set1_epi32(0x05040302u)); + + for (h = height; h > 1; h -= 2) { + // Load the source + src_reg = mm256_loadu2_epi64((const __m128i *)src_ptr, + (const __m128i *)(src_ptr + src_stride)); + src_reg_shuf = _mm256_shuffle_epi8(src_reg, shuf_idx); + + // Get the result + dst = _mm256_maddubs_epi16(src_reg_shuf, kernel_reg); + dst = _mm256_hadds_epi16(dst, _mm256_setzero_si256()); + + // Round result + dst = mm256_round_epi16(&dst, ®_32, 6); + + // Pack to 8-bits + dst = _mm256_packus_epi16(dst, _mm256_setzero_si256()); + + // Save + mm256_storeu2_epi32((__m128i *const)dst_ptr, + (__m128i *const)(dst_ptr + dst_stride), &dst); + + src_ptr += unrolled_src_stride; + dst_ptr += unrolled_dst_stride; + } + + if (h > 0) { + // Load the source + const __m128i reg_32 = _mm_set1_epi16(32); // Used for rounding + __m128i src_reg = _mm_loadl_epi64((const __m128i *)src_ptr); + __m128i src_reg_shuf = + _mm_shuffle_epi8(src_reg, _mm256_castsi256_si128(shuf_idx)); + + // Get the result + __m128i dst = + _mm_maddubs_epi16(src_reg_shuf, _mm256_castsi256_si128(kernel_reg)); + dst = _mm_hadds_epi16(dst, _mm_setzero_si128()); + + // Round result + dst = round_epi16_sse2(&dst, ®_32, 6); + + // Pack to 8-bits + dst = _mm_packus_epi16(dst, _mm_setzero_si128()); + *((uint32_t *)(dst_ptr)) = _mm_cvtsi128_si32(dst); + } +} + +void vpx_filter_block1d4_v4_avx2(const uint8_t *src_ptr, ptrdiff_t src_stride, + uint8_t *dst_ptr, ptrdiff_t dst_stride, + uint32_t height, const int16_t *kernel) { + // We will load two rows of pixels as 8-bit words, rearrange them into the + // form + // ... s[3,0] s[2,0] s[1,0] s[0,0] s[2,0] s[1,0] s[0,0] s[-1,0] + // so that we can call multiply and add with the kernel to get partial output. + // Calling horizontal add then gives us the completely output + + // Register for source s[-1:3, :] + __m256i src_reg_1, src_reg_2, src_reg_3; + // Interleaved rows of the source. lo is first half, hi second + __m256i src_reg_m10, src_reg_01, src_reg_12, src_reg_23; + __m256i src_reg_m1001, src_reg_1223, src_reg_m1012_1023; + + __m128i kernel_reg_128; // Kernel + __m256i kernel_reg; + + // Result after multiply and add + __m256i res_reg; + + const __m256i reg_32 = _mm256_set1_epi16(32); // Used for rounding + + // We will compute the result two rows at a time + const ptrdiff_t src_stride_unrolled = src_stride << 1; + const ptrdiff_t dst_stride_unrolled = dst_stride << 1; + int h; + + // We only need to go num_taps/2 - 1 row above the souce, so we move + // 3 - (num_taps/2 - 1) = 4 - num_taps/2 = 2 back down + src_ptr += src_stride_unrolled; + + // Load Kernel + kernel_reg_128 = _mm_loadu_si128((const __m128i *)kernel); + kernel_reg_128 = _mm_srai_epi16(kernel_reg_128, 1); + kernel_reg_128 = _mm_packs_epi16(kernel_reg_128, kernel_reg_128); + kernel_reg = _mm256_broadcastsi128_si256(kernel_reg_128); + kernel_reg = _mm256_shuffle_epi8(kernel_reg, _mm256_set1_epi32(0x05040302u)); + + // Row -1 to row 0 + src_reg_m10 = mm256_loadu2_si128((const __m128i *)src_ptr, + (const __m128i *)(src_ptr + src_stride)); + + // Row 0 to row 1 + src_reg_1 = _mm256_castsi128_si256( + _mm_loadu_si128((const __m128i *)(src_ptr + src_stride * 2))); + src_reg_01 = _mm256_permute2x128_si256(src_reg_m10, src_reg_1, 0x21); + + // First three rows + src_reg_m1001 = _mm256_unpacklo_epi8(src_reg_m10, src_reg_01); + + for (h = height; h > 1; h -= 2) { + src_reg_2 = _mm256_castsi128_si256( + _mm_loadl_epi64((const __m128i *)(src_ptr + src_stride * 3))); + + src_reg_12 = _mm256_inserti128_si256(src_reg_1, + _mm256_castsi256_si128(src_reg_2), 1); + + src_reg_3 = _mm256_castsi128_si256( + _mm_loadl_epi64((const __m128i *)(src_ptr + src_stride * 4))); + + src_reg_23 = _mm256_inserti128_si256(src_reg_2, + _mm256_castsi256_si128(src_reg_3), 1); + + // Last three rows + src_reg_1223 = _mm256_unpacklo_epi8(src_reg_12, src_reg_23); + + // Combine all the rows + src_reg_m1012_1023 = _mm256_unpacklo_epi16(src_reg_m1001, src_reg_1223); + + // Output + res_reg = _mm256_maddubs_epi16(src_reg_m1012_1023, kernel_reg); + res_reg = _mm256_hadds_epi16(res_reg, _mm256_setzero_si256()); + + // Round the words + res_reg = mm256_round_epi16(&res_reg, ®_32, 6); + + // Combine to get the result + res_reg = _mm256_packus_epi16(res_reg, res_reg); + + // Save the result + mm256_storeu2_epi32((__m128i *)dst_ptr, (__m128i *)(dst_ptr + dst_stride), + &res_reg); + + // Update the source by two rows + src_ptr += src_stride_unrolled; + dst_ptr += dst_stride_unrolled; + + src_reg_m1001 = src_reg_1223; + src_reg_1 = src_reg_3; + } +} + #if HAVE_AVX2 && HAVE_SSSE3 filter8_1dfunction vpx_filter_block1d4_v8_ssse3; #if ARCH_X86_64 @@ -377,12 +949,6 @@ filter8_1dfunction vpx_filter_block1d4_h2_avg_ssse3; #define vpx_filter_block1d4_v2_avg_avx2 vpx_filter_block1d4_v2_avg_ssse3 #define vpx_filter_block1d4_h2_avg_avx2 vpx_filter_block1d4_h2_avg_ssse3 -#define vpx_filter_block1d16_v4_avx2 vpx_filter_block1d16_v8_avx2 -#define vpx_filter_block1d16_h4_avx2 vpx_filter_block1d16_h8_avx2 -#define vpx_filter_block1d8_v4_avx2 vpx_filter_block1d8_v8_avx2 -#define vpx_filter_block1d8_h4_avx2 vpx_filter_block1d8_h8_avx2 -#define vpx_filter_block1d4_v4_avx2 vpx_filter_block1d4_v8_avx2 -#define vpx_filter_block1d4_h4_avx2 vpx_filter_block1d4_h8_avx2 #define vpx_filter_block1d16_v4_avg_avx2 vpx_filter_block1d16_v8_avg_avx2 #define vpx_filter_block1d16_h4_avg_avx2 vpx_filter_block1d16_h8_avg_avx2 #define vpx_filter_block1d8_v4_avg_avx2 vpx_filter_block1d8_v8_avg_avx2 -- 2.40.0