From 01df00ec0f4db0ec633811e38452d601a7c8174c Mon Sep 17 00:00:00 2001 From: chiyotsai Date: Wed, 17 Oct 2018 14:52:26 -0700 Subject: [PATCH] Add SSSE3 support for 4-tap interpolation filter Performance: | 4X4 | 8X8 |16X16|64X64| 2 DIM|1.526|1.827|1.844|1.906| HORZ|1.336|1.795|1.886|1.654| VERT|1.443|1.539|2.139|2.190| The ratio is SSSE3 8-tap time / SSSE3 4-tap time. Change-Id: I01ed2ab494428256e918875774a459afecc5ec6a --- vpx_dsp/x86/vpx_subpixel_8t_intrin_avx2.c | 26 +- vpx_dsp/x86/vpx_subpixel_8t_intrin_ssse3.c | 504 ++++++++++++++++++++- 2 files changed, 496 insertions(+), 34 deletions(-) diff --git a/vpx_dsp/x86/vpx_subpixel_8t_intrin_avx2.c b/vpx_dsp/x86/vpx_subpixel_8t_intrin_avx2.c index 809f2e1f2..c5b2a67b9 100644 --- a/vpx_dsp/x86/vpx_subpixel_8t_intrin_avx2.c +++ b/vpx_dsp/x86/vpx_subpixel_8t_intrin_avx2.c @@ -376,19 +376,19 @@ filter8_1dfunction vpx_filter_block1d4_h2_avg_ssse3; #define vpx_filter_block1d8_h2_avg_avx2 vpx_filter_block1d8_h2_avg_ssse3 #define vpx_filter_block1d4_v2_avg_avx2 vpx_filter_block1d4_v2_avg_ssse3 #define vpx_filter_block1d4_h2_avg_avx2 vpx_filter_block1d4_h2_avg_ssse3 -#if HAVE_SSE2 -filter8_1dfunction vpx_filter_block1d16_v4_sse2; -filter8_1dfunction vpx_filter_block1d16_h4_sse2; -filter8_1dfunction vpx_filter_block1d8_v4_sse2; -filter8_1dfunction vpx_filter_block1d8_h4_sse2; -filter8_1dfunction vpx_filter_block1d4_v4_sse2; -filter8_1dfunction vpx_filter_block1d4_h4_sse2; -#define vpx_filter_block1d16_v4_avx2 vpx_filter_block1d16_v4_sse2 -#define vpx_filter_block1d16_h4_avx2 vpx_filter_block1d16_h4_sse2 -#define vpx_filter_block1d8_v4_avx2 vpx_filter_block1d8_v4_sse2 -#define vpx_filter_block1d8_h4_avx2 vpx_filter_block1d8_h4_sse2 -#define vpx_filter_block1d4_v4_avx2 vpx_filter_block1d4_v4_sse2 -#define vpx_filter_block1d4_h4_avx2 vpx_filter_block1d4_h4_sse2 +#if HAVE_SSSE3 +filter8_1dfunction vpx_filter_block1d16_v4_ssse3; +filter8_1dfunction vpx_filter_block1d16_h4_ssse3; +filter8_1dfunction vpx_filter_block1d8_v4_ssse3; +filter8_1dfunction vpx_filter_block1d8_h4_ssse3; +filter8_1dfunction vpx_filter_block1d4_v4_ssse3; +filter8_1dfunction vpx_filter_block1d4_h4_ssse3; +#define vpx_filter_block1d16_v4_avx2 vpx_filter_block1d16_v4_ssse3 +#define vpx_filter_block1d16_h4_avx2 vpx_filter_block1d16_h4_ssse3 +#define vpx_filter_block1d8_v4_avx2 vpx_filter_block1d8_v4_ssse3 +#define vpx_filter_block1d8_h4_avx2 vpx_filter_block1d8_h4_ssse3 +#define vpx_filter_block1d4_v4_avx2 vpx_filter_block1d4_v4_ssse3 +#define vpx_filter_block1d4_h4_avx2 vpx_filter_block1d4_h4_ssse3 #else #define vpx_filter_block1d16_v4_avx2 vpx_filter_block1d16_v8_avx2 #define vpx_filter_block1d16_h4_avx2 vpx_filter_block1d16_h8_avx2 diff --git a/vpx_dsp/x86/vpx_subpixel_8t_intrin_ssse3.c b/vpx_dsp/x86/vpx_subpixel_8t_intrin_ssse3.c index 3a19aed2e..9e5b73047 100644 --- a/vpx_dsp/x86/vpx_subpixel_8t_intrin_ssse3.c +++ b/vpx_dsp/x86/vpx_subpixel_8t_intrin_ssse3.c @@ -16,6 +16,7 @@ #include "vpx_dsp/vpx_filter.h" #include "vpx_dsp/x86/convolve.h" #include "vpx_dsp/x86/convolve_ssse3.h" +#include "vpx_dsp/x86/convolve_sse2.h" #include "vpx_dsp/x86/mem_sse2.h" #include "vpx_dsp/x86/transpose_sse2.h" #include "vpx_mem/vpx_mem.h" @@ -185,6 +186,488 @@ void vpx_filter_block1d8_v8_intrin_ssse3( } } +void vpx_filter_block1d16_h4_ssse3(const uint8_t *src_ptr, ptrdiff_t src_stride, + uint8_t *dst_ptr, ptrdiff_t dst_stride, + uint32_t height, const int16_t *kernel) { + // We will cast the kernel from 16-bit words to 8-bit words, and then extract + // the middle four elements of the kernel into two registers in the form + // ... k[3] k[2] k[3] k[2] + // ... k[5] k[4] k[5] k[4] + // Then we shuffle the source into + // ... s[1] s[0] s[0] s[-1] + // ... s[3] s[2] s[2] s[1] + // Calling multiply and add gives us half of the sum. Calling add gives us + // first half of the output. Repeat again to get the second half of the + // output. Finally we shuffle again to combine the two outputs. + + __m128i kernel_reg; // Kernel + __m128i kernel_reg_23, kernel_reg_45; // Segments of the kernel used + const __m128i reg_32 = _mm_set1_epi16(32); // Used for rounding + int h; + + __m128i src_reg, src_reg_shift_0, src_reg_shift_2; + __m128i dst_first, dst_second; + __m128i tmp_0, tmp_1; + __m128i idx_shift_0 = + _mm_setr_epi8(0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8); + __m128i idx_shift_2 = + _mm_setr_epi8(2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10); + + // Start one pixel before as we need tap/2 - 1 = 1 sample from the past + src_ptr -= 1; + + // Load Kernel + kernel_reg = _mm_loadu_si128((const __m128i *)kernel); + kernel_reg = _mm_srai_epi16(kernel_reg, 1); + kernel_reg = _mm_packs_epi16(kernel_reg, kernel_reg); + kernel_reg_23 = _mm_shuffle_epi8(kernel_reg, _mm_set1_epi16(0x0302u)); + kernel_reg_45 = _mm_shuffle_epi8(kernel_reg, _mm_set1_epi16(0x0504u)); + + for (h = height; h > 0; --h) { + // Load the source + src_reg = _mm_loadu_si128((const __m128i *)src_ptr); + src_reg_shift_0 = _mm_shuffle_epi8(src_reg, idx_shift_0); + src_reg_shift_2 = _mm_shuffle_epi8(src_reg, idx_shift_2); + + // Partial result for first half + tmp_0 = _mm_maddubs_epi16(src_reg_shift_0, kernel_reg_23); + tmp_1 = _mm_maddubs_epi16(src_reg_shift_2, kernel_reg_45); + dst_first = _mm_adds_epi16(tmp_0, tmp_1); + + // Do again to get the second half of dst + // Load the source + src_reg = _mm_loadu_si128((const __m128i *)(src_ptr + 8)); + src_reg_shift_0 = _mm_shuffle_epi8(src_reg, idx_shift_0); + src_reg_shift_2 = _mm_shuffle_epi8(src_reg, idx_shift_2); + + // Partial result for first half + tmp_0 = _mm_maddubs_epi16(src_reg_shift_0, kernel_reg_23); + tmp_1 = _mm_maddubs_epi16(src_reg_shift_2, kernel_reg_45); + dst_second = _mm_adds_epi16(tmp_0, tmp_1); + + // Round each result + dst_first = round_epi16_sse2(&dst_first, ®_32, 6); + dst_second = round_epi16_sse2(&dst_second, ®_32, 6); + + // Finally combine to get the final dst + dst_first = _mm_packus_epi16(dst_first, dst_second); + _mm_store_si128((__m128i *)dst_ptr, dst_first); + + src_ptr += src_stride; + dst_ptr += dst_stride; + } +} + +void vpx_filter_block1d16_v4_ssse3(const uint8_t *src_ptr, ptrdiff_t src_stride, + uint8_t *dst_ptr, ptrdiff_t dst_stride, + uint32_t height, const int16_t *kernel) { + // We will load two rows of pixels as 8-bit words, rearrange them into the + // form + // ... s[0,1] s[-1,1] s[0,0] s[-1,0] + // ... s[0,9] s[-1,9] s[0,8] s[-1,8] + // so that we can call multiply and add with the kernel to get 16-bit words of + // the form + // ... s[0,1]k[3]+s[-1,1]k[2] s[0,0]k[3]+s[-1,0]k[2] + // Finally, we can add multiple rows together to get the desired output. + + // Register for source s[-1:3, :] + __m128i src_reg_m1, src_reg_0, src_reg_1, src_reg_2, src_reg_3; + // Interleaved rows of the source. lo is first half, hi second + __m128i src_reg_m10_lo, src_reg_m10_hi, src_reg_01_lo, src_reg_01_hi; + __m128i src_reg_12_lo, src_reg_12_hi, src_reg_23_lo, src_reg_23_hi; + + __m128i kernel_reg; // Kernel + __m128i kernel_reg_23, kernel_reg_45; // Segments of the kernel used + + // Result after multiply and add + __m128i res_reg_m10_lo, res_reg_01_lo, res_reg_12_lo, res_reg_23_lo; + __m128i res_reg_m10_hi, res_reg_01_hi, res_reg_12_hi, res_reg_23_hi; + __m128i res_reg_m1012, res_reg_0123; + __m128i res_reg_m1012_lo, res_reg_0123_lo, res_reg_m1012_hi, res_reg_0123_hi; + + const __m128i reg_32 = _mm_set1_epi16(32); // Used for rounding + + // We will compute the result two rows at a time + const ptrdiff_t src_stride_unrolled = src_stride << 1; + const ptrdiff_t dst_stride_unrolled = dst_stride << 1; + int h; + + // We only need to go num_taps/2 - 1 row above the souce, so we move + // 3 - (num_taps/2 - 1) = 4 - num_taps/2 = 2 back down + src_ptr += src_stride_unrolled; + + // Load Kernel + kernel_reg = _mm_loadu_si128((const __m128i *)kernel); + kernel_reg = _mm_srai_epi16(kernel_reg, 1); + kernel_reg = _mm_packs_epi16(kernel_reg, kernel_reg); + kernel_reg_23 = _mm_shuffle_epi8(kernel_reg, _mm_set1_epi16(0x0302u)); + kernel_reg_45 = _mm_shuffle_epi8(kernel_reg, _mm_set1_epi16(0x0504u)); + + // First shuffle the data + src_reg_m1 = _mm_loadu_si128((const __m128i *)src_ptr); + src_reg_0 = _mm_loadu_si128((const __m128i *)(src_ptr + src_stride)); + src_reg_m10_lo = _mm_unpacklo_epi8(src_reg_m1, src_reg_0); + src_reg_m10_hi = _mm_unpackhi_epi8(src_reg_m1, src_reg_0); + + // More shuffling + src_reg_1 = _mm_loadu_si128((const __m128i *)(src_ptr + src_stride * 2)); + src_reg_01_lo = _mm_unpacklo_epi8(src_reg_0, src_reg_1); + src_reg_01_hi = _mm_unpackhi_epi8(src_reg_0, src_reg_1); + + for (h = height; h > 1; h -= 2) { + src_reg_2 = _mm_loadu_si128((const __m128i *)(src_ptr + src_stride * 3)); + + src_reg_12_lo = _mm_unpacklo_epi8(src_reg_1, src_reg_2); + src_reg_12_hi = _mm_unpackhi_epi8(src_reg_1, src_reg_2); + + src_reg_3 = _mm_loadu_si128((const __m128i *)(src_ptr + src_stride * 4)); + + src_reg_23_lo = _mm_unpacklo_epi8(src_reg_2, src_reg_3); + src_reg_23_hi = _mm_unpackhi_epi8(src_reg_2, src_reg_3); + + // Partial output from first half + res_reg_m10_lo = _mm_maddubs_epi16(src_reg_m10_lo, kernel_reg_23); + res_reg_01_lo = _mm_maddubs_epi16(src_reg_01_lo, kernel_reg_23); + + res_reg_12_lo = _mm_maddubs_epi16(src_reg_12_lo, kernel_reg_45); + res_reg_23_lo = _mm_maddubs_epi16(src_reg_23_lo, kernel_reg_45); + + // Add to get first half of the results + res_reg_m1012_lo = _mm_adds_epi16(res_reg_m10_lo, res_reg_12_lo); + res_reg_0123_lo = _mm_adds_epi16(res_reg_01_lo, res_reg_23_lo); + + // Partial output for second half + res_reg_m10_hi = _mm_maddubs_epi16(src_reg_m10_hi, kernel_reg_23); + res_reg_01_hi = _mm_maddubs_epi16(src_reg_01_hi, kernel_reg_23); + + res_reg_12_hi = _mm_maddubs_epi16(src_reg_12_hi, kernel_reg_45); + res_reg_23_hi = _mm_maddubs_epi16(src_reg_23_hi, kernel_reg_45); + + // Second half of the results + res_reg_m1012_hi = _mm_adds_epi16(res_reg_m10_hi, res_reg_12_hi); + res_reg_0123_hi = _mm_adds_epi16(res_reg_01_hi, res_reg_23_hi); + + // Round the words + res_reg_m1012_lo = round_epi16_sse2(&res_reg_m1012_lo, ®_32, 6); + res_reg_0123_lo = round_epi16_sse2(&res_reg_0123_lo, ®_32, 6); + res_reg_m1012_hi = round_epi16_sse2(&res_reg_m1012_hi, ®_32, 6); + res_reg_0123_hi = round_epi16_sse2(&res_reg_0123_hi, ®_32, 6); + + // Combine to get the result + res_reg_m1012 = _mm_packus_epi16(res_reg_m1012_lo, res_reg_m1012_hi); + res_reg_0123 = _mm_packus_epi16(res_reg_0123_lo, res_reg_0123_hi); + + _mm_store_si128((__m128i *)dst_ptr, res_reg_m1012); + _mm_store_si128((__m128i *)(dst_ptr + dst_stride), res_reg_0123); + + // Update the source by two rows + src_ptr += src_stride_unrolled; + dst_ptr += dst_stride_unrolled; + + src_reg_m10_lo = src_reg_12_lo; + src_reg_m10_hi = src_reg_12_hi; + src_reg_01_lo = src_reg_23_lo; + src_reg_01_hi = src_reg_23_hi; + src_reg_1 = src_reg_3; + } +} + +void vpx_filter_block1d8_h4_ssse3(const uint8_t *src_ptr, ptrdiff_t src_stride, + uint8_t *dst_ptr, ptrdiff_t dst_stride, + uint32_t height, const int16_t *kernel) { + // We will cast the kernel from 16-bit words to 8-bit words, and then extract + // the middle four elements of the kernel into two registers in the form + // ... k[3] k[2] k[3] k[2] + // ... k[5] k[4] k[5] k[4] + // Then we shuffle the source into + // ... s[1] s[0] s[0] s[-1] + // ... s[3] s[2] s[2] s[1] + // Calling multiply and add gives us half of the sum. Calling add gives us + // first half of the output. Repeat again to get the second half of the + // output. Finally we shuffle again to combine the two outputs. + + __m128i kernel_reg; // Kernel + __m128i kernel_reg_23, kernel_reg_45; // Segments of the kernel used + const __m128i reg_32 = _mm_set1_epi16(32); // Used for rounding + int h; + + __m128i src_reg, src_reg_shift_0, src_reg_shift_2; + __m128i dst_first; + __m128i tmp_0, tmp_1; + __m128i idx_shift_0 = + _mm_setr_epi8(0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8); + __m128i idx_shift_2 = + _mm_setr_epi8(2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10); + + // Start one pixel before as we need tap/2 - 1 = 1 sample from the past + src_ptr -= 1; + + // Load Kernel + kernel_reg = _mm_loadu_si128((const __m128i *)kernel); + kernel_reg = _mm_srai_epi16(kernel_reg, 1); + kernel_reg = _mm_packs_epi16(kernel_reg, kernel_reg); + kernel_reg_23 = _mm_shuffle_epi8(kernel_reg, _mm_set1_epi16(0x0302u)); + kernel_reg_45 = _mm_shuffle_epi8(kernel_reg, _mm_set1_epi16(0x0504u)); + + for (h = height; h > 0; --h) { + // Load the source + src_reg = _mm_loadu_si128((const __m128i *)src_ptr); + src_reg_shift_0 = _mm_shuffle_epi8(src_reg, idx_shift_0); + src_reg_shift_2 = _mm_shuffle_epi8(src_reg, idx_shift_2); + + // Get the result + tmp_0 = _mm_maddubs_epi16(src_reg_shift_0, kernel_reg_23); + tmp_1 = _mm_maddubs_epi16(src_reg_shift_2, kernel_reg_45); + dst_first = _mm_adds_epi16(tmp_0, tmp_1); + + // Round round result + dst_first = round_epi16_sse2(&dst_first, ®_32, 6); + + // Pack to 8-bits + dst_first = _mm_packus_epi16(dst_first, _mm_setzero_si128()); + _mm_storel_epi64((__m128i *)dst_ptr, dst_first); + + src_ptr += src_stride; + dst_ptr += dst_stride; + } +} + +void vpx_filter_block1d8_v4_ssse3(const uint8_t *src_ptr, ptrdiff_t src_stride, + uint8_t *dst_ptr, ptrdiff_t dst_stride, + uint32_t height, const int16_t *kernel) { + // We will load two rows of pixels as 8-bit words, rearrange them into the + // form + // ... s[0,1] s[-1,1] s[0,0] s[-1,0] + // so that we can call multiply and add with the kernel to get 16-bit words of + // the form + // ... s[0,1]k[3]+s[-1,1]k[2] s[0,0]k[3]+s[-1,0]k[2] + // Finally, we can add multiple rows together to get the desired output. + + // Register for source s[-1:3, :] + __m128i src_reg_m1, src_reg_0, src_reg_1, src_reg_2, src_reg_3; + // Interleaved rows of the source. lo is first half, hi second + __m128i src_reg_m10, src_reg_01; + __m128i src_reg_12, src_reg_23; + + __m128i kernel_reg; // Kernel + __m128i kernel_reg_23, kernel_reg_45; // Segments of the kernel used + + // Result after multiply and add + __m128i res_reg_m10, res_reg_01, res_reg_12, res_reg_23; + __m128i res_reg_m1012, res_reg_0123; + + const __m128i reg_32 = _mm_set1_epi16(32); // Used for rounding + + // We will compute the result two rows at a time + const ptrdiff_t src_stride_unrolled = src_stride << 1; + const ptrdiff_t dst_stride_unrolled = dst_stride << 1; + int h; + + // We only need to go num_taps/2 - 1 row above the souce, so we move + // 3 - (num_taps/2 - 1) = 4 - num_taps/2 = 2 back down + src_ptr += src_stride_unrolled; + + // Load Kernel + kernel_reg = _mm_loadu_si128((const __m128i *)kernel); + kernel_reg = _mm_srai_epi16(kernel_reg, 1); + kernel_reg = _mm_packs_epi16(kernel_reg, kernel_reg); + kernel_reg_23 = _mm_shuffle_epi8(kernel_reg, _mm_set1_epi16(0x0302u)); + kernel_reg_45 = _mm_shuffle_epi8(kernel_reg, _mm_set1_epi16(0x0504u)); + + // First shuffle the data + src_reg_m1 = _mm_loadl_epi64((const __m128i *)src_ptr); + src_reg_0 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_stride)); + src_reg_m10 = _mm_unpacklo_epi8(src_reg_m1, src_reg_0); + + // More shuffling + src_reg_1 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_stride * 2)); + src_reg_01 = _mm_unpacklo_epi8(src_reg_0, src_reg_1); + + for (h = height; h > 1; h -= 2) { + src_reg_2 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_stride * 3)); + + src_reg_12 = _mm_unpacklo_epi8(src_reg_1, src_reg_2); + + src_reg_3 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_stride * 4)); + + src_reg_23 = _mm_unpacklo_epi8(src_reg_2, src_reg_3); + + // Partial output + res_reg_m10 = _mm_maddubs_epi16(src_reg_m10, kernel_reg_23); + res_reg_01 = _mm_maddubs_epi16(src_reg_01, kernel_reg_23); + + res_reg_12 = _mm_maddubs_epi16(src_reg_12, kernel_reg_45); + res_reg_23 = _mm_maddubs_epi16(src_reg_23, kernel_reg_45); + + // Add to get entire output + res_reg_m1012 = _mm_adds_epi16(res_reg_m10, res_reg_12); + res_reg_0123 = _mm_adds_epi16(res_reg_01, res_reg_23); + + // Round the words + res_reg_m1012 = round_epi16_sse2(&res_reg_m1012, ®_32, 6); + res_reg_0123 = round_epi16_sse2(&res_reg_0123, ®_32, 6); + + // Pack from 16-bit to 8-bit + res_reg_m1012 = _mm_packus_epi16(res_reg_m1012, _mm_setzero_si128()); + res_reg_0123 = _mm_packus_epi16(res_reg_0123, _mm_setzero_si128()); + + _mm_storel_epi64((__m128i *)dst_ptr, res_reg_m1012); + _mm_storel_epi64((__m128i *)(dst_ptr + dst_stride), res_reg_0123); + + // Update the source by two rows + src_ptr += src_stride_unrolled; + dst_ptr += dst_stride_unrolled; + + src_reg_m10 = src_reg_12; + src_reg_01 = src_reg_23; + src_reg_1 = src_reg_3; + } +} + +void vpx_filter_block1d4_h4_ssse3(const uint8_t *src_ptr, ptrdiff_t src_stride, + uint8_t *dst_ptr, ptrdiff_t dst_stride, + uint32_t height, const int16_t *kernel) { + // We will cast the kernel from 16-bit words to 8-bit words, and then extract + // the middle four elements of the kernel into a single register in the form + // k[5:2] k[5:2] k[5:2] k[5:2] + // Then we shuffle the source into + // s[5:2] s[4:1] s[3:0] s[2:-1] + // Calling multiply and add gives us half of the sum next to each other. + // Calling horizontal add then gives us the output. + + __m128i kernel_reg; // Kernel + const __m128i reg_32 = _mm_set1_epi16(32); // Used for rounding + int h; + + __m128i src_reg, src_reg_shuf; + __m128i dst_first; + __m128i shuf_idx = + _mm_setr_epi8(0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6); + + // Start one pixel before as we need tap/2 - 1 = 1 sample from the past + src_ptr -= 1; + + // Load Kernel + kernel_reg = _mm_loadu_si128((const __m128i *)kernel); + kernel_reg = _mm_srai_epi16(kernel_reg, 1); + kernel_reg = _mm_packs_epi16(kernel_reg, kernel_reg); + kernel_reg = _mm_shuffle_epi8(kernel_reg, _mm_set1_epi32(0x05040302u)); + + for (h = height; h > 0; --h) { + // Load the source + src_reg = _mm_loadu_si128((const __m128i *)src_ptr); + src_reg_shuf = _mm_shuffle_epi8(src_reg, shuf_idx); + + // Get the result + dst_first = _mm_maddubs_epi16(src_reg_shuf, kernel_reg); + dst_first = _mm_hadds_epi16(dst_first, _mm_setzero_si128()); + + // Round result + dst_first = round_epi16_sse2(&dst_first, ®_32, 6); + + // Pack to 8-bits + dst_first = _mm_packus_epi16(dst_first, _mm_setzero_si128()); + *((uint32_t *)(dst_ptr)) = _mm_cvtsi128_si32(dst_first); + + src_ptr += src_stride; + dst_ptr += dst_stride; + } +} + +void vpx_filter_block1d4_v4_ssse3(const uint8_t *src_ptr, ptrdiff_t src_stride, + uint8_t *dst_ptr, ptrdiff_t dst_stride, + uint32_t height, const int16_t *kernel) { + // We will load two rows of pixels as 8-bit words, rearrange them into the + // form + // ... s[2,0] s[1,0] s[0,0] s[-1,0] + // so that we can call multiply and add with the kernel partial output. Then + // we can call horizontal add to get the output. + // Finally, we can add multiple rows together to get the desired output. + // This is done two rows at a time + + // Register for source s[-1:3, :] + __m128i src_reg_m1, src_reg_0, src_reg_1, src_reg_2, src_reg_3; + // Interleaved rows of the source. + __m128i src_reg_m10, src_reg_01; + __m128i src_reg_12, src_reg_23; + __m128i src_reg_m1001, src_reg_1223; + __m128i src_reg_m1012_1023_lo, src_reg_m1012_1023_hi; + + __m128i kernel_reg; // Kernel + + // Result after multiply and add + __m128i reg_0, reg_1; + + const __m128i reg_32 = _mm_set1_epi16(32); // Used for rounding + + // We will compute the result two rows at a time + const ptrdiff_t src_stride_unrolled = src_stride << 1; + const ptrdiff_t dst_stride_unrolled = dst_stride << 1; + int h; + + // We only need to go num_taps/2 - 1 row above the souce, so we move + // 3 - (num_taps/2 - 1) = 4 - num_taps/2 = 2 back down + src_ptr += src_stride_unrolled; + + // Load Kernel + kernel_reg = _mm_loadu_si128((const __m128i *)kernel); + kernel_reg = _mm_srai_epi16(kernel_reg, 1); + kernel_reg = _mm_packs_epi16(kernel_reg, kernel_reg); + kernel_reg = _mm_shuffle_epi8(kernel_reg, _mm_set1_epi32(0x05040302u)); + + // First shuffle the data + src_reg_m1 = _mm_loadl_epi64((const __m128i *)src_ptr); + src_reg_0 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_stride)); + src_reg_m10 = _mm_unpacklo_epi32(src_reg_m1, src_reg_0); + + // More shuffling + src_reg_1 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_stride * 2)); + src_reg_01 = _mm_unpacklo_epi32(src_reg_0, src_reg_1); + + // Put three rows next to each other + src_reg_m1001 = _mm_unpacklo_epi8(src_reg_m10, src_reg_01); + + for (h = height; h > 1; h -= 2) { + src_reg_2 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_stride * 3)); + src_reg_12 = _mm_unpacklo_epi32(src_reg_1, src_reg_2); + + src_reg_3 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_stride * 4)); + src_reg_23 = _mm_unpacklo_epi32(src_reg_2, src_reg_3); + + // Put three rows next to each other + src_reg_1223 = _mm_unpacklo_epi8(src_reg_12, src_reg_23); + + // Put all four rows next to each other + src_reg_m1012_1023_lo = _mm_unpacklo_epi16(src_reg_m1001, src_reg_1223); + src_reg_m1012_1023_hi = _mm_unpackhi_epi16(src_reg_m1001, src_reg_1223); + + // Get the results + reg_0 = _mm_maddubs_epi16(src_reg_m1012_1023_lo, kernel_reg); + reg_1 = _mm_maddubs_epi16(src_reg_m1012_1023_hi, kernel_reg); + reg_0 = _mm_hadds_epi16(reg_0, _mm_setzero_si128()); + reg_1 = _mm_hadds_epi16(reg_1, _mm_setzero_si128()); + + // Round the words + reg_0 = round_epi16_sse2(®_0, ®_32, 6); + reg_1 = round_epi16_sse2(®_1, ®_32, 6); + + // Pack from 16-bit to 8-bit and put them in the right order + reg_0 = _mm_packus_epi16(reg_0, reg_0); + reg_1 = _mm_packus_epi16(reg_1, reg_1); + + // Save the result + *((uint32_t *)(dst_ptr)) = _mm_cvtsi128_si32(reg_0); + *((uint32_t *)(dst_ptr + dst_stride)) = _mm_cvtsi128_si32(reg_1); + + // Update the source by two rows + src_ptr += src_stride_unrolled; + dst_ptr += dst_stride_unrolled; + + src_reg_m1001 = src_reg_1223; + src_reg_1 = src_reg_3; + } +} + filter8_1dfunction vpx_filter_block1d16_v8_ssse3; filter8_1dfunction vpx_filter_block1d16_h8_ssse3; filter8_1dfunction vpx_filter_block1d8_v8_ssse3; @@ -198,27 +681,6 @@ filter8_1dfunction vpx_filter_block1d8_h8_avg_ssse3; filter8_1dfunction vpx_filter_block1d4_v8_avg_ssse3; filter8_1dfunction vpx_filter_block1d4_h8_avg_ssse3; -#if HAVE_SSE2 -filter8_1dfunction vpx_filter_block1d16_v4_sse2; -filter8_1dfunction vpx_filter_block1d16_h4_sse2; -filter8_1dfunction vpx_filter_block1d8_v4_sse2; -filter8_1dfunction vpx_filter_block1d8_h4_sse2; -filter8_1dfunction vpx_filter_block1d4_v4_sse2; -filter8_1dfunction vpx_filter_block1d4_h4_sse2; -#define vpx_filter_block1d16_v4_ssse3 vpx_filter_block1d16_v4_sse2 -#define vpx_filter_block1d16_h4_ssse3 vpx_filter_block1d16_h4_sse2 -#define vpx_filter_block1d8_v4_ssse3 vpx_filter_block1d8_v4_sse2 -#define vpx_filter_block1d8_h4_ssse3 vpx_filter_block1d8_h4_sse2 -#define vpx_filter_block1d4_v4_ssse3 vpx_filter_block1d4_v4_sse2 -#define vpx_filter_block1d4_h4_ssse3 vpx_filter_block1d4_h4_sse2 -#else -#define vpx_filter_block1d16_v4_ssse3 vpx_filter_block1d16_v8_ssse3 -#define vpx_filter_block1d16_h4_ssse3 vpx_filter_block1d16_h8_ssse3 -#define vpx_filter_block1d8_v4_ssse3 vpx_filter_block1d8_v8_ssse3 -#define vpx_filter_block1d8_h4_ssse3 vpx_filter_block1d8_h8_ssse3 -#define vpx_filter_block1d4_v4_ssse3 vpx_filter_block1d4_v8_ssse3 -#define vpx_filter_block1d4_h4_ssse3 vpx_filter_block1d4_h8_ssse3 -#endif #define vpx_filter_block1d16_v4_avg_ssse3 vpx_filter_block1d16_v8_avg_ssse3 #define vpx_filter_block1d16_h4_avg_ssse3 vpx_filter_block1d16_h8_avg_ssse3 #define vpx_filter_block1d8_v4_avg_ssse3 vpx_filter_block1d8_v8_avg_ssse3 -- 2.40.0