From 62830c53a644f5feaa49431b39c85093d2e387fc Mon Sep 17 00:00:00 2001 From: chiyotsai Date: Tue, 16 Oct 2018 12:26:34 -0700 Subject: [PATCH] Refactor SSE2 Code for 4-tap interpolation filter on width 16. Some repeated codes are refactored as inline functions. No performance degradation is observed. These inline functions can be used for width 8 and width 4. Change-Id: Ibf08cc9ebd2dd47bd2a6c2bcc1616f9d4c252d4d --- vpx_dsp/vpx_dsp.mk | 2 + vpx_dsp/x86/convolve_sse2.h | 67 +++++++++++++ vpx_dsp/x86/vpx_subpixel_4t_intrin_sse2.c | 115 ++++++++-------------- 3 files changed, 109 insertions(+), 75 deletions(-) create mode 100644 vpx_dsp/x86/convolve_sse2.h diff --git a/vpx_dsp/vpx_dsp.mk b/vpx_dsp/vpx_dsp.mk index fe20bab03..08ab9a128 100644 --- a/vpx_dsp/vpx_dsp.mk +++ b/vpx_dsp/vpx_dsp.mk @@ -89,6 +89,8 @@ DSP_SRCS-yes += vpx_filter.h DSP_SRCS-$(ARCH_X86)$(ARCH_X86_64) += x86/convolve.h DSP_SRCS-$(ARCH_X86)$(ARCH_X86_64) += x86/vpx_asm_stubs.c + +DSP_SRCS-$(HAVE_SSE2) += x86/convolve_sse2.h DSP_SRCS-$(HAVE_SSSE3) += x86/convolve_ssse3.h DSP_SRCS-$(HAVE_AVX2) += x86/convolve_avx2.h DSP_SRCS-$(HAVE_SSE2) += x86/vpx_subpixel_8t_sse2.asm diff --git a/vpx_dsp/x86/convolve_sse2.h b/vpx_dsp/x86/convolve_sse2.h new file mode 100644 index 000000000..d674cc495 --- /dev/null +++ b/vpx_dsp/x86/convolve_sse2.h @@ -0,0 +1,67 @@ +/* + * Copyright (c) 2018 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VPX_VPX_DSP_X86_CONVOLVE_SSE2_H_ +#define VPX_VPX_DSP_X86_CONVOLVE_SSE2_H_ + +#include +#include // SSE2 + +#include "./vpx_config.h" + +// Interprets the input register as 16-bit words 7 6 5 4 3 2 1 0, then returns +// values at index 2 and 3 to return 3 2 3 2 3 2 3 2 as 16-bit words +static INLINE __m128i extract_quarter_2_epi16_sse2(const __m128i *const reg) { + __m128i tmp = _mm_unpacklo_epi32(*reg, *reg); + return _mm_unpackhi_epi64(tmp, tmp); +} + +// Interprets the input register as 16-bit words 7 6 5 4 3 2 1 0, then returns +// values at index 2 and 3 to return 5 4 5 4 5 4 5 4 as 16-bit words. +static INLINE __m128i extract_quarter_3_epi16_sse2(const __m128i *const reg) { + __m128i tmp = _mm_unpackhi_epi32(*reg, *reg); + return _mm_unpacklo_epi64(tmp, tmp); +} + +// Interprets src as 8-bit words, pads each word with zeroes to form 16-bit +// words, then multiplies with ker and add the adjacent results to form 32-bit +// words. Finally adds the result from 1 and 2 together. +static INLINE __m128i pad_multiply_add_add_epi8_sse2( + const __m128i *const src_1, const __m128i *const src_2, + const __m128i *const ker_1, const __m128i *const ker_2) { + const __m128i src_1_half = _mm_unpacklo_epi8(*src_1, _mm_setzero_si128()); + const __m128i src_2_half = _mm_unpacklo_epi8(*src_2, _mm_setzero_si128()); + const __m128i madd_1 = _mm_madd_epi16(src_1_half, *ker_1); + const __m128i madd_2 = _mm_madd_epi16(src_2_half, *ker_2); + return _mm_add_epi32(madd_1, madd_2); +} + +static INLINE __m128i multiply_add_packs_epi16_sse2(const __m128i *const src_0, + const __m128i *const src_1, + const __m128i *const ker) { + const __m128i madd_1 = _mm_madd_epi16(*src_0, *ker); + const __m128i madd_2 = _mm_madd_epi16(*src_1, *ker); + return _mm_packs_epi32(madd_1, madd_2); +} +static INLINE __m128i combine_epi32_sse2(const __m128i *const src_1, + const __m128i *const src_2) { + const __m128i tmp_1 = _mm_unpacklo_epi32(*src_1, *src_2); + const __m128i tmp_2 = _mm_unpackhi_epi32(*src_1, *src_2); + return _mm_packs_epi32(tmp_1, tmp_2); +} + +static INLINE __m128i round_epi16_sse2(const __m128i *const src, + const __m128i *const half_depth, + const int depth) { + const __m128i nearest_src = _mm_adds_epi16(*src, *half_depth); + return _mm_srai_epi16(nearest_src, depth); +} + +#endif // VPX_VPX_DSP_X86_CONVOLVE_SSE2_H_ diff --git a/vpx_dsp/x86/vpx_subpixel_4t_intrin_sse2.c b/vpx_dsp/x86/vpx_subpixel_4t_intrin_sse2.c index 6238e0e7a..8b44c4989 100644 --- a/vpx_dsp/x86/vpx_subpixel_4t_intrin_sse2.c +++ b/vpx_dsp/x86/vpx_subpixel_4t_intrin_sse2.c @@ -13,6 +13,7 @@ #include "./vpx_dsp_rtcd.h" #include "vpx/vpx_integer.h" #include "vpx_dsp/x86/convolve.h" +#include "vpx_dsp/x86/convolve_sse2.h" #include "vpx_ports/mem.h" void vpx_filter_block1d16_h4_sse2(const uint8_t *src_ptr, ptrdiff_t src_stride, @@ -26,8 +27,6 @@ void vpx_filter_block1d16_h4_sse2(const uint8_t *src_ptr, ptrdiff_t src_stride, __m128i src_reg, src_reg_shift_1, src_reg_shift_2, src_reg_shift_3; __m128i dst_first, dst_second; __m128i even, odd; - __m128i tmp_1, tmp_2; - __m128i madd_1, madd_2; // Start one pixel before as we need tap/2 - 1 = 1 sample from the past src_ptr -= 1; @@ -35,10 +34,8 @@ void vpx_filter_block1d16_h4_sse2(const uint8_t *src_ptr, ptrdiff_t src_stride, // Load Kernel kernel_reg = _mm_loadu_si128((const __m128i *)kernel); kernel_reg = _mm_srai_epi16(kernel_reg, 1); - tmp_1 = _mm_unpacklo_epi32(kernel_reg, kernel_reg); - kernel_reg_23 = _mm_unpackhi_epi64(tmp_1, tmp_1); - tmp_2 = _mm_unpackhi_epi32(kernel_reg, kernel_reg); - kernel_reg_45 = _mm_unpacklo_epi64(tmp_2, tmp_2); + kernel_reg_23 = extract_quarter_2_epi16_sse2(&kernel_reg); + kernel_reg_45 = extract_quarter_3_epi16_sse2(&kernel_reg); for (h = height; h > 0; --h) { // We will load multiple shifted versions of the row and shuffle them into @@ -57,23 +54,15 @@ void vpx_filter_block1d16_h4_sse2(const uint8_t *src_ptr, ptrdiff_t src_stride, src_reg_shift_3 = _mm_srli_si128(src_reg, 3); // Output 6 4 2 0 - tmp_1 = _mm_unpacklo_epi8(src_reg, _mm_setzero_si128()); - tmp_2 = _mm_unpacklo_epi8(src_reg_shift_2, _mm_setzero_si128()); - madd_1 = _mm_madd_epi16(tmp_1, kernel_reg_23); - madd_2 = _mm_madd_epi16(tmp_2, kernel_reg_45); - even = _mm_add_epi32(madd_1, madd_2); + even = pad_multiply_add_add_epi8_sse2(&src_reg, &src_reg_shift_2, + &kernel_reg_23, &kernel_reg_45); // Output 7 5 3 1 - tmp_1 = _mm_unpacklo_epi8(src_reg_shift_1, _mm_setzero_si128()); - tmp_2 = _mm_unpacklo_epi8(src_reg_shift_3, _mm_setzero_si128()); - madd_1 = _mm_madd_epi16(tmp_1, kernel_reg_23); - madd_2 = _mm_madd_epi16(tmp_2, kernel_reg_45); - odd = _mm_add_epi32(madd_1, madd_2); + odd = pad_multiply_add_add_epi8_sse2(&src_reg_shift_1, &src_reg_shift_3, + &kernel_reg_23, &kernel_reg_45); // Combine to get the first half of the dst - tmp_1 = _mm_unpacklo_epi32(even, odd); - tmp_2 = _mm_unpackhi_epi32(even, odd); - dst_first = _mm_packs_epi32(tmp_1, tmp_2); + dst_first = combine_epi32_sse2(&even, &odd); // Do again to get the second half of dst src_reg = _mm_loadu_si128((const __m128i *)(src_ptr + 8)); @@ -82,29 +71,19 @@ void vpx_filter_block1d16_h4_sse2(const uint8_t *src_ptr, ptrdiff_t src_stride, src_reg_shift_3 = _mm_srli_si128(src_reg, 3); // Output 14 12 10 8 - tmp_1 = _mm_unpacklo_epi8(src_reg, _mm_setzero_si128()); - tmp_2 = _mm_unpacklo_epi8(src_reg_shift_2, _mm_setzero_si128()); - madd_1 = _mm_madd_epi16(tmp_1, kernel_reg_23); - madd_2 = _mm_madd_epi16(tmp_2, kernel_reg_45); - even = _mm_add_epi32(madd_1, madd_2); + even = pad_multiply_add_add_epi8_sse2(&src_reg, &src_reg_shift_2, + &kernel_reg_23, &kernel_reg_45); // Output 15 13 11 9 - tmp_1 = _mm_unpacklo_epi8(src_reg_shift_1, _mm_setzero_si128()); - tmp_2 = _mm_unpacklo_epi8(src_reg_shift_3, _mm_setzero_si128()); - madd_1 = _mm_madd_epi16(tmp_1, kernel_reg_23); - madd_2 = _mm_madd_epi16(tmp_2, kernel_reg_45); - odd = _mm_add_epi32(madd_1, madd_2); + odd = pad_multiply_add_add_epi8_sse2(&src_reg_shift_1, &src_reg_shift_3, + &kernel_reg_23, &kernel_reg_45); // Combine to get the second half of the dst - tmp_1 = _mm_unpacklo_epi32(even, odd); - tmp_2 = _mm_unpackhi_epi32(even, odd); - dst_second = _mm_packs_epi32(tmp_1, tmp_2); + dst_second = combine_epi32_sse2(&even, &odd); // Round each result - dst_first = _mm_adds_epi16(dst_first, reg_32); - dst_first = _mm_srai_epi16(dst_first, 6); - dst_second = _mm_adds_epi16(dst_second, reg_32); - dst_second = _mm_srai_epi16(dst_second, 6); + dst_first = round_epi16_sse2(&dst_first, ®_32, 6); + dst_second = round_epi16_sse2(&dst_second, ®_32, 6); // Finally combine to get the final dst dst_first = _mm_packus_epi16(dst_first, dst_second); @@ -143,7 +122,6 @@ void vpx_filter_block1d16_v4_sse2(const uint8_t *src_ptr, ptrdiff_t src_stride, __m128i res_reg_m1012_lo, res_reg_0123_lo, res_reg_m1012_hi, res_reg_0123_hi; const __m128i reg_32 = _mm_set1_epi16(32); // Used for rounding - __m128i tmp_0, tmp_1; // We will compute the result two rows at a time const ptrdiff_t src_stride_unrolled = src_stride << 1; @@ -157,13 +135,12 @@ void vpx_filter_block1d16_v4_sse2(const uint8_t *src_ptr, ptrdiff_t src_stride, // Load Kernel kernel_reg = _mm_loadu_si128((const __m128i *)kernel); kernel_reg = _mm_srai_epi16(kernel_reg, 1); - tmp_0 = _mm_unpacklo_epi32(kernel_reg, kernel_reg); - kernel_reg_23 = _mm_unpackhi_epi64(tmp_0, tmp_0); - tmp_1 = _mm_unpackhi_epi32(kernel_reg, kernel_reg); - kernel_reg_45 = _mm_unpacklo_epi64(tmp_1, tmp_1); + kernel_reg_23 = extract_quarter_2_epi16_sse2(&kernel_reg); + kernel_reg_45 = extract_quarter_3_epi16_sse2(&kernel_reg); // We will load two rows of pixels as 8-bit words, rearrange them as 16-bit - // words, shuffle the data into the form + // words, + // shuffle the data into the form // ... s[0,1] s[-1,1] s[0,0] s[-1,0] // ... s[0,7] s[-1,7] s[0,6] s[-1,6] // ... s[0,9] s[-1,9] s[0,8] s[-1,8] @@ -204,25 +181,21 @@ void vpx_filter_block1d16_v4_sse2(const uint8_t *src_ptr, ptrdiff_t src_stride, src_reg_23_hi = _mm_unpackhi_epi8(src_reg_2, src_reg_3); // Partial output from first half - tmp_0 = _mm_madd_epi16(src_reg_m10_lo_1, kernel_reg_23); - tmp_1 = _mm_madd_epi16(src_reg_m10_lo_2, kernel_reg_23); - res_reg_m10_lo = _mm_packs_epi32(tmp_0, tmp_1); + res_reg_m10_lo = multiply_add_packs_epi16_sse2( + &src_reg_m10_lo_1, &src_reg_m10_lo_2, &kernel_reg_23); - tmp_0 = _mm_madd_epi16(src_reg_01_lo_1, kernel_reg_23); - tmp_1 = _mm_madd_epi16(src_reg_01_lo_2, kernel_reg_23); - res_reg_01_lo = _mm_packs_epi32(tmp_0, tmp_1); + res_reg_01_lo = multiply_add_packs_epi16_sse2( + &src_reg_01_lo_1, &src_reg_01_lo_2, &kernel_reg_23); src_reg_12_lo_1 = _mm_unpacklo_epi8(src_reg_12_lo, _mm_setzero_si128()); src_reg_12_lo_2 = _mm_unpackhi_epi8(src_reg_12_lo, _mm_setzero_si128()); - tmp_0 = _mm_madd_epi16(src_reg_12_lo_1, kernel_reg_45); - tmp_1 = _mm_madd_epi16(src_reg_12_lo_2, kernel_reg_45); - res_reg_12_lo = _mm_packs_epi32(tmp_0, tmp_1); + res_reg_12_lo = multiply_add_packs_epi16_sse2( + &src_reg_12_lo_1, &src_reg_12_lo_2, &kernel_reg_45); src_reg_23_lo_1 = _mm_unpacklo_epi8(src_reg_23_lo, _mm_setzero_si128()); src_reg_23_lo_2 = _mm_unpackhi_epi8(src_reg_23_lo, _mm_setzero_si128()); - tmp_0 = _mm_madd_epi16(src_reg_23_lo_1, kernel_reg_45); - tmp_1 = _mm_madd_epi16(src_reg_23_lo_2, kernel_reg_45); - res_reg_23_lo = _mm_packs_epi32(tmp_0, tmp_1); + res_reg_23_lo = multiply_add_packs_epi16_sse2( + &src_reg_23_lo_1, &src_reg_23_lo_2, &kernel_reg_45); // Add to get first half of the results res_reg_m1012_lo = _mm_adds_epi16(res_reg_m10_lo, res_reg_12_lo); @@ -230,39 +203,31 @@ void vpx_filter_block1d16_v4_sse2(const uint8_t *src_ptr, ptrdiff_t src_stride, // Now repeat everything again for the second half // Partial output for second half - tmp_0 = _mm_madd_epi16(src_reg_m10_hi_1, kernel_reg_23); - tmp_1 = _mm_madd_epi16(src_reg_m10_hi_2, kernel_reg_23); - res_reg_m10_hi = _mm_packs_epi32(tmp_0, tmp_1); + res_reg_m10_hi = multiply_add_packs_epi16_sse2( + &src_reg_m10_hi_1, &src_reg_m10_hi_2, &kernel_reg_23); - tmp_0 = _mm_madd_epi16(src_reg_01_hi_1, kernel_reg_23); - tmp_1 = _mm_madd_epi16(src_reg_01_hi_2, kernel_reg_23); - res_reg_01_hi = _mm_packs_epi32(tmp_0, tmp_1); + res_reg_01_hi = multiply_add_packs_epi16_sse2( + &src_reg_01_hi_1, &src_reg_01_hi_2, &kernel_reg_23); src_reg_12_hi_1 = _mm_unpacklo_epi8(src_reg_12_hi, _mm_setzero_si128()); src_reg_12_hi_2 = _mm_unpackhi_epi8(src_reg_12_hi, _mm_setzero_si128()); - tmp_0 = _mm_madd_epi16(src_reg_12_hi_1, kernel_reg_45); - tmp_1 = _mm_madd_epi16(src_reg_12_hi_2, kernel_reg_45); - res_reg_12_hi = _mm_packs_epi32(tmp_0, tmp_1); + res_reg_12_hi = multiply_add_packs_epi16_sse2( + &src_reg_12_hi_1, &src_reg_12_hi_2, &kernel_reg_45); src_reg_23_hi_1 = _mm_unpacklo_epi8(src_reg_23_hi, _mm_setzero_si128()); src_reg_23_hi_2 = _mm_unpackhi_epi8(src_reg_23_hi, _mm_setzero_si128()); - tmp_0 = _mm_madd_epi16(src_reg_23_hi_1, kernel_reg_45); - tmp_1 = _mm_madd_epi16(src_reg_23_hi_2, kernel_reg_45); - res_reg_23_hi = _mm_packs_epi32(tmp_0, tmp_1); + res_reg_23_hi = multiply_add_packs_epi16_sse2( + &src_reg_23_hi_1, &src_reg_23_hi_2, &kernel_reg_45); - // First half of the results + // Second half of the results res_reg_m1012_hi = _mm_adds_epi16(res_reg_m10_hi, res_reg_12_hi); res_reg_0123_hi = _mm_adds_epi16(res_reg_01_hi, res_reg_23_hi); // Round the words - res_reg_m1012_lo = _mm_adds_epi16(res_reg_m1012_lo, reg_32); - res_reg_0123_lo = _mm_adds_epi16(res_reg_0123_lo, reg_32); - res_reg_m1012_hi = _mm_adds_epi16(res_reg_m1012_hi, reg_32); - res_reg_0123_hi = _mm_adds_epi16(res_reg_0123_hi, reg_32); - res_reg_m1012_lo = _mm_srai_epi16(res_reg_m1012_lo, 6); - res_reg_0123_lo = _mm_srai_epi16(res_reg_0123_lo, 6); - res_reg_m1012_hi = _mm_srai_epi16(res_reg_m1012_hi, 6); - res_reg_0123_hi = _mm_srai_epi16(res_reg_0123_hi, 6); + res_reg_m1012_lo = round_epi16_sse2(&res_reg_m1012_lo, ®_32, 6); + res_reg_0123_lo = round_epi16_sse2(&res_reg_0123_lo, ®_32, 6); + res_reg_m1012_hi = round_epi16_sse2(&res_reg_m1012_hi, ®_32, 6); + res_reg_0123_hi = round_epi16_sse2(&res_reg_0123_hi, ®_32, 6); // Combine to get the result res_reg_m1012 = _mm_packus_epi16(res_reg_m1012_lo, res_reg_m1012_hi); -- 2.49.0