From 5a34e0eb89e16aafa2dafc73596078409fa40bbb Mon Sep 17 00:00:00 2001 From: Christian Duvivier Date: Tue, 14 Aug 2012 18:19:09 -0700 Subject: [PATCH] First partial snapshot of vectorized 8-tap filtering. About 3.5x faster, 30% overall encoder speedup. Rest of optimizations will come soon (see TODO section in filter_sse4.c). Change-Id: If18108048bfd5345fc942e8574e4c7f58e0e86e0 --- vp8/common/filter.c | 131 ++++++++----- vp8/common/rtcd_defs.sh | 15 ++ vp8/common/x86/filter_sse4.c | 361 +++++++++++++++++++++++++++++++++++ vp8/vp8_common.mk | 5 + 4 files changed, 467 insertions(+), 45 deletions(-) create mode 100644 vp8/common/x86/filter_sse4.c diff --git a/vp8/common/filter.c b/vp8/common/filter.c index 8b96c82a2..0bc88e5dd 100644 --- a/vp8/common/filter.c +++ b/vp8/common/filter.c @@ -12,6 +12,7 @@ #include #include "filter.h" #include "vpx_ports/mem.h" +#include "vpx_rtcd.h" DECLARE_ALIGNED(16, const short, vp8_bilinear_filters[SUBPEL_SHIFTS][2]) = { { 128, 0 }, @@ -511,13 +512,10 @@ static const unsigned int filter_max_width = 16; static void vp8_filter_block2d_8_c ( - const unsigned char *src_ptr, - const unsigned int src_stride, - const short *HFilter, - const short *VFilter, + const unsigned char *src_ptr, const unsigned int src_stride, + const short *HFilter, const short *VFilter, const filter_size_t filter_size, - unsigned char *dst_ptr, - unsigned int dst_stride + unsigned char *dst_ptr, unsigned int dst_stride ) { const unsigned int output_width = filter_size_to_wh[filter_size][0]; const unsigned int output_height = filter_size_to_wh[filter_size][1]; @@ -609,6 +607,50 @@ static void vp8_filter_block2d_8_c } } +void vp8_filter_block2d_4x4_8_c +( + const unsigned char *src_ptr, const unsigned int src_stride, + const short *HFilter_aligned16, const short *VFilter_aligned16, + unsigned char *dst_ptr, unsigned int dst_stride +) { + vp8_filter_block2d_8_c(src_ptr, src_stride, + HFilter_aligned16, VFilter_aligned16, + VPX_FILTER_4x4, dst_ptr, dst_stride); +} + +void vp8_filter_block2d_8x4_8_c +( + const unsigned char *src_ptr, const unsigned int src_stride, + const short *HFilter_aligned16, const short *VFilter_aligned16, + unsigned char *dst_ptr, unsigned int dst_stride +) { + vp8_filter_block2d_8_c(src_ptr, src_stride, + HFilter_aligned16, VFilter_aligned16, + VPX_FILTER_8x4, dst_ptr, dst_stride); +} + +void vp8_filter_block2d_8x8_8_c +( + const unsigned char *src_ptr, const unsigned int src_stride, + const short *HFilter_aligned16, const short *VFilter_aligned16, + unsigned char *dst_ptr, unsigned int dst_stride +) { + vp8_filter_block2d_8_c(src_ptr, src_stride, + HFilter_aligned16, VFilter_aligned16, + VPX_FILTER_8x8, dst_ptr, dst_stride); +} + +void vp8_filter_block2d_16x16_8_c +( + const unsigned char *src_ptr, const unsigned int src_stride, + const short *HFilter_aligned16, const short *VFilter_aligned16, + unsigned char *dst_ptr, unsigned int dst_stride +) { + vp8_filter_block2d_8_c(src_ptr, src_stride, + HFilter_aligned16, VFilter_aligned16, + VPX_FILTER_16x16, dst_ptr, dst_stride); +} + static void vp8_block2d_average_c ( unsigned char *src, @@ -629,7 +671,6 @@ static void vp8_block2d_average_c } } -#define vp8_filter_block2d_8 vp8_filter_block2d_8_c #define vp8_block2d_average vp8_block2d_average_c void vp8_eighttap_predict_c @@ -647,9 +688,9 @@ void vp8_eighttap_predict_c HFilter = vp8_sub_pel_filters_8[xoffset]; VFilter = vp8_sub_pel_filters_8[yoffset]; - vp8_filter_block2d_8(src_ptr, src_pixels_per_line, - HFilter, VFilter, VPX_FILTER_4x4, - dst_ptr, dst_pitch); + vp8_filter_block2d_4x4_8(src_ptr, src_pixels_per_line, + HFilter, VFilter, + dst_ptr, dst_pitch); } void vp8_eighttap_predict_avg4x4_c @@ -665,9 +706,9 @@ void vp8_eighttap_predict_avg4x4_c const short *VFilter = vp8_sub_pel_filters_8[yoffset]; unsigned char tmp[4 * 4]; - vp8_filter_block2d_8(src_ptr, src_pixels_per_line, - HFilter, VFilter, VPX_FILTER_4x4, - tmp, 4); + vp8_filter_block2d_4x4_8(src_ptr, src_pixels_per_line, + HFilter, VFilter, + tmp, 4); vp8_block2d_average(tmp, 4, dst_ptr, dst_pitch, VPX_FILTER_4x4); } @@ -686,9 +727,9 @@ void vp8_eighttap_predict_sharp_c HFilter = vp8_sub_pel_filters_8s[xoffset]; VFilter = vp8_sub_pel_filters_8s[yoffset]; - vp8_filter_block2d_8(src_ptr, src_pixels_per_line, - HFilter, VFilter, VPX_FILTER_4x4, - dst_ptr, dst_pitch); + vp8_filter_block2d_4x4_8(src_ptr, src_pixels_per_line, + HFilter, VFilter, + dst_ptr, dst_pitch); } void vp8_eighttap_predict_avg4x4_sharp_c @@ -704,9 +745,9 @@ void vp8_eighttap_predict_avg4x4_sharp_c const short *VFilter = vp8_sub_pel_filters_8s[yoffset]; unsigned char tmp[4 * 4]; - vp8_filter_block2d_8(src_ptr, src_pixels_per_line, - HFilter, VFilter, VPX_FILTER_4x4, - tmp, 4); + vp8_filter_block2d_4x4_8(src_ptr, src_pixels_per_line, + HFilter, VFilter, + tmp, 4); vp8_block2d_average(tmp, 4, dst_ptr, dst_pitch, VPX_FILTER_4x4); } @@ -722,9 +763,9 @@ void vp8_eighttap_predict8x8_c const short *HFilter = vp8_sub_pel_filters_8[xoffset]; const short *VFilter = vp8_sub_pel_filters_8[yoffset]; - vp8_filter_block2d_8(src_ptr, src_pixels_per_line, - HFilter, VFilter, VPX_FILTER_8x8, - dst_ptr, dst_pitch); + vp8_filter_block2d_8x8_8(src_ptr, src_pixels_per_line, + HFilter, VFilter, + dst_ptr, dst_pitch); } void vp8_eighttap_predict8x8_sharp_c @@ -739,9 +780,9 @@ void vp8_eighttap_predict8x8_sharp_c const short *HFilter = vp8_sub_pel_filters_8s[xoffset]; const short *VFilter = vp8_sub_pel_filters_8s[yoffset]; - vp8_filter_block2d_8(src_ptr, src_pixels_per_line, - HFilter, VFilter, VPX_FILTER_8x8, - dst_ptr, dst_pitch); + vp8_filter_block2d_8x8_8(src_ptr, src_pixels_per_line, + HFilter, VFilter, + dst_ptr, dst_pitch); } void vp8_eighttap_predict_avg8x8_c @@ -757,9 +798,9 @@ void vp8_eighttap_predict_avg8x8_c const short *HFilter = vp8_sub_pel_filters_8[xoffset]; const short *VFilter = vp8_sub_pel_filters_8[yoffset]; - vp8_filter_block2d_8(src_ptr, src_pixels_per_line, - HFilter, VFilter, VPX_FILTER_8x8, - tmp, 8); + vp8_filter_block2d_8x8_8(src_ptr, src_pixels_per_line, + HFilter, VFilter, + tmp, 8); vp8_block2d_average(tmp, 8, dst_ptr, dst_pitch, VPX_FILTER_8x8); } @@ -776,9 +817,9 @@ void vp8_eighttap_predict_avg8x8_sharp_c const short *HFilter = vp8_sub_pel_filters_8s[xoffset]; const short *VFilter = vp8_sub_pel_filters_8s[yoffset]; - vp8_filter_block2d_8(src_ptr, src_pixels_per_line, - HFilter, VFilter, VPX_FILTER_8x8, - tmp, 8); + vp8_filter_block2d_8x8_8(src_ptr, src_pixels_per_line, + HFilter, VFilter, + tmp, 8); vp8_block2d_average(tmp, 8, dst_ptr, dst_pitch, VPX_FILTER_8x8); } @@ -795,9 +836,9 @@ void vp8_eighttap_predict8x4_c const short *HFilter = vp8_sub_pel_filters_8[xoffset]; const short *VFilter = vp8_sub_pel_filters_8[yoffset]; - vp8_filter_block2d_8(src_ptr, src_pixels_per_line, - HFilter, VFilter, VPX_FILTER_8x4, - dst_ptr, dst_pitch); + vp8_filter_block2d_8x4_8(src_ptr, src_pixels_per_line, + HFilter, VFilter, + dst_ptr, dst_pitch); } void vp8_eighttap_predict8x4_sharp_c @@ -812,9 +853,9 @@ void vp8_eighttap_predict8x4_sharp_c const short *HFilter = vp8_sub_pel_filters_8s[xoffset]; const short *VFilter = vp8_sub_pel_filters_8s[yoffset]; - vp8_filter_block2d_8(src_ptr, src_pixels_per_line, - HFilter, VFilter, VPX_FILTER_8x4, - dst_ptr, dst_pitch); + vp8_filter_block2d_8x4_8(src_ptr, src_pixels_per_line, + HFilter, VFilter, + dst_ptr, dst_pitch); } void vp8_eighttap_predict16x16_c @@ -829,8 +870,8 @@ void vp8_eighttap_predict16x16_c const short *HFilter = vp8_sub_pel_filters_8[xoffset]; const short *VFilter = vp8_sub_pel_filters_8[yoffset]; - vp8_filter_block2d_8(src_ptr, src_pixels_per_line, - HFilter, VFilter, VPX_FILTER_16x16, + vp8_filter_block2d_16x16_8(src_ptr, src_pixels_per_line, + HFilter, VFilter, dst_ptr, dst_pitch); } @@ -846,8 +887,8 @@ void vp8_eighttap_predict16x16_sharp_c const short *HFilter = vp8_sub_pel_filters_8s[xoffset]; const short *VFilter = vp8_sub_pel_filters_8s[yoffset]; - vp8_filter_block2d_8(src_ptr, src_pixels_per_line, - HFilter, VFilter, VPX_FILTER_16x16, + vp8_filter_block2d_16x16_8(src_ptr, src_pixels_per_line, + HFilter, VFilter, dst_ptr, dst_pitch); } @@ -865,8 +906,8 @@ void vp8_eighttap_predict_avg16x16_c const short *HFilter = vp8_sub_pel_filters_8[xoffset]; const short *VFilter = vp8_sub_pel_filters_8[yoffset]; - vp8_filter_block2d_8(src_ptr, src_pixels_per_line, - HFilter, VFilter, VPX_FILTER_16x16, + vp8_filter_block2d_16x16_8(src_ptr, src_pixels_per_line, + HFilter, VFilter, tmp, 16); vp8_block2d_average(tmp, 16, dst_ptr, dst_pitch, VPX_FILTER_16x16); } @@ -884,8 +925,8 @@ void vp8_eighttap_predict_avg16x16_sharp_c const short *HFilter = vp8_sub_pel_filters_8s[xoffset]; const short *VFilter = vp8_sub_pel_filters_8s[yoffset]; - vp8_filter_block2d_8(src_ptr, src_pixels_per_line, - HFilter, VFilter, VPX_FILTER_16x16, + vp8_filter_block2d_16x16_8(src_ptr, src_pixels_per_line, + HFilter, VFilter, tmp, 16); vp8_block2d_average(tmp, 16, dst_ptr, dst_pitch, VPX_FILTER_16x16); } diff --git a/vp8/common/rtcd_defs.sh b/vp8/common/rtcd_defs.sh index 95148c86f..1cb5de311 100644 --- a/vp8/common/rtcd_defs.sh +++ b/vp8/common/rtcd_defs.sh @@ -4,3 +4,18 @@ struct blockd; EOF } forward_decls common_forward_decls + +prototype void vp8_filter_block2d_4x4_8 "const unsigned char *src_ptr, const unsigned int src_stride, const short *HFilter_aligned16, const short *VFilter_aligned16, unsigned char *dst_ptr, unsigned int dst_stride" +prototype void vp8_filter_block2d_8x4_8 "const unsigned char *src_ptr, const unsigned int src_stride, const short *HFilter_aligned16, const short *VFilter_aligned16, unsigned char *dst_ptr, unsigned int dst_stride" +prototype void vp8_filter_block2d_8x8_8 "const unsigned char *src_ptr, const unsigned int src_stride, const short *HFilter_aligned16, const short *VFilter_aligned16, unsigned char *dst_ptr, unsigned int dst_stride" +prototype void vp8_filter_block2d_16x16_8 "const unsigned char *src_ptr, const unsigned int src_stride, const short *HFilter_aligned16, const short *VFilter_aligned16, unsigned char *dst_ptr, unsigned int dst_stride" + +# At the very least, MSVC 2008 has compiler bug exhibited by this code; code +# compiles warning free but a dissassembly of generated code show bugs. To be +# on the safe side, only enabled when compiled with 'gcc'. +if [ "$CONFIG_GCC" = "yes" ]; then + specialize vp8_filter_block2d_4x4_8 sse4_1 + specialize vp8_filter_block2d_8x4_8 sse4_1 + specialize vp8_filter_block2d_8x8_8 sse4_1 + specialize vp8_filter_block2d_16x16_8 sse4_1 +fi diff --git a/vp8/common/x86/filter_sse4.c b/vp8/common/x86/filter_sse4.c new file mode 100644 index 000000000..40e37d69c --- /dev/null +++ b/vp8/common/x86/filter_sse4.c @@ -0,0 +1,361 @@ +/* + * Copyright (c) 2012 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include // for alignment checks +#include // SSE4.1 +#include "vp8/common/filter.h" +#include "vpx_ports/mem.h" // for DECLARE_ALIGNED +#include "vpx_rtcd.h" + +// TODO(cd): After cleanup, commit faster versions for non 4x4 size. This is +// just a quick partial snapshot so that other can already use some +// speedup. +// TODO(cd): Use vectorized 8 tap filtering code as speedup to pure C 6 tap +// filtering. +// TODO(cd): Reduce source size by using macros instead of current code +// duplication. +// TODO(cd): Add some comments, better variable naming. +// TODO(cd): Maybe use _mm_maddubs_epi16 if smaller filter coeficients (no sum +// of positive above 128), or have higher precision filter +// coefficients. +// TODO(cd): Remove use of _mm_extract_epi32 and _mm_extract_epi64, to not +// require SSE4.1 +// TODO(cd): Remove use of _mm_shuffle_epi8 to not require SSSE3 + +DECLARE_ALIGNED(16, static const unsigned char, mask0123_c[16]) = { + 0x00, 0x01, + 0x01, 0x02, + 0x02, 0x03, + 0x03, 0x04, + 0x02, 0x03, + 0x03, 0x04, + 0x04, 0x05, + 0x05, 0x06, +}; +DECLARE_ALIGNED(16, static const unsigned char, mask4567_c[16]) = { + 0x04, 0x05, + 0x05, 0x06, + 0x06, 0x07, + 0x07, 0x08, + 0x06, 0x07, + 0x07, 0x08, + 0x08, 0x09, + 0x09, 0x0A, +}; +DECLARE_ALIGNED(16, static const unsigned int, rounding_c[4]) = { + VP8_FILTER_WEIGHT >> 1, + VP8_FILTER_WEIGHT >> 1, + VP8_FILTER_WEIGHT >> 1, + VP8_FILTER_WEIGHT >> 1, +}; + +void vp8_filter_block2d_4x4_8_sse4_1 +( + const unsigned char *src_ptr, const unsigned int src_stride, + const short *HFilter_aligned16, const short *VFilter_aligned16, + unsigned char *dst_ptr, unsigned int dst_stride +) { + const unsigned int output_height_div4 = 1; + + DECLARE_ALIGNED(16, unsigned char, intermediate_buffer[4 * 12]); + const int kInterp_Extend = 4; + const unsigned int output_height = output_height_div4 * 4 - 1 + + 2 * kInterp_Extend; + + const __m128i zero = _mm_set1_epi16(0); + const __m128i mask0123 = _mm_load_si128((const __m128i *)mask0123_c); + const __m128i mask4567 = _mm_load_si128((const __m128i *)mask4567_c); + const __m128i rounding = _mm_load_si128((const __m128i *)rounding_c); + + unsigned int i; + // check alignment + assert(0 == ((long)HFilter_aligned16)%16); + assert(0 == ((long)VFilter_aligned16)%16); + + { + __m128i transpose3_0; + __m128i transpose3_1; + __m128i transpose3_2; + __m128i transpose3_3; + + // Horizontal pass (src -> intermediate). + { + unsigned char *output_ptr = intermediate_buffer; + const __m128i HFilter = _mm_load_si128((const __m128i *)HFilter_aligned16); + // get first two columns filter coefficients + __m128i fil01 = _mm_shuffle_epi32(HFilter, _MM_SHUFFLE(0, 0, 0, 0)); + __m128i fil23 = _mm_shuffle_epi32(HFilter, _MM_SHUFFLE(1, 1, 1, 1)); + __m128i fil45 = _mm_shuffle_epi32(HFilter, _MM_SHUFFLE(2, 2, 2, 2)); + __m128i fil67 = _mm_shuffle_epi32(HFilter, _MM_SHUFFLE(3, 3, 3, 3)); + src_ptr -= (kInterp_Extend - 1) * src_stride + (kInterp_Extend - 1); + + for (i = 0; i < output_height; i++) { + //load pixels + __m128i src = _mm_loadu_si128((const __m128i *)src_ptr); + // extract the ones used for first column + __m128i src0123 = _mm_shuffle_epi8(src, mask0123); + __m128i src4567 = _mm_shuffle_epi8(src, mask4567); + __m128i src01_16 = _mm_unpacklo_epi8(src0123, zero); + __m128i src23_16 = _mm_unpackhi_epi8(src0123, zero); + __m128i src45_16 = _mm_unpacklo_epi8(src4567, zero); + __m128i src67_16 = _mm_unpackhi_epi8(src4567, zero); + // multiply accumulate them + __m128i mad01 = _mm_madd_epi16(src01_16, fil01); + __m128i mad23 = _mm_madd_epi16(src23_16, fil23); + __m128i mad45 = _mm_madd_epi16(src45_16, fil45); + __m128i mad67 = _mm_madd_epi16(src67_16, fil67); + __m128i mad0123 = _mm_add_epi32(mad01, mad23); + __m128i mad4567 = _mm_add_epi32(mad45, mad67); + __m128i mad_all = _mm_add_epi32(mad0123, mad4567); + mad_all = _mm_add_epi32(mad_all, rounding); + mad_all = _mm_srai_epi32(mad_all, VP8_FILTER_SHIFT); + mad_all = _mm_packs_epi32(mad_all, mad_all); + mad_all = _mm_packus_epi16(mad_all, mad_all); + *((unsigned int *)output_ptr) = _mm_extract_epi32(mad_all, 0); + //TODO(cd): look into Ronald's comment: + // future suggestion: use movd, not pextrd(0). + // + // Alternatively, you could unroll this loop somewhat to handle 2 + // or 4 lines at a time, so that the packs_epi32() and the + // packus_epi16() handle a full register worth of data. Then again, + // you might have to specialcase the last line since you have 11 + // lines to handle here, and you don't want to handle a 12th dummy + // line, so overall I'm not sure it's worth it. Use your best + // judgement. :-). + + // next row + src_ptr += src_stride; + output_ptr += 4; + } + } + + // Transpose result (intermediate -> transpose3_x) + { + const __m128i srcA = _mm_load_si128((__m128i *)(&intermediate_buffer[ 0])); + const __m128i srcB = _mm_load_si128((__m128i *)(&intermediate_buffer[16])); + const __m128i srcC = _mm_load_si128((__m128i *)(&intermediate_buffer[32])); + // 00 01 02 03 10 11 12 13 20 21 22 23 30 31 32 33 + // 40 41 42 43 50 51 52 53 60 61 62 63 70 71 72 73 + // 80 81 82 83 90 91 92 93 A0 A1 A2 A3 xx xx xx xx + const __m128i transpose0_0 = _mm_unpacklo_epi8(srcA, srcB); + const __m128i transpose0_1 = _mm_unpackhi_epi8(srcA, srcB); + const __m128i transpose0_2 = _mm_unpacklo_epi8(srcC, srcC); + const __m128i transpose0_3 = _mm_unpackhi_epi8(srcC, srcC); + // 00 40 01 41 02 42 03 43 10 50 11 51 12 52 13 53 + // 20 60 21 61 22 62 23 63 30 70 31 71 32 72 33 73 + // 80 xx 81 xx 82 xx 83 xx 90 xx 91 xx 92 xx 93 xx + // A0 xx A1 xx A2 xx A3 xx xx xx xx xx xx xx xx xx + const __m128i transpose1_0 = _mm_unpacklo_epi8(transpose0_0, transpose0_1); + const __m128i transpose1_1 = _mm_unpackhi_epi8(transpose0_0, transpose0_1); + const __m128i transpose1_2 = _mm_unpacklo_epi8(transpose0_2, transpose0_3); + const __m128i transpose1_3 = _mm_unpackhi_epi8(transpose0_2, transpose0_3); + // 00 20 40 60 01 21 41 61 02 22 42 62 03 23 43 63 + // 10 30 50 70 11 31 51 71 12 32 52 72 13 33 53 73 + // 80 A0 xx xx 81 A1 xx xx 82 A2 xx xx 83 A3 xx xx + // 90 xx xx xx 91 xx xx xx 92 xx xx xx 93 xx xx xx + const __m128i transpose2_0 = _mm_unpacklo_epi8(transpose1_0, transpose1_1); + const __m128i transpose2_1 = _mm_unpackhi_epi8(transpose1_0, transpose1_1); + const __m128i transpose2_2 = _mm_unpacklo_epi8(transpose1_2, transpose1_3); + const __m128i transpose2_3 = _mm_unpackhi_epi8(transpose1_2, transpose1_3); + // 00 10 20 30 40 50 60 70 01 11 21 31 41 51 61 71 + // 02 12 22 32 42 52 62 72 03 13 23 33 43 53 63 73 + // 80 90 A0 xx xx xx xx xx 81 91 A1 xx xx xx xx xx + // 82 92 A2 xx xx xx xx xx 83 93 A3 xx xx xx xx xx + transpose3_0 = _mm_castps_si128( + _mm_shuffle_ps(_mm_castsi128_ps(transpose2_0), + _mm_castsi128_ps(transpose2_2), + _MM_SHUFFLE(1, 0, 1, 0))); + transpose3_1 = _mm_castps_si128( + _mm_shuffle_ps(_mm_castsi128_ps(transpose2_0), + _mm_castsi128_ps(transpose2_2), + _MM_SHUFFLE(3, 2, 3, 2))); + transpose3_2 = _mm_castps_si128( + _mm_shuffle_ps(_mm_castsi128_ps(transpose2_1), + _mm_castsi128_ps(transpose2_3), + _MM_SHUFFLE(1, 0, 1, 0))); + transpose3_3 = _mm_castps_si128( + _mm_shuffle_ps(_mm_castsi128_ps(transpose2_1), + _mm_castsi128_ps(transpose2_3), + _MM_SHUFFLE(3, 2, 3, 2))); + // 00 10 20 30 40 50 60 70 80 90 A0 xx xx xx xx xx + // 01 11 21 31 41 51 61 71 81 91 A1 xx xx xx xx xx + // 02 12 22 32 42 52 62 72 82 92 A2 xx xx xx xx xx + // 03 13 23 33 43 53 63 73 83 93 A3 xx xx xx xx xx + } + + // Vertical pass (transpose3_x -> dst). + { + const __m128i VFilter = _mm_load_si128((const __m128i *)VFilter_aligned16); + // get first two columns filter coefficients + __m128i fil01 = _mm_shuffle_epi32(VFilter, _MM_SHUFFLE(0, 0, 0, 0)); + __m128i fil23 = _mm_shuffle_epi32(VFilter, _MM_SHUFFLE(1, 1, 1, 1)); + __m128i fil45 = _mm_shuffle_epi32(VFilter, _MM_SHUFFLE(2, 2, 2, 2)); + __m128i fil67 = _mm_shuffle_epi32(VFilter, _MM_SHUFFLE(3, 3, 3, 3)); + __m128i col0, col1, col2, col3; + { + //load pixels + __m128i src = transpose3_0; + // extract the ones used for first column + __m128i src0123 = _mm_shuffle_epi8(src, mask0123); + __m128i src4567 = _mm_shuffle_epi8(src, mask4567); + __m128i src01_16 = _mm_unpacklo_epi8(src0123, zero); + __m128i src23_16 = _mm_unpackhi_epi8(src0123, zero); + __m128i src45_16 = _mm_unpacklo_epi8(src4567, zero); + __m128i src67_16 = _mm_unpackhi_epi8(src4567, zero); + // multiply accumulate them + __m128i mad01 = _mm_madd_epi16(src01_16, fil01); + __m128i mad23 = _mm_madd_epi16(src23_16, fil23); + __m128i mad45 = _mm_madd_epi16(src45_16, fil45); + __m128i mad67 = _mm_madd_epi16(src67_16, fil67); + __m128i mad0123 = _mm_add_epi32(mad01, mad23); + __m128i mad4567 = _mm_add_epi32(mad45, mad67); + __m128i mad_all = _mm_add_epi32(mad0123, mad4567); + mad_all = _mm_add_epi32(mad_all, rounding); + mad_all = _mm_srai_epi32(mad_all, VP8_FILTER_SHIFT); + mad_all = _mm_packs_epi32(mad_all, mad_all); + col0 = _mm_packus_epi16(mad_all, mad_all); + } + { + //load pixels + __m128i src = transpose3_1; + // extract the ones used for first column + __m128i src0123 = _mm_shuffle_epi8(src, mask0123); + __m128i src4567 = _mm_shuffle_epi8(src, mask4567); + __m128i src01_16 = _mm_unpacklo_epi8(src0123, zero); + __m128i src23_16 = _mm_unpackhi_epi8(src0123, zero); + __m128i src45_16 = _mm_unpacklo_epi8(src4567, zero); + __m128i src67_16 = _mm_unpackhi_epi8(src4567, zero); + // multiply accumulate them + __m128i mad01 = _mm_madd_epi16(src01_16, fil01); + __m128i mad23 = _mm_madd_epi16(src23_16, fil23); + __m128i mad45 = _mm_madd_epi16(src45_16, fil45); + __m128i mad67 = _mm_madd_epi16(src67_16, fil67); + __m128i mad0123 = _mm_add_epi32(mad01, mad23); + __m128i mad4567 = _mm_add_epi32(mad45, mad67); + __m128i mad_all = _mm_add_epi32(mad0123, mad4567); + mad_all = _mm_add_epi32(mad_all, rounding); + mad_all = _mm_srai_epi32(mad_all, VP8_FILTER_SHIFT); + mad_all = _mm_packs_epi32(mad_all, mad_all); + col1 = _mm_packus_epi16(mad_all, mad_all); + } + { + //load pixels + __m128i src = transpose3_2; + // extract the ones used for first column + __m128i src0123 = _mm_shuffle_epi8(src, mask0123); + __m128i src4567 = _mm_shuffle_epi8(src, mask4567); + __m128i src01_16 = _mm_unpacklo_epi8(src0123, zero); + __m128i src23_16 = _mm_unpackhi_epi8(src0123, zero); + __m128i src45_16 = _mm_unpacklo_epi8(src4567, zero); + __m128i src67_16 = _mm_unpackhi_epi8(src4567, zero); + // multiply accumulate them + __m128i mad01 = _mm_madd_epi16(src01_16, fil01); + __m128i mad23 = _mm_madd_epi16(src23_16, fil23); + __m128i mad45 = _mm_madd_epi16(src45_16, fil45); + __m128i mad67 = _mm_madd_epi16(src67_16, fil67); + __m128i mad0123 = _mm_add_epi32(mad01, mad23); + __m128i mad4567 = _mm_add_epi32(mad45, mad67); + __m128i mad_all = _mm_add_epi32(mad0123, mad4567); + mad_all = _mm_add_epi32(mad_all, rounding); + mad_all = _mm_srai_epi32(mad_all, VP8_FILTER_SHIFT); + mad_all = _mm_packs_epi32(mad_all, mad_all); + col2 = _mm_packus_epi16(mad_all, mad_all); + } + { + //load pixels + __m128i src = transpose3_3; + // extract the ones used for first column + __m128i src0123 = _mm_shuffle_epi8(src, mask0123); + __m128i src4567 = _mm_shuffle_epi8(src, mask4567); + __m128i src01_16 = _mm_unpacklo_epi8(src0123, zero); + __m128i src23_16 = _mm_unpackhi_epi8(src0123, zero); + __m128i src45_16 = _mm_unpacklo_epi8(src4567, zero); + __m128i src67_16 = _mm_unpackhi_epi8(src4567, zero); + // multiply accumulate them + __m128i mad01 = _mm_madd_epi16(src01_16, fil01); + __m128i mad23 = _mm_madd_epi16(src23_16, fil23); + __m128i mad45 = _mm_madd_epi16(src45_16, fil45); + __m128i mad67 = _mm_madd_epi16(src67_16, fil67); + __m128i mad0123 = _mm_add_epi32(mad01, mad23); + __m128i mad4567 = _mm_add_epi32(mad45, mad67); + __m128i mad_all = _mm_add_epi32(mad0123, mad4567); + mad_all = _mm_add_epi32(mad_all, rounding); + mad_all = _mm_srai_epi32(mad_all, VP8_FILTER_SHIFT); + mad_all = _mm_packs_epi32(mad_all, mad_all); + col3 = _mm_packus_epi16(mad_all, mad_all); + } + { + __m128i col01 = _mm_unpacklo_epi8(col0, col1); + __m128i col23 = _mm_unpacklo_epi8(col2, col3); + __m128i col0123 = _mm_unpacklo_epi16(col01, col23); + //TODO(cd): look into Ronald's comment: + // Future suggestion: I believe here, too, you can merge the + // packs_epi32() and pacus_epi16() for the 4 cols above, so that + // you get the data in a single register, and then use pshufb + // (shuffle_epi8()) instead of the unpacks here. Should be + // 2+3+2 instructions faster. + *((unsigned int *)&dst_ptr[dst_stride * 0]) = + _mm_extract_epi32(col0123, 0); + *((unsigned int *)&dst_ptr[dst_stride * 1]) = + _mm_extract_epi32(col0123, 1); + *((unsigned int *)&dst_ptr[dst_stride * 2]) = + _mm_extract_epi32(col0123, 2); + *((unsigned int *)&dst_ptr[dst_stride * 3]) = + _mm_extract_epi32(col0123, 3); + } + } + } +} + +void vp8_filter_block2d_8x4_8_sse4_1 +( + const unsigned char *src_ptr, const unsigned int src_stride, + const short *HFilter_aligned16, const short *VFilter_aligned16, + unsigned char *dst_ptr, unsigned int dst_stride +) { + int j; + for (j=0; j<8; j+=4) { + vp8_filter_block2d_4x4_8_sse4_1(src_ptr + j, src_stride, + HFilter_aligned16, VFilter_aligned16, + dst_ptr + j, dst_stride); + } +} + +void vp8_filter_block2d_8x8_8_sse4_1 +( + const unsigned char *src_ptr, const unsigned int src_stride, + const short *HFilter_aligned16, const short *VFilter_aligned16, + unsigned char *dst_ptr, unsigned int dst_stride +) { + int i, j; + for (i=0; i<8; i+=4) { + for (j=0; j<8; j+=4) { + vp8_filter_block2d_4x4_8_sse4_1(src_ptr + j + i*src_stride, src_stride, + HFilter_aligned16, VFilter_aligned16, + dst_ptr + j + i*dst_stride, dst_stride); + } + } +} + +void vp8_filter_block2d_16x16_8_sse4_1 +( + const unsigned char *src_ptr, const unsigned int src_stride, + const short *HFilter_aligned16, const short *VFilter_aligned16, + unsigned char *dst_ptr, unsigned int dst_stride +) { + int i, j; + for (i=0; i<16; i+=4) { + for (j=0; j<16; j+=4) { + vp8_filter_block2d_4x4_8_sse4_1(src_ptr + j + i*src_stride, src_stride, + HFilter_aligned16, VFilter_aligned16, + dst_ptr + j + i*dst_stride, dst_stride); + } + } +} diff --git a/vp8/vp8_common.mk b/vp8/vp8_common.mk index 8c9addc78..3595a851a 100644 --- a/vp8/vp8_common.mk +++ b/vp8/vp8_common.mk @@ -114,6 +114,11 @@ VP8_COMMON_SRCS-yes += common/maskingmv.c VP8_COMMON_SRCS-$(HAVE_SSE3) += common/x86/mask_sse3.asm endif +VP8_COMMON_SRCS-$(HAVE_SSE4_1) += common/x86/filter_sse4.c +ifeq ($(HAVE_SSE4_1),yes) +vp8/common/x86/filter_sse4.c.o: CFLAGS += -msse4 +endif + VP8_COMMON_SRCS-$(ARCH_ARM) += common/arm/arm_systemdependent.c VP8_COMMON_SRCS-$(ARCH_ARM) += common/arm/bilinearfilter_arm.c VP8_COMMON_SRCS-$(ARCH_ARM) += common/arm/bilinearfilter_arm.h -- 2.40.0