From 5caec339bea5e37ae8707f9d783feb2aa51cd111 Mon Sep 17 00:00:00 2001 From: Johann Date: Thu, 25 Oct 2018 12:23:03 -0700 Subject: [PATCH] vp8 bilinear: rewrite 4x4 ~20% faster than the MMX. Removes the last usage of vp8_bilinear_filters_x86_[48]. Change-Id: Iee976fab9655d0020440f26c4403ce50103af913 --- test/predict_test.cc | 8 +- vp8/common/rtcd_defs.pl | 2 +- vp8/common/x86/bilinear_filter_sse2.c | 93 ++++++++++++++++++++ vp8/common/x86/filter_x86.c | 29 ------ vp8/common/x86/filter_x86.h | 33 ------- vp8/common/x86/subpixel_mmx.asm | 121 -------------------------- vp8/common/x86/vp8_asm_stubs.c | 1 - vp8/vp8_common.mk | 2 - vpx_dsp/x86/mem_sse2.h | 11 +++ 9 files changed, 107 insertions(+), 193 deletions(-) delete mode 100644 vp8/common/x86/filter_x86.c delete mode 100644 vp8/common/x86/filter_x86.h diff --git a/test/predict_test.cc b/test/predict_test.cc index a8fcfc0b1..1c2f2d11d 100644 --- a/test/predict_test.cc +++ b/test/predict_test.cc @@ -379,17 +379,13 @@ INSTANTIATE_TEST_CASE_P( make_tuple(8, 4, &vp8_bilinear_predict8x4_neon), make_tuple(4, 4, &vp8_bilinear_predict4x4_neon))); #endif -#if HAVE_MMX -INSTANTIATE_TEST_CASE_P( - MMX, BilinearPredictTest, - ::testing::Values(make_tuple(4, 4, &vp8_bilinear_predict4x4_mmx))); -#endif #if HAVE_SSE2 INSTANTIATE_TEST_CASE_P( SSE2, BilinearPredictTest, ::testing::Values(make_tuple(16, 16, &vp8_bilinear_predict16x16_sse2), make_tuple(8, 8, &vp8_bilinear_predict8x8_sse2), - make_tuple(8, 4, &vp8_bilinear_predict8x4_sse2))); + make_tuple(8, 4, &vp8_bilinear_predict8x4_sse2), + make_tuple(4, 4, &vp8_bilinear_predict4x4_sse2))); #endif #if HAVE_SSSE3 INSTANTIATE_TEST_CASE_P( diff --git a/vp8/common/rtcd_defs.pl b/vp8/common/rtcd_defs.pl index f67025767..3ab89a338 100644 --- a/vp8/common/rtcd_defs.pl +++ b/vp8/common/rtcd_defs.pl @@ -167,7 +167,7 @@ add_proto qw/void vp8_bilinear_predict8x4/, "unsigned char *src, int src_pitch, specialize qw/vp8_bilinear_predict8x4 sse2 neon msa/; add_proto qw/void vp8_bilinear_predict4x4/, "unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch"; -specialize qw/vp8_bilinear_predict4x4 mmx neon msa/; +specialize qw/vp8_bilinear_predict4x4 sse2 neon msa/; # # Encoder functions below this point. diff --git a/vp8/common/x86/bilinear_filter_sse2.c b/vp8/common/x86/bilinear_filter_sse2.c index 224c1b32a..14e10eca4 100644 --- a/vp8/common/x86/bilinear_filter_sse2.c +++ b/vp8/common/x86/bilinear_filter_sse2.c @@ -14,6 +14,7 @@ #include "./vp8_rtcd.h" #include "./vpx_config.h" #include "vp8/common/filter.h" +#include "vpx_dsp/x86/mem_sse2.h" #include "vpx_ports/mem.h" static INLINE void horizontal_16x16(uint8_t *src, const int stride, @@ -241,3 +242,95 @@ void vp8_bilinear_predict8x4_sse2(uint8_t *src_ptr, int src_pixels_per_line, vertical_8xN(FData, dst_ptr, dst_pitch, yoffset, 4); } + +static INLINE void horizontal_4x4(uint8_t *src, const int stride, uint16_t *dst, + const int xoffset) { + int h; + const __m128i zero = _mm_setzero_si128(); + + if (xoffset == 0) { + for (h = 0; h < 5; ++h) { + const __m128i a = load_unaligned_u32(src); + const __m128i a_u16 = _mm_unpacklo_epi8(a, zero); + _mm_storel_epi64((__m128i *)dst, a_u16); + src += stride; + dst += 4; + } + return; + } + + { + const __m128i round_factor = _mm_set1_epi16(1 << (VP8_FILTER_SHIFT - 1)); + const __m128i hfilter_0 = _mm_set1_epi16(vp8_bilinear_filters[xoffset][0]); + const __m128i hfilter_1 = _mm_set1_epi16(vp8_bilinear_filters[xoffset][1]); + + for (h = 0; h < 5; ++h) { + const __m128i a = load_unaligned_u32(src); + const __m128i b = load_unaligned_u32(src + 1); + const __m128i a_u16 = _mm_unpacklo_epi8(a, zero); + const __m128i b_u16 = _mm_unpacklo_epi8(b, zero); + const __m128i a_filtered = _mm_mullo_epi16(a_u16, hfilter_0); + const __m128i b_filtered = _mm_mullo_epi16(b_u16, hfilter_1); + const __m128i sum = _mm_add_epi16(a_filtered, b_filtered); + const __m128i compensated = _mm_add_epi16(sum, round_factor); + const __m128i shifted = _mm_srai_epi16(compensated, VP8_FILTER_SHIFT); + _mm_storel_epi64((__m128i *)dst, shifted); + src += stride; + dst += 4; + } + } +} + +static INLINE void vertical_4x4(uint16_t *src, uint8_t *dst, const int stride, + const int yoffset) { + int h; + + if (yoffset == 0) { + for (h = 0; h < 4; h += 2) { + const __m128i row = _mm_load_si128((__m128i *)src); + __m128i packed = _mm_packus_epi16(row, row); + store_unaligned_u32(dst, packed); + dst += stride; + packed = _mm_srli_si128(packed, 4); + store_unaligned_u32(dst, packed); + dst += stride; + src += 8; + } + return; + } + + { + const __m128i round_factor = _mm_set1_epi16(1 << (VP8_FILTER_SHIFT - 1)); + const __m128i vfilter_0 = _mm_set1_epi16(vp8_bilinear_filters[yoffset][0]); + const __m128i vfilter_1 = _mm_set1_epi16(vp8_bilinear_filters[yoffset][1]); + + for (h = 0; h < 4; h += 2) { + const __m128i row_0 = _mm_load_si128((__m128i *)src); + const __m128i row_1 = _mm_loadu_si128((__m128i *)(src + 4)); + const __m128i row_0_filtered = _mm_mullo_epi16(row_0, vfilter_0); + const __m128i row_1_filtered = _mm_mullo_epi16(row_1, vfilter_1); + const __m128i sum = _mm_add_epi16(row_0_filtered, row_1_filtered); + const __m128i compensated = _mm_add_epi16(sum, round_factor); + const __m128i shifted = _mm_srai_epi16(compensated, VP8_FILTER_SHIFT); + __m128i packed = _mm_packus_epi16(shifted, shifted); + storeu_uint32(dst, _mm_cvtsi128_si32(packed)); + packed = _mm_srli_si128(packed, 4); + dst += stride; + storeu_uint32(dst, _mm_cvtsi128_si32(packed)); + dst += stride; + src += 8; + } + } +} + +void vp8_bilinear_predict4x4_sse2(uint8_t *src_ptr, int src_pixels_per_line, + int xoffset, int yoffset, uint8_t *dst_ptr, + int dst_pitch) { + uint16_t FData[4 * 5]; + + assert((xoffset | yoffset) != 0); + + horizontal_4x4(src_ptr, src_pixels_per_line, FData, xoffset); + + vertical_4x4(FData, dst_ptr, dst_pitch, yoffset); +} diff --git a/vp8/common/x86/filter_x86.c b/vp8/common/x86/filter_x86.c deleted file mode 100644 index 2405342f0..000000000 --- a/vp8/common/x86/filter_x86.c +++ /dev/null @@ -1,29 +0,0 @@ -/* - * Copyright (c) 2011 The WebM project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - -#include "vp8/common/x86/filter_x86.h" - -DECLARE_ALIGNED(16, const short, vp8_bilinear_filters_x86_4[8][8]) = { - { 128, 128, 128, 128, 0, 0, 0, 0 }, { 112, 112, 112, 112, 16, 16, 16, 16 }, - { 96, 96, 96, 96, 32, 32, 32, 32 }, { 80, 80, 80, 80, 48, 48, 48, 48 }, - { 64, 64, 64, 64, 64, 64, 64, 64 }, { 48, 48, 48, 48, 80, 80, 80, 80 }, - { 32, 32, 32, 32, 96, 96, 96, 96 }, { 16, 16, 16, 16, 112, 112, 112, 112 } -}; - -DECLARE_ALIGNED(16, const short, vp8_bilinear_filters_x86_8[8][16]) = { - { 128, 128, 128, 128, 128, 128, 128, 128, 0, 0, 0, 0, 0, 0, 0, 0 }, - { 112, 112, 112, 112, 112, 112, 112, 112, 16, 16, 16, 16, 16, 16, 16, 16 }, - { 96, 96, 96, 96, 96, 96, 96, 96, 32, 32, 32, 32, 32, 32, 32, 32 }, - { 80, 80, 80, 80, 80, 80, 80, 80, 48, 48, 48, 48, 48, 48, 48, 48 }, - { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 }, - { 48, 48, 48, 48, 48, 48, 48, 48, 80, 80, 80, 80, 80, 80, 80, 80 }, - { 32, 32, 32, 32, 32, 32, 32, 32, 96, 96, 96, 96, 96, 96, 96, 96 }, - { 16, 16, 16, 16, 16, 16, 16, 16, 112, 112, 112, 112, 112, 112, 112, 112 } -}; diff --git a/vp8/common/x86/filter_x86.h b/vp8/common/x86/filter_x86.h deleted file mode 100644 index 570ff8666..000000000 --- a/vp8/common/x86/filter_x86.h +++ /dev/null @@ -1,33 +0,0 @@ -/* - * Copyright (c) 2011 The WebM project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - -#ifndef VPX_VP8_COMMON_X86_FILTER_X86_H_ -#define VPX_VP8_COMMON_X86_FILTER_X86_H_ - -#include "vpx_ports/mem.h" - -#ifdef __cplusplus -extern "C" { -#endif - -/* x86 assembly specific copy of vp8/common/filter.c:vp8_bilinear_filters with - * duplicated values */ - -/* duplicated 4x */ -extern DECLARE_ALIGNED(16, const short, vp8_bilinear_filters_x86_4[8][8]); - -/* duplicated 8x */ -extern DECLARE_ALIGNED(16, const short, vp8_bilinear_filters_x86_8[8][16]); - -#ifdef __cplusplus -} // extern "C" -#endif - -#endif // VPX_VP8_COMMON_X86_FILTER_X86_H_ diff --git a/vp8/common/x86/subpixel_mmx.asm b/vp8/common/x86/subpixel_mmx.asm index 05320d58d..67bcd0cbd 100644 --- a/vp8/common/x86/subpixel_mmx.asm +++ b/vp8/common/x86/subpixel_mmx.asm @@ -10,8 +10,6 @@ %include "vpx_ports/x86_abi_support.asm" -extern sym(vp8_bilinear_filters_x86_8) - %define BLOCK_HEIGHT_WIDTH 4 %define vp8_filter_weight 128 @@ -205,125 +203,6 @@ sym(vp8_filter_block1dc_v6_mmx): ret -;void bilinear_predict4x4_mmx -;( -; unsigned char *src_ptr, -; int src_pixels_per_line, -; int xoffset, -; int yoffset, -; unsigned char *dst_ptr, -; int dst_pitch -;) -global sym(vp8_bilinear_predict4x4_mmx) PRIVATE -sym(vp8_bilinear_predict4x4_mmx): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 6 - GET_GOT rbx - push rsi - push rdi - ; end prolog - - ;const short *HFilter = vp8_bilinear_filters_x86_8[xoffset]; - ;const short *VFilter = vp8_bilinear_filters_x86_8[yoffset]; - - movsxd rax, dword ptr arg(2) ;xoffset - mov rdi, arg(4) ;dst_ptr ; - - lea rcx, [GLOBAL(sym(vp8_bilinear_filters_x86_8))] - shl rax, 5 - - add rax, rcx ; HFilter - mov rsi, arg(0) ;src_ptr ; - - movsxd rdx, dword ptr arg(5) ;ldst_pitch - movq mm1, [rax] ; - - movq mm2, [rax+16] ; - movsxd rax, dword ptr arg(3) ;yoffset - - pxor mm0, mm0 ; - shl rax, 5 - - add rax, rcx - lea rcx, [rdi+rdx*4] ; - - movsxd rdx, dword ptr arg(1) ;src_pixels_per_line ; - - ; get the first horizontal line done ; - movd mm3, [rsi] ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 - punpcklbw mm3, mm0 ; xx 00 01 02 03 04 05 06 - - pmullw mm3, mm1 ; - movd mm5, [rsi+1] ; - - punpcklbw mm5, mm0 ; - pmullw mm5, mm2 ; - - paddw mm3, mm5 ; - paddw mm3, [GLOBAL(rd)] ; xmm3 += round value - - psraw mm3, VP8_FILTER_SHIFT ; xmm3 /= 128 - - movq mm7, mm3 ; - packuswb mm7, mm0 ; - - add rsi, rdx ; next line -.next_row_4x4: - movd mm3, [rsi] ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 - punpcklbw mm3, mm0 ; xx 00 01 02 03 04 05 06 - - pmullw mm3, mm1 ; - movd mm5, [rsi+1] ; - - punpcklbw mm5, mm0 ; - pmullw mm5, mm2 ; - - paddw mm3, mm5 ; - - movq mm5, mm7 ; - punpcklbw mm5, mm0 ; - - pmullw mm5, [rax] ; - paddw mm3, [GLOBAL(rd)] ; xmm3 += round value - - psraw mm3, VP8_FILTER_SHIFT ; xmm3 /= 128 - movq mm7, mm3 ; - - packuswb mm7, mm0 ; - - pmullw mm3, [rax+16] ; - paddw mm3, mm5 ; - - - paddw mm3, [GLOBAL(rd)] ; xmm3 += round value - psraw mm3, VP8_FILTER_SHIFT ; xmm3 /= 128 - - packuswb mm3, mm0 - movd [rdi], mm3 ; store the results in the destination - -%if ABI_IS_32BIT - add rsi, rdx ; next line - add rdi, dword ptr arg(5) ;dst_pitch ; -%else - movsxd r8, dword ptr arg(5) ;dst_pitch ; - add rsi, rdx ; next line - add rdi, r8 -%endif - - cmp rdi, rcx ; - jne .next_row_4x4 - - ; begin epilog - pop rdi - pop rsi - RESTORE_GOT - UNSHADOW_ARGS - pop rbp - ret - - - SECTION_RODATA align 16 rd: diff --git a/vp8/common/x86/vp8_asm_stubs.c b/vp8/common/x86/vp8_asm_stubs.c index de836f19d..7fb83c2d5 100644 --- a/vp8/common/x86/vp8_asm_stubs.c +++ b/vp8/common/x86/vp8_asm_stubs.c @@ -11,7 +11,6 @@ #include "vpx_config.h" #include "vp8_rtcd.h" #include "vpx_ports/mem.h" -#include "filter_x86.h" extern const short vp8_six_tap_x86[8][6 * 8]; diff --git a/vp8/vp8_common.mk b/vp8/vp8_common.mk index d2d5712a5..9f106a2c3 100644 --- a/vp8/vp8_common.mk +++ b/vp8/vp8_common.mk @@ -70,8 +70,6 @@ VP8_COMMON_SRCS-yes += common/vp8_entropymodedata.h VP8_COMMON_SRCS-yes += common/treecoder.c -VP8_COMMON_SRCS-$(ARCH_X86)$(ARCH_X86_64) += common/x86/filter_x86.c -VP8_COMMON_SRCS-$(ARCH_X86)$(ARCH_X86_64) += common/x86/filter_x86.h VP8_COMMON_SRCS-$(ARCH_X86)$(ARCH_X86_64) += common/x86/vp8_asm_stubs.c VP8_COMMON_SRCS-$(ARCH_X86)$(ARCH_X86_64) += common/x86/loopfilter_x86.c VP8_COMMON_SRCS-$(CONFIG_POSTPROC) += common/mfqe.c diff --git a/vpx_dsp/x86/mem_sse2.h b/vpx_dsp/x86/mem_sse2.h index 5209a0628..258ab38e6 100644 --- a/vpx_dsp/x86/mem_sse2.h +++ b/vpx_dsp/x86/mem_sse2.h @@ -26,6 +26,17 @@ static INLINE uint32_t loadu_uint32(const void *src) { return v; } +static INLINE __m128i load_unaligned_u32(const void *a) { + uint32_t val; + memcpy(&val, a, sizeof(val)); + return _mm_cvtsi32_si128(val); +} + +static INLINE void store_unaligned_u32(void *const a, const __m128i v) { + const uint32_t val = _mm_cvtsi128_si32(v); + memcpy(a, &val, sizeof(val)); +} + #define mm_storelu(dst, v) memcpy((dst), (const char *)&(v), 8) #define mm_storehu(dst, v) memcpy((dst), (const char *)&(v) + 8, 8) -- 2.40.0