From: Yunqing Wang Date: Wed, 29 Jan 2014 23:49:03 +0000 (-0800) Subject: Optimize bilinear sub-pixel filters in sse2 X-Git-Tag: v1.4.0~2521^2 X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=2488cb34bc894cb5f5e0d5afaa85605c60e2f5ab;p=libvpx Optimize bilinear sub-pixel filters in sse2 Using bilinear filters could speed up the codec in real-time mode. This patch added sse2 optimizations of bilinear filters that operate on different-sized blocks. Tests showed that the real-time encoder was speeded up by 3%. Change-Id: If99a7ee4385fcc225c3ee7445d962d5752e57c3f --- diff --git a/vp9/common/x86/vp9_asm_stubs.c b/vp9/common/x86/vp9_asm_stubs.c index 8a2297feb..8ab5fb1bc 100644 --- a/vp9/common/x86/vp9_asm_stubs.c +++ b/vp9/common/x86/vp9_asm_stubs.c @@ -23,36 +23,63 @@ typedef void filter8_1dfunction ( const short *filter ); -#define FUN_CONV_1D(name, step_q4, filter, dir, src_start, avg, opt) \ -void vp9_convolve8_##name##_##opt(const uint8_t *src, ptrdiff_t src_stride, \ - uint8_t *dst, ptrdiff_t dst_stride, \ - const int16_t *filter_x, int x_step_q4, \ - const int16_t *filter_y, int y_step_q4, \ - int w, int h) { \ +#define FUN_CONV_1D(name, step_q4, filter, dir, src_start, avg, opt1, opt2) \ +void vp9_convolve8_##name##_##opt1(const uint8_t *src, ptrdiff_t src_stride, \ + uint8_t *dst, ptrdiff_t dst_stride, \ + const int16_t *filter_x, int x_step_q4, \ + const int16_t *filter_y, int y_step_q4, \ + int w, int h) { \ if (step_q4 == 16 && filter[3] != 128) { \ - while (w >= 16) { \ - vp9_filter_block1d16_##dir##8_##avg##opt(src_start, src_stride, \ - dst, dst_stride, \ - h, filter); \ - src += 16; \ - dst += 16; \ - w -= 16; \ - } \ - while (w >= 8) { \ - vp9_filter_block1d8_##dir##8_##avg##opt(src_start, src_stride, \ - dst, dst_stride, \ - h, filter); \ - src += 8; \ - dst += 8; \ - w -= 8; \ - } \ - while (w >= 4) { \ - vp9_filter_block1d4_##dir##8_##avg##opt(src_start, src_stride, \ - dst, dst_stride, \ - h, filter); \ - src += 4; \ - dst += 4; \ - w -= 4; \ + if (filter[0] || filter[1] || filter[2]) { \ + while (w >= 16) { \ + vp9_filter_block1d16_##dir##8_##avg##opt1(src_start, src_stride, \ + dst, dst_stride, \ + h, filter); \ + src += 16; \ + dst += 16; \ + w -= 16; \ + } \ + while (w >= 8) { \ + vp9_filter_block1d8_##dir##8_##avg##opt1(src_start, src_stride, \ + dst, dst_stride, \ + h, filter); \ + src += 8; \ + dst += 8; \ + w -= 8; \ + } \ + while (w >= 4) { \ + vp9_filter_block1d4_##dir##8_##avg##opt1(src_start, src_stride, \ + dst, dst_stride, \ + h, filter); \ + src += 4; \ + dst += 4; \ + w -= 4; \ + } \ + } else { \ + while (w >= 16) { \ + vp9_filter_block1d16_##dir##2_##avg##opt2(src, src_stride, \ + dst, dst_stride, \ + h, filter); \ + src += 16; \ + dst += 16; \ + w -= 16; \ + } \ + while (w >= 8) { \ + vp9_filter_block1d8_##dir##2_##avg##opt2(src, src_stride, \ + dst, dst_stride, \ + h, filter); \ + src += 8; \ + dst += 8; \ + w -= 8; \ + } \ + while (w >= 4) { \ + vp9_filter_block1d4_##dir##2_##avg##opt2(src, src_stride, \ + dst, dst_stride, \ + h, filter); \ + src += 4; \ + dst += 4; \ + w -= 4; \ + } \ } \ } \ if (w) { \ @@ -68,17 +95,27 @@ void vp9_convolve8_##avg##opt(const uint8_t *src, ptrdiff_t src_stride, \ const int16_t *filter_x, int x_step_q4, \ const int16_t *filter_y, int y_step_q4, \ int w, int h) { \ - DECLARE_ALIGNED_ARRAY(16, unsigned char, fdata2, 64 * 71); \ - \ assert(w <= 64); \ assert(h <= 64); \ if (x_step_q4 == 16 && y_step_q4 == 16) { \ - vp9_convolve8_horiz_##opt(src - 3 * src_stride, src_stride, fdata2, 64, \ - filter_x, x_step_q4, filter_y, y_step_q4, \ - w, h + 7); \ - vp9_convolve8_##avg##vert_##opt(fdata2 + 3 * 64, 64, dst, dst_stride, \ - filter_x, x_step_q4, filter_y, y_step_q4, \ - w, h); \ + if (filter_x[0] || filter_x[1] || filter_x[2] || filter_x[3] == 128 || \ + filter_y[0] || filter_y[1] || filter_y[2] || filter_y[3] == 128) { \ + DECLARE_ALIGNED_ARRAY(16, unsigned char, fdata2, 64 * 71); \ + vp9_convolve8_horiz_##opt(src - 3 * src_stride, src_stride, fdata2, 64, \ + filter_x, x_step_q4, filter_y, y_step_q4, \ + w, h + 7); \ + vp9_convolve8_##avg##vert_##opt(fdata2 + 3 * 64, 64, dst, dst_stride, \ + filter_x, x_step_q4, filter_y, \ + y_step_q4, w, h); \ + } else { \ + DECLARE_ALIGNED_ARRAY(16, unsigned char, fdata2, 64 * 65); \ + vp9_convolve8_horiz_##opt(src, src_stride, fdata2, 64, \ + filter_x, x_step_q4, filter_y, y_step_q4, \ + w, h + 1); \ + vp9_convolve8_##avg##vert_##opt(fdata2, 64, dst, dst_stride, \ + filter_x, x_step_q4, filter_y, \ + y_step_q4, w, h); \ + } \ } else { \ vp9_convolve8_##avg##c(src, src_stride, dst, dst_stride, \ filter_x, x_step_q4, filter_y, y_step_q4, w, h); \ @@ -99,6 +136,19 @@ filter8_1dfunction vp9_filter_block1d8_h8_avg_ssse3; filter8_1dfunction vp9_filter_block1d4_v8_avg_ssse3; filter8_1dfunction vp9_filter_block1d4_h8_avg_ssse3; +filter8_1dfunction vp9_filter_block1d16_v2_sse2; +filter8_1dfunction vp9_filter_block1d16_h2_sse2; +filter8_1dfunction vp9_filter_block1d8_v2_sse2; +filter8_1dfunction vp9_filter_block1d8_h2_sse2; +filter8_1dfunction vp9_filter_block1d4_v2_sse2; +filter8_1dfunction vp9_filter_block1d4_h2_sse2; +filter8_1dfunction vp9_filter_block1d16_v2_avg_sse2; +filter8_1dfunction vp9_filter_block1d16_h2_avg_sse2; +filter8_1dfunction vp9_filter_block1d8_v2_avg_sse2; +filter8_1dfunction vp9_filter_block1d8_h2_avg_sse2; +filter8_1dfunction vp9_filter_block1d4_v2_avg_sse2; +filter8_1dfunction vp9_filter_block1d4_h2_avg_sse2; + // void vp9_convolve8_horiz_ssse3(const uint8_t *src, ptrdiff_t src_stride, // uint8_t *dst, ptrdiff_t dst_stride, // const int16_t *filter_x, int x_step_q4, @@ -119,11 +169,11 @@ filter8_1dfunction vp9_filter_block1d4_h8_avg_ssse3; // const int16_t *filter_x, int x_step_q4, // const int16_t *filter_y, int y_step_q4, // int w, int h); -FUN_CONV_1D(horiz, x_step_q4, filter_x, h, src, , ssse3); -FUN_CONV_1D(vert, y_step_q4, filter_y, v, src - src_stride * 3, , ssse3); -FUN_CONV_1D(avg_horiz, x_step_q4, filter_x, h, src, avg_, ssse3); +FUN_CONV_1D(horiz, x_step_q4, filter_x, h, src, , ssse3, sse2); +FUN_CONV_1D(vert, y_step_q4, filter_y, v, src - src_stride * 3, , ssse3, sse2); +FUN_CONV_1D(avg_horiz, x_step_q4, filter_x, h, src, avg_, ssse3, sse2); FUN_CONV_1D(avg_vert, y_step_q4, filter_y, v, src - src_stride * 3, avg_, - ssse3); + ssse3, sse2); // void vp9_convolve8_ssse3(const uint8_t *src, ptrdiff_t src_stride, // uint8_t *dst, ptrdiff_t dst_stride, @@ -153,6 +203,19 @@ filter8_1dfunction vp9_filter_block1d8_h8_avg_sse2; filter8_1dfunction vp9_filter_block1d4_v8_avg_sse2; filter8_1dfunction vp9_filter_block1d4_h8_avg_sse2; +filter8_1dfunction vp9_filter_block1d16_v2_sse2; +filter8_1dfunction vp9_filter_block1d16_h2_sse2; +filter8_1dfunction vp9_filter_block1d8_v2_sse2; +filter8_1dfunction vp9_filter_block1d8_h2_sse2; +filter8_1dfunction vp9_filter_block1d4_v2_sse2; +filter8_1dfunction vp9_filter_block1d4_h2_sse2; +filter8_1dfunction vp9_filter_block1d16_v2_avg_sse2; +filter8_1dfunction vp9_filter_block1d16_h2_avg_sse2; +filter8_1dfunction vp9_filter_block1d8_v2_avg_sse2; +filter8_1dfunction vp9_filter_block1d8_h2_avg_sse2; +filter8_1dfunction vp9_filter_block1d4_v2_avg_sse2; +filter8_1dfunction vp9_filter_block1d4_h2_avg_sse2; + // void vp9_convolve8_horiz_sse2(const uint8_t *src, ptrdiff_t src_stride, // uint8_t *dst, ptrdiff_t dst_stride, // const int16_t *filter_x, int x_step_q4, @@ -173,10 +236,11 @@ filter8_1dfunction vp9_filter_block1d4_h8_avg_sse2; // const int16_t *filter_x, int x_step_q4, // const int16_t *filter_y, int y_step_q4, // int w, int h); -FUN_CONV_1D(horiz, x_step_q4, filter_x, h, src, , sse2); -FUN_CONV_1D(vert, y_step_q4, filter_y, v, src - src_stride * 3, , sse2); -FUN_CONV_1D(avg_horiz, x_step_q4, filter_x, h, src, avg_, sse2); -FUN_CONV_1D(avg_vert, y_step_q4, filter_y, v, src - src_stride * 3, avg_, sse2); +FUN_CONV_1D(horiz, x_step_q4, filter_x, h, src, , sse2, sse2); +FUN_CONV_1D(vert, y_step_q4, filter_y, v, src - src_stride * 3, , sse2, sse2); +FUN_CONV_1D(avg_horiz, x_step_q4, filter_x, h, src, avg_, sse2, sse2); +FUN_CONV_1D(avg_vert, y_step_q4, filter_y, v, src - src_stride * 3, avg_, sse2, + sse2); // void vp9_convolve8_sse2(const uint8_t *src, ptrdiff_t src_stride, // uint8_t *dst, ptrdiff_t dst_stride, diff --git a/vp9/common/x86/vp9_subpixel_bilinear_sse2.asm b/vp9/common/x86/vp9_subpixel_bilinear_sse2.asm new file mode 100644 index 000000000..d94ccf2e9 --- /dev/null +++ b/vp9/common/x86/vp9_subpixel_bilinear_sse2.asm @@ -0,0 +1,448 @@ +; +; Copyright (c) 2014 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + +%include "vpx_ports/x86_abi_support.asm" + +%macro GET_PARAM_4 0 + mov rdx, arg(5) ;filter ptr + mov rsi, arg(0) ;src_ptr + mov rdi, arg(2) ;output_ptr + mov rcx, 0x0400040 + + movdqa xmm3, [rdx] ;load filters + pshuflw xmm4, xmm3, 11111111b ;k3 + psrldq xmm3, 8 + pshuflw xmm3, xmm3, 0b ;k4 + punpcklqdq xmm4, xmm3 ;k3k4 + + movq xmm3, rcx ;rounding + pshufd xmm3, xmm3, 0 + + pxor xmm2, xmm2 + + movsxd rax, DWORD PTR arg(1) ;pixels_per_line + movsxd rdx, DWORD PTR arg(3) ;out_pitch + movsxd rcx, DWORD PTR arg(4) ;output_height +%endm + +%macro APPLY_FILTER_4 1 + + punpckldq xmm0, xmm1 ;two row in one register + punpcklbw xmm0, xmm2 ;unpack to word + pmullw xmm0, xmm4 ;multiply the filter factors + + movdqa xmm1, xmm0 + psrldq xmm1, 8 + paddsw xmm0, xmm1 + + paddsw xmm0, xmm3 ;rounding + psraw xmm0, 7 ;shift + packuswb xmm0, xmm0 ;pack to byte + +%if %1 + movd xmm1, [rdi] + pavgb xmm0, xmm1 +%endif + + movd [rdi], xmm0 + lea rsi, [rsi + rax] + lea rdi, [rdi + rdx] + dec rcx +%endm + +%macro GET_PARAM 0 + mov rdx, arg(5) ;filter ptr + mov rsi, arg(0) ;src_ptr + mov rdi, arg(2) ;output_ptr + mov rcx, 0x0400040 + + movdqa xmm7, [rdx] ;load filters + + pshuflw xmm6, xmm7, 11111111b ;k3 + pshufhw xmm7, xmm7, 0b ;k4 + punpcklwd xmm6, xmm6 + punpckhwd xmm7, xmm7 + + movq xmm4, rcx ;rounding + pshufd xmm4, xmm4, 0 + + pxor xmm5, xmm5 + + movsxd rax, DWORD PTR arg(1) ;pixels_per_line + movsxd rdx, DWORD PTR arg(3) ;out_pitch + movsxd rcx, DWORD PTR arg(4) ;output_height +%endm + +%macro APPLY_FILTER_8 1 + punpcklbw xmm0, xmm5 + punpcklbw xmm1, xmm5 + + pmullw xmm0, xmm6 + pmullw xmm1, xmm7 + paddsw xmm0, xmm1 + paddsw xmm0, xmm4 ;rounding + psraw xmm0, 7 ;shift + packuswb xmm0, xmm0 ;pack back to byte +%if %1 + movq xmm1, [rdi] + pavgb xmm0, xmm1 +%endif + movq [rdi], xmm0 ;store the result + + lea rsi, [rsi + rax] + lea rdi, [rdi + rdx] + dec rcx +%endm + +%macro APPLY_FILTER_16 1 + punpcklbw xmm0, xmm5 + punpcklbw xmm1, xmm5 + punpckhbw xmm2, xmm5 + punpckhbw xmm3, xmm5 + + pmullw xmm0, xmm6 + pmullw xmm1, xmm7 + pmullw xmm2, xmm6 + pmullw xmm3, xmm7 + + paddsw xmm0, xmm1 + paddsw xmm2, xmm3 + + paddsw xmm0, xmm4 ;rounding + paddsw xmm2, xmm4 + psraw xmm0, 7 ;shift + psraw xmm2, 7 + packuswb xmm0, xmm2 ;pack back to byte +%if %1 + movdqu xmm1, [rdi] + pavgb xmm0, xmm1 +%endif + movdqu [rdi], xmm0 ;store the result + + lea rsi, [rsi + rax] + lea rdi, [rdi + rdx] + dec rcx +%endm + +global sym(vp9_filter_block1d4_v2_sse2) PRIVATE +sym(vp9_filter_block1d4_v2_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + push rsi + push rdi + ; end prolog + + GET_PARAM_4 +.loop: + movd xmm0, [rsi] ;load src + movd xmm1, [rsi + rax] + + APPLY_FILTER_4 0 + jnz .loop + + ; begin epilog + pop rdi + pop rsi + UNSHADOW_ARGS + pop rbp + ret + +global sym(vp9_filter_block1d8_v2_sse2) PRIVATE +sym(vp9_filter_block1d8_v2_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + SAVE_XMM 7 + push rsi + push rdi + ; end prolog + + GET_PARAM +.loop: + movq xmm0, [rsi] ;0 + movq xmm1, [rsi + rax] ;1 + + APPLY_FILTER_8 0 + jnz .loop + + ; begin epilog + pop rdi + pop rsi + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +global sym(vp9_filter_block1d16_v2_sse2) PRIVATE +sym(vp9_filter_block1d16_v2_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + SAVE_XMM 7 + push rsi + push rdi + ; end prolog + + GET_PARAM +.loop: + movdqu xmm0, [rsi] ;0 + movdqu xmm1, [rsi + rax] ;1 + movdqa xmm2, xmm0 + movdqa xmm3, xmm1 + + APPLY_FILTER_16 0 + jnz .loop + + ; begin epilog + pop rdi + pop rsi + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +global sym(vp9_filter_block1d4_v2_avg_sse2) PRIVATE +sym(vp9_filter_block1d4_v2_avg_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + push rsi + push rdi + ; end prolog + + GET_PARAM_4 +.loop: + movd xmm0, [rsi] ;load src + movd xmm1, [rsi + rax] + + APPLY_FILTER_4 1 + jnz .loop + + ; begin epilog + pop rdi + pop rsi + UNSHADOW_ARGS + pop rbp + ret + +global sym(vp9_filter_block1d8_v2_avg_sse2) PRIVATE +sym(vp9_filter_block1d8_v2_avg_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + SAVE_XMM 7 + push rsi + push rdi + ; end prolog + + GET_PARAM +.loop: + movq xmm0, [rsi] ;0 + movq xmm1, [rsi + rax] ;1 + + APPLY_FILTER_8 1 + jnz .loop + + ; begin epilog + pop rdi + pop rsi + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +global sym(vp9_filter_block1d16_v2_avg_sse2) PRIVATE +sym(vp9_filter_block1d16_v2_avg_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + SAVE_XMM 7 + push rsi + push rdi + ; end prolog + + GET_PARAM +.loop: + movdqu xmm0, [rsi] ;0 + movdqu xmm1, [rsi + rax] ;1 + movdqa xmm2, xmm0 + movdqa xmm3, xmm1 + + APPLY_FILTER_16 1 + jnz .loop + + ; begin epilog + pop rdi + pop rsi + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +global sym(vp9_filter_block1d4_h2_sse2) PRIVATE +sym(vp9_filter_block1d4_h2_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + push rsi + push rdi + ; end prolog + + GET_PARAM_4 +.loop: + movdqu xmm0, [rsi] ;load src + movdqa xmm1, xmm0 + psrldq xmm1, 1 + + APPLY_FILTER_4 0 + jnz .loop + + ; begin epilog + pop rdi + pop rsi + UNSHADOW_ARGS + pop rbp + ret + +global sym(vp9_filter_block1d8_h2_sse2) PRIVATE +sym(vp9_filter_block1d8_h2_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + SAVE_XMM 7 + push rsi + push rdi + ; end prolog + + GET_PARAM +.loop: + movdqu xmm0, [rsi] ;load src + movdqa xmm1, xmm0 + psrldq xmm1, 1 + + APPLY_FILTER_8 0 + jnz .loop + + ; begin epilog + pop rdi + pop rsi + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +global sym(vp9_filter_block1d16_h2_sse2) PRIVATE +sym(vp9_filter_block1d16_h2_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + SAVE_XMM 7 + push rsi + push rdi + ; end prolog + + GET_PARAM +.loop: + movdqu xmm0, [rsi] ;load src + movdqu xmm1, [rsi + 1] + movdqa xmm2, xmm0 + movdqa xmm3, xmm1 + + APPLY_FILTER_16 0 + jnz .loop + + ; begin epilog + pop rdi + pop rsi + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +global sym(vp9_filter_block1d4_h2_avg_sse2) PRIVATE +sym(vp9_filter_block1d4_h2_avg_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + push rsi + push rdi + ; end prolog + + GET_PARAM_4 +.loop: + movdqu xmm0, [rsi] ;load src + movdqa xmm1, xmm0 + psrldq xmm1, 1 + + APPLY_FILTER_4 1 + jnz .loop + + ; begin epilog + pop rdi + pop rsi + UNSHADOW_ARGS + pop rbp + ret + +global sym(vp9_filter_block1d8_h2_avg_sse2) PRIVATE +sym(vp9_filter_block1d8_h2_avg_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + SAVE_XMM 7 + push rsi + push rdi + ; end prolog + + GET_PARAM +.loop: + movdqu xmm0, [rsi] ;load src + movdqa xmm1, xmm0 + psrldq xmm1, 1 + + APPLY_FILTER_8 1 + jnz .loop + + ; begin epilog + pop rdi + pop rsi + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +global sym(vp9_filter_block1d16_h2_avg_sse2) PRIVATE +sym(vp9_filter_block1d16_h2_avg_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + SAVE_XMM 7 + push rsi + push rdi + ; end prolog + + GET_PARAM +.loop: + movdqu xmm0, [rsi] ;load src + movdqu xmm1, [rsi + 1] + movdqa xmm2, xmm0 + movdqa xmm3, xmm1 + + APPLY_FILTER_16 1 + jnz .loop + + ; begin epilog + pop rdi + pop rsi + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret diff --git a/vp9/vp9_common.mk b/vp9/vp9_common.mk index 78aabc7aa..d8c6eaaff 100644 --- a/vp9/vp9_common.mk +++ b/vp9/vp9_common.mk @@ -74,6 +74,7 @@ VP9_COMMON_SRCS-$(CONFIG_VP9_POSTPROC) += common/vp9_postproc.h VP9_COMMON_SRCS-$(CONFIG_VP9_POSTPROC) += common/vp9_postproc.c VP9_COMMON_SRCS-$(HAVE_MMX) += common/x86/vp9_loopfilter_mmx.asm VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_subpixel_8t_sse2.asm +VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_subpixel_bilinear_sse2.asm VP9_COMMON_SRCS-$(HAVE_SSSE3) += common/x86/vp9_subpixel_8t_ssse3.asm ifeq ($(CONFIG_VP9_POSTPROC),yes) VP9_COMMON_SRCS-$(HAVE_MMX) += common/x86/vp9_postproc_mmx.asm