From: Scott LaVarnway Date: Fri, 26 Oct 2012 00:24:50 +0000 (-0700) Subject: Faster 8t filtering X-Git-Tag: v1.3.0~1217^2~176^2 X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=ce811f87c49d8dbd0f11895d1105310fe061bc5f;p=libvpx Faster 8t filtering Quickly modified the ssse3 sixtap filters to support eight taps. For the test clip used, a 23+% boost in decoder performance was seen. We can revisit later and improve further. Change-Id: I5f59860459e80d6fa23e6cc0fd91296a969f5240 --- diff --git a/vp8/common/filter.c b/vp8/common/filter.c index 0bc88e5dd..a70a981b6 100644 --- a/vp8/common/filter.c +++ b/vp8/common/filter.c @@ -901,8 +901,7 @@ void vp8_eighttap_predict_avg16x16_c unsigned char *dst_ptr, int dst_pitch ) { - unsigned char tmp[16 * 16]; - + DECLARE_ALIGNED_ARRAY(16, unsigned char, tmp, 16 * 16); const short *HFilter = vp8_sub_pel_filters_8[xoffset]; const short *VFilter = vp8_sub_pel_filters_8[yoffset]; @@ -921,7 +920,7 @@ void vp8_eighttap_predict_avg16x16_sharp_c unsigned char *dst_ptr, int dst_pitch ) { - unsigned char tmp[16 * 16]; + DECLARE_ALIGNED_ARRAY(16, unsigned char, tmp, 16 * 16); const short *HFilter = vp8_sub_pel_filters_8s[xoffset]; const short *VFilter = vp8_sub_pel_filters_8s[yoffset]; diff --git a/vp8/common/rtcd_defs.sh b/vp8/common/rtcd_defs.sh index d7e5b5b25..e45028284 100644 --- a/vp8/common/rtcd_defs.sh +++ b/vp8/common/rtcd_defs.sh @@ -22,9 +22,9 @@ prototype void vp8_filter_block2d_16x16_8 "const unsigned char *src_ptr, const u # on the safe side, only enabled when compiled with 'gcc'. if [ "$CONFIG_GCC" = "yes" ]; then specialize vp8_filter_block2d_4x4_8 sse4_1 sse2 - specialize vp8_filter_block2d_8x4_8 sse4_1 sse2 - specialize vp8_filter_block2d_8x8_8 sse4_1 sse2 - specialize vp8_filter_block2d_16x16_8 sse4_1 sse2 + specialize vp8_filter_block2d_8x4_8 ssse3 #sse4_1 sse2 + specialize vp8_filter_block2d_8x8_8 ssse3 #sse4_1 sse2 + specialize vp8_filter_block2d_16x16_8 ssse3 #sse4_1 sse2 fi diff --git a/vp8/common/x86/subpixel_8t_ssse3.asm b/vp8/common/x86/subpixel_8t_ssse3.asm new file mode 100644 index 000000000..8f8c7389e --- /dev/null +++ b/vp8/common/x86/subpixel_8t_ssse3.asm @@ -0,0 +1,550 @@ +; +; Copyright (c) 2010 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + + +%include "vpx_ports/x86_abi_support.asm" + +;/************************************************************************************ +; Notes: filter_block1d_h6 applies a 6 tap filter horizontally to the input pixels. The +; input pixel array has output_height rows. This routine assumes that output_height is an +; even number. This function handles 8 pixels in horizontal direction, calculating ONE +; rows each iteration to take advantage of the 128 bits operations. +; +; This is an implementation of some of the SSE optimizations first seen in ffvp8 +; +;*************************************************************************************/ + +;void vp8_filter_block1d8_v8_ssse3 +;( +; unsigned char *src_ptr, +; unsigned int src_pitch, +; unsigned char *output_ptr, +; unsigned int out_pitch, +; unsigned int output_height, +; short *filter +;) +global sym(vp8_filter_block1d8_v8_ssse3) +sym(vp8_filter_block1d8_v8_ssse3): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + SAVE_XMM 7 + push rsi + push rdi + push rbx + ; end prolog + + ALIGN_STACK 16, rax + sub rsp, 16*5 + %define k0k1 [rsp + 16*0] + %define k2k3 [rsp + 16*1] + %define k4k5 [rsp + 16*2] + %define k6k7 [rsp + 16*3] + %define krd [rsp + 16*4] + + mov rdx, arg(5) ;filter ptr + mov rsi, arg(0) ;src_ptr + mov rdi, arg(2) ;output_ptr + mov rcx, 0x0400040 + + movdqa xmm4, [rdx] ;load filters + movd xmm5, rcx + packsswb xmm4, xmm4 + pshuflw xmm0, xmm4, 0b ;k0_k1 + pshuflw xmm1, xmm4, 01010101b ;k2_k3 + pshuflw xmm2, xmm4, 10101010b ;k4_k5 + pshuflw xmm3, xmm4, 11111111b ;k6_k7 + + punpcklqdq xmm0, xmm0 + punpcklqdq xmm1, xmm1 + punpcklqdq xmm2, xmm2 + punpcklqdq xmm3, xmm3 + + movdqa k0k1, xmm0 + movdqa k2k3, xmm1 + pshufd xmm5, xmm5, 0 + movdqa k4k5, xmm2 + movdqa k6k7, xmm3 + movdqa krd, xmm5 + + movsxd rdx, DWORD PTR arg(1) ;pixels_per_line + +%if ABI_IS_32BIT=0 + movsxd r8, DWORD PTR arg(3) ;out_pitch +%endif + mov rax, rsi + movsxd rcx, DWORD PTR arg(4) ;output_height + add rax, rdx + + lea rbx, [rdx + rdx*4] + add rbx, rdx ;pitch * 6 + +.vp8_filter_block1d8_v8_ssse3_loop: + movq xmm0, [rsi] ;A + movq xmm1, [rsi + rdx] ;B + movq xmm2, [rsi + rdx * 2] ;C + movq xmm3, [rax + rdx * 2] ;D + movq xmm4, [rsi + rdx * 4] ;E + movq xmm5, [rax + rdx * 4] ;F + + punpcklbw xmm0, xmm1 ;A B + punpcklbw xmm2, xmm3 ;C D + punpcklbw xmm4, xmm5 ;E F + + movq xmm6, [rsi + rbx] ;G + movq xmm7, [rax + rbx] ;H + + pmaddubsw xmm0, k0k1 + pmaddubsw xmm2, k2k3 + punpcklbw xmm6, xmm7 ;G H + pmaddubsw xmm4, k4k5 + pmaddubsw xmm6, k6k7 + + paddsw xmm0, xmm2 + paddsw xmm0, krd + paddsw xmm4, xmm6 + paddsw xmm0, xmm4 + + psraw xmm0, 7 + packuswb xmm0, xmm0 + + add rsi, rdx + add rax, rdx + + movq [rdi], xmm0 + +%if ABI_IS_32BIT + add rdi, DWORD PTR arg(3) ;out_pitch +%else + add rdi, r8 +%endif + dec rcx + jnz .vp8_filter_block1d8_v8_ssse3_loop + + add rsp, 16*5 + pop rsp + pop rbx + ; begin epilog + pop rdi + pop rsi + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +;void vp8_filter_block1d16_v8_ssse3 +;( +; unsigned char *src_ptr, +; unsigned int src_pitch, +; unsigned char *output_ptr, +; unsigned int out_pitch, +; unsigned int output_height, +; short *filter +;) +global sym(vp8_filter_block1d16_v8_ssse3) +sym(vp8_filter_block1d16_v8_ssse3): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + SAVE_XMM 7 + push rsi + push rdi + push rbx + ; end prolog + + ALIGN_STACK 16, rax + sub rsp, 16*5 + %define k0k1 [rsp + 16*0] + %define k2k3 [rsp + 16*1] + %define k4k5 [rsp + 16*2] + %define k6k7 [rsp + 16*3] + %define krd [rsp + 16*4] + + mov rdx, arg(5) ;filter ptr + mov rsi, arg(0) ;src_ptr + mov rdi, arg(2) ;output_ptr + mov rcx, 0x0400040 + + movdqa xmm4, [rdx] ;load filters + movd xmm5, rcx + packsswb xmm4, xmm4 + pshuflw xmm0, xmm4, 0b ;k0_k1 + pshuflw xmm1, xmm4, 01010101b ;k2_k3 + pshuflw xmm2, xmm4, 10101010b ;k4_k5 + pshuflw xmm3, xmm4, 11111111b ;k6_k7 + + punpcklqdq xmm0, xmm0 + punpcklqdq xmm1, xmm1 + punpcklqdq xmm2, xmm2 + punpcklqdq xmm3, xmm3 + + movdqa k0k1, xmm0 + movdqa k2k3, xmm1 + pshufd xmm5, xmm5, 0 + movdqa k4k5, xmm2 + movdqa k6k7, xmm3 + movdqa krd, xmm5 + + movsxd rdx, DWORD PTR arg(1) ;pixels_per_line + +%if ABI_IS_32BIT=0 + movsxd r8, DWORD PTR arg(3) ;out_pitch +%endif + mov rax, rsi + movsxd rcx, DWORD PTR arg(4) ;output_height + add rax, rdx + + lea rbx, [rdx + rdx*4] + add rbx, rdx ;pitch * 6 + +.vp8_filter_block1d16_v8_ssse3_loop: + movq xmm0, [rsi] ;A + movq xmm1, [rsi + rdx] ;B + movq xmm2, [rsi + rdx * 2] ;C + movq xmm3, [rax + rdx * 2] ;D + movq xmm4, [rsi + rdx * 4] ;E + movq xmm5, [rax + rdx * 4] ;F + + punpcklbw xmm0, xmm1 ;A B + punpcklbw xmm2, xmm3 ;C D + punpcklbw xmm4, xmm5 ;E F + + movq xmm6, [rsi + rbx] ;G + movq xmm7, [rax + rbx] ;H + + pmaddubsw xmm0, k0k1 + pmaddubsw xmm2, k2k3 + punpcklbw xmm6, xmm7 ;G H + pmaddubsw xmm4, k4k5 + pmaddubsw xmm6, k6k7 + + paddsw xmm0, xmm2 + paddsw xmm0, krd + paddsw xmm4, xmm6 + paddsw xmm0, xmm4 + + psraw xmm0, 7 + packuswb xmm0, xmm0 + + movq [rdi], xmm0 + + movq xmm0, [rsi + 8] ;A + movq xmm1, [rsi + rdx + 8] ;B + movq xmm2, [rsi + rdx * 2 + 8] ;C + movq xmm3, [rax + rdx * 2 + 8] ;D + movq xmm4, [rsi + rdx * 4 + 8] ;E + movq xmm5, [rax + rdx * 4 + 8] ;F + + punpcklbw xmm0, xmm1 ;A B + punpcklbw xmm2, xmm3 ;C D + punpcklbw xmm4, xmm5 ;E F + + + movq xmm6, [rsi + rbx + 8] ;G + movq xmm7, [rax + rbx + 8] ;H + punpcklbw xmm6, xmm7 ;G H + + + pmaddubsw xmm0, k0k1 + pmaddubsw xmm2, k2k3 + pmaddubsw xmm4, k4k5 + pmaddubsw xmm6, k6k7 + + paddsw xmm0, xmm2 + paddsw xmm4, xmm6 + paddsw xmm0, krd + paddsw xmm0, xmm4 + + psraw xmm0, 7 + packuswb xmm0, xmm0 + + add rsi, rdx + add rax, rdx + + movq [rdi+8], xmm0 + +%if ABI_IS_32BIT + add rdi, DWORD PTR arg(3) ;out_pitch +%else + add rdi, r8 +%endif + dec rcx + jnz .vp8_filter_block1d16_v8_ssse3_loop + + add rsp, 16*5 + pop rsp + pop rbx + ; begin epilog + pop rdi + pop rsi + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +;void vp8_filter_block1d8_h8_ssse3 +;( +; unsigned char *src_ptr, +; unsigned int src_pixels_per_line, +; unsigned char *output_ptr, +; unsigned int output_pitch, +; unsigned int output_height, +; short *filter +;) +global sym(vp8_filter_block1d8_h8_ssse3) +sym(vp8_filter_block1d8_h8_ssse3): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + SAVE_XMM 7 + GET_GOT rbx + push rsi + push rdi + ; end prolog + + ALIGN_STACK 16, rax + sub rsp, 16*5 + %define k0k1 [rsp + 16*0] + %define k2k3 [rsp + 16*1] + %define k4k5 [rsp + 16*2] + %define k6k7 [rsp + 16*3] + %define krd [rsp + 16*4] + + mov rdx, arg(5) ;filter ptr + mov rsi, arg(0) ;src_ptr + mov rdi, arg(2) ;output_ptr + mov rcx, 0x0400040 + + movdqa xmm4, [rdx] ;load filters + movd xmm5, rcx + packsswb xmm4, xmm4 + pshuflw xmm0, xmm4, 0b ;k0_k1 + pshuflw xmm1, xmm4, 01010101b ;k2_k3 + pshuflw xmm2, xmm4, 10101010b ;k4_k5 + pshuflw xmm3, xmm4, 11111111b ;k6_k7 + + punpcklqdq xmm0, xmm0 + punpcklqdq xmm1, xmm1 + punpcklqdq xmm2, xmm2 + punpcklqdq xmm3, xmm3 + + movdqa k0k1, xmm0 + movdqa k2k3, xmm1 + pshufd xmm5, xmm5, 0 + movdqa k4k5, xmm2 + movdqa k6k7, xmm3 +; movdqa krd, xmm5 + + movsxd rax, dword ptr arg(1) ;src_pixels_per_line + movsxd rdx, dword ptr arg(3) ;output_pitch + movsxd rcx, dword ptr arg(4) ;output_height + +.filter_block1d8_h8_rowloop_ssse3: + movq xmm0, [rsi - 3] ; -3 -2 -1 0 1 2 3 4 + +; movq xmm3, [rsi + 4] ; 4 5 6 7 8 9 10 11 + movq xmm3, [rsi + 5] ; 5 6 7 8 9 10 11 12 +;note: if we create a k0_k7 filter, we can save a pshufb +; punpcklbw xmm0, xmm3 ; -3 4 -2 5 -1 6 0 7 1 8 2 9 3 10 4 11 + punpcklqdq xmm0, xmm3 + + movdqa xmm1, xmm0 + pshufb xmm0, [GLOBAL(shuf_t0t1)] + pmaddubsw xmm0, k0k1 + + movdqa xmm2, xmm1 + pshufb xmm1, [GLOBAL(shuf_t2t3)] + pmaddubsw xmm1, k2k3 + + movdqa xmm4, xmm2 + pshufb xmm2, [GLOBAL(shuf_t4t5)] + pmaddubsw xmm2, k4k5 + + pshufb xmm4, [GLOBAL(shuf_t6t7)] + pmaddubsw xmm4, k6k7 + + paddsw xmm0, xmm1 + paddsw xmm0, xmm2 + paddsw xmm0, xmm5 + paddsw xmm0, xmm4 + psraw xmm0, 7 + packuswb xmm0, xmm0 + + lea rsi, [rsi + rax] + movq [rdi], xmm0 + + lea rdi, [rdi + rdx] + dec rcx + jnz .filter_block1d8_h8_rowloop_ssse3 + + add rsp, 16*5 + pop rsp + + ; begin epilog + pop rdi + pop rsi + RESTORE_GOT + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +;void vp8_filter_block1d16_h8_ssse3 +;( +; unsigned char *src_ptr, +; unsigned int src_pixels_per_line, +; unsigned char *output_ptr, +; unsigned int output_pitch, +; unsigned int output_height, +; short *filter +;) +global sym(vp8_filter_block1d16_h8_ssse3) +sym(vp8_filter_block1d16_h8_ssse3): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + SAVE_XMM 7 + GET_GOT rbx + push rsi + push rdi + ; end prolog + + ALIGN_STACK 16, rax + sub rsp, 16*5 + %define k0k1 [rsp + 16*0] + %define k2k3 [rsp + 16*1] + %define k4k5 [rsp + 16*2] + %define k6k7 [rsp + 16*3] + %define krd [rsp + 16*4] + + mov rdx, arg(5) ;filter ptr + mov rsi, arg(0) ;src_ptr + mov rdi, arg(2) ;output_ptr + mov rcx, 0x0400040 + + movdqa xmm4, [rdx] ;load filters + movd xmm5, rcx + packsswb xmm4, xmm4 + pshuflw xmm0, xmm4, 0b ;k0_k1 + pshuflw xmm1, xmm4, 01010101b ;k2_k3 + pshuflw xmm2, xmm4, 10101010b ;k4_k5 + pshuflw xmm3, xmm4, 11111111b ;k6_k7 + + punpcklqdq xmm0, xmm0 + punpcklqdq xmm1, xmm1 + punpcklqdq xmm2, xmm2 + punpcklqdq xmm3, xmm3 + + movdqa k0k1, xmm0 + movdqa k2k3, xmm1 + pshufd xmm5, xmm5, 0 + movdqa k4k5, xmm2 + movdqa k6k7, xmm3 + movdqa krd, xmm5 + + movsxd rax, dword ptr arg(1) ;src_pixels_per_line + movsxd rdx, dword ptr arg(3) ;output_pitch + movsxd rcx, dword ptr arg(4) ;output_height + +.filter_block1d16_h8_rowloop_ssse3: + movq xmm0, [rsi - 3] ; -3 -2 -1 0 1 2 3 4 + +; movq xmm3, [rsi + 4] ; 4 5 6 7 8 9 10 11 + movq xmm3, [rsi + 5] ; 5 6 7 8 9 10 11 12 +;note: if we create a k0_k7 filter, we can save a pshufb +; punpcklbw xmm0, xmm3 ; -3 4 -2 5 -1 6 0 7 1 8 2 9 3 10 4 11 + punpcklqdq xmm0, xmm3 + + movdqa xmm1, xmm0 + pshufb xmm0, [GLOBAL(shuf_t0t1)] + pmaddubsw xmm0, k0k1 + + movdqa xmm2, xmm1 + pshufb xmm1, [GLOBAL(shuf_t2t3)] + pmaddubsw xmm1, k2k3 + + movdqa xmm4, xmm2 + pshufb xmm2, [GLOBAL(shuf_t4t5)] + pmaddubsw xmm2, k4k5 + + pshufb xmm4, [GLOBAL(shuf_t6t7)] + pmaddubsw xmm4, k6k7 + + paddsw xmm0, xmm1 + paddsw xmm0, xmm4 + paddsw xmm0, xmm2 + paddsw xmm0, krd + psraw xmm0, 7 + packuswb xmm0, xmm0 + + + movq xmm3, [rsi + 5] +; movq xmm7, [rsi + 12] + movq xmm7, [rsi + 13] +;note: same as above +; punpcklbw xmm3, xmm7 + punpcklqdq xmm3, xmm7 + + movdqa xmm1, xmm3 + pshufb xmm3, [GLOBAL(shuf_t0t1)] + pmaddubsw xmm3, k0k1 + + movdqa xmm2, xmm1 + pshufb xmm1, [GLOBAL(shuf_t2t3)] + pmaddubsw xmm1, k2k3 + + movdqa xmm4, xmm2 + pshufb xmm2, [GLOBAL(shuf_t4t5)] + pmaddubsw xmm2, k4k5 + + pshufb xmm4, [GLOBAL(shuf_t6t7)] + pmaddubsw xmm4, k6k7 + + paddsw xmm3, xmm1 + paddsw xmm3, xmm2 + paddsw xmm3, krd + paddsw xmm3, xmm4 + psraw xmm3, 7 + packuswb xmm3, xmm3 + punpcklqdq xmm0, xmm3 + + lea rsi, [rsi + rax] + movdqa [rdi], xmm0 + + lea rdi, [rdi + rdx] + dec rcx + jnz .filter_block1d16_h8_rowloop_ssse3 + + add rsp, 16*5 + pop rsp + + ; begin epilog + pop rdi + pop rsi + RESTORE_GOT + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + + +SECTION_RODATA +align 16 +shuf_t0t1: + db 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 +align 16 +shuf_t2t3: + db 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10 +align 16 +shuf_t4t5: + db 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12 +align 16 +shuf_t6t7: + db 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14 diff --git a/vp8/common/x86/vp8_asm_stubs.c b/vp8/common/x86/vp8_asm_stubs.c index f39941258..4d9303666 100644 --- a/vp8/common/x86/vp8_asm_stubs.c +++ b/vp8/common/x86/vp8_asm_stubs.c @@ -525,7 +525,99 @@ void vp8_sixtap_predict4x4_ssse3 } else { vp8_filter_block1d4_v6_ssse3(src_ptr - (2 * src_pixels_per_line), src_pixels_per_line, dst_ptr, dst_pitch, 4, yoffset); } +} +void vp8_filter_block1d16_v8_ssse3(const unsigned char *src_ptr, + const unsigned int src_pitch, + unsigned char *output_ptr, + unsigned int out_pitch, + unsigned int output_height, + const short *filter); +void vp8_filter_block1d16_h8_ssse3(const unsigned char *src_ptr, + const unsigned int src_pitch, + unsigned char *output_ptr, + unsigned int out_pitch, + unsigned int output_height, + const short *filter); + +void vp8_filter_block2d_16x16_8_ssse3 +( + const unsigned char *src_ptr, const unsigned int src_stride, + const short *HFilter_aligned16, const short *VFilter_aligned16, + unsigned char *dst_ptr, unsigned int dst_stride +) { + if (HFilter_aligned16[3] !=128 && VFilter_aligned16[3] != 128) { + DECLARE_ALIGNED_ARRAY(16, unsigned char, FData2, 23 * 16); + vp8_filter_block1d16_h8_ssse3(src_ptr - (3 * src_stride), src_stride, + FData2, 16, 23, HFilter_aligned16); + vp8_filter_block1d16_v8_ssse3(FData2, 16, dst_ptr, dst_stride, 16, + VFilter_aligned16); + } else { + if (HFilter_aligned16[3] !=128) { + vp8_filter_block1d16_h8_ssse3(src_ptr, src_stride, dst_ptr, dst_stride, + 16, HFilter_aligned16); + } else { + vp8_filter_block1d16_v8_ssse3(src_ptr - (3 * src_stride), src_stride, + dst_ptr, dst_stride, 16, VFilter_aligned16); + } + } +} + +void vp8_filter_block1d8_v8_ssse3(const unsigned char *src_ptr, + const unsigned int src_pitch, + unsigned char *output_ptr, + unsigned int out_pitch, + unsigned int output_height, + const short *filter); +void vp8_filter_block1d8_h8_ssse3(const unsigned char *src_ptr, + const unsigned int src_pitch, + unsigned char *output_ptr, + unsigned int out_pitch, + unsigned int output_height, + const short *filter); +void vp8_filter_block2d_8x8_8_ssse3 +( + const unsigned char *src_ptr, const unsigned int src_stride, + const short *HFilter_aligned16, const short *VFilter_aligned16, + unsigned char *dst_ptr, unsigned int dst_stride +) { + if (HFilter_aligned16[3] !=128 && VFilter_aligned16[3] != 128) { + DECLARE_ALIGNED_ARRAY(16, unsigned char, FData2, 23 * 16); + vp8_filter_block1d8_h8_ssse3(src_ptr - (3 * src_stride), src_stride, + FData2, 16, 15, HFilter_aligned16); + vp8_filter_block1d8_v8_ssse3(FData2, 16, dst_ptr, dst_stride, 8, + VFilter_aligned16); + } else { + if (HFilter_aligned16[3] !=128) { + vp8_filter_block1d8_h8_ssse3(src_ptr, src_stride, dst_ptr, dst_stride, 8, + HFilter_aligned16); + } else { + vp8_filter_block1d8_v8_ssse3(src_ptr - (3 * src_stride), src_stride, + dst_ptr, dst_stride, 8, VFilter_aligned16); + } + } } +void vp8_filter_block2d_8x4_8_ssse3 +( + const unsigned char *src_ptr, const unsigned int src_stride, + const short *HFilter_aligned16, const short *VFilter_aligned16, + unsigned char *dst_ptr, unsigned int dst_stride +) { + if (HFilter_aligned16[3] !=128 && VFilter_aligned16[3] != 128) { + DECLARE_ALIGNED_ARRAY(16, unsigned char, FData2, 23 * 16); + vp8_filter_block1d8_h8_ssse3(src_ptr - (3 * src_stride), src_stride, + FData2, 16, 11, HFilter_aligned16); + vp8_filter_block1d8_v8_ssse3(FData2, 16, dst_ptr, dst_stride, 4, + VFilter_aligned16); + } else { + if (HFilter_aligned16[3] !=128) { + vp8_filter_block1d8_h8_ssse3(src_ptr, src_stride, dst_ptr, dst_stride, 4, + HFilter_aligned16); + } else { + vp8_filter_block1d8_v8_ssse3(src_ptr - (3 * src_stride), src_stride, + dst_ptr, dst_stride, 4, VFilter_aligned16); + } + } +} #endif diff --git a/vp8/vp8_common.mk b/vp8/vp8_common.mk index d6f31ed79..c2eeff1b3 100644 --- a/vp8/vp8_common.mk +++ b/vp8/vp8_common.mk @@ -100,6 +100,7 @@ VP8_COMMON_SRCS-$(HAVE_SSE2) += common/x86/recon_wrapper_sse2.c VP8_COMMON_SRCS-$(HAVE_SSE2) += common/x86/subpixel_sse2.asm VP8_COMMON_SRCS-$(HAVE_SSE2) += common/x86/loopfilter_sse2.asm VP8_COMMON_SRCS-$(HAVE_SSE2) += common/x86/iwalsh_sse2.asm +VP8_COMMON_SRCS-$(HAVE_SSSE3) += common/x86/subpixel_8t_ssse3.asm VP8_COMMON_SRCS-$(HAVE_SSSE3) += common/x86/subpixel_ssse3.asm ifeq ($(CONFIG_POSTPROC),yes) VP8_COMMON_SRCS-$(HAVE_MMX) += common/x86/postproc_mmx.asm