From ff4a71f4c20b7b2047d17872747aed6e98586b13 Mon Sep 17 00:00:00 2001 From: Scott LaVarnway Date: Mon, 1 Nov 2010 16:24:15 -0400 Subject: [PATCH] SSSE3 version of fast quantizer (test clip: tulip) For good quality mode with speed=1, this gave the encoder a small (2 - 3%) performance boost. Change-Id: I8a1d4269465944ac0819986c2f0be4b0a2ee0b35 --- vp8/encoder/x86/quantize_ssse3.asm | 114 +++++++++++++++++++++++++ vp8/encoder/x86/x86_csystemdependent.c | 22 +++++ vp8/vp8cx.mk | 1 + 3 files changed, 137 insertions(+) create mode 100755 vp8/encoder/x86/quantize_ssse3.asm diff --git a/vp8/encoder/x86/quantize_ssse3.asm b/vp8/encoder/x86/quantize_ssse3.asm new file mode 100755 index 000000000..2f33199e5 --- /dev/null +++ b/vp8/encoder/x86/quantize_ssse3.asm @@ -0,0 +1,114 @@ +; +; Copyright (c) 2010 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license and patent +; grant that can be found in the LICENSE file in the root of the source +; tree. All contributing project authors may be found in the AUTHORS +; file in the root of the source tree. +; + + +%include "vpx_ports/x86_abi_support.asm" + + +;int vp8_fast_quantize_b_impl_ssse3(short *coeff_ptr +; short *qcoeff_ptr,short *dequant_ptr, +; short *round_ptr, +; short *quant_ptr, short *dqcoeff_ptr); +; +global sym(vp8_fast_quantize_b_impl_ssse3) +sym(vp8_fast_quantize_b_impl_ssse3): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + GET_GOT rbx + push rsi + push rdi + ; end prolog + + mov rdx, arg(0) ;coeff_ptr + mov rdi, arg(3) ;round_ptr + mov rsi, arg(4) ;quant_ptr + + movdqa xmm0, [rdx] + movdqa xmm4, [rdx + 16] + + movdqa xmm2, [rdi] ;round lo + movdqa xmm3, [rdi + 16] ;round hi + + movdqa xmm1, xmm0 + movdqa xmm5, xmm4 + + psraw xmm0, 15 ;sign of z (aka sz) + psraw xmm4, 15 ;sign of z (aka sz) + + pabsw xmm1, xmm1 + pabsw xmm5, xmm5 + + paddw xmm1, xmm2 + paddw xmm5, xmm3 + + pmulhw xmm1, [rsi] + pmulhw xmm5, [rsi + 16] + + mov rdi, arg(1) ;qcoeff_ptr + mov rcx, arg(2) ;dequant_ptr + mov rsi, arg(5) ;dqcoeff_ptr + + pxor xmm1, xmm0 + pxor xmm5, xmm4 + psubw xmm1, xmm0 + psubw xmm5, xmm4 + + movdqa [rdi], xmm1 + movdqa [rdi + 16], xmm5 + + movdqa xmm2, [rcx] + movdqa xmm3, [rcx + 16] + + pxor xmm4, xmm4 + pmullw xmm2, xmm1 + pmullw xmm3, xmm5 + + pcmpeqw xmm1, xmm4 ;non zero mask + pcmpeqw xmm5, xmm4 ;non zero mask + packsswb xmm1, xmm5 + pshufb xmm1, [ GLOBAL(zz_shuf)] + + pmovmskb edx, xmm1 + +; xor ecx, ecx +; mov eax, -1 +;find_eob_loop: +; shr edx, 1 +; jc fq_skip +; mov eax, ecx +;fq_skip: +; inc ecx +; cmp ecx, 16 +; jne find_eob_loop + xor rdi, rdi + mov eax, -1 + xor dx, ax ;flip the bits for bsr + bsr eax, edx + + movdqa [rsi], xmm2 ;store dqcoeff + movdqa [rsi + 16], xmm3 ;store dqcoeff + + sub edi, edx ;check for all zeros in bit mask + sar edi, 31 ;0 or -1 + add eax, 1 + and eax, edi ;if the bit mask was all zero, + ;then eob = 0 + ; begin epilog + pop rdi + pop rsi + RESTORE_GOT + UNSHADOW_ARGS + pop rbp + ret + +SECTION_RODATA +align 16 +zz_shuf: + db 0, 1, 4, 8, 5, 2, 3, 6, 9, 12, 13, 10, 7, 11, 14, 15 diff --git a/vp8/encoder/x86/x86_csystemdependent.c b/vp8/encoder/x86/x86_csystemdependent.c index ed0e71ed0..fb1b37ccb 100644 --- a/vp8/encoder/x86/x86_csystemdependent.c +++ b/vp8/encoder/x86/x86_csystemdependent.c @@ -179,6 +179,25 @@ void vp8_subtract_b_sse2(BLOCK *be, BLOCKD *bd, int pitch) #endif +#if HAVE_SSSE3 +int vp8_fast_quantize_b_impl_ssse3(short *coeff_ptr, + short *qcoeff_ptr, short *dequant_ptr, + short *round_ptr, + short *quant_ptr, short *dqcoeff_ptr); +void vp8_fast_quantize_b_ssse3(BLOCK *b, BLOCKD *d) +{ + d->eob = vp8_fast_quantize_b_impl_ssse3( + b->coeff, + d->qcoeff, + d->dequant, + b->round, + b->quant, + d->dqcoeff + ); +} +#endif + + void vp8_arch_x86_encoder_init(VP8_COMP *cpi) { #if CONFIG_RUNTIME_CPU_DETECT @@ -333,6 +352,9 @@ void vp8_arch_x86_encoder_init(VP8_COMP *cpi) { cpi->rtcd.variance.sad16x16x3 = vp8_sad16x16x3_ssse3; cpi->rtcd.variance.sad16x8x3 = vp8_sad16x8x3_ssse3; + + cpi->rtcd.quantize.fastquantb = vp8_fast_quantize_b_ssse3; + } #endif diff --git a/vp8/vp8cx.mk b/vp8/vp8cx.mk index f693f865b..683d785e6 100644 --- a/vp8/vp8cx.mk +++ b/vp8/vp8cx.mk @@ -109,6 +109,7 @@ VP8_CX_SRCS-$(HAVE_SSE2) += encoder/x86/quantize_sse2.asm VP8_CX_SRCS-$(HAVE_SSE2) += encoder/x86/subtract_sse2.asm VP8_CX_SRCS-$(HAVE_SSE3) += encoder/x86/sad_sse3.asm VP8_CX_SRCS-$(HAVE_SSSE3) += encoder/x86/sad_ssse3.asm +VP8_CX_SRCS-$(HAVE_SSSE3) += encoder/x86/quantize_ssse3.asm VP8_CX_SRCS-$(HAVE_SSE4_1) += encoder/x86/sad_sse4.asm VP8_CX_SRCS-$(ARCH_X86)$(ARCH_X86_64) += encoder/x86/quantize_mmx.asm VP8_CX_SRCS-$(ARCH_X86)$(ARCH_X86_64) += encoder/x86/encodeopt.asm -- 2.40.0