From c8defcfdeea614a780af9a2405f59c60cab876ad Mon Sep 17 00:00:00 2001 From: "Ronald S. Bultje" Date: Mon, 1 Jul 2013 11:36:33 -0700 Subject: [PATCH] Update quantize SSSE3 SIMD to cover 32x32 transform case also. Encode time of bus (speed 0) 50 frames @ 1500kbps goes from 2min14.4 to 2min10.1, i.e. a 2.3% overall speed increase. Change-Id: I3699580e74ec26c7d24e03681bc47ba25ee1ee87 --- vp9/common/vp9_rtcd_defs.sh | 3 ++ vp9/encoder/vp9_quantize.c | 31 ++++++++++--------- vp9/encoder/x86/vp9_quantize_ssse3.asm | 41 +++++++++++++++++++++++--- 3 files changed, 55 insertions(+), 20 deletions(-) diff --git a/vp9/common/vp9_rtcd_defs.sh b/vp9/common/vp9_rtcd_defs.sh index e7cefa57c..330c60f6d 100644 --- a/vp9/common/vp9_rtcd_defs.sh +++ b/vp9/common/vp9_rtcd_defs.sh @@ -569,6 +569,9 @@ specialize vp9_subtract_block sse2 prototype void vp9_quantize_b "int16_t *coeff_ptr, intptr_t n_coeffs, int skip_block, int16_t *zbin_ptr, int16_t *round_ptr, int16_t *quant_ptr, int16_t *quant_shift_ptr, int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr, int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan" specialize vp9_quantize_b $ssse3_x86_64 +prototype void vp9_quantize_b_32x32 "int16_t *coeff_ptr, intptr_t n_coeffs, int skip_block, int16_t *zbin_ptr, int16_t *round_ptr, int16_t *quant_ptr, int16_t *quant_shift_ptr, int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr, int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan" +specialize vp9_quantize_b_32x32 $ssse3_x86_64 + # # Structured Similarity (SSIM) # diff --git a/vp9/encoder/vp9_quantize.c b/vp9/encoder/vp9_quantize.c index 6f2e13a0e..862923fd4 100644 --- a/vp9/encoder/vp9_quantize.c +++ b/vp9/encoder/vp9_quantize.c @@ -85,18 +85,19 @@ void vp9_quantize_b_c(int16_t *coeff_ptr, intptr_t n_coeffs, int skip_block, } // This function works well for large transform size. -static void quantize_sparse(int16_t *coeff_ptr, intptr_t n_coeffs, +void vp9_quantize_b_32x32_c(int16_t *coeff_ptr, intptr_t n_coeffs, int skip_block, int16_t *zbin_ptr, int16_t *round_ptr, int16_t *quant_ptr, int16_t *quant_shift_ptr, int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr, int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, - int *idx_arr) { + const int16_t *iscan) { int i, rc, eob; int zbins[2], nzbins[2], zbin; int x, y, z, sz; int idx = 0; + int idx_arr[1024]; vpx_memset(qcoeff_ptr, 0, n_coeffs*sizeof(int16_t)); vpx_memset(dqcoeff_ptr, 0, n_coeffs*sizeof(int16_t)); @@ -179,20 +180,18 @@ void vp9_quantize(MACROBLOCK *mb, int plane, int block, int n_coeffs, // Call different quantization for different transform size. if (n_coeffs >= 1024) { // Save index of picked coefficient in pre-scan pass. - int idx_arr[1024]; - - quantize_sparse(BLOCK_OFFSET(mb->plane[plane].coeff, block, 16), - n_coeffs, mb->skip_block, - mb->plane[plane].zbin, - mb->plane[plane].round, - mb->plane[plane].quant, - mb->plane[plane].quant_shift, - BLOCK_OFFSET(xd->plane[plane].qcoeff, block, 16), - BLOCK_OFFSET(xd->plane[plane].dqcoeff, block, 16), - xd->plane[plane].dequant, - mb->plane[plane].zbin_extra, - &xd->plane[plane].eobs[block], - scan, idx_arr); + vp9_quantize_b_32x32(BLOCK_OFFSET(mb->plane[plane].coeff, block, 16), + n_coeffs, mb->skip_block, + mb->plane[plane].zbin, + mb->plane[plane].round, + mb->plane[plane].quant, + mb->plane[plane].quant_shift, + BLOCK_OFFSET(xd->plane[plane].qcoeff, block, 16), + BLOCK_OFFSET(xd->plane[plane].dqcoeff, block, 16), + xd->plane[plane].dequant, + mb->plane[plane].zbin_extra, + &xd->plane[plane].eobs[block], + scan, iscan); } else { vp9_quantize_b(BLOCK_OFFSET(mb->plane[plane].coeff, block, 16), diff --git a/vp9/encoder/x86/vp9_quantize_ssse3.asm b/vp9/encoder/x86/vp9_quantize_ssse3.asm index 665bafacb..b666abbd9 100644 --- a/vp9/encoder/x86/vp9_quantize_ssse3.asm +++ b/vp9/encoder/x86/vp9_quantize_ssse3.asm @@ -15,10 +15,10 @@ pw_1: times 8 dw 1 SECTION .text -INIT_XMM ssse3 -cglobal quantize_b, 0, 6, 15, coeff, ncoeff, skip, zbin, round, quant, \ - shift, qcoeff, dqcoeff, dequant, zbin_oq, \ - eob, scan, iscan +%macro QUANTIZE_FN 1 +cglobal quantize_%1, 0, 6, 15, coeff, ncoeff, skip, zbin, round, quant, \ + shift, qcoeff, dqcoeff, dequant, zbin_oq, \ + eob, scan, iscan cmp dword skipm, 0 jne .blank @@ -57,6 +57,10 @@ cglobal quantize_b, 0, 6, 15, coeff, ncoeff, skip, zbin, round, quant, \ mova m10, [ coeffq+ncoeffq*2+16] ; m10 = c[i] pabsw m6, m9 ; m6 = abs(m9) pabsw m11, m10 ; m11 = abs(m10) +%ifidn %1, b_32x32 + paddw m6, m6 + paddw m11, m11 +%endif pcmpgtw m7, m6, m0 ; m7 = c[i] >= zbin punpckhqdq m0, m0 pcmpgtw m12, m11, m0 ; m12 = c[i] >= zbin @@ -77,9 +81,19 @@ cglobal quantize_b, 0, 6, 15, coeff, ncoeff, skip, zbin, round, quant, \ pand m13, m12 mova [qcoeffq+ncoeffq*2+ 0], m8 mova [qcoeffq+ncoeffq*2+16], m13 +%ifidn %1, b_32x32 + pabsw m8, m8 + pabsw m13, m13 +%endif pmullw m8, m3 ; dqc[i] = qc[i] * q punpckhqdq m3, m3 pmullw m13, m3 ; dqc[i] = qc[i] * q +%ifidn %1, b_32x32 + psrlw m8, 1 + psrlw m13, 1 + psignw m8, m9 + psignw m13, m10 +%endif mova [dqcoeffq+ncoeffq*2+ 0], m8 mova [dqcoeffq+ncoeffq*2+16], m13 pcmpeqw m8, m5 ; m8 = c[i] == 0 @@ -99,6 +113,10 @@ cglobal quantize_b, 0, 6, 15, coeff, ncoeff, skip, zbin, round, quant, \ mova m10, [ coeffq+ncoeffq*2+16] ; m10 = c[i] pabsw m6, m9 ; m6 = abs(m9) pabsw m11, m10 ; m11 = abs(m10) +%ifidn %1, b_32x32 + paddw m6, m6 + paddw m11, m11 +%endif pcmpgtw m7, m6, m0 ; m7 = c[i] >= zbin pcmpgtw m12, m11, m0 ; m12 = c[i] >= zbin paddw m6, m1 ; m6 += round @@ -115,8 +133,18 @@ cglobal quantize_b, 0, 6, 15, coeff, ncoeff, skip, zbin, round, quant, \ pand m13, m12 mova [qcoeffq+ncoeffq*2+ 0], m14 mova [qcoeffq+ncoeffq*2+16], m13 +%ifidn %1, b_32x32 + pabsw m14, m14 + pabsw m13, m13 +%endif pmullw m14, m3 ; dqc[i] = qc[i] * q pmullw m13, m3 ; dqc[i] = qc[i] * q +%ifidn %1, b_32x32 + psrlw m14, 1 + psrlw m13, 1 + psignw m14, m9 + psignw m13, m10 +%endif mova [dqcoeffq+ncoeffq*2+ 0], m14 mova [dqcoeffq+ncoeffq*2+16], m13 pcmpeqw m14, m5 ; m14 = c[i] == 0 @@ -163,3 +191,8 @@ cglobal quantize_b, 0, 6, 15, coeff, ncoeff, skip, zbin, round, quant, \ jl .blank_loop mov word [eobq], 0 RET +%endmacro + +INIT_XMM ssse3 +QUANTIZE_FN b +QUANTIZE_FN b_32x32 -- 2.40.0