From 37c68efee2daa98d8abc9853d2210477d5c0104f Mon Sep 17 00:00:00 2001 From: Julia Robson Date: Tue, 6 Oct 2015 12:11:14 +0100 Subject: [PATCH] SSSE3 optimisation for quantize in high bit depth When configured with high bit detpth enabled, the 8bit quantize function stopped using optimised code. This made 8bit content decode slowly. This commit re-enables the SSSE3 optimisations. Change-Id: I194b505dd3f4c494e5c5e53e020f5d94534b16b5 --- vpx_dsp/vpx_dsp_rtcd_defs.pl | 4 +- vpx_dsp/x86/quantize_ssse3_x86_64.asm | 136 +++++++++++++++++++++++++- 2 files changed, 137 insertions(+), 3 deletions(-) diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl index 7f9b882e7..6e071ee51 100644 --- a/vpx_dsp/vpx_dsp_rtcd_defs.pl +++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl @@ -849,10 +849,10 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") { if ((vpx_config("CONFIG_VP9_ENCODER") eq "yes") || (vpx_config("CONFIG_VP10_ENCODER") eq "yes")) { if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") { add_proto qw/void vpx_quantize_b/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan"; - specialize qw/vpx_quantize_b sse2/; + specialize qw/vpx_quantize_b sse2/, "$ssse3_x86_64_x86inc"; add_proto qw/void vpx_quantize_b_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan"; - specialize qw/vpx_quantize_b_32x32/; + specialize qw/vpx_quantize_b_32x32/, "$ssse3_x86_64_x86inc"; add_proto qw/void vpx_highbd_quantize_b/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan"; specialize qw/vpx_highbd_quantize_b sse2/; diff --git a/vpx_dsp/x86/quantize_ssse3_x86_64.asm b/vpx_dsp/x86/quantize_ssse3_x86_64.asm index 3784d9d2e..2f3cadde2 100644 --- a/vpx_dsp/x86/quantize_ssse3_x86_64.asm +++ b/vpx_dsp/x86/quantize_ssse3_x86_64.asm @@ -53,15 +53,31 @@ cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, skip, zbin, round, quant, \ %endif pxor m5, m5 ; m5 = dedicated zero DEFINE_ARGS coeff, ncoeff, d1, qcoeff, dqcoeff, iscan, d2, d3, d4, d5, eob +%if CONFIG_VP9_HIGHBITDEPTH + lea coeffq, [ coeffq+ncoeffq*4] + lea qcoeffq, [ qcoeffq+ncoeffq*4] + lea dqcoeffq, [dqcoeffq+ncoeffq*4] +%else lea coeffq, [ coeffq+ncoeffq*2] - lea iscanq, [ iscanq+ncoeffq*2] lea qcoeffq, [ qcoeffq+ncoeffq*2] lea dqcoeffq, [dqcoeffq+ncoeffq*2] +%endif + lea iscanq, [ iscanq+ncoeffq*2] neg ncoeffq ; get DC and first 15 AC coeffs +%if CONFIG_VP9_HIGHBITDEPTH + ; coeff stored as 32bit numbers & require 16bit numbers + mova m9, [ coeffq+ncoeffq*4+ 0] + mova m6, [ coeffq+ncoeffq*4+16] + mova m10, [ coeffq+ncoeffq*4+32] + mova m11, [ coeffq+ncoeffq*4+48] + packssdw m9, m6 ; m9 = c[i] + packssdw m10, m11 ; m10 = c[i] +%else mova m9, [ coeffq+ncoeffq*2+ 0] ; m9 = c[i] mova m10, [ coeffq+ncoeffq*2+16] ; m10 = c[i] +%endif pabsw m6, m9 ; m6 = abs(m9) pabsw m11, m10 ; m11 = abs(m10) pcmpgtw m7, m6, m0 ; m7 = c[i] >= zbin @@ -82,8 +98,28 @@ cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, skip, zbin, round, quant, \ psignw m13, m10 ; m13 = reinsert sign pand m8, m7 pand m13, m12 +%if CONFIG_VP9_HIGHBITDEPTH + ; store 16bit numbers as 32bit numbers in array pointed to by qcoeff + mova m11, m8 + mova m6, m8 + pcmpgtw m5, m8 + punpcklwd m11, m5 + punpckhwd m6, m5 + mova [qcoeffq+ncoeffq*4+ 0], m11 + mova [qcoeffq+ncoeffq*4+16], m6 + pxor m5, m5 + mova m11, m13 + mova m6, m13 + pcmpgtw m5, m13 + punpcklwd m11, m5 + punpckhwd m6, m5 + mova [qcoeffq+ncoeffq*4+32], m11 + mova [qcoeffq+ncoeffq*4+48], m6 + pxor m5, m5 ; reset m5 to zero register +%else mova [qcoeffq+ncoeffq*2+ 0], m8 mova [qcoeffq+ncoeffq*2+16], m13 +%endif %ifidn %1, b_32x32 pabsw m8, m8 pabsw m13, m13 @@ -97,8 +133,28 @@ cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, skip, zbin, round, quant, \ psignw m8, m9 psignw m13, m10 %endif +%if CONFIG_VP9_HIGHBITDEPTH + ; store 16bit numbers as 32bit numbers in array pointed to by qcoeff + mova m11, m8 + mova m6, m8 + pcmpgtw m5, m8 + punpcklwd m11, m5 + punpckhwd m6, m5 + mova [dqcoeffq+ncoeffq*4+ 0], m11 + mova [dqcoeffq+ncoeffq*4+16], m6 + pxor m5, m5 + mova m11, m13 + mova m6, m13 + pcmpgtw m5, m13 + punpcklwd m11, m5 + punpckhwd m6, m5 + mova [dqcoeffq+ncoeffq*4+32], m11 + mova [dqcoeffq+ncoeffq*4+48], m6 + pxor m5, m5 ; reset m5 to zero register +%else mova [dqcoeffq+ncoeffq*2+ 0], m8 mova [dqcoeffq+ncoeffq*2+16], m13 +%endif pcmpeqw m8, m5 ; m8 = c[i] == 0 pcmpeqw m13, m5 ; m13 = c[i] == 0 mova m6, [ iscanq+ncoeffq*2+ 0] ; m6 = scan[i] @@ -112,8 +168,18 @@ cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, skip, zbin, round, quant, \ jz .accumulate_eob .ac_only_loop: +%if CONFIG_VP9_HIGHBITDEPTH + ; pack coeff from 32bit to 16bit array + mova m9, [ coeffq+ncoeffq*4+ 0] + mova m6, [ coeffq+ncoeffq*4+16] + mova m10, [ coeffq+ncoeffq*4+32] + mova m11, [ coeffq+ncoeffq*4+48] + packssdw m9, m6 ; m9 = c[i] + packssdw m10, m11 ; m10 = c[i] +%else mova m9, [ coeffq+ncoeffq*2+ 0] ; m9 = c[i] mova m10, [ coeffq+ncoeffq*2+16] ; m10 = c[i] +%endif pabsw m6, m9 ; m6 = abs(m9) pabsw m11, m10 ; m11 = abs(m10) pcmpgtw m7, m6, m0 ; m7 = c[i] >= zbin @@ -136,8 +202,29 @@ cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, skip, zbin, round, quant, \ psignw m13, m10 ; m13 = reinsert sign pand m14, m7 pand m13, m12 +%if CONFIG_VP9_HIGHBITDEPTH + ; store 16bit numbers as 32bit numbers in array pointed to by qcoeff + pxor m11, m11 + mova m11, m14 + mova m6, m14 + pcmpgtw m5, m14 + punpcklwd m11, m5 + punpckhwd m6, m5 + mova [qcoeffq+ncoeffq*4+ 0], m11 + mova [qcoeffq+ncoeffq*4+16], m6 + pxor m5, m5 + mova m11, m13 + mova m6, m13 + pcmpgtw m5, m13 + punpcklwd m11, m5 + punpckhwd m6, m5 + mova [qcoeffq+ncoeffq*4+32], m11 + mova [qcoeffq+ncoeffq*4+48], m6 + pxor m5, m5 ; reset m5 to zero register +%else mova [qcoeffq+ncoeffq*2+ 0], m14 mova [qcoeffq+ncoeffq*2+16], m13 +%endif %ifidn %1, b_32x32 pabsw m14, m14 pabsw m13, m13 @@ -150,8 +237,28 @@ cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, skip, zbin, round, quant, \ psignw m14, m9 psignw m13, m10 %endif +%if CONFIG_VP9_HIGHBITDEPTH + ; store 16bit numbers as 32bit numbers in array pointed to by qcoeff + mova m11, m14 + mova m6, m14 + pcmpgtw m5, m14 + punpcklwd m11, m5 + punpckhwd m6, m5 + mova [dqcoeffq+ncoeffq*4+ 0], m11 + mova [dqcoeffq+ncoeffq*4+16], m6 + pxor m5, m5 + mova m11, m13 + mova m6, m13 + pcmpgtw m5, m13 + punpcklwd m11, m5 + punpckhwd m6, m5 + mova [dqcoeffq+ncoeffq*4+32], m11 + mova [dqcoeffq+ncoeffq*4+48], m6 + pxor m5, m5 +%else mova [dqcoeffq+ncoeffq*2+ 0], m14 mova [dqcoeffq+ncoeffq*2+16], m13 +%endif pcmpeqw m14, m5 ; m14 = c[i] == 0 pcmpeqw m13, m5 ; m13 = c[i] == 0 mova m6, [ iscanq+ncoeffq*2+ 0] ; m6 = scan[i] @@ -168,10 +275,21 @@ cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, skip, zbin, round, quant, \ %ifidn %1, b_32x32 jmp .accumulate_eob .skip_iter: +%if CONFIG_VP9_HIGHBITDEPTH + mova [qcoeffq+ncoeffq*4+ 0], m5 + mova [qcoeffq+ncoeffq*4+16], m5 + mova [qcoeffq+ncoeffq*4+32], m5 + mova [qcoeffq+ncoeffq*4+48], m5 + mova [dqcoeffq+ncoeffq*4+ 0], m5 + mova [dqcoeffq+ncoeffq*4+16], m5 + mova [dqcoeffq+ncoeffq*4+32], m5 + mova [dqcoeffq+ncoeffq*4+48], m5 +%else mova [qcoeffq+ncoeffq*2+ 0], m5 mova [qcoeffq+ncoeffq*2+16], m5 mova [dqcoeffq+ncoeffq*2+ 0], m5 mova [dqcoeffq+ncoeffq*2+16], m5 +%endif add ncoeffq, mmsize jl .ac_only_loop %endif @@ -196,15 +314,31 @@ cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, skip, zbin, round, quant, \ mov r2, qcoeffmp mov r3, eobmp DEFINE_ARGS dqcoeff, ncoeff, qcoeff, eob +%if CONFIG_VP9_HIGHBITDEPTH + lea dqcoeffq, [dqcoeffq+ncoeffq*4] + lea qcoeffq, [ qcoeffq+ncoeffq*4] +%else lea dqcoeffq, [dqcoeffq+ncoeffq*2] lea qcoeffq, [ qcoeffq+ncoeffq*2] +%endif neg ncoeffq pxor m7, m7 .blank_loop: +%if CONFIG_VP9_HIGHBITDEPTH + mova [dqcoeffq+ncoeffq*4+ 0], m7 + mova [dqcoeffq+ncoeffq*4+16], m7 + mova [dqcoeffq+ncoeffq*4+32], m7 + mova [dqcoeffq+ncoeffq*4+48], m7 + mova [qcoeffq+ncoeffq*4+ 0], m7 + mova [qcoeffq+ncoeffq*4+16], m7 + mova [qcoeffq+ncoeffq*4+32], m7 + mova [qcoeffq+ncoeffq*4+48], m7 +%else mova [dqcoeffq+ncoeffq*2+ 0], m7 mova [dqcoeffq+ncoeffq*2+16], m7 mova [qcoeffq+ncoeffq*2+ 0], m7 mova [qcoeffq+ncoeffq*2+16], m7 +%endif add ncoeffq, mmsize jl .blank_loop mov word [eobq], 0 -- 2.40.0