From: Johann Date: Thu, 16 Feb 2017 03:01:38 +0000 (-0800) Subject: quantize_fp highbd ssse3: use tran_low_t for coeff X-Git-Tag: v1.7.0~704^2~1 X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=4682130b60dd27ec5a03d13a20e8249156fcd250;p=libvpx quantize_fp highbd ssse3: use tran_low_t for coeff Change-Id: Iebade0efc0efbb0a80a0f3adbef4962e3a2f25e8 --- diff --git a/vp9/common/vp9_rtcd_defs.pl b/vp9/common/vp9_rtcd_defs.pl index ddc85ded5..5ba98a278 100644 --- a/vp9/common/vp9_rtcd_defs.pl +++ b/vp9/common/vp9_rtcd_defs.pl @@ -136,7 +136,7 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") { specialize qw/vp9_block_error_fp sse2/; add_proto qw/void vp9_quantize_fp/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan"; - specialize qw/vp9_quantize_fp neon sse2/; + specialize qw/vp9_quantize_fp neon sse2/, "$ssse3_x86_64"; add_proto qw/void vp9_quantize_fp_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan"; diff --git a/vp9/encoder/x86/vp9_quantize_ssse3_x86_64.asm b/vp9/encoder/x86/vp9_quantize_ssse3_x86_64.asm index ec61c0c3a..c9f0cbce4 100644 --- a/vp9/encoder/x86/vp9_quantize_ssse3_x86_64.asm +++ b/vp9/encoder/x86/vp9_quantize_ssse3_x86_64.asm @@ -11,6 +11,7 @@ %define private_prefix vp9 %include "third_party/x86inc/x86inc.asm" +%include "vpx_dsp/x86/bitdepth_conversion_sse2.asm" SECTION_RODATA pw_1: times 8 dw 1 @@ -48,15 +49,15 @@ cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, skip, zbin, round, quant, \ %endif pxor m5, m5 ; m5 = dedicated zero - lea coeffq, [ coeffq+ncoeffq*2] - lea r5q, [ r5q+ncoeffq*2] - lea r3q, [ r3q+ncoeffq*2] - lea r4q, [r4q+ncoeffq*2] + INCREMENT_ELEMENTS_TRAN_LOW coeffq, ncoeffq + lea r5q, [r5q+ncoeffq*2] + INCREMENT_ELEMENTS_TRAN_LOW r3q, ncoeffq + INCREMENT_ELEMENTS_TRAN_LOW r4q, ncoeffq neg ncoeffq ; get DC and first 15 AC coeffs - mova m9, [ coeffq+ncoeffq*2+ 0] ; m9 = c[i] - mova m10, [ coeffq+ncoeffq*2+16] ; m10 = c[i] + LOAD_TRAN_LOW 9, coeffq, ncoeffq ; m9 = c[i] + LOAD_TRAN_LOW 10, coeffq, ncoeffq + 8 ; m10 = c[i] pabsw m6, m9 ; m6 = abs(m9) pabsw m11, m10 ; m11 = abs(m10) pcmpeqw m7, m7 @@ -69,8 +70,8 @@ cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, skip, zbin, round, quant, \ pmulhw m13, m11, m2 ; m13 = m11*q>>16 psignw m8, m9 ; m8 = reinsert sign psignw m13, m10 ; m13 = reinsert sign - mova [r3q+ncoeffq*2+ 0], m8 - mova [r3q+ncoeffq*2+16], m13 + STORE_TRAN_LOW 8, r3q, ncoeffq, 6, 11, 12 + STORE_TRAN_LOW 13, r3q, ncoeffq + 8, 6, 11, 12 %ifidn %1, fp_32x32 pabsw m8, m8 pabsw m13, m13 @@ -87,8 +88,8 @@ cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, skip, zbin, round, quant, \ %else psrlw m0, m3, 1 %endif - mova [r4q+ncoeffq*2+ 0], m8 - mova [r4q+ncoeffq*2+16], m13 + STORE_TRAN_LOW 8, r4q, ncoeffq, 6, 11, 12 + STORE_TRAN_LOW 13, r4q, ncoeffq + 8, 6, 11, 12 pcmpeqw m8, m5 ; m8 = c[i] == 0 pcmpeqw m13, m5 ; m13 = c[i] == 0 mova m6, [ r5q+ncoeffq*2+ 0] ; m6 = scan[i] @@ -102,8 +103,8 @@ cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, skip, zbin, round, quant, \ jz .accumulate_eob .ac_only_loop: - mova m9, [ coeffq+ncoeffq*2+ 0] ; m9 = c[i] - mova m10, [ coeffq+ncoeffq*2+16] ; m10 = c[i] + LOAD_TRAN_LOW 9, coeffq, ncoeffq ; m9 = c[i] + LOAD_TRAN_LOW 10, coeffq, ncoeffq + 8 ; m10 = c[i] pabsw m6, m9 ; m6 = abs(m9) pabsw m11, m10 ; m11 = abs(m10) @@ -123,8 +124,8 @@ cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, skip, zbin, round, quant, \ pmulhw m13, m11, m2 ; m13 = m11*q>>16 psignw m14, m9 ; m14 = reinsert sign psignw m13, m10 ; m13 = reinsert sign - mova [r3q+ncoeffq*2+ 0], m14 - mova [r3q+ncoeffq*2+16], m13 + STORE_TRAN_LOW 14, r3q, ncoeffq, 6, 11, 12 + STORE_TRAN_LOW 13, r3q, ncoeffq + 8, 6, 11, 12 %ifidn %1, fp_32x32 pabsw m14, m14 pabsw m13, m13 @@ -137,8 +138,8 @@ cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, skip, zbin, round, quant, \ psignw m14, m9 psignw m13, m10 %endif - mova [r4q+ncoeffq*2+ 0], m14 - mova [r4q+ncoeffq*2+16], m13 + STORE_TRAN_LOW 14, r4q, ncoeffq, 6, 11, 12 + STORE_TRAN_LOW 13, r4q, ncoeffq + 8, 6, 11, 12 pcmpeqw m14, m5 ; m14 = c[i] == 0 pcmpeqw m13, m5 ; m13 = c[i] == 0 mova m6, [ r5q+ncoeffq*2+ 0] ; m6 = scan[i] @@ -154,10 +155,10 @@ cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, skip, zbin, round, quant, \ jmp .accumulate_eob .skip_iter: - mova [r3q+ncoeffq*2+ 0], m5 - mova [r3q+ncoeffq*2+16], m5 - mova [r4q+ncoeffq*2+ 0], m5 - mova [r4q+ncoeffq*2+16], m5 + STORE_ZERO_TRAN_LOW 5, r3q, ncoeffq + STORE_ZERO_TRAN_LOW 5, r3q, ncoeffq + 8 + STORE_ZERO_TRAN_LOW 5, r4q, ncoeffq + STORE_ZERO_TRAN_LOW 5, r4q, ncoeffq + 8 add ncoeffq, mmsize jl .ac_only_loop @@ -186,10 +187,10 @@ cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, skip, zbin, round, quant, \ neg ncoeffq pxor m7, m7 .blank_loop: - mova [r0q+ncoeffq*2+ 0], m7 - mova [r0q+ncoeffq*2+16], m7 - mova [r2q+ncoeffq*2+ 0], m7 - mova [r2q+ncoeffq*2+16], m7 + STORE_ZERO_TRAN_LOW 7, r0q, ncoeffq + STORE_ZERO_TRAN_LOW 7, r0q, ncoeffq + 8 + STORE_ZERO_TRAN_LOW 7, r2q, ncoeffq + STORE_ZERO_TRAN_LOW 7, r2q, ncoeffq + 8 add ncoeffq, mmsize jl .blank_loop mov word [r3q], 0 diff --git a/vpx_dsp/x86/bitdepth_conversion_sse2.asm b/vpx_dsp/x86/bitdepth_conversion_sse2.asm index 2bcbc0ac1..aacf71f7a 100644 --- a/vpx_dsp/x86/bitdepth_conversion_sse2.asm +++ b/vpx_dsp/x86/bitdepth_conversion_sse2.asm @@ -38,29 +38,53 @@ ; the values down to 16 bits. %macro LOAD_TRAN_LOW 3 %if CONFIG_VP9_HIGHBITDEPTH - mova m%1, [%2 + %3 * 4] - packssdw m%1, [%2 + %3 * 4 + 16] + mova m%1, [%2 + (%3) * 4] + packssdw m%1, [%2 + (%3) * 4 + 16] %else - mova m%1, [%2 + %3 * 2] + mova m%1, [%2 + (%3) * 2] %endif %endmacro ; Store m%1 to %2 + %3. ; %3 is the offset in elements, not bytes. +; If 5 arguments are provided then m%1 is corrupted. +; If 6 arguments are provided then m%1 is preserved. ; If tran_low_t is 16 bits (low bit depth configuration) then store the value ; directly. If tran_low_t is 32 bits (high bit depth configuration) then sign ; extend the values first. ; Uses m%4-m%6 as scratch registers for high bit depth. -%macro STORE_TRAN_LOW 5 +%macro STORE_TRAN_LOW 5-6 %if CONFIG_VP9_HIGHBITDEPTH pxor m%4, m%4 mova m%5, m%1 + %if %0 == 6 + mova m%6, m%1 + %endif pcmpgtw m%4, m%1 punpcklwd m%5, m%4 + %if %0 == 5 punpckhwd m%1, m%4 - mova [%2 + %3 * 4 + 0], m%5 - mova [%2 + %3 * 4 + 16], m%1 + %else + punpckhwd m%6, m%4 + %endif + mova [%2 + (%3) * 4 + 0], m%5 + %if %0 == 5 + mova [%2 + (%3) * 4 + 16], m%1 + %else + mova [%2 + (%3) * 4 + 16], m%6 + %endif %else - mova [%2 + %3 * 2], m%1 + mova [%2 + (%3) * 2], m%1 +%endif +%endmacro + +; Store zeros (in m%1) to %2 + %3. +; %3 is the offset in elements, not bytes. +%macro STORE_ZERO_TRAN_LOW 3 +%if CONFIG_VP9_HIGHBITDEPTH + mova [%2 + (%3) * 4 + 0], m%1 + mova [%2 + (%3) * 4 + 16], m%1 +%else + mova [%2 + (%3) * 2], m%1 %endif %endmacro