if ((vpx_config("CONFIG_VP9_ENCODER") eq "yes") || (vpx_config("CONFIG_VP10_ENCODER") eq "yes")) {
if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
add_proto qw/void vpx_quantize_b/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
- specialize qw/vpx_quantize_b sse2/;
+ specialize qw/vpx_quantize_b sse2/, "$ssse3_x86_64_x86inc";
add_proto qw/void vpx_quantize_b_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
- specialize qw/vpx_quantize_b_32x32/;
+ specialize qw/vpx_quantize_b_32x32/, "$ssse3_x86_64_x86inc";
add_proto qw/void vpx_highbd_quantize_b/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
specialize qw/vpx_highbd_quantize_b sse2/;
%endif
pxor m5, m5 ; m5 = dedicated zero
DEFINE_ARGS coeff, ncoeff, d1, qcoeff, dqcoeff, iscan, d2, d3, d4, d5, eob
+%if CONFIG_VP9_HIGHBITDEPTH
+ lea coeffq, [ coeffq+ncoeffq*4]
+ lea qcoeffq, [ qcoeffq+ncoeffq*4]
+ lea dqcoeffq, [dqcoeffq+ncoeffq*4]
+%else
lea coeffq, [ coeffq+ncoeffq*2]
- lea iscanq, [ iscanq+ncoeffq*2]
lea qcoeffq, [ qcoeffq+ncoeffq*2]
lea dqcoeffq, [dqcoeffq+ncoeffq*2]
+%endif
+ lea iscanq, [ iscanq+ncoeffq*2]
neg ncoeffq
; get DC and first 15 AC coeffs
+%if CONFIG_VP9_HIGHBITDEPTH
+ ; coeff stored as 32bit numbers & require 16bit numbers
+ mova m9, [ coeffq+ncoeffq*4+ 0]
+ mova m6, [ coeffq+ncoeffq*4+16]
+ mova m10, [ coeffq+ncoeffq*4+32]
+ mova m11, [ coeffq+ncoeffq*4+48]
+ packssdw m9, m6 ; m9 = c[i]
+ packssdw m10, m11 ; m10 = c[i]
+%else
mova m9, [ coeffq+ncoeffq*2+ 0] ; m9 = c[i]
mova m10, [ coeffq+ncoeffq*2+16] ; m10 = c[i]
+%endif
pabsw m6, m9 ; m6 = abs(m9)
pabsw m11, m10 ; m11 = abs(m10)
pcmpgtw m7, m6, m0 ; m7 = c[i] >= zbin
psignw m13, m10 ; m13 = reinsert sign
pand m8, m7
pand m13, m12
+%if CONFIG_VP9_HIGHBITDEPTH
+ ; store 16bit numbers as 32bit numbers in array pointed to by qcoeff
+ mova m11, m8
+ mova m6, m8
+ pcmpgtw m5, m8
+ punpcklwd m11, m5
+ punpckhwd m6, m5
+ mova [qcoeffq+ncoeffq*4+ 0], m11
+ mova [qcoeffq+ncoeffq*4+16], m6
+ pxor m5, m5
+ mova m11, m13
+ mova m6, m13
+ pcmpgtw m5, m13
+ punpcklwd m11, m5
+ punpckhwd m6, m5
+ mova [qcoeffq+ncoeffq*4+32], m11
+ mova [qcoeffq+ncoeffq*4+48], m6
+ pxor m5, m5 ; reset m5 to zero register
+%else
mova [qcoeffq+ncoeffq*2+ 0], m8
mova [qcoeffq+ncoeffq*2+16], m13
+%endif
%ifidn %1, b_32x32
pabsw m8, m8
pabsw m13, m13
psignw m8, m9
psignw m13, m10
%endif
+%if CONFIG_VP9_HIGHBITDEPTH
+ ; store 16bit numbers as 32bit numbers in array pointed to by qcoeff
+ mova m11, m8
+ mova m6, m8
+ pcmpgtw m5, m8
+ punpcklwd m11, m5
+ punpckhwd m6, m5
+ mova [dqcoeffq+ncoeffq*4+ 0], m11
+ mova [dqcoeffq+ncoeffq*4+16], m6
+ pxor m5, m5
+ mova m11, m13
+ mova m6, m13
+ pcmpgtw m5, m13
+ punpcklwd m11, m5
+ punpckhwd m6, m5
+ mova [dqcoeffq+ncoeffq*4+32], m11
+ mova [dqcoeffq+ncoeffq*4+48], m6
+ pxor m5, m5 ; reset m5 to zero register
+%else
mova [dqcoeffq+ncoeffq*2+ 0], m8
mova [dqcoeffq+ncoeffq*2+16], m13
+%endif
pcmpeqw m8, m5 ; m8 = c[i] == 0
pcmpeqw m13, m5 ; m13 = c[i] == 0
mova m6, [ iscanq+ncoeffq*2+ 0] ; m6 = scan[i]
jz .accumulate_eob
.ac_only_loop:
+%if CONFIG_VP9_HIGHBITDEPTH
+ ; pack coeff from 32bit to 16bit array
+ mova m9, [ coeffq+ncoeffq*4+ 0]
+ mova m6, [ coeffq+ncoeffq*4+16]
+ mova m10, [ coeffq+ncoeffq*4+32]
+ mova m11, [ coeffq+ncoeffq*4+48]
+ packssdw m9, m6 ; m9 = c[i]
+ packssdw m10, m11 ; m10 = c[i]
+%else
mova m9, [ coeffq+ncoeffq*2+ 0] ; m9 = c[i]
mova m10, [ coeffq+ncoeffq*2+16] ; m10 = c[i]
+%endif
pabsw m6, m9 ; m6 = abs(m9)
pabsw m11, m10 ; m11 = abs(m10)
pcmpgtw m7, m6, m0 ; m7 = c[i] >= zbin
psignw m13, m10 ; m13 = reinsert sign
pand m14, m7
pand m13, m12
+%if CONFIG_VP9_HIGHBITDEPTH
+ ; store 16bit numbers as 32bit numbers in array pointed to by qcoeff
+ pxor m11, m11
+ mova m11, m14
+ mova m6, m14
+ pcmpgtw m5, m14
+ punpcklwd m11, m5
+ punpckhwd m6, m5
+ mova [qcoeffq+ncoeffq*4+ 0], m11
+ mova [qcoeffq+ncoeffq*4+16], m6
+ pxor m5, m5
+ mova m11, m13
+ mova m6, m13
+ pcmpgtw m5, m13
+ punpcklwd m11, m5
+ punpckhwd m6, m5
+ mova [qcoeffq+ncoeffq*4+32], m11
+ mova [qcoeffq+ncoeffq*4+48], m6
+ pxor m5, m5 ; reset m5 to zero register
+%else
mova [qcoeffq+ncoeffq*2+ 0], m14
mova [qcoeffq+ncoeffq*2+16], m13
+%endif
%ifidn %1, b_32x32
pabsw m14, m14
pabsw m13, m13
psignw m14, m9
psignw m13, m10
%endif
+%if CONFIG_VP9_HIGHBITDEPTH
+ ; store 16bit numbers as 32bit numbers in array pointed to by qcoeff
+ mova m11, m14
+ mova m6, m14
+ pcmpgtw m5, m14
+ punpcklwd m11, m5
+ punpckhwd m6, m5
+ mova [dqcoeffq+ncoeffq*4+ 0], m11
+ mova [dqcoeffq+ncoeffq*4+16], m6
+ pxor m5, m5
+ mova m11, m13
+ mova m6, m13
+ pcmpgtw m5, m13
+ punpcklwd m11, m5
+ punpckhwd m6, m5
+ mova [dqcoeffq+ncoeffq*4+32], m11
+ mova [dqcoeffq+ncoeffq*4+48], m6
+ pxor m5, m5
+%else
mova [dqcoeffq+ncoeffq*2+ 0], m14
mova [dqcoeffq+ncoeffq*2+16], m13
+%endif
pcmpeqw m14, m5 ; m14 = c[i] == 0
pcmpeqw m13, m5 ; m13 = c[i] == 0
mova m6, [ iscanq+ncoeffq*2+ 0] ; m6 = scan[i]
%ifidn %1, b_32x32
jmp .accumulate_eob
.skip_iter:
+%if CONFIG_VP9_HIGHBITDEPTH
+ mova [qcoeffq+ncoeffq*4+ 0], m5
+ mova [qcoeffq+ncoeffq*4+16], m5
+ mova [qcoeffq+ncoeffq*4+32], m5
+ mova [qcoeffq+ncoeffq*4+48], m5
+ mova [dqcoeffq+ncoeffq*4+ 0], m5
+ mova [dqcoeffq+ncoeffq*4+16], m5
+ mova [dqcoeffq+ncoeffq*4+32], m5
+ mova [dqcoeffq+ncoeffq*4+48], m5
+%else
mova [qcoeffq+ncoeffq*2+ 0], m5
mova [qcoeffq+ncoeffq*2+16], m5
mova [dqcoeffq+ncoeffq*2+ 0], m5
mova [dqcoeffq+ncoeffq*2+16], m5
+%endif
add ncoeffq, mmsize
jl .ac_only_loop
%endif
mov r2, qcoeffmp
mov r3, eobmp
DEFINE_ARGS dqcoeff, ncoeff, qcoeff, eob
+%if CONFIG_VP9_HIGHBITDEPTH
+ lea dqcoeffq, [dqcoeffq+ncoeffq*4]
+ lea qcoeffq, [ qcoeffq+ncoeffq*4]
+%else
lea dqcoeffq, [dqcoeffq+ncoeffq*2]
lea qcoeffq, [ qcoeffq+ncoeffq*2]
+%endif
neg ncoeffq
pxor m7, m7
.blank_loop:
+%if CONFIG_VP9_HIGHBITDEPTH
+ mova [dqcoeffq+ncoeffq*4+ 0], m7
+ mova [dqcoeffq+ncoeffq*4+16], m7
+ mova [dqcoeffq+ncoeffq*4+32], m7
+ mova [dqcoeffq+ncoeffq*4+48], m7
+ mova [qcoeffq+ncoeffq*4+ 0], m7
+ mova [qcoeffq+ncoeffq*4+16], m7
+ mova [qcoeffq+ncoeffq*4+32], m7
+ mova [qcoeffq+ncoeffq*4+48], m7
+%else
mova [dqcoeffq+ncoeffq*2+ 0], m7
mova [dqcoeffq+ncoeffq*2+16], m7
mova [qcoeffq+ncoeffq*2+ 0], m7
mova [qcoeffq+ncoeffq*2+16], m7
+%endif
add ncoeffq, mmsize
jl .blank_loop
mov word [eobq], 0