Patch from Google Code-In.
pf->quant_8x8 = x264_quant_8x8_sse2;
pf->quant_2x2_dc = x264_quant_2x2_dc_sse2;
pf->quant_4x4_dc = x264_quant_4x4_dc_sse2;
+ pf->dequant_4x4 = x264_dequant_4x4_sse2;
+ pf->dequant_8x8 = x264_dequant_8x8_sse2;
+ pf->dequant_4x4_dc = x264_dequant_4x4dc_sse2;
pf->denoise_dct = x264_denoise_dct_sse2;
pf->decimate_score15 = x264_decimate_score15_sse2;
pf->decimate_score16 = x264_decimate_score16_sse2;
;;; %1 dct[y][x]
;;; %2,%3 dequant_mf[i_mf][y][x]
;;; m2 i_qbits
-
mova m0, %2
+%ifdef HIGH_BIT_DEPTH
+ pmaddwd m0, %1
+ pslld m0, m2
+%else
packssdw m0, %3
pmullw m0, %1
psllw m0, m2
+%endif
mova %1, m0
%endmacro
;;; m2 -i_qbits
;;; m3 f
;;; m4 0
-
mova m0, %1
+%ifdef HIGH_BIT_DEPTH
+ pmaddwd m0, %2
+ paddd m0, m3
+ psrad m0, m2
+%else
mova m1, m0
punpcklwd m0, m4
punpckhwd m1, m4
psrad m0, m2
psrad m1, m2
packssdw m0, m1
+%endif
mova %1, m0
%endmacro
%if 8*(%2-2*%3)
mov t0d, 8*(%2-2*%3)
%%loop:
- %1 [r0+t0+8*%3], [r1+t0*2+16*%3], [r1+t0*2+24*%3]
- %1 [r0+t0 ], [r1+t0*2 ], [r1+t0*2+ 8*%3]
+ %1 [r0+(t0 )*SIZEOF_PIXEL], [r1+t0*2 ], [r1+t0*2+ 8*%3]
+ %1 [r0+(t0+8*%3)*SIZEOF_PIXEL], [r1+t0*2+16*%3], [r1+t0*2+24*%3]
sub t0d, 16*%3
jge %%loop
REP_RET
%else
- %1 [r0+8*%3], [r1+16*%3], [r1+24*%3]
- %1 [r0 ], [r1 ], [r1+ 8*%3]
+ %1 [r0+(8*%3)*SIZEOF_PIXEL], [r1+16*%3], [r1+24*%3]
+ %1 [r0+(0 )*SIZEOF_PIXEL], [r1+0 ], [r1+ 8*%3]
RET
%endif
%endmacro
%endmacro
;-----------------------------------------------------------------------------
-; void dequant_4x4( int16_t dct[4][4], int dequant_mf[6][4][4], int i_qp )
+; void dequant_4x4( dctcoef dct[4][4], int dequant_mf[6][4][4], int i_qp )
;-----------------------------------------------------------------------------
%macro DEQUANT 4
-cglobal dequant_%2x%2_%1, 0,3
+cglobal dequant_%2x%2_%1, 0,3,6*(mmsize/16)
.skip_prologue:
DEQUANT_START %3+2, %3
RET
%endmacro ; DEQUANT
+%ifdef HIGH_BIT_DEPTH
+INIT_XMM
+DEQUANT sse2, 4, 4, 1
+DEQUANT sse4, 4, 4, 1
+DEQUANT sse2, 8, 6, 1
+DEQUANT sse4, 8, 6, 1
+%else
%ifndef ARCH_X86_64
INIT_MMX
DEQUANT mmx, 4, 4, 1
INIT_XMM
DEQUANT sse2, 4, 4, 2
DEQUANT sse2, 8, 6, 2
+%endif
-%macro DEQUANT_DC 1
-cglobal dequant_4x4dc_%1, 0,3
+%macro DEQUANT_DC 2
+cglobal dequant_4x4dc_%1, 0,3,6*(mmsize/16)
DEQUANT_START 6, 6
.lshift:
- movd m3, [r1]
- movd m2, t0d
- pslld m3, m2
+ movd m3, [r1]
+ movd m2, t0d
+ pslld m3, m2
+%ifdef HIGH_BIT_DEPTH
+ pshufd m3, m3, 0
+%assign x 0
+%rep SIZEOF_PIXEL*16/mmsize
+ mova m0, [r0+mmsize*0+x]
+ mova m1, [r0+mmsize*1+x]
+ pmaddwd m0, m3
+ pmaddwd m1, m3
+ mova [r0+mmsize*0+x], m0
+ mova [r0+mmsize*1+x], m1
+%assign x x+mmsize*2
+%endrep
+
+%else ; !HIGH_BIT_DEPTH
%if mmsize==16
pshuflw m3, m3, 0
punpcklqdq m3, m3
pshufw m3, m3, 0
%endif
%assign x 0
-%rep 16/mmsize
+%rep SIZEOF_PIXEL*16/mmsize
mova m0, [r0+mmsize*0+x]
mova m1, [r0+mmsize*1+x]
pmullw m0, m3
mova [r0+mmsize*1+x], m1
%assign x x+mmsize*2
%endrep
+%endif ; HIGH_BIT_DEPTH
RET
.rshift32:
neg t0d
movd m3, t0d
- mova m4, [pw_1]
+ mova m4, [p%2_1]
mova m5, m4
pslld m4, m3
psrld m4, 1
movd m2, [r1]
+%assign x 0
+%ifdef HIGH_BIT_DEPTH
+ pshufd m2, m2, 0
+%rep SIZEOF_PIXEL*32/mmsize
+ mova m0, [r0+x]
+ pmaddwd m0, m2
+ paddd m0, m4
+ psrad m0, m3
+ mova [r0+x], m0
+%assign x x+mmsize
+%endrep
+
+%else
%if mmsize==8
punpcklwd m2, m2
%else
pshuflw m2, m2, 0
%endif
punpcklwd m2, m4
-%assign x 0
-%rep 32/mmsize
+%rep SIZEOF_PIXEL*32/mmsize
mova m0, [r0+x]
mova m1, m0
punpcklwd m0, m5
mova [r0+x], m0
%assign x x+mmsize
%endrep
+%endif
RET
%endmacro
+%ifdef HIGH_BIT_DEPTH
+INIT_XMM
+DEQUANT_DC sse2 , d
+DEQUANT_DC sse4 , d
+%else
INIT_MMX
-DEQUANT_DC mmxext
+DEQUANT_DC mmxext, w
INIT_XMM
-DEQUANT_DC sse2
+DEQUANT_DC sse2 , w
+%endif
%ifdef HIGH_BIT_DEPTH
;-----------------------------------------------------------------------------
void x264_dequant_4x4_mmx( int16_t dct[16], int dequant_mf[6][16], int i_qp );
void x264_dequant_4x4dc_mmxext( int16_t dct[16], int dequant_mf[6][16], int i_qp );
void x264_dequant_8x8_mmx( int16_t dct[64], int dequant_mf[6][64], int i_qp );
-void x264_dequant_4x4_sse2( int16_t dct[16], int dequant_mf[6][16], int i_qp );
-void x264_dequant_4x4dc_sse2( int16_t dct[16], int dequant_mf[6][16], int i_qp );
-void x264_dequant_8x8_sse2( int16_t dct[64], int dequant_mf[6][64], int i_qp );
+void x264_dequant_4x4_sse2( dctcoef dct[16], int dequant_mf[6][16], int i_qp );
+void x264_dequant_4x4dc_sse2( dctcoef dct[16], int dequant_mf[6][16], int i_qp );
+void x264_dequant_8x8_sse2( dctcoef dct[64], int dequant_mf[6][64], int i_qp );
void x264_dequant_4x4_flat16_mmx( int16_t dct[16], int dequant_mf[6][16], int i_qp );
void x264_dequant_8x8_flat16_mmx( int16_t dct[64], int dequant_mf[6][64], int i_qp );
void x264_dequant_4x4_flat16_sse2( int16_t dct[16], int dequant_mf[6][16], int i_qp );
for( int i = 0; i < 16 && ok; i++ )\
{\
for( int j = 0; j < 16; j++ )\
- dct1[0][j] = !i ? (j^j>>1^j>>2^j>>3)&1 ? 4080 : -4080 /* max dc */\
- : i<8 ? (*p++)&1 ? 4080 : -4080 /* max elements */\
+ dct1[0][j] = !i ? (j^j>>1^j>>2^j>>3)&1 ? PIXEL_MAX*16 : -PIXEL_MAX*16 /* max dc */\
+ : i<8 ? (*p++)&1 ? PIXEL_MAX*16 : -PIXEL_MAX*16 /* max elements */\
: ((*p++)&0x1fff)-0x1000; /* general case */\
memcpy( dct2, dct1, 16 * sizeof(dctcoef) );\
call_c1( dct_c.name, dct1[0] );\
for( int qp = QP_MAX; qp > 0; qp-- ) \
{ \
for( int i = 0; i < 16; i++ ) \
- dct1[i] = rand(); \
+ dct1[i] = rand()%(PIXEL_MAX*16*2+1) - PIXEL_MAX*16; \
call_c1( qf_c.qname, dct1, h->quant##w##_mf[block][qp][0]>>1, h->quant##w##_bias[block][qp][0]>>1 ); \
memcpy( dct2, dct1, w*w*sizeof(dctcoef) ); \
call_c1( qf_c.dqname, dct1, h->dequant##w##_mf[block], qp ); \