From: Daniel Kang Date: Wed, 8 Dec 2010 22:56:22 +0000 (-0500) Subject: SSE2 high bit depth dequant functions X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=fd8cfd445016db99a99b7a4d3769e52599aeda0e;p=libx264 SSE2 high bit depth dequant functions Patch from Google Code-In. --- diff --git a/common/quant.c b/common/quant.c index baf7a80c..416b88fc 100644 --- a/common/quant.c +++ b/common/quant.c @@ -322,6 +322,9 @@ void x264_quant_init( x264_t *h, int cpu, x264_quant_function_t *pf ) pf->quant_8x8 = x264_quant_8x8_sse2; pf->quant_2x2_dc = x264_quant_2x2_dc_sse2; pf->quant_4x4_dc = x264_quant_4x4_dc_sse2; + pf->dequant_4x4 = x264_dequant_4x4_sse2; + pf->dequant_8x8 = x264_dequant_8x8_sse2; + pf->dequant_4x4_dc = x264_dequant_4x4dc_sse2; pf->denoise_dct = x264_denoise_dct_sse2; pf->decimate_score15 = x264_decimate_score15_sse2; pf->decimate_score16 = x264_decimate_score16_sse2; diff --git a/common/x86/quant-a.asm b/common/x86/quant-a.asm index 8244e4ed..1a5cc92b 100644 --- a/common/x86/quant-a.asm +++ b/common/x86/quant-a.asm @@ -473,11 +473,15 @@ QUANT_AC quant_8x8_sse4, 8 ;;; %1 dct[y][x] ;;; %2,%3 dequant_mf[i_mf][y][x] ;;; m2 i_qbits - mova m0, %2 +%ifdef HIGH_BIT_DEPTH + pmaddwd m0, %1 + pslld m0, m2 +%else packssdw m0, %3 pmullw m0, %1 psllw m0, m2 +%endif mova %1, m0 %endmacro @@ -487,8 +491,12 @@ QUANT_AC quant_8x8_sse4, 8 ;;; m2 -i_qbits ;;; m3 f ;;; m4 0 - mova m0, %1 +%ifdef HIGH_BIT_DEPTH + pmaddwd m0, %2 + paddd m0, m3 + psrad m0, m2 +%else mova m1, m0 punpcklwd m0, m4 punpckhwd m1, m4 @@ -499,6 +507,7 @@ QUANT_AC quant_8x8_sse4, 8 psrad m0, m2 psrad m1, m2 packssdw m0, m1 +%endif mova %1, m0 %endmacro @@ -506,14 +515,14 @@ QUANT_AC quant_8x8_sse4, 8 %if 8*(%2-2*%3) mov t0d, 8*(%2-2*%3) %%loop: - %1 [r0+t0+8*%3], [r1+t0*2+16*%3], [r1+t0*2+24*%3] - %1 [r0+t0 ], [r1+t0*2 ], [r1+t0*2+ 8*%3] + %1 [r0+(t0 )*SIZEOF_PIXEL], [r1+t0*2 ], [r1+t0*2+ 8*%3] + %1 [r0+(t0+8*%3)*SIZEOF_PIXEL], [r1+t0*2+16*%3], [r1+t0*2+24*%3] sub t0d, 16*%3 jge %%loop REP_RET %else - %1 [r0+8*%3], [r1+16*%3], [r1+24*%3] - %1 [r0 ], [r1 ], [r1+ 8*%3] + %1 [r0+(8*%3)*SIZEOF_PIXEL], [r1+16*%3], [r1+24*%3] + %1 [r0+(0 )*SIZEOF_PIXEL], [r1+0 ], [r1+ 8*%3] RET %endif %endmacro @@ -562,10 +571,10 @@ QUANT_AC quant_8x8_sse4, 8 %endmacro ;----------------------------------------------------------------------------- -; void dequant_4x4( int16_t dct[4][4], int dequant_mf[6][4][4], int i_qp ) +; void dequant_4x4( dctcoef dct[4][4], int dequant_mf[6][4][4], int i_qp ) ;----------------------------------------------------------------------------- %macro DEQUANT 4 -cglobal dequant_%2x%2_%1, 0,3 +cglobal dequant_%2x%2_%1, 0,3,6*(mmsize/16) .skip_prologue: DEQUANT_START %3+2, %3 @@ -623,6 +632,13 @@ cglobal dequant_%2x%2_flat16_%1, 0,3 RET %endmacro ; DEQUANT +%ifdef HIGH_BIT_DEPTH +INIT_XMM +DEQUANT sse2, 4, 4, 1 +DEQUANT sse4, 4, 4, 1 +DEQUANT sse2, 8, 6, 1 +DEQUANT sse4, 8, 6, 1 +%else %ifndef ARCH_X86_64 INIT_MMX DEQUANT mmx, 4, 4, 1 @@ -631,15 +647,30 @@ DEQUANT mmx, 8, 6, 1 INIT_XMM DEQUANT sse2, 4, 4, 2 DEQUANT sse2, 8, 6, 2 +%endif -%macro DEQUANT_DC 1 -cglobal dequant_4x4dc_%1, 0,3 +%macro DEQUANT_DC 2 +cglobal dequant_4x4dc_%1, 0,3,6*(mmsize/16) DEQUANT_START 6, 6 .lshift: - movd m3, [r1] - movd m2, t0d - pslld m3, m2 + movd m3, [r1] + movd m2, t0d + pslld m3, m2 +%ifdef HIGH_BIT_DEPTH + pshufd m3, m3, 0 +%assign x 0 +%rep SIZEOF_PIXEL*16/mmsize + mova m0, [r0+mmsize*0+x] + mova m1, [r0+mmsize*1+x] + pmaddwd m0, m3 + pmaddwd m1, m3 + mova [r0+mmsize*0+x], m0 + mova [r0+mmsize*1+x], m1 +%assign x x+mmsize*2 +%endrep + +%else ; !HIGH_BIT_DEPTH %if mmsize==16 pshuflw m3, m3, 0 punpcklqdq m3, m3 @@ -647,7 +678,7 @@ cglobal dequant_4x4dc_%1, 0,3 pshufw m3, m3, 0 %endif %assign x 0 -%rep 16/mmsize +%rep SIZEOF_PIXEL*16/mmsize mova m0, [r0+mmsize*0+x] mova m1, [r0+mmsize*1+x] pmullw m0, m3 @@ -656,24 +687,37 @@ cglobal dequant_4x4dc_%1, 0,3 mova [r0+mmsize*1+x], m1 %assign x x+mmsize*2 %endrep +%endif ; HIGH_BIT_DEPTH RET .rshift32: neg t0d movd m3, t0d - mova m4, [pw_1] + mova m4, [p%2_1] mova m5, m4 pslld m4, m3 psrld m4, 1 movd m2, [r1] +%assign x 0 +%ifdef HIGH_BIT_DEPTH + pshufd m2, m2, 0 +%rep SIZEOF_PIXEL*32/mmsize + mova m0, [r0+x] + pmaddwd m0, m2 + paddd m0, m4 + psrad m0, m3 + mova [r0+x], m0 +%assign x x+mmsize +%endrep + +%else %if mmsize==8 punpcklwd m2, m2 %else pshuflw m2, m2, 0 %endif punpcklwd m2, m4 -%assign x 0 -%rep 32/mmsize +%rep SIZEOF_PIXEL*32/mmsize mova m0, [r0+x] mova m1, m0 punpcklwd m0, m5 @@ -686,13 +730,20 @@ cglobal dequant_4x4dc_%1, 0,3 mova [r0+x], m0 %assign x x+mmsize %endrep +%endif RET %endmacro +%ifdef HIGH_BIT_DEPTH +INIT_XMM +DEQUANT_DC sse2 , d +DEQUANT_DC sse4 , d +%else INIT_MMX -DEQUANT_DC mmxext +DEQUANT_DC mmxext, w INIT_XMM -DEQUANT_DC sse2 +DEQUANT_DC sse2 , w +%endif %ifdef HIGH_BIT_DEPTH ;----------------------------------------------------------------------------- diff --git a/common/x86/quant.h b/common/x86/quant.h index 3a7e59b9..1a7e8343 100644 --- a/common/x86/quant.h +++ b/common/x86/quant.h @@ -47,9 +47,9 @@ int x264_quant_8x8_sse4( dctcoef dct[64], udctcoef mf[64], udctcoef bias[64] ); void x264_dequant_4x4_mmx( int16_t dct[16], int dequant_mf[6][16], int i_qp ); void x264_dequant_4x4dc_mmxext( int16_t dct[16], int dequant_mf[6][16], int i_qp ); void x264_dequant_8x8_mmx( int16_t dct[64], int dequant_mf[6][64], int i_qp ); -void x264_dequant_4x4_sse2( int16_t dct[16], int dequant_mf[6][16], int i_qp ); -void x264_dequant_4x4dc_sse2( int16_t dct[16], int dequant_mf[6][16], int i_qp ); -void x264_dequant_8x8_sse2( int16_t dct[64], int dequant_mf[6][64], int i_qp ); +void x264_dequant_4x4_sse2( dctcoef dct[16], int dequant_mf[6][16], int i_qp ); +void x264_dequant_4x4dc_sse2( dctcoef dct[16], int dequant_mf[6][16], int i_qp ); +void x264_dequant_8x8_sse2( dctcoef dct[64], int dequant_mf[6][64], int i_qp ); void x264_dequant_4x4_flat16_mmx( int16_t dct[16], int dequant_mf[6][16], int i_qp ); void x264_dequant_8x8_flat16_mmx( int16_t dct[64], int dequant_mf[6][64], int i_qp ); void x264_dequant_4x4_flat16_sse2( int16_t dct[16], int dequant_mf[6][16], int i_qp ); diff --git a/tools/checkasm.c b/tools/checkasm.c index 1b45efc5..aa99a285 100644 --- a/tools/checkasm.c +++ b/tools/checkasm.c @@ -677,8 +677,8 @@ static int check_dct( int cpu_ref, int cpu_new ) for( int i = 0; i < 16 && ok; i++ )\ {\ for( int j = 0; j < 16; j++ )\ - dct1[0][j] = !i ? (j^j>>1^j>>2^j>>3)&1 ? 4080 : -4080 /* max dc */\ - : i<8 ? (*p++)&1 ? 4080 : -4080 /* max elements */\ + dct1[0][j] = !i ? (j^j>>1^j>>2^j>>3)&1 ? PIXEL_MAX*16 : -PIXEL_MAX*16 /* max dc */\ + : i<8 ? (*p++)&1 ? PIXEL_MAX*16 : -PIXEL_MAX*16 /* max elements */\ : ((*p++)&0x1fff)-0x1000; /* general case */\ memcpy( dct2, dct1, 16 * sizeof(dctcoef) );\ call_c1( dct_c.name, dct1[0] );\ @@ -1533,7 +1533,7 @@ static int check_quant( int cpu_ref, int cpu_new ) for( int qp = QP_MAX; qp > 0; qp-- ) \ { \ for( int i = 0; i < 16; i++ ) \ - dct1[i] = rand(); \ + dct1[i] = rand()%(PIXEL_MAX*16*2+1) - PIXEL_MAX*16; \ call_c1( qf_c.qname, dct1, h->quant##w##_mf[block][qp][0]>>1, h->quant##w##_bias[block][qp][0]>>1 ); \ memcpy( dct2, dct1, w*w*sizeof(dctcoef) ); \ call_c1( qf_c.dqname, dct1, h->dequant##w##_mf[block], qp ); \