From: Fiona Glaser Date: Fri, 21 Nov 2008 04:11:14 +0000 (-0800) Subject: A few tweaks to decimate asm X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=7df060bedbc72232fdf48869cea47bcd480e8eda;p=libx264 A few tweaks to decimate asm A little bit faster on both 32-bit and 64-bit --- diff --git a/common/x86/quant-a.asm b/common/x86/quant-a.asm index f89eaf69..80cf5b57 100644 --- a/common/x86/quant-a.asm +++ b/common/x86/quant-a.asm @@ -521,16 +521,14 @@ cglobal x264_decimate_score64_%1, 1,4 DECIMATE_MASK r2d, r3d, r0+96, m7, %1, null shl r2, 48 or r1, r2 - not r1 - test r1, r1 + xor r1, -1 je .ret or eax, r3d jne .ret9 .loop: bsf rcx, r1 shr r1, cl - movzx ecx, byte [table + rcx] - add eax, ecx + add al, byte [table + rcx] shr r1, 1 jne .loop .ret: @@ -557,28 +555,33 @@ cglobal x264_decimate_score64_%1, 1,5 DECIMATE_MASK r1, r0, r0+96, m7, %1, r5 shl r1, 16 or r4, r1 - not r3 - not r4 - mov r1, r3 - or r1, r4 - je .ret + xor r3, -1 + je .tryret + xor r4, -1 +.cont or r0, r2 - jne .ret9 ;r2 is zero at this point, so we don't need to zero it + jne .ret9 ;r0 is zero at this point, so we don't need to zero it .loop: bsf ecx, r3 test r3, r3 je .largerun shrd r3, r4, cl shr r4, cl - movzx ecx, byte [x264_decimate_table8 + ecx] - add r0, ecx + add r0b, byte [x264_decimate_table8 + ecx] shrd r3, r4, 1 shr r4, 1 - mov r2, r3 - or r2, r4 + cmp r0, 6 ;score64's threshold is never higher than 6 + jge .ret9 ;this early termination is only useful on 32-bit because it can be done in the latency after shrd + test r3, r3 + jne .loop + test r4, r4 jne .loop .ret: REP_RET +.tryret: + xor r4, -1 + jne .cont + REP_RET .ret9: mov eax, 9 RET diff --git a/tools/checkasm.c b/tools/checkasm.c index f7366b99..fb33a604 100644 --- a/tools/checkasm.c +++ b/tools/checkasm.c @@ -1078,7 +1078,7 @@ static int check_quant( int cpu_ref, int cpu_new ) } report( "denoise dct :" ); -#define TEST_DECIMATE( qname, decname, block, w, ac ) \ +#define TEST_DECIMATE( qname, decname, block, w, ac, thresh ) \ if( qf_a.decname != qf_ref.decname ) \ { \ set_func_name( #decname ); \ @@ -1093,7 +1093,7 @@ static int check_quant( int cpu_ref, int cpu_new ) memcpy( dct2, dct1, w*w*2 ); \ result_c = call_c1( qf_c.decname, (void*)dct2 ); \ result_a = call_a1( qf_a.decname, (void*)dct2 ); \ - if( result_c != result_a ) \ + if( X264_MIN(result_c,thresh) != X264_MIN(result_a,thresh) ) \ { \ ok = 0; \ fprintf( stderr, #decname ": [FAILED]\n" ); \ @@ -1104,9 +1104,9 @@ static int check_quant( int cpu_ref, int cpu_new ) } \ } - TEST_DECIMATE( quant_8x8, decimate_score64, CQM_8IY, 8, 0 ); - TEST_DECIMATE( quant_4x4, decimate_score16, CQM_4IY, 4, 0 ); - TEST_DECIMATE( quant_4x4, decimate_score15, CQM_4IY, 4, 1 ); + TEST_DECIMATE( quant_8x8, decimate_score64, CQM_8IY, 8, 0, 6 ); + TEST_DECIMATE( quant_4x4, decimate_score16, CQM_4IY, 4, 0, 6 ); + TEST_DECIMATE( quant_4x4, decimate_score15, CQM_4IY, 4, 1, 7 ); report( "decimate_score :" ); return ret;