From: Loren Merritt Date: Mon, 17 Mar 2008 04:40:43 +0000 (-0600) Subject: checkasm check whether callee-saved regs are correctly saved X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=9d0c0a90254e39adb581158d37a6946064fce4e2;p=libx264 checkasm check whether callee-saved regs are correctly saved x86_32 only for now since x86_64 varargs are annoying --- diff --git a/Makefile b/Makefile index 3f3a16ba..0e7d712b 100644 --- a/Makefile +++ b/Makefile @@ -32,6 +32,7 @@ ASMSRC = $(X86SRC) common/x86/pixel-32.asm OBJASM = $(ASMSRC:%.asm=%.o) ASFLAGS += -Icommon/x86/ $(OBJASM): common/x86/x86inc.asm common/x86/x86inc-32.asm +checkasm: tools/checkasm-32.o endif ifeq ($(ARCH),X86_64) diff --git a/common/x86/mc-a.asm b/common/x86/mc-a.asm index 02959222..e1d074a5 100644 --- a/common/x86/mc-a.asm +++ b/common/x86/mc-a.asm @@ -373,7 +373,7 @@ cglobal x264_pixel_avg_weight_w8_mmxext, 4,5 ;----------------------------------------------------------------------------- ; int x264_pixel_avg_weight_4x4_mmxext( uint8_t *, int, uint8_t *, int, int ) ;----------------------------------------------------------------------------- -cglobal x264_pixel_avg_weight_4x4_mmxext, 4,4 +cglobal x264_pixel_avg_weight_4x4_mmxext, 4,4,1 BIWEIGHT_START_MMX 0 BIWEIGHT_4P_MMX [r0 ], [r2 ] BIWEIGHT_4P_MMX [r0+r1 ], [r2+r3 ] diff --git a/tools/checkasm-32.asm b/tools/checkasm-32.asm new file mode 100644 index 00000000..15fbf246 --- /dev/null +++ b/tools/checkasm-32.asm @@ -0,0 +1,72 @@ +;***************************************************************************** +;* checkasm-32.asm +;***************************************************************************** +;* Copyright (C) 2008 Loren Merritt +;* +;* This program is free software; you can redistribute it and/or modify +;* it under the terms of the GNU General Public License as published by +;* the Free Software Foundation; either version 2 of the License, or +;* (at your option) any later version. +;* +;* This program is distributed in the hope that it will be useful, +;* but WITHOUT ANY WARRANTY; without even the implied warranty of +;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +;* GNU General Public License for more details. +;* +;* You should have received a copy of the GNU General Public License +;* along with this program; if not, write to the Free Software +;* Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111, USA. +;***************************************************************************** + +%include "x86inc.asm" + +SECTION_RODATA + +error_message: db "failed to preserve register", 10, 0 + +SECTION .text + +cextern printf + +; max number of args used by any x264 asm function. +%define max_args 8 + +; just random numbers to reduce the chance of incidental match +%define n3 dword 0x6549315c +%define n4 dword 0xe02f3e23 +%define n5 dword 0xb78d0d1d +%define n6 dword 0x33627ba7 + +;----------------------------------------------------------------------------- +; long x264_checkasm_call( long (*func)(), int *ok, ... ) +;----------------------------------------------------------------------------- +cglobal x264_checkasm_call, 1,7 + mov r3, n3 + mov r4, n4 + mov r5, n5 + mov r6, n6 +%rep max_args + push dword [esp+24+max_args*4] +%endrep + call r0 + add esp, max_args*4 + xor r3, n3 + xor r4, n4 + xor r5, n5 + xor r6, n6 + or r3, r4 + or r5, r6 + or r3, r5 + jz .ok + mov r3, eax + picgetgot r1 + lea r1, [error_message GLOBAL] + push r1 + xor eax, eax + call printf + add esp, 4 + mov r1, r1m + mov dword [r1], 0 + mov eax, r3 +.ok: + RET diff --git a/tools/checkasm.c b/tools/checkasm.c index dc500b40..91f80de0 100644 --- a/tools/checkasm.c +++ b/tools/checkasm.c @@ -15,6 +15,16 @@ uint8_t * buf3, * buf4; if( !ok ) ret = -1; \ } +/* detect when callee-saved regs aren't saved. + * needs an explicit asm check because it only sometimes crashes in normal use. */ +#define call_c(func,...) func(__VA_ARGS__) +#ifdef ARCH_X86 +long x264_checkasm_call( long (*func)(), int *ok, ... ); +#define call_a(func,...) x264_checkasm_call((long(*)())func, &ok, __VA_ARGS__) +#else +#define call_a call_c +#endif + static int check_pixel( int cpu_ref, int cpu_new ) { x264_pixel_function_t pixel_c; @@ -47,8 +57,8 @@ static int check_pixel( int cpu_ref, int cpu_new ) for( j=0; j<64; j++ ) \ { \ used_asm = 1; \ - res_c = pixel_c.name[i]( buf1, 32, buf2+j, 16 ); \ - res_asm = pixel_asm.name[i]( buf1, 32, buf2+j, 16 ); \ + res_c = call_c( pixel_c.name[i], buf1, 32, buf2+j, 16 ); \ + res_asm = call_a( pixel_asm.name[i], buf1, 32, buf2+j, 16 ); \ if( res_c != res_asm ) \ { \ ok = 0; \ @@ -81,10 +91,10 @@ static int check_pixel( int cpu_ref, int cpu_new ) if(N==4) \ { \ res_c[3] = pixel_c.sad[i]( buf1, 16, pix2+99, 32 ); \ - pixel_asm.sad_x4[i]( buf1, pix2, pix2+30, pix2+1, pix2+99, 32, res_asm ); \ + call_a( pixel_asm.sad_x4[i], buf1, pix2, pix2+30, pix2+1, pix2+99, 32, res_asm ); \ } \ else \ - pixel_asm.sad_x3[i]( buf1, pix2, pix2+30, pix2+1, 32, res_asm ); \ + call_a( pixel_asm.sad_x3[i], buf1, pix2, pix2+30, pix2+1, 32, res_asm ); \ if( memcmp(res_c, res_asm, sizeof(res_c)) ) \ { \ ok = 0; \ @@ -111,7 +121,7 @@ static int check_pixel( int cpu_ref, int cpu_new ) pred[i]( buf3+40, ##__VA_ARGS__ ); \ res_c[i] = pixel_c.satd( buf1+40, 16, buf3+40, 32 ); \ } \ - pixel_asm.name( buf1+40, i8x8 ? edge : buf3+40, res_asm ); \ + call_a( pixel_asm.name, buf1+40, i8x8 ? edge : buf3+40, res_asm ); \ if( memcmp(res_c, res_asm, sizeof(res_c)) ) \ { \ ok = 0; \ @@ -147,7 +157,7 @@ static int check_pixel( int cpu_ref, int cpu_new ) ok = 1; used_asm = 0; for( i=0; i<32; i++ ) cost_mv[i] = i*10; - for( i=0; i<100; i++ ) + for( i=0; i<100 && ok; i++ ) if( pixel_asm.ads[i&3] != pixel_ref.ads[i&3] ) { DECLARE_ALIGNED( uint16_t, sums[72], 16 ); @@ -160,8 +170,8 @@ static int check_pixel( int cpu_ref, int cpu_new ) for( j=0; j<4; j++ ) dc[j] = rand() & 0x3fff; used_asm = 1; - mvn_c = pixel_c.ads[i&3]( dc, sums, 32, cost_mv, mvs_c, 28, thresh ); - mvn_a = pixel_asm.ads[i&3]( dc, sums, 32, cost_mv, mvs_a, 28, thresh ); + mvn_c = call_c( pixel_c.ads[i&3], dc, sums, 32, cost_mv, mvs_c, 28, thresh ); + mvn_a = call_a( pixel_asm.ads[i&3], dc, sums, 32, cost_mv, mvs_a, 28, thresh ); if( mvn_c != mvn_a || memcmp( mvs_c, mvs_a, mvn_c*sizeof(*mvs_c) ) ) { ok = 0; @@ -212,8 +222,8 @@ static int check_dct( int cpu_ref, int cpu_new ) if( dct_asm.name != dct_ref.name ) \ { \ used_asm = 1; \ - dct_c.name( t1, buf1, buf2 ); \ - dct_asm.name( t2, buf1, buf2 ); \ + call_c( dct_c.name, t1, buf1, buf2 ); \ + call_a( dct_asm.name, t2, buf1, buf2 ); \ if( memcmp( t1, t2, size ) ) \ { \ ok = 0; \ @@ -255,8 +265,8 @@ static int check_dct( int cpu_ref, int cpu_new ) memcpy( buf4, buf1, 32*32 ); \ memcpy( dct1, src, 512 ); \ memcpy( dct2, src, 512 ); \ - dct_c.name( buf3, (void*)dct1 ); \ - dct_asm.name( buf4, (void*)dct2 ); \ + call_c( dct_c.name, buf3, (void*)dct1 ); \ + call_a( dct_asm.name, buf4, (void*)dct2 ); \ if( memcmp( buf3, buf4, 32*32 ) ) \ { \ ok = 0; \ @@ -281,8 +291,8 @@ static int check_dct( int cpu_ref, int cpu_new ) int16_t dct1[4][4] __attribute((aligned(16))) = { {-12, 42, 23, 67},{2, 90, 89,56}, {67,43,-76,91},{56,-78,-54,1}}; int16_t dct2[4][4] __attribute((aligned(16))) = { {-12, 42, 23, 67},{2, 90, 89,56}, {67,43,-76,91},{56,-78,-54,1}}; used_asm = 1; - dct_c.dct4x4dc( dct1 ); - dct_asm.dct4x4dc( dct2 ); + call_c( dct_c.dct4x4dc, dct1 ); + call_a( dct_asm.dct4x4dc, dct2 ); if( memcmp( dct1, dct2, 32 ) ) { ok = 0; @@ -294,8 +304,8 @@ static int check_dct( int cpu_ref, int cpu_new ) int16_t dct1[4][4] __attribute((aligned(16))) = { {-12, 42, 23, 67},{2, 90, 89,56}, {67,43,-76,91},{56,-78,-54,1}}; int16_t dct2[4][4] __attribute((aligned(16))) = { {-12, 42, 23, 67},{2, 90, 89,56}, {67,43,-76,91},{56,-78,-54,1}}; used_asm = 1; - dct_c.idct4x4dc( dct1 ); - dct_asm.idct4x4dc( dct2 ); + call_c( dct_c.idct4x4dc, dct1 ); + call_a( dct_asm.idct4x4dc, dct2 ); if( memcmp( dct1, dct2, 32 ) ) { ok = 0; @@ -310,8 +320,8 @@ static int check_dct( int cpu_ref, int cpu_new ) int16_t dct1[2][2] __attribute((aligned(16))) = { {-12, 42},{2, 90}}; int16_t dct2[2][2] __attribute((aligned(16))) = { {-12, 42},{2, 90}}; used_asm = 1; - dct_c.dct2x2dc( dct1 ); - dct_asm.dct2x2dc( dct2 ); + call_c( dct_c.dct2x2dc, dct1 ); + call_a( dct_asm.dct2x2dc, dct2 ); if( memcmp( dct1, dct2, 4*2 ) ) { ok = 0; @@ -323,8 +333,8 @@ static int check_dct( int cpu_ref, int cpu_new ) int16_t dct1[2][2] __attribute((aligned(16))) = { {-12, 42},{2, 90}}; int16_t dct2[2][2] __attribute((aligned(16))) = { {-12, 42},{2, 90}}; used_asm = 1; - dct_c.idct2x2dc( dct1 ); - dct_asm.idct2x2dc( dct2 ); + call_c( dct_c.idct2x2dc, dct1 ); + call_a( dct_asm.idct2x2dc, dct2 ); if( memcmp( dct1, dct2, 4*2 ) ) { ok = 0; @@ -344,8 +354,8 @@ static int check_dct( int cpu_ref, int cpu_new ) if( zigzag_asm.name != zigzag_ref.name ) \ { \ used_asm = 1; \ - zigzag_c.name( t1, dct ); \ - zigzag_asm.name( t2, dct ); \ + call_c( zigzag_c.name, t1, dct ); \ + call_a( zigzag_asm.name, t2, dct ); \ if( memcmp( t1, t2, size ) ) \ { \ ok = 0; \ @@ -359,8 +369,8 @@ static int check_dct( int cpu_ref, int cpu_new ) used_asm = 1; \ memcpy( buf3, buf1, 16*FDEC_STRIDE ); \ memcpy( buf4, buf1, 16*FDEC_STRIDE ); \ - zigzag_c.name( t1, buf2, buf3 ); \ - zigzag_asm.name( t2, buf2, buf4 ); \ + call_c( zigzag_c.name, t1, buf2, buf3 ); \ + call_a( zigzag_asm.name, t2, buf2, buf4 ); \ if( memcmp( t1, t2, size )|| memcmp( buf3, buf4, 16*FDEC_STRIDE ) ) \ { \ ok = 0; \ @@ -424,8 +434,8 @@ static int check_mc( int cpu_ref, int cpu_new ) used_asm = 1; \ memset(buf3, 0xCD, 1024); \ memset(buf4, 0xCD, 1024); \ - mc_c.mc_luma( dst1, 32, src2, 16, dx, dy, w, h ); \ - mc_a.mc_luma( dst2, 32, src2, 16, dx, dy, w, h ); \ + call_c( mc_c.mc_luma, dst1, 32, src2, 16, dx, dy, w, h ); \ + call_a( mc_a.mc_luma, dst2, 32, src2, 16, dx, dy, w, h ); \ if( memcmp( buf3, buf4, 1024 ) ) \ { \ fprintf( stderr, "mc_luma[mv(%d,%d) %2dx%-2d] [FAILED]\n", dx, dy, w, h ); \ @@ -439,8 +449,8 @@ static int check_mc( int cpu_ref, int cpu_new ) used_asm = 1; \ memset(buf3, 0xCD, 1024); \ memset(buf4, 0xCD, 1024); \ - mc_c.mc_luma( dst1, 32, src2, 16, dx, dy, w, h ); \ - ref = mc_a.get_ref( ref, &ref_stride, src2, 16, dx, dy, w, h ); \ + call_c( mc_c.mc_luma, dst1, 32, src2, 16, dx, dy, w, h ); \ + ref = (uint8_t*) call_a( mc_a.get_ref, ref, &ref_stride, src2, 16, dx, dy, w, h ); \ for( i=0; iquant4_mf[CQM_4IY][qp][0], h->quant4_bias[CQM_4IY][qp][0] ); \ - qf_a.name( (void*)dct2, h->quant4_mf[CQM_4IY][qp][0], h->quant4_bias[CQM_4IY][qp][0] ); \ + call_c( qf_c.name, (void*)dct1, h->quant4_mf[CQM_4IY][qp][0], h->quant4_bias[CQM_4IY][qp][0] ); \ + call_a( qf_a.name, (void*)dct2, h->quant4_mf[CQM_4IY][qp][0], h->quant4_bias[CQM_4IY][qp][0] ); \ if( memcmp( dct1, dct2, 16*2 ) ) \ { \ oks[0] = 0; \ @@ -710,8 +720,8 @@ static int check_quant( int cpu_ref, int cpu_new ) for( qp = 51; qp > 0; qp-- ) \ { \ INIT_QUANT##w() \ - qf_c.qname( (void*)dct1, h->quant##w##_mf[block][qp], h->quant##w##_bias[block][qp] ); \ - qf_a.qname( (void*)dct2, h->quant##w##_mf[block][qp], h->quant##w##_bias[block][qp] ); \ + call_c( qf_c.qname, (void*)dct1, h->quant##w##_mf[block][qp], h->quant##w##_bias[block][qp] ); \ + call_a( qf_a.qname, (void*)dct2, h->quant##w##_mf[block][qp], h->quant##w##_bias[block][qp] ); \ if( memcmp( dct1, dct2, w*w*2 ) ) \ { \ oks[0] = 0; \ @@ -735,10 +745,10 @@ static int check_quant( int cpu_ref, int cpu_new ) for( qp = 51; qp > 0; qp-- ) \ { \ INIT_QUANT##w() \ - qf_c.qname( (void*)dct1, h->quant##w##_mf[block][qp], h->quant##w##_bias[block][qp] ); \ + call_c( qf_c.qname, (void*)dct1, h->quant##w##_mf[block][qp], h->quant##w##_bias[block][qp] ); \ memcpy( dct2, dct1, w*w*2 ); \ - qf_c.dqname( (void*)dct1, h->dequant##w##_mf[block], qp ); \ - qf_a.dqname( (void*)dct2, h->dequant##w##_mf[block], qp ); \ + call_c( qf_c.dqname, (void*)dct1, h->dequant##w##_mf[block], qp ); \ + call_a( qf_a.dqname, (void*)dct2, h->dequant##w##_mf[block], qp ); \ if( memcmp( dct1, dct2, w*w*2 ) ) \ { \ oks[1] = 0; \ @@ -801,8 +811,8 @@ static int check_intra( int cpu_ref, int cpu_new ) used_asm = 1; \ memcpy( buf3, buf1, 32*20 );\ memcpy( buf4, buf1, 32*20 );\ - ip_c.name[dir]( buf3+48, ##__VA_ARGS__ );\ - ip_a.name[dir]( buf4+48, ##__VA_ARGS__ );\ + call_c( ip_c.name[dir], buf3+48, ##__VA_ARGS__ );\ + call_a( ip_a.name[dir], buf4+48, ##__VA_ARGS__ );\ if( memcmp( buf3, buf4, 32*20 ) )\ {\ fprintf( stderr, #name "[%d] : [FAILED]\n", dir );\