From 2a7dd58c68fda378a5e8b68184ff56daee9f9019 Mon Sep 17 00:00:00 2001 From: Fiona Glaser Date: Sun, 15 Jun 2008 11:59:25 -0600 Subject: [PATCH] Add more inline asm and a runtime check for MMXEXT support x264 will now terminate gracefully rather than SIGILL when run on a machine with no MMXEXT support. A configure option is now available to build x264 without assembly support for support on such old CPUs as the Pentium 2, K6, etc. --- common/common.h | 8 +++--- common/macroblock.h | 32 ++++++++++++++++++++++++ common/x86/util.h | 58 +++++++++++++++++++++++++++++++++++++++++++- configure | 25 +++++++++++++------ encoder/cavlc.c | 14 +++++------ encoder/encoder.c | 8 ++++++ encoder/macroblock.h | 24 ------------------ 7 files changed, 126 insertions(+), 43 deletions(-) diff --git a/common/common.h b/common/common.h index aaf584a2..04f5243e 100644 --- a/common/common.h +++ b/common/common.h @@ -141,10 +141,6 @@ static inline int x264_predictor_difference( int16_t (*mvc)[2], intptr_t i_mvc ) return sum; } -#ifdef HAVE_MMX -#include "x86/util.h" -#endif - /**************************************************************************** * ****************************************************************************/ @@ -595,5 +591,9 @@ struct x264_t // included at the end because it needs x264_t #include "macroblock.h" +#ifdef HAVE_MMX +#include "x86/util.h" +#endif + #endif diff --git a/common/macroblock.h b/common/macroblock.h index 2766ddd0..660978a9 100644 --- a/common/macroblock.h +++ b/common/macroblock.h @@ -356,6 +356,38 @@ static ALWAYS_INLINE void x264_macroblock_cache_intra8x8_pred( x264_t *h, int x, int8_t *cache = &h->mb.cache.intra4x4_pred_mode[X264_SCAN8_0+x+8*y]; cache[0] = cache[1] = cache[8] = cache[9] = i_mode; } +#define array_non_zero(a) array_non_zero_int(a, sizeof(a)) +#define array_non_zero_int array_non_zero_int_c +static ALWAYS_INLINE int array_non_zero_int_c( void *v, int i_count ) +{ + uint64_t *x = v; + if(i_count == 8) + return !!x[0]; + else if(i_count == 16) + return !!(x[0]|x[1]); + else if(i_count == 32) + return !!(x[0]|x[1]|x[2]|x[3]); + else + { + int i; + i_count /= sizeof(uint64_t); + for( i = 0; i < i_count; i++ ) + if( x[i] ) return 1; + return 0; + } +} +/* This function and its MMX version only work on arrays of size 16 */ +static ALWAYS_INLINE int array_non_zero_count( int16_t *v ) +{ + int i; + int i_nz; + + for( i = 0, i_nz = 0; i < 16; i++ ) + if( v[i] ) + i_nz++; + + return i_nz; +} #endif diff --git a/common/x86/util.h b/common/x86/util.h index 59d17495..e100a4e4 100644 --- a/common/x86/util.h +++ b/common/x86/util.h @@ -65,11 +65,67 @@ static inline int x264_predictor_difference_mmxext( int16_t (*mvc)[2], intptr_t "paddusw %%mm0, %%mm4 \n" "jg 1b \n" "movq %%mm4, %0 \n" - :"=m"(output), "+r"(i_mvc), "+r"(mvc) + :"=m"(output), "+r"(i_mvc) + :"r"(mvc) ); sum += output[0] + output[1] + output[2] + output[3]; return sum; } +#define array_non_zero_count array_non_zero_count_mmx +static inline int array_non_zero_count_mmx( int16_t *v ) +{ + static const uint64_t pw_2 = 0x0202020202020202ULL; + int count; + asm( + "pxor %%mm7, %%mm7 \n" + "movq (%1), %%mm0 \n" + "movq 16(%1), %%mm1 \n" + "packsswb 8(%1), %%mm0 \n" + "packsswb 24(%1), %%mm1 \n" + "pcmpeqb %%mm7, %%mm0 \n" + "pcmpeqb %%mm7, %%mm1 \n" + "paddb %%mm0, %%mm1 \n" + "paddb %2, %%mm1 \n" + "psadbw %%mm7, %%mm1 \n" + "movd %%mm1, %0 \n" + :"=r"(count) + :"r"(v), "m"(pw_2) + ); + return count; +} +#undef array_non_zero_int +#define array_non_zero_int array_non_zero_int_mmx +static ALWAYS_INLINE int array_non_zero_int_mmx( void *v, int i_count ) +{ + if(i_count == 128) + { + int nonzero; + asm( + "movq (%1), %%mm0 \n" + "por 8(%1), %%mm0 \n" + "por 16(%1), %%mm0 \n" + "por 24(%1), %%mm0 \n" + "por 32(%1), %%mm0 \n" + "por 40(%1), %%mm0 \n" + "por 48(%1), %%mm0 \n" + "por 56(%1), %%mm0 \n" + "por 64(%1), %%mm0 \n" + "por 72(%1), %%mm0 \n" + "por 80(%1), %%mm0 \n" + "por 88(%1), %%mm0 \n" + "por 96(%1), %%mm0 \n" + "por 104(%1), %%mm0 \n" + "por 112(%1), %%mm0 \n" + "por 120(%1), %%mm0 \n" + "packsswb %%mm0, %%mm0 \n" + "movd %%mm0, %0 \n" + :"=r"(nonzero) + :"r"(v) + ); + return !!nonzero; + } + else return array_non_zero_int_c( v, i_count ); +} #endif #endif diff --git a/configure b/configure index 2cb9bad1..29e1b25b 100755 --- a/configure +++ b/configure @@ -7,10 +7,11 @@ echo "" echo "available options:" echo "" echo " --help print this message" -echo " --enable-avis-input enables avisynth input (win32 only)" -echo " --enable-mp4-output enables mp4 output (using gpac)" +echo " --disable-avis-input disables avisynth input (win32 only)" +echo " --disable-mp4-output disables mp4 output (using gpac)" +echo " --disable-pthread disables multithreaded encoding" +echo " --disable-asm disables assembly optimizations on x86" echo " --enable-gtk build GTK+ interface" -echo " --enable-pthread enables multithreaded encoding" echo " --enable-debug adds -g, doesn't strip" echo " --enable-gprof adds -pg, doesn't strip" echo " --enable-visualize enables visualization (X11 only)" @@ -53,6 +54,7 @@ DEVNULL='/dev/null' avis_input="auto" mp4_output="auto" pthread="auto" +asm="yes" debug="no" gprof="no" pic="no" @@ -102,6 +104,12 @@ for opt do --includedir=*) includedir="$optarg" ;; + --enable-asm) + asm="yes" + ;; + --disable-asm) + asm="no" + ;; --enable-avis-input) avis_input="yes" ;; @@ -300,7 +308,7 @@ if [ $shared = yes -a \( $ARCH = "X86_64" -o $ARCH = "PPC" -o $ARCH = "ALPHA" \) pic="yes" fi -if [ $ARCH = X86 -o $ARCH = X86_64 ] ; then +if [ $asm = yes -a \( $ARCH = X86 -o $ARCH = X86_64 \) ] ; then if [ $ARCH = X86 -a $pic = yes -a x$AS = xyasm -a\ "`yasm --version 2>$DEVNULL | head -n 1`" "<" "yasm 0.6.2" ] ; then echo "yasm prior to 0.6.2 miscompiles PIC. trying nasm instead..." @@ -309,10 +317,12 @@ if [ $ARCH = X86 -o $ARCH = X86_64 ] ; then if as_check "pabsw xmm0, xmm0" ; then CFLAGS="$CFLAGS -DHAVE_MMX" else - echo "No suitable assembler found. x264 will be several times slower." - echo "Please install 'yasm' to get MMX/SSE optimized code." - AS="" + echo "No suitable assembler found. Install 'yasm' to get MMX/SSE optimized code." + echo "If you really want to compile without asm, configure with --disable-asm." + exit 1 fi +else + AS="" fi CFLAGS="$CFLAGS -DARCH_$ARCH -DSYS_$SYS" @@ -482,6 +492,7 @@ EOF echo "Platform: $ARCH" echo "System: $SYS" +echo "asm: $asm" echo "avis input: $avis_input" echo "mp4 output: $mp4_output" echo "pthread: $pthread" diff --git a/encoder/cavlc.c b/encoder/cavlc.c index 62136547..7d5bc84c 100644 --- a/encoder/cavlc.c +++ b/encoder/cavlc.c @@ -301,7 +301,7 @@ static inline void x264_macroblock_luma_write_cavlc( x264_t *h, bs_t *s, int i8s if( h->mb.i_cbp_luma & (1 << i8) ) for( i4 = 0; i4 < 4; i4++ ) { - h->mb.cache.non_zero_count[x264_scan8[i4+i8*4]] = array_non_zero_count( h->dct.luma4x4[i4+i8*4], 16 ); + h->mb.cache.non_zero_count[x264_scan8[i4+i8*4]] = array_non_zero_count( h->dct.luma4x4[i4+i8*4] ); block_residual_write_cavlc( h, s, i4+i8*4, h->dct.luma4x4[i4+i8*4], 16 ); } } @@ -657,7 +657,7 @@ void x264_macroblock_write_cavlc( x264_t *h, bs_t *s ) if( h->mb.i_cbp_luma != 0 ) for( i = 0; i < 16; i++ ) { - h->mb.cache.non_zero_count[x264_scan8[i]] = array_non_zero_count( h->dct.luma4x4[i]+1, 15 ); + h->mb.cache.non_zero_count[x264_scan8[i]] = array_non_zero_count( h->dct.luma4x4[i] ); block_residual_write_cavlc( h, s, i, h->dct.luma4x4[i]+1, 15 ); } } @@ -674,7 +674,7 @@ void x264_macroblock_write_cavlc( x264_t *h, bs_t *s ) if( h->mb.i_cbp_chroma&0x02 ) /* Chroma AC residual present */ for( i = 16; i < 24; i++ ) { - h->mb.cache.non_zero_count[x264_scan8[i]] = array_non_zero_count( h->dct.luma4x4[i]+1, 15 ); + h->mb.cache.non_zero_count[x264_scan8[i]] = array_non_zero_count( h->dct.luma4x4[i] ); block_residual_write_cavlc( h, s, i, h->dct.luma4x4[i]+1, 15 ); } } @@ -741,9 +741,9 @@ int x264_partition_size_cavlc( x264_t *h, int i8, int i_pixel ) for( j = (i_pixel < PIXEL_8x8); j >= 0; j-- ) { x264_macroblock_luma_write_cavlc( h, &s, i8, i8 ); - h->mb.cache.non_zero_count[x264_scan8[16+i8]] = array_non_zero_count( h->dct.luma4x4[16+i8]+1, 15 ); + h->mb.cache.non_zero_count[x264_scan8[16+i8]] = array_non_zero_count( h->dct.luma4x4[16+i8] ); block_residual_write_cavlc( h, &s, 16+i8, h->dct.luma4x4[16+i8]+1, 15 ); - h->mb.cache.non_zero_count[x264_scan8[20+i8]] = array_non_zero_count( h->dct.luma4x4[20+i8]+1, 15 ); + h->mb.cache.non_zero_count[x264_scan8[20+i8]] = array_non_zero_count( h->dct.luma4x4[20+i8] ); block_residual_write_cavlc( h, &s, 20+i8, h->dct.luma4x4[20+i8]+1, 15 ); i8 += x264_pixel_size[i_pixel].h >> 3; } @@ -768,7 +768,7 @@ static int x264_partition_i8x8_size_cavlc( x264_t *h, int i8, int i_mode ) for( i = 0; i < 16; i++ ) h->dct.luma4x4[i4+i8*4][i] = h->dct.luma8x8[i8][i4+i*4]; h->mb.cache.non_zero_count[x264_scan8[i4+i8*4]] = - array_non_zero_count( h->dct.luma4x4[i4+i8*4], 16 ); + array_non_zero_count( h->dct.luma4x4[i4+i8*4] ); block_residual_write_cavlc( h, &h->out.bs, i4+i8*4, h->dct.luma4x4[i4+i8*4], 16 ); } return h->out.bs.i_bits_encoded; @@ -794,7 +794,7 @@ static int x264_i8x8_chroma_size_cavlc( x264_t *h ) int i; for( i = 16; i < 24; i++ ) { - h->mb.cache.non_zero_count[x264_scan8[i]] = array_non_zero_count( h->dct.luma4x4[i]+1, 15 ); + h->mb.cache.non_zero_count[x264_scan8[i]] = array_non_zero_count( h->dct.luma4x4[i] ); block_residual_write_cavlc( h, &h->out.bs, i, h->dct.luma4x4[i]+1, 15 ); } } diff --git a/encoder/encoder.c b/encoder/encoder.c index cffaeeb6..533e8a83 100644 --- a/encoder/encoder.c +++ b/encoder/encoder.c @@ -300,6 +300,14 @@ static void x264_slice_header_write( bs_t *s, x264_slice_header_t *sh, int i_nal static int x264_validate_parameters( x264_t *h ) { +#ifdef HAVE_MMX + if( !(x264_cpu_detect() & X264_CPU_MMXEXT) ) + { + x264_log( h, X264_LOG_ERROR, "your cpu does not support MMXEXT, but x264 was compiled with asm support\n"); + x264_log( h, X264_LOG_ERROR, "to run x264, recompile without asm support (configure --disable-asm)\n"); + return -1; + } +#endif if( h->param.i_width <= 0 || h->param.i_height <= 0 ) { x264_log( h, X264_LOG_ERROR, "invalid width x height (%dx%d)\n", diff --git a/encoder/macroblock.h b/encoder/macroblock.h index e80995d7..5ac58349 100644 --- a/encoder/macroblock.h +++ b/encoder/macroblock.h @@ -54,29 +54,5 @@ void x264_quant_8x8_trellis( x264_t *h, int16_t dct[8][8], int i_quant_cat, void x264_noise_reduction_update( x264_t *h ); void x264_denoise_dct( x264_t *h, int16_t *dct ); -#define array_non_zero(a) array_non_zero_int(a, sizeof(a)) -static inline int array_non_zero_int( void *v, int i_count ) -{ - int i; - uint64_t *x = v; - i_count /= sizeof(uint64_t); - for( i = 0; i < i_count; i++ ) - if( x[i] ) return 1; - return 0; -} - -static inline int array_non_zero_count( int16_t *v, int i_count ) -{ - int i; - int i_nz; - - for( i = 0, i_nz = 0; i < i_count; i++ ) - if( v[i] ) - i_nz++; - - return i_nz; -} - - #endif -- 2.40.0