From 2a7dd58c68fda378a5e8b68184ff56daee9f9019 Mon Sep 17 00:00:00 2001
From: Fiona Glaser <fiona@x264.com>
Date: Sun, 15 Jun 2008 11:59:25 -0600
Subject: [PATCH] Add more inline asm and a runtime check for MMXEXT support
 x264 will now terminate gracefully rather than SIGILL when run on a machine
 with no MMXEXT support. A configure option is now available to build x264
 without assembly support for support on such old CPUs as the Pentium 2, K6,
 etc.

---
 common/common.h      |  8 +++---
 common/macroblock.h  | 32 ++++++++++++++++++++++++
 common/x86/util.h    | 58 +++++++++++++++++++++++++++++++++++++++++++-
 configure            | 25 +++++++++++++------
 encoder/cavlc.c      | 14 +++++------
 encoder/encoder.c    |  8 ++++++
 encoder/macroblock.h | 24 ------------------
 7 files changed, 126 insertions(+), 43 deletions(-)

diff --git a/common/common.h b/common/common.h
index aaf584a2..04f5243e 100644
--- a/common/common.h
+++ b/common/common.h
@@ -141,10 +141,6 @@ static inline int x264_predictor_difference( int16_t (*mvc)[2], intptr_t i_mvc )
     return sum;
 }
 
-#ifdef HAVE_MMX
-#include "x86/util.h"
-#endif
-
 /****************************************************************************
  *
  ****************************************************************************/
@@ -595,5 +591,9 @@ struct x264_t
 // included at the end because it needs x264_t
 #include "macroblock.h"
 
+#ifdef HAVE_MMX
+#include "x86/util.h"
+#endif
+
 #endif
 
diff --git a/common/macroblock.h b/common/macroblock.h
index 2766ddd0..660978a9 100644
--- a/common/macroblock.h
+++ b/common/macroblock.h
@@ -356,6 +356,38 @@ static ALWAYS_INLINE void x264_macroblock_cache_intra8x8_pred( x264_t *h, int x,
     int8_t *cache = &h->mb.cache.intra4x4_pred_mode[X264_SCAN8_0+x+8*y];
     cache[0] = cache[1] = cache[8] = cache[9] = i_mode;
 }
+#define array_non_zero(a) array_non_zero_int(a, sizeof(a))
+#define array_non_zero_int array_non_zero_int_c
+static ALWAYS_INLINE int array_non_zero_int_c( void *v, int i_count )
+{
+    uint64_t *x = v;
+    if(i_count == 8)
+        return !!x[0];
+    else if(i_count == 16)
+        return !!(x[0]|x[1]);
+    else if(i_count == 32)
+        return !!(x[0]|x[1]|x[2]|x[3]);
+    else
+    {
+        int i;
+        i_count /= sizeof(uint64_t);
+        for( i = 0; i < i_count; i++ )
+            if( x[i] ) return 1;
+        return 0;
+    }
+}
+/* This function and its MMX version only work on arrays of size 16 */
+static ALWAYS_INLINE int array_non_zero_count( int16_t *v )
+{
+    int i;
+    int i_nz;
+
+    for( i = 0, i_nz = 0; i < 16; i++ )
+        if( v[i] )
+            i_nz++;
+
+    return i_nz;
+}
 
 #endif
 
diff --git a/common/x86/util.h b/common/x86/util.h
index 59d17495..e100a4e4 100644
--- a/common/x86/util.h
+++ b/common/x86/util.h
@@ -65,11 +65,67 @@ static inline int x264_predictor_difference_mmxext( int16_t (*mvc)[2], intptr_t
         "paddusw %%mm0, %%mm4 \n"
         "jg 1b                \n"
         "movq    %%mm4, %0    \n"
-        :"=m"(output), "+r"(i_mvc), "+r"(mvc)
+        :"=m"(output), "+r"(i_mvc)
+        :"r"(mvc)
     );
     sum += output[0] + output[1] + output[2] + output[3];
     return sum;
 }
+#define array_non_zero_count array_non_zero_count_mmx
+static inline int array_non_zero_count_mmx( int16_t *v )
+{
+    static const uint64_t pw_2 = 0x0202020202020202ULL;
+    int count;
+    asm(
+        "pxor     %%mm7,  %%mm7 \n"
+        "movq     (%1),   %%mm0 \n"
+        "movq     16(%1), %%mm1 \n"
+        "packsswb 8(%1),  %%mm0 \n"
+        "packsswb 24(%1), %%mm1 \n"
+        "pcmpeqb  %%mm7,  %%mm0 \n"
+        "pcmpeqb  %%mm7,  %%mm1 \n"
+        "paddb    %%mm0,  %%mm1 \n"
+        "paddb    %2,     %%mm1 \n"
+        "psadbw   %%mm7,  %%mm1 \n"
+        "movd     %%mm1,  %0    \n"
+        :"=r"(count)
+        :"r"(v), "m"(pw_2)
+    );
+    return count;
+}
+#undef array_non_zero_int
+#define array_non_zero_int array_non_zero_int_mmx
+static ALWAYS_INLINE int array_non_zero_int_mmx( void *v, int i_count )
+{
+    if(i_count == 128)
+    {
+        int nonzero;
+        asm(
+            "movq     (%1),    %%mm0 \n"
+            "por      8(%1),   %%mm0 \n"
+            "por      16(%1),  %%mm0 \n"
+            "por      24(%1),  %%mm0 \n"
+            "por      32(%1),  %%mm0 \n"
+            "por      40(%1),  %%mm0 \n"
+            "por      48(%1),  %%mm0 \n"
+            "por      56(%1),  %%mm0 \n"
+            "por      64(%1),  %%mm0 \n"
+            "por      72(%1),  %%mm0 \n"
+            "por      80(%1),  %%mm0 \n"
+            "por      88(%1),  %%mm0 \n"
+            "por      96(%1),  %%mm0 \n"
+            "por      104(%1), %%mm0 \n"
+            "por      112(%1), %%mm0 \n"
+            "por      120(%1), %%mm0 \n"
+            "packsswb %%mm0,   %%mm0 \n"
+            "movd     %%mm0,   %0    \n"
+            :"=r"(nonzero)
+            :"r"(v)
+        );
+        return !!nonzero;
+    }
+    else return array_non_zero_int_c( v, i_count );
+}
 #endif
 
 #endif
diff --git a/configure b/configure
index 2cb9bad1..29e1b25b 100755
--- a/configure
+++ b/configure
@@ -7,10 +7,11 @@ echo ""
 echo "available options:"
 echo ""
 echo "  --help                   print this message"
-echo "  --enable-avis-input      enables avisynth input (win32 only)"
-echo "  --enable-mp4-output      enables mp4 output (using gpac)"
+echo "  --disable-avis-input     disables avisynth input (win32 only)"
+echo "  --disable-mp4-output     disables mp4 output (using gpac)"
+echo "  --disable-pthread        disables multithreaded encoding"
+echo "  --disable-asm            disables assembly optimizations on x86"
 echo "  --enable-gtk             build GTK+ interface"
-echo "  --enable-pthread         enables multithreaded encoding"
 echo "  --enable-debug           adds -g, doesn't strip"
 echo "  --enable-gprof           adds -pg, doesn't strip"
 echo "  --enable-visualize       enables visualization (X11 only)"
@@ -53,6 +54,7 @@ DEVNULL='/dev/null'
 avis_input="auto"
 mp4_output="auto"
 pthread="auto"
+asm="yes"
 debug="no"
 gprof="no"
 pic="no"
@@ -102,6 +104,12 @@ for opt do
         --includedir=*)
             includedir="$optarg"
             ;;
+        --enable-asm)
+            asm="yes"
+            ;;
+        --disable-asm)
+            asm="no"
+            ;;
         --enable-avis-input)
             avis_input="yes"
             ;;
@@ -300,7 +308,7 @@ if [ $shared = yes -a \( $ARCH = "X86_64" -o $ARCH = "PPC" -o $ARCH = "ALPHA" \)
     pic="yes"
 fi
 
-if [ $ARCH = X86 -o $ARCH = X86_64 ] ; then
+if [ $asm = yes -a \( $ARCH = X86 -o $ARCH = X86_64 \) ] ; then
     if [ $ARCH = X86 -a $pic = yes -a x$AS = xyasm -a\
          "`yasm --version 2>$DEVNULL | head -n 1`" "<" "yasm 0.6.2" ] ; then
          echo "yasm prior to 0.6.2 miscompiles PIC. trying nasm instead..."
@@ -309,10 +317,12 @@ if [ $ARCH = X86 -o $ARCH = X86_64 ] ; then
     if as_check "pabsw xmm0, xmm0" ; then
         CFLAGS="$CFLAGS -DHAVE_MMX"
     else
-        echo "No suitable assembler found.  x264 will be several times slower."
-        echo "Please install 'yasm' to get MMX/SSE optimized code."
-        AS=""
+        echo "No suitable assembler found.  Install 'yasm' to get MMX/SSE optimized code."
+        echo "If you really want to compile without asm, configure with --disable-asm."
+        exit 1
     fi
+else
+    AS=""
 fi
 
 CFLAGS="$CFLAGS -DARCH_$ARCH -DSYS_$SYS"
@@ -482,6 +492,7 @@ EOF
 
 echo "Platform:   $ARCH"
 echo "System:     $SYS"
+echo "asm:        $asm"
 echo "avis input: $avis_input"
 echo "mp4 output: $mp4_output"
 echo "pthread:    $pthread"
diff --git a/encoder/cavlc.c b/encoder/cavlc.c
index 62136547..7d5bc84c 100644
--- a/encoder/cavlc.c
+++ b/encoder/cavlc.c
@@ -301,7 +301,7 @@ static inline void x264_macroblock_luma_write_cavlc( x264_t *h, bs_t *s, int i8s
         if( h->mb.i_cbp_luma & (1 << i8) )
             for( i4 = 0; i4 < 4; i4++ )
             {
-                h->mb.cache.non_zero_count[x264_scan8[i4+i8*4]] = array_non_zero_count( h->dct.luma4x4[i4+i8*4], 16 );
+                h->mb.cache.non_zero_count[x264_scan8[i4+i8*4]] = array_non_zero_count( h->dct.luma4x4[i4+i8*4] );
                 block_residual_write_cavlc( h, s, i4+i8*4, h->dct.luma4x4[i4+i8*4], 16 );
             }
 }
@@ -657,7 +657,7 @@ void x264_macroblock_write_cavlc( x264_t *h, bs_t *s )
         if( h->mb.i_cbp_luma != 0 )
             for( i = 0; i < 16; i++ )
             {
-                h->mb.cache.non_zero_count[x264_scan8[i]] = array_non_zero_count( h->dct.luma4x4[i]+1, 15 );
+                h->mb.cache.non_zero_count[x264_scan8[i]] = array_non_zero_count( h->dct.luma4x4[i] );
                 block_residual_write_cavlc( h, s, i, h->dct.luma4x4[i]+1, 15 );
             }
     }
@@ -674,7 +674,7 @@ void x264_macroblock_write_cavlc( x264_t *h, bs_t *s )
         if( h->mb.i_cbp_chroma&0x02 ) /* Chroma AC residual present */
             for( i = 16; i < 24; i++ )
             {
-                h->mb.cache.non_zero_count[x264_scan8[i]] = array_non_zero_count( h->dct.luma4x4[i]+1, 15 );
+                h->mb.cache.non_zero_count[x264_scan8[i]] = array_non_zero_count( h->dct.luma4x4[i] );
                 block_residual_write_cavlc( h, s, i, h->dct.luma4x4[i]+1, 15 );
             }
     }
@@ -741,9 +741,9 @@ int x264_partition_size_cavlc( x264_t *h, int i8, int i_pixel )
     for( j = (i_pixel < PIXEL_8x8); j >= 0; j-- )
     {
         x264_macroblock_luma_write_cavlc( h, &s, i8, i8 );
-        h->mb.cache.non_zero_count[x264_scan8[16+i8]] = array_non_zero_count( h->dct.luma4x4[16+i8]+1, 15 );
+        h->mb.cache.non_zero_count[x264_scan8[16+i8]] = array_non_zero_count( h->dct.luma4x4[16+i8] );
         block_residual_write_cavlc( h, &s, 16+i8, h->dct.luma4x4[16+i8]+1, 15 );
-        h->mb.cache.non_zero_count[x264_scan8[20+i8]] = array_non_zero_count( h->dct.luma4x4[20+i8]+1, 15 );
+        h->mb.cache.non_zero_count[x264_scan8[20+i8]] = array_non_zero_count( h->dct.luma4x4[20+i8] );
         block_residual_write_cavlc( h, &s, 20+i8, h->dct.luma4x4[20+i8]+1, 15 );
         i8 += x264_pixel_size[i_pixel].h >> 3;
     }
@@ -768,7 +768,7 @@ static int x264_partition_i8x8_size_cavlc( x264_t *h, int i8, int i_mode )
         for( i = 0; i < 16; i++ )
             h->dct.luma4x4[i4+i8*4][i] = h->dct.luma8x8[i8][i4+i*4];
         h->mb.cache.non_zero_count[x264_scan8[i4+i8*4]] =
-            array_non_zero_count( h->dct.luma4x4[i4+i8*4], 16 );
+            array_non_zero_count( h->dct.luma4x4[i4+i8*4] );
         block_residual_write_cavlc( h, &h->out.bs, i4+i8*4, h->dct.luma4x4[i4+i8*4], 16 );
     }
     return h->out.bs.i_bits_encoded;
@@ -794,7 +794,7 @@ static int x264_i8x8_chroma_size_cavlc( x264_t *h )
             int i;
             for( i = 16; i < 24; i++ )
             {
-                h->mb.cache.non_zero_count[x264_scan8[i]] = array_non_zero_count( h->dct.luma4x4[i]+1, 15 );
+                h->mb.cache.non_zero_count[x264_scan8[i]] = array_non_zero_count( h->dct.luma4x4[i] );
                 block_residual_write_cavlc( h, &h->out.bs, i, h->dct.luma4x4[i]+1, 15 );
             }
         }
diff --git a/encoder/encoder.c b/encoder/encoder.c
index cffaeeb6..533e8a83 100644
--- a/encoder/encoder.c
+++ b/encoder/encoder.c
@@ -300,6 +300,14 @@ static void x264_slice_header_write( bs_t *s, x264_slice_header_t *sh, int i_nal
 
 static int x264_validate_parameters( x264_t *h )
 {
+#ifdef HAVE_MMX
+    if( !(x264_cpu_detect() & X264_CPU_MMXEXT) )
+    {
+        x264_log( h, X264_LOG_ERROR, "your cpu does not support MMXEXT, but x264 was compiled with asm support\n");
+        x264_log( h, X264_LOG_ERROR, "to run x264, recompile without asm support (configure --disable-asm)\n");
+        return -1;
+    }
+#endif
     if( h->param.i_width <= 0 || h->param.i_height <= 0 )
     {
         x264_log( h, X264_LOG_ERROR, "invalid width x height (%dx%d)\n",
diff --git a/encoder/macroblock.h b/encoder/macroblock.h
index e80995d7..5ac58349 100644
--- a/encoder/macroblock.h
+++ b/encoder/macroblock.h
@@ -54,29 +54,5 @@ void x264_quant_8x8_trellis( x264_t *h, int16_t dct[8][8], int i_quant_cat,
 void x264_noise_reduction_update( x264_t *h );
 void x264_denoise_dct( x264_t *h, int16_t *dct );
 
-#define array_non_zero(a) array_non_zero_int(a, sizeof(a))
-static inline int array_non_zero_int( void *v, int i_count )
-{
-    int i;
-    uint64_t *x = v;
-    i_count /= sizeof(uint64_t);
-    for( i = 0; i < i_count; i++ )
-        if( x[i] ) return 1;
-    return 0;
-}
-
-static inline int array_non_zero_count( int16_t *v, int i_count )
-{
-    int i;
-    int i_nz;
-
-    for( i = 0, i_nz = 0; i < i_count; i++ )
-        if( v[i] )
-            i_nz++;
-
-    return i_nz;
-}
-
-
 #endif
 
-- 
2.50.1