From: Loren Merritt <pengvado@akuvian.org>
Date: Tue, 22 Apr 2008 23:16:25 +0000 (-0600)
Subject: drop support for pre-SSE3 assemblers
X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=ad6c91f064e6e6ceab3b876713006e5e1fb3f574;p=libx264

drop support for pre-SSE3 assemblers
---

diff --git a/common/cpu.c b/common/cpu.c
index f79f0031..47a72f76 100644
--- a/common/cpu.c
+++ b/common/cpu.c
@@ -84,12 +84,10 @@ uint32_t x264_cpu_detect( void )
         cpu |= X264_CPU_MMXEXT|X264_CPU_SSE;
     if( edx&0x04000000 )
         cpu |= X264_CPU_SSE2;
-#ifdef HAVE_SSE3
     if( ecx&0x00000001 )
         cpu |= X264_CPU_SSE3;
     if( ecx&0x00000200 )
         cpu |= X264_CPU_SSSE3;
-#endif
 
     x264_cpu_cpuid( 0x80000000, &eax, &ebx, &ecx, &edx );
     max_extended_cap = eax;
diff --git a/common/dct.c b/common/dct.c
index bdc92929..669e24f3 100644
--- a/common/dct.c
+++ b/common/dct.c
@@ -580,7 +580,7 @@ void x264_zigzag_init( int cpu, x264_zigzag_function_t *pf, int b_interlaced )
         pf->scan_8x8   = zigzag_scan_8x8_frame;
         pf->scan_4x4   = zigzag_scan_4x4_frame;
         pf->sub_4x4    = zigzag_sub_4x4_frame;
-#ifdef HAVE_SSE3
+#ifdef HAVE_MMX
         if( cpu&X264_CPU_SSSE3 )
             pf->sub_4x4 = x264_zigzag_sub_4x4_frame_ssse3;
 #endif
diff --git a/common/pixel.c b/common/pixel.c
index 1d5567b6..133968cc 100644
--- a/common/pixel.c
+++ b/common/pixel.c
@@ -359,10 +359,8 @@ SATD_X_DECL7()
 #ifdef HAVE_MMX
 SATD_X_DECL7( _mmxext )
 SATD_X_DECL5( _sse2 )
-#ifdef HAVE_SSE3
 SATD_X_DECL7( _ssse3 )
 #endif
-#endif
 
 /****************************************************************************
  * structural similarity metric
@@ -623,7 +621,6 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf )
 #endif
     }
 
-#ifdef HAVE_SSE3
     if( (cpu&X264_CPU_SSE3) && (cpu&X264_CPU_CACHELINE_SPLIT) )
     {
         INIT2( sad, _sse3 );
@@ -652,7 +649,6 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf )
             INIT2( sad_x4, _cache64_ssse3 );
         }
     }
-#endif //HAVE_SSE3
 #endif //HAVE_MMX
 
 #ifdef ARCH_PPC
diff --git a/common/quant.c b/common/quant.c
index 270f9798..38581f45 100644
--- a/common/quant.c
+++ b/common/quant.c
@@ -240,16 +240,14 @@ void x264_quant_init( x264_t *h, int cpu, x264_quant_function_t *pf )
             pf->dequant_8x8 = x264_dequant_8x8_flat16_sse2;
         }
     }
-#endif
 
-#ifdef HAVE_SSE3
     if( cpu&X264_CPU_SSSE3 )
     {
         pf->quant_4x4_dc = x264_quant_4x4_dc_ssse3;
         pf->quant_4x4 = x264_quant_4x4_ssse3;
         pf->quant_8x8 = x264_quant_8x8_ssse3;
     }
-#endif
+#endif // HAVE_MMX
 
 #ifdef ARCH_PPC
     if( cpu&X264_CPU_ALTIVEC ) {
diff --git a/common/x86/dct-a.asm b/common/x86/dct-a.asm
index 5491b238..77baddaa 100644
--- a/common/x86/dct-a.asm
+++ b/common/x86/dct-a.asm
@@ -325,7 +325,6 @@ cglobal x264_zigzag_scan_4x4_field_mmxext, 2,3
     mov    [r0+12], r2d
     RET
 
-%ifdef HAVE_SSE3
 ;-----------------------------------------------------------------------------
 ; void x264_zigzag_sub_4x4_frame_ssse3( int16_t level[16], const uint8_t *src, uint8_t *dst )
 ;-----------------------------------------------------------------------------
@@ -364,4 +363,3 @@ cglobal x264_zigzag_sub_4x4_frame_ssse3, 3,3
     movdqa    [r0], xmm0
     movdqa [r0+16], xmm1
     RET
-%endif
diff --git a/common/x86/mc-a.asm b/common/x86/mc-a.asm
index ed1e3326..bbf85392 100644
--- a/common/x86/mc-a.asm
+++ b/common/x86/mc-a.asm
@@ -275,11 +275,9 @@ cglobal x264_pixel_avg2_w20_%1, 6,7
 %endmacro
 
 PIXEL_AVG_SSE sse2
-%ifdef HAVE_SSE3
 %define movdqu lddqu
 PIXEL_AVG_SSE sse3
 %undef movdqu
-%endif
 
 ; Cacheline split code for processors with high latencies for loads
 ; split over cache lines.  See sad-a.asm for a more detailed explanation.
@@ -481,9 +479,7 @@ cglobal %1, 5,7
 COPY_W16_SSE2 x264_mc_copy_w16_sse2, movdqu
 ; cacheline split with mmx has too much overhead; the speed benefit is near-zero.
 ; but with SSE3 the overhead is zero, so there's no reason not to include it.
-%ifdef HAVE_SSE3
 COPY_W16_SSE2 x264_mc_copy_w16_sse3, lddqu
-%endif
 COPY_W16_SSE2 x264_mc_copy_w16_aligned_sse2, movdqa
 
 
diff --git a/common/x86/mc-a2.asm b/common/x86/mc-a2.asm
index 3b04b70c..b05d2944 100644
--- a/common/x86/mc-a2.asm
+++ b/common/x86/mc-a2.asm
@@ -309,10 +309,8 @@ cglobal x264_hpel_filter_h_sse2, 3,3,1
 %define PALIGNR PALIGNR_SSE2
 HPEL_V sse2
 HPEL_C sse2
-%ifdef HAVE_SSE3
 %define PALIGNR PALIGNR_SSSE3
 HPEL_C ssse3
-%endif
 
 cglobal x264_sfence
     sfence
diff --git a/common/x86/mc-c.c b/common/x86/mc-c.c
index fd202da4..1144c36f 100644
--- a/common/x86/mc-c.c
+++ b/common/x86/mc-c.c
@@ -102,9 +102,7 @@ PIXEL_AVG_WTAB(cache32_mmxext, mmxext, cache32_mmxext, cache32_mmxext, cache32_m
 PIXEL_AVG_WTAB(cache64_mmxext, mmxext, cache64_mmxext, cache64_mmxext, cache64_mmxext, cache64_mmxext)
 PIXEL_AVG_WTAB(sse2, mmxext, mmxext, mmxext, sse2, sse2)
 PIXEL_AVG_WTAB(cache64_sse2, mmxext, cache64_mmxext, cache64_sse2, cache64_sse2, cache64_sse2)
-#ifdef HAVE_SSE3
 PIXEL_AVG_WTAB(cache64_sse3, mmxext, cache64_mmxext, sse3, sse3, sse3)
-#endif
 
 #define MC_COPY_WTAB(instr, name1, name2, name3)\
 static void (* const x264_mc_copy_wtab_##instr[5])( uint8_t *, int, uint8_t *, int, int ) =\
@@ -118,9 +116,7 @@ static void (* const x264_mc_copy_wtab_##instr[5])( uint8_t *, int, uint8_t *, i
 
 MC_COPY_WTAB(mmx,mmx,mmx,mmx)
 MC_COPY_WTAB(sse2,mmx,mmx,sse2)
-#ifdef HAVE_SSE3
 MC_COPY_WTAB(sse3,mmx,mmx,sse3)
-#endif
 
 static const int hpel_ref0[16] = {0,1,1,1,0,1,1,1,2,3,3,3,0,1,1,1};
 static const int hpel_ref1[16] = {0,0,0,0,2,2,3,2,2,2,3,2,2,2,3,2};
@@ -155,9 +151,7 @@ MC_LUMA(cache64_mmxext,cache64_mmxext,mmx)
 #endif
 MC_LUMA(sse2,sse2,sse2)
 MC_LUMA(cache64_sse2,cache64_sse2,sse2)
-#ifdef HAVE_SSE3
 MC_LUMA(cache64_sse3,cache64_sse3,sse3)
-#endif
 
 #define GET_REF(name)\
 uint8_t *get_ref_##name( uint8_t *dst,   int *i_dst_stride,\
@@ -190,9 +184,7 @@ GET_REF(cache64_mmxext)
 #endif
 GET_REF(sse2)
 GET_REF(cache64_sse2)
-#ifdef HAVE_SSE3
 GET_REF(cache64_sse3)
-#endif
 
 #define HPEL(align, cpu, cpuv, cpuc, cpuh)\
 void x264_hpel_filter_v_##cpuv( uint8_t *dst, uint8_t *src, int16_t *buf, int stride, int width);\
@@ -227,9 +219,7 @@ void x264_hpel_filter_##cpu( uint8_t *dsth, uint8_t *dstv, uint8_t *dstc, uint8_
 HPEL(8, mmxext, mmxext, mmxext, mmxext)
 HPEL(16, sse2_amd, mmxext, mmxext, sse2)
 HPEL(16, sse2, sse2, sse2, sse2)
-#ifdef HAVE_SSE3
 HPEL(16, ssse3, sse2, ssse3, sse2)
-#endif
 
 void x264_mc_init_mmx( int cpu, x264_mc_functions_t *pf )
 {
@@ -305,20 +295,16 @@ void x264_mc_init_mmx( int cpu, x264_mc_functions_t *pf )
     {
         pf->mc_luma = mc_luma_cache64_sse2;
         pf->get_ref = get_ref_cache64_sse2;
-#ifdef HAVE_SSE3
         /* lddqu doesn't work on Core2 */
         if( (cpu&X264_CPU_SSE3) && !(cpu&X264_CPU_SSSE3) )
         {
             pf->mc_luma = mc_luma_cache64_sse3;
             pf->get_ref = get_ref_cache64_sse3;
         }
-#endif
     }
 
     if( !(cpu&X264_CPU_SSSE3) )
         return;
 
-#ifdef HAVE_SSE3
     pf->hpel_filter = x264_hpel_filter_ssse3;
-#endif
 }
diff --git a/common/x86/pixel-a.asm b/common/x86/pixel-a.asm
index b4d06561..9eed1dbc 100644
--- a/common/x86/pixel-a.asm
+++ b/common/x86/pixel-a.asm
@@ -1272,7 +1272,6 @@ SATDS_SSE2 sse2
 SA8D_16x16_32 sse2
 INTRA_SA8D_SSE2 sse2
 INTRA_SATDS_MMX mmxext
-%ifdef HAVE_SSE3
 %define ABS1 ABS1_SSSE3
 %define ABS2 ABS2_SSSE3
 SATDS_SSE2 ssse3
@@ -1280,7 +1279,6 @@ SA8D_16x16_32 ssse3
 INTRA_SA8D_SSE2 ssse3
 INTRA_SATDS_MMX ssse3
 SATD_W4 ssse3 ; mmx, but uses pabsw from ssse3.
-%endif
 
 
 
@@ -1655,10 +1653,8 @@ cglobal x264_pixel_ads1_%1, 4,7
 %endmacro
 
 ADS_SSE2 sse2
-%ifdef HAVE_SSE3
 %define ABS1 ABS1_SSSE3
 ADS_SSE2 ssse3
-%endif
 
 ; int x264_pixel_ads_mvs( int16_t *mvs, uint8_t *masks, int width )
 ; {
diff --git a/common/x86/predict-c.c b/common/x86/predict-c.c
index 3982a0d9..18a115cb 100644
--- a/common/x86/predict-c.c
+++ b/common/x86/predict-c.c
@@ -483,9 +483,7 @@ void x264_intra_sa8d_x3_8x8_##cpu( uint8_t *fenc, uint8_t edge[33], int res[3] )
 
 #ifdef ARCH_X86_64
 INTRA_SA8D_X3(sse2)
-#ifdef HAVE_SSE3
 INTRA_SA8D_X3(ssse3)
-#endif
 #else
 INTRA_SA8D_X3(mmxext)
 #endif
diff --git a/common/x86/quant-a.asm b/common/x86/quant-a.asm
index 90aebf7f..693432dd 100644
--- a/common/x86/quant-a.asm
+++ b/common/x86/quant-a.asm
@@ -145,11 +145,9 @@ QUANT_DC x264_quant_4x4_dc_sse2, QUANT_MMX, 2, 16
 QUANT_AC x264_quant_4x4_sse2, QUANT_MMX, 2, 16
 QUANT_AC x264_quant_8x8_sse2, QUANT_MMX, 8, 16
 
-%ifdef HAVE_SSE3
 QUANT_DC x264_quant_4x4_dc_ssse3, QUANT_SSSE3, 2, 16
 QUANT_AC x264_quant_4x4_ssse3, QUANT_SSSE3, 2, 16
 QUANT_AC x264_quant_8x8_ssse3, QUANT_SSSE3, 8, 16
-%endif
 
 
 
diff --git a/common/x86/sad-a.asm b/common/x86/sad-a.asm
index 6e31921c..3709e28c 100644
--- a/common/x86/sad-a.asm
+++ b/common/x86/sad-a.asm
@@ -25,7 +25,7 @@
 %include "x86inc.asm"
 
 SECTION_RODATA
-sw_64: dq 64
+sw_64: dd 64
 
 SECTION .text
 
@@ -213,11 +213,9 @@ cglobal x264_pixel_sad_16x8_%1, 4,4
 %endmacro
 
 SAD_W16 sse2
-%ifdef HAVE_SSE3
 %define movdqu lddqu
 SAD_W16 sse3
 %undef movdqu
-%endif
 
 
 
@@ -613,14 +611,12 @@ SAD_X_SSE2 3, 16,  8, sse2
 SAD_X_SSE2 4, 16, 16, sse2
 SAD_X_SSE2 4, 16,  8, sse2
 
-%ifdef HAVE_SSE3
 %define movdqu lddqu
 SAD_X_SSE2 3, 16, 16, sse3
 SAD_X_SSE2 3, 16,  8, sse3
 SAD_X_SSE2 4, 16, 16, sse3
 SAD_X_SSE2 4, 16,  8, sse3
 %undef movdqu
-%endif
 
 
 
@@ -961,7 +957,6 @@ SADX34_CACHELINE_FUNC 16, 16, 64, sse2, sse2
 SADX34_CACHELINE_FUNC 16,  8, 64, sse2, sse2
 %endif ; !ARCH_X86_64
 
-%ifdef HAVE_SSE3
 SAD16_CACHELINE_FUNC ssse3, 8
 SAD16_CACHELINE_FUNC ssse3, 16
 %assign i 1
@@ -971,4 +966,3 @@ SAD16_CACHELINE_LOOP_SSSE3 i
 %endrep
 SADX34_CACHELINE_FUNC 16, 16, 64, sse2, ssse3
 SADX34_CACHELINE_FUNC 16,  8, 64, sse2, ssse3
-%endif ; HAVE_SSE3
diff --git a/configure b/configure
index 19d76d48..10e0aa5d 100755
--- a/configure
+++ b/configure
@@ -321,12 +321,8 @@ if [ $ARCH = X86 -o $ARCH = X86_64 ] ; then
          echo "yasm prior to 0.6.2 miscompiles PIC. trying nasm instead..."
          AS=nasm
     fi
-    if as_check ; then
+    if as_check "pabsw xmm0, xmm0" ; then
         CFLAGS="$CFLAGS -DHAVE_MMX"
-        if as_check "pabsw xmm0, xmm0" ; then
-            ASFLAGS="$ASFLAGS -DHAVE_SSE3"
-            CFLAGS="$CFLAGS -DHAVE_SSE3"
-        fi
     else
         echo "No suitable assembler found.  x264 will be several times slower."
         echo "Please install 'yasm' to get MMX/SSE optimized code."