From: Loren Merritt Date: Tue, 22 Apr 2008 23:16:25 +0000 (-0600) Subject: drop support for pre-SSE3 assemblers X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=ad6c91f064e6e6ceab3b876713006e5e1fb3f574;p=libx264 drop support for pre-SSE3 assemblers --- diff --git a/common/cpu.c b/common/cpu.c index f79f0031..47a72f76 100644 --- a/common/cpu.c +++ b/common/cpu.c @@ -84,12 +84,10 @@ uint32_t x264_cpu_detect( void ) cpu |= X264_CPU_MMXEXT|X264_CPU_SSE; if( edx&0x04000000 ) cpu |= X264_CPU_SSE2; -#ifdef HAVE_SSE3 if( ecx&0x00000001 ) cpu |= X264_CPU_SSE3; if( ecx&0x00000200 ) cpu |= X264_CPU_SSSE3; -#endif x264_cpu_cpuid( 0x80000000, &eax, &ebx, &ecx, &edx ); max_extended_cap = eax; diff --git a/common/dct.c b/common/dct.c index bdc92929..669e24f3 100644 --- a/common/dct.c +++ b/common/dct.c @@ -580,7 +580,7 @@ void x264_zigzag_init( int cpu, x264_zigzag_function_t *pf, int b_interlaced ) pf->scan_8x8 = zigzag_scan_8x8_frame; pf->scan_4x4 = zigzag_scan_4x4_frame; pf->sub_4x4 = zigzag_sub_4x4_frame; -#ifdef HAVE_SSE3 +#ifdef HAVE_MMX if( cpu&X264_CPU_SSSE3 ) pf->sub_4x4 = x264_zigzag_sub_4x4_frame_ssse3; #endif diff --git a/common/pixel.c b/common/pixel.c index 1d5567b6..133968cc 100644 --- a/common/pixel.c +++ b/common/pixel.c @@ -359,10 +359,8 @@ SATD_X_DECL7() #ifdef HAVE_MMX SATD_X_DECL7( _mmxext ) SATD_X_DECL5( _sse2 ) -#ifdef HAVE_SSE3 SATD_X_DECL7( _ssse3 ) #endif -#endif /**************************************************************************** * structural similarity metric @@ -623,7 +621,6 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf ) #endif } -#ifdef HAVE_SSE3 if( (cpu&X264_CPU_SSE3) && (cpu&X264_CPU_CACHELINE_SPLIT) ) { INIT2( sad, _sse3 ); @@ -652,7 +649,6 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf ) INIT2( sad_x4, _cache64_ssse3 ); } } -#endif //HAVE_SSE3 #endif //HAVE_MMX #ifdef ARCH_PPC diff --git a/common/quant.c b/common/quant.c index 270f9798..38581f45 100644 --- a/common/quant.c +++ b/common/quant.c @@ -240,16 +240,14 @@ void x264_quant_init( x264_t *h, int cpu, x264_quant_function_t *pf ) pf->dequant_8x8 = x264_dequant_8x8_flat16_sse2; } } -#endif -#ifdef HAVE_SSE3 if( cpu&X264_CPU_SSSE3 ) { pf->quant_4x4_dc = x264_quant_4x4_dc_ssse3; pf->quant_4x4 = x264_quant_4x4_ssse3; pf->quant_8x8 = x264_quant_8x8_ssse3; } -#endif +#endif // HAVE_MMX #ifdef ARCH_PPC if( cpu&X264_CPU_ALTIVEC ) { diff --git a/common/x86/dct-a.asm b/common/x86/dct-a.asm index 5491b238..77baddaa 100644 --- a/common/x86/dct-a.asm +++ b/common/x86/dct-a.asm @@ -325,7 +325,6 @@ cglobal x264_zigzag_scan_4x4_field_mmxext, 2,3 mov [r0+12], r2d RET -%ifdef HAVE_SSE3 ;----------------------------------------------------------------------------- ; void x264_zigzag_sub_4x4_frame_ssse3( int16_t level[16], const uint8_t *src, uint8_t *dst ) ;----------------------------------------------------------------------------- @@ -364,4 +363,3 @@ cglobal x264_zigzag_sub_4x4_frame_ssse3, 3,3 movdqa [r0], xmm0 movdqa [r0+16], xmm1 RET -%endif diff --git a/common/x86/mc-a.asm b/common/x86/mc-a.asm index ed1e3326..bbf85392 100644 --- a/common/x86/mc-a.asm +++ b/common/x86/mc-a.asm @@ -275,11 +275,9 @@ cglobal x264_pixel_avg2_w20_%1, 6,7 %endmacro PIXEL_AVG_SSE sse2 -%ifdef HAVE_SSE3 %define movdqu lddqu PIXEL_AVG_SSE sse3 %undef movdqu -%endif ; Cacheline split code for processors with high latencies for loads ; split over cache lines. See sad-a.asm for a more detailed explanation. @@ -481,9 +479,7 @@ cglobal %1, 5,7 COPY_W16_SSE2 x264_mc_copy_w16_sse2, movdqu ; cacheline split with mmx has too much overhead; the speed benefit is near-zero. ; but with SSE3 the overhead is zero, so there's no reason not to include it. -%ifdef HAVE_SSE3 COPY_W16_SSE2 x264_mc_copy_w16_sse3, lddqu -%endif COPY_W16_SSE2 x264_mc_copy_w16_aligned_sse2, movdqa diff --git a/common/x86/mc-a2.asm b/common/x86/mc-a2.asm index 3b04b70c..b05d2944 100644 --- a/common/x86/mc-a2.asm +++ b/common/x86/mc-a2.asm @@ -309,10 +309,8 @@ cglobal x264_hpel_filter_h_sse2, 3,3,1 %define PALIGNR PALIGNR_SSE2 HPEL_V sse2 HPEL_C sse2 -%ifdef HAVE_SSE3 %define PALIGNR PALIGNR_SSSE3 HPEL_C ssse3 -%endif cglobal x264_sfence sfence diff --git a/common/x86/mc-c.c b/common/x86/mc-c.c index fd202da4..1144c36f 100644 --- a/common/x86/mc-c.c +++ b/common/x86/mc-c.c @@ -102,9 +102,7 @@ PIXEL_AVG_WTAB(cache32_mmxext, mmxext, cache32_mmxext, cache32_mmxext, cache32_m PIXEL_AVG_WTAB(cache64_mmxext, mmxext, cache64_mmxext, cache64_mmxext, cache64_mmxext, cache64_mmxext) PIXEL_AVG_WTAB(sse2, mmxext, mmxext, mmxext, sse2, sse2) PIXEL_AVG_WTAB(cache64_sse2, mmxext, cache64_mmxext, cache64_sse2, cache64_sse2, cache64_sse2) -#ifdef HAVE_SSE3 PIXEL_AVG_WTAB(cache64_sse3, mmxext, cache64_mmxext, sse3, sse3, sse3) -#endif #define MC_COPY_WTAB(instr, name1, name2, name3)\ static void (* const x264_mc_copy_wtab_##instr[5])( uint8_t *, int, uint8_t *, int, int ) =\ @@ -118,9 +116,7 @@ static void (* const x264_mc_copy_wtab_##instr[5])( uint8_t *, int, uint8_t *, i MC_COPY_WTAB(mmx,mmx,mmx,mmx) MC_COPY_WTAB(sse2,mmx,mmx,sse2) -#ifdef HAVE_SSE3 MC_COPY_WTAB(sse3,mmx,mmx,sse3) -#endif static const int hpel_ref0[16] = {0,1,1,1,0,1,1,1,2,3,3,3,0,1,1,1}; static const int hpel_ref1[16] = {0,0,0,0,2,2,3,2,2,2,3,2,2,2,3,2}; @@ -155,9 +151,7 @@ MC_LUMA(cache64_mmxext,cache64_mmxext,mmx) #endif MC_LUMA(sse2,sse2,sse2) MC_LUMA(cache64_sse2,cache64_sse2,sse2) -#ifdef HAVE_SSE3 MC_LUMA(cache64_sse3,cache64_sse3,sse3) -#endif #define GET_REF(name)\ uint8_t *get_ref_##name( uint8_t *dst, int *i_dst_stride,\ @@ -190,9 +184,7 @@ GET_REF(cache64_mmxext) #endif GET_REF(sse2) GET_REF(cache64_sse2) -#ifdef HAVE_SSE3 GET_REF(cache64_sse3) -#endif #define HPEL(align, cpu, cpuv, cpuc, cpuh)\ void x264_hpel_filter_v_##cpuv( uint8_t *dst, uint8_t *src, int16_t *buf, int stride, int width);\ @@ -227,9 +219,7 @@ void x264_hpel_filter_##cpu( uint8_t *dsth, uint8_t *dstv, uint8_t *dstc, uint8_ HPEL(8, mmxext, mmxext, mmxext, mmxext) HPEL(16, sse2_amd, mmxext, mmxext, sse2) HPEL(16, sse2, sse2, sse2, sse2) -#ifdef HAVE_SSE3 HPEL(16, ssse3, sse2, ssse3, sse2) -#endif void x264_mc_init_mmx( int cpu, x264_mc_functions_t *pf ) { @@ -305,20 +295,16 @@ void x264_mc_init_mmx( int cpu, x264_mc_functions_t *pf ) { pf->mc_luma = mc_luma_cache64_sse2; pf->get_ref = get_ref_cache64_sse2; -#ifdef HAVE_SSE3 /* lddqu doesn't work on Core2 */ if( (cpu&X264_CPU_SSE3) && !(cpu&X264_CPU_SSSE3) ) { pf->mc_luma = mc_luma_cache64_sse3; pf->get_ref = get_ref_cache64_sse3; } -#endif } if( !(cpu&X264_CPU_SSSE3) ) return; -#ifdef HAVE_SSE3 pf->hpel_filter = x264_hpel_filter_ssse3; -#endif } diff --git a/common/x86/pixel-a.asm b/common/x86/pixel-a.asm index b4d06561..9eed1dbc 100644 --- a/common/x86/pixel-a.asm +++ b/common/x86/pixel-a.asm @@ -1272,7 +1272,6 @@ SATDS_SSE2 sse2 SA8D_16x16_32 sse2 INTRA_SA8D_SSE2 sse2 INTRA_SATDS_MMX mmxext -%ifdef HAVE_SSE3 %define ABS1 ABS1_SSSE3 %define ABS2 ABS2_SSSE3 SATDS_SSE2 ssse3 @@ -1280,7 +1279,6 @@ SA8D_16x16_32 ssse3 INTRA_SA8D_SSE2 ssse3 INTRA_SATDS_MMX ssse3 SATD_W4 ssse3 ; mmx, but uses pabsw from ssse3. -%endif @@ -1655,10 +1653,8 @@ cglobal x264_pixel_ads1_%1, 4,7 %endmacro ADS_SSE2 sse2 -%ifdef HAVE_SSE3 %define ABS1 ABS1_SSSE3 ADS_SSE2 ssse3 -%endif ; int x264_pixel_ads_mvs( int16_t *mvs, uint8_t *masks, int width ) ; { diff --git a/common/x86/predict-c.c b/common/x86/predict-c.c index 3982a0d9..18a115cb 100644 --- a/common/x86/predict-c.c +++ b/common/x86/predict-c.c @@ -483,9 +483,7 @@ void x264_intra_sa8d_x3_8x8_##cpu( uint8_t *fenc, uint8_t edge[33], int res[3] ) #ifdef ARCH_X86_64 INTRA_SA8D_X3(sse2) -#ifdef HAVE_SSE3 INTRA_SA8D_X3(ssse3) -#endif #else INTRA_SA8D_X3(mmxext) #endif diff --git a/common/x86/quant-a.asm b/common/x86/quant-a.asm index 90aebf7f..693432dd 100644 --- a/common/x86/quant-a.asm +++ b/common/x86/quant-a.asm @@ -145,11 +145,9 @@ QUANT_DC x264_quant_4x4_dc_sse2, QUANT_MMX, 2, 16 QUANT_AC x264_quant_4x4_sse2, QUANT_MMX, 2, 16 QUANT_AC x264_quant_8x8_sse2, QUANT_MMX, 8, 16 -%ifdef HAVE_SSE3 QUANT_DC x264_quant_4x4_dc_ssse3, QUANT_SSSE3, 2, 16 QUANT_AC x264_quant_4x4_ssse3, QUANT_SSSE3, 2, 16 QUANT_AC x264_quant_8x8_ssse3, QUANT_SSSE3, 8, 16 -%endif diff --git a/common/x86/sad-a.asm b/common/x86/sad-a.asm index 6e31921c..3709e28c 100644 --- a/common/x86/sad-a.asm +++ b/common/x86/sad-a.asm @@ -25,7 +25,7 @@ %include "x86inc.asm" SECTION_RODATA -sw_64: dq 64 +sw_64: dd 64 SECTION .text @@ -213,11 +213,9 @@ cglobal x264_pixel_sad_16x8_%1, 4,4 %endmacro SAD_W16 sse2 -%ifdef HAVE_SSE3 %define movdqu lddqu SAD_W16 sse3 %undef movdqu -%endif @@ -613,14 +611,12 @@ SAD_X_SSE2 3, 16, 8, sse2 SAD_X_SSE2 4, 16, 16, sse2 SAD_X_SSE2 4, 16, 8, sse2 -%ifdef HAVE_SSE3 %define movdqu lddqu SAD_X_SSE2 3, 16, 16, sse3 SAD_X_SSE2 3, 16, 8, sse3 SAD_X_SSE2 4, 16, 16, sse3 SAD_X_SSE2 4, 16, 8, sse3 %undef movdqu -%endif @@ -961,7 +957,6 @@ SADX34_CACHELINE_FUNC 16, 16, 64, sse2, sse2 SADX34_CACHELINE_FUNC 16, 8, 64, sse2, sse2 %endif ; !ARCH_X86_64 -%ifdef HAVE_SSE3 SAD16_CACHELINE_FUNC ssse3, 8 SAD16_CACHELINE_FUNC ssse3, 16 %assign i 1 @@ -971,4 +966,3 @@ SAD16_CACHELINE_LOOP_SSSE3 i %endrep SADX34_CACHELINE_FUNC 16, 16, 64, sse2, ssse3 SADX34_CACHELINE_FUNC 16, 8, 64, sse2, ssse3 -%endif ; HAVE_SSE3 diff --git a/configure b/configure index 19d76d48..10e0aa5d 100755 --- a/configure +++ b/configure @@ -321,12 +321,8 @@ if [ $ARCH = X86 -o $ARCH = X86_64 ] ; then echo "yasm prior to 0.6.2 miscompiles PIC. trying nasm instead..." AS=nasm fi - if as_check ; then + if as_check "pabsw xmm0, xmm0" ; then CFLAGS="$CFLAGS -DHAVE_MMX" - if as_check "pabsw xmm0, xmm0" ; then - ASFLAGS="$ASFLAGS -DHAVE_SSE3" - CFLAGS="$CFLAGS -DHAVE_SSE3" - fi else echo "No suitable assembler found. x264 will be several times slower." echo "Please install 'yasm' to get MMX/SSE optimized code."