From 1bf7228f7e975e9220daae5a439797aaea2aa511 Mon Sep 17 00:00:00 2001 From: Fiona Glaser Date: Wed, 5 Nov 2008 03:11:45 -0800 Subject: [PATCH] Initial Nehalem CPU optimizations movaps/movups are no longer equivalent to their integer equivalents on the Nehalem, so that substitution is removed. Nehalem has a much lower cacheline split penalty than previous Intel CPUs, so cacheline workarounds are no longer necessary. Thanks to Intel for providing Avail Media with the pre-release Nehalem CPU needed to prepare these (and other not-yet-committed) optimizations. Overall speed improvement with Nehalem vs Penryn at the same clock speed is around 40%. --- common/cpu.c | 7 +++++-- common/x86/x86inc.asm | 4 ---- encoder/encoder.c | 3 +++ x264.h | 5 +++-- 4 files changed, 11 insertions(+), 8 deletions(-) diff --git a/common/cpu.c b/common/cpu.c index 307a0ee1..2d722c69 100644 --- a/common/cpu.c +++ b/common/cpu.c @@ -48,7 +48,8 @@ const x264_cpu_name_t x264_cpu_names[] = { {"SSE3", X264_CPU_MMX|X264_CPU_MMXEXT|X264_CPU_SSE|X264_CPU_SSE2|X264_CPU_SSE3}, {"SSSE3", X264_CPU_MMX|X264_CPU_MMXEXT|X264_CPU_SSE|X264_CPU_SSE2|X264_CPU_SSE3|X264_CPU_SSSE3}, {"PHADD", X264_CPU_MMX|X264_CPU_MMXEXT|X264_CPU_SSE|X264_CPU_SSE2|X264_CPU_SSE3|X264_CPU_SSSE3|X264_CPU_PHADD_IS_FAST}, - {"SSE4", X264_CPU_MMX|X264_CPU_MMXEXT|X264_CPU_SSE|X264_CPU_SSE2|X264_CPU_SSE3|X264_CPU_SSSE3|X264_CPU_SSE4}, + {"SSE4.1", X264_CPU_MMX|X264_CPU_MMXEXT|X264_CPU_SSE|X264_CPU_SSE2|X264_CPU_SSE3|X264_CPU_SSSE3|X264_CPU_SSE4}, + {"SSE4.2", X264_CPU_MMX|X264_CPU_MMXEXT|X264_CPU_SSE|X264_CPU_SSE2|X264_CPU_SSE3|X264_CPU_SSSE3|X264_CPU_SSE4|X264_CPU_SSE42}, {"Cache32", X264_CPU_CACHELINE_32}, {"Cache64", X264_CPU_CACHELINE_64}, {"Slow_mod4_stack", X264_CPU_STACK_MOD4}, @@ -91,6 +92,8 @@ uint32_t x264_cpu_detect( void ) cpu |= X264_CPU_SSSE3; if( ecx&0x00080000 ) cpu |= X264_CPU_SSE4; + if( ecx&0x00100000 ) + cpu |= X264_CPU_SSE42; if( cpu & X264_CPU_SSSE3 ) cpu |= X264_CPU_SSE2_IS_FAST; @@ -131,7 +134,7 @@ uint32_t x264_cpu_detect( void ) } } - if( !strcmp((char*)vendor, "GenuineIntel") || !strcmp((char*)vendor, "CyrixInstead") ) + if( (!strcmp((char*)vendor, "GenuineIntel") || !strcmp((char*)vendor, "CyrixInstead")) && !(cpu&X264_CPU_SSE42)) { /* cacheline size is specified in 3 places, any of which may be missing */ x264_cpu_cpuid( 1, &eax, &ebx, &ecx, &edx ); diff --git a/common/x86/x86inc.asm b/common/x86/x86inc.asm index dc06e7c5..9a4a92b4 100644 --- a/common/x86/x86inc.asm +++ b/common/x86/x86inc.asm @@ -474,7 +474,3 @@ INIT_MMX %endif %endmacro -; substitutions which are functionally identical but reduce code size -%define movdqa movaps -%define movdqu movups - diff --git a/encoder/encoder.c b/encoder/encoder.c index 2e7ea80c..4a9860fa 100644 --- a/encoder/encoder.c +++ b/encoder/encoder.c @@ -744,6 +744,9 @@ x264_t *x264_encoder_open ( x264_param_t *param ) if( !strcmp(x264_cpu_names[i].name, "SSE3") && (param->cpu & X264_CPU_SSSE3 || !(param->cpu & X264_CPU_CACHELINE_64)) ) continue; + if( !strcmp(x264_cpu_names[i].name, "SSE4.1") + && (param->cpu & X264_CPU_SSE42) ) + continue; if( (param->cpu & x264_cpu_names[i].flags) == x264_cpu_names[i].flags && (!i || x264_cpu_names[i].flags != x264_cpu_names[i-1].flags) ) p += sprintf( p, " %s", x264_cpu_names[i].name ); diff --git a/x264.h b/x264.h index afb8a419..323f9bbc 100644 --- a/x264.h +++ b/x264.h @@ -58,8 +58,9 @@ typedef struct x264_t x264_t; #define X264_CPU_SSE3 0x000200 #define X264_CPU_SSSE3 0x000400 #define X264_CPU_PHADD_IS_FAST 0x000800 /* pre-Penryn Core2 have a uselessly slow PHADD instruction */ -#define X264_CPU_SSE4 0x001000 /* SSE4.1 */ -#define X264_CPU_STACK_MOD4 0x002000 /* if stack is only mod4 and not mod16 */ +#define X264_CPU_STACK_MOD4 0x001000 /* if stack is only mod4 and not mod16 */ +#define X264_CPU_SSE4 0x002000 /* SSE4.1 */ +#define X264_CPU_SSE42 0x004000 /* SSE4.2 */ /* Analyse flags */ -- 2.40.0