]> granicus.if.org Git - libx264/commitdiff
Initial Nehalem CPU optimizations
authorFiona Glaser <fiona@x264.com>
Wed, 5 Nov 2008 11:11:45 +0000 (03:11 -0800)
committerFiona Glaser <fiona@x264.com>
Wed, 5 Nov 2008 11:37:45 +0000 (03:37 -0800)
movaps/movups are no longer equivalent to their integer equivalents on the Nehalem, so that substitution is removed.
Nehalem has a much lower cacheline split penalty than previous Intel CPUs, so cacheline workarounds are no longer necessary.
Thanks to Intel for providing Avail Media with the pre-release Nehalem CPU needed to prepare these (and other not-yet-committed) optimizations.
Overall speed improvement with Nehalem vs Penryn at the same clock speed is around 40%.

common/cpu.c
common/x86/x86inc.asm
encoder/encoder.c
x264.h

index 307a0ee178b5740a11f20d3f02a98ee64a9265ef..2d722c690449e773ff06a61b607b642c481d92b2 100644 (file)
@@ -48,7 +48,8 @@ const x264_cpu_name_t x264_cpu_names[] = {
     {"SSE3",    X264_CPU_MMX|X264_CPU_MMXEXT|X264_CPU_SSE|X264_CPU_SSE2|X264_CPU_SSE3},
     {"SSSE3",   X264_CPU_MMX|X264_CPU_MMXEXT|X264_CPU_SSE|X264_CPU_SSE2|X264_CPU_SSE3|X264_CPU_SSSE3},
     {"PHADD",   X264_CPU_MMX|X264_CPU_MMXEXT|X264_CPU_SSE|X264_CPU_SSE2|X264_CPU_SSE3|X264_CPU_SSSE3|X264_CPU_PHADD_IS_FAST},
-    {"SSE4",    X264_CPU_MMX|X264_CPU_MMXEXT|X264_CPU_SSE|X264_CPU_SSE2|X264_CPU_SSE3|X264_CPU_SSSE3|X264_CPU_SSE4},
+    {"SSE4.1",  X264_CPU_MMX|X264_CPU_MMXEXT|X264_CPU_SSE|X264_CPU_SSE2|X264_CPU_SSE3|X264_CPU_SSSE3|X264_CPU_SSE4},
+    {"SSE4.2",  X264_CPU_MMX|X264_CPU_MMXEXT|X264_CPU_SSE|X264_CPU_SSE2|X264_CPU_SSE3|X264_CPU_SSSE3|X264_CPU_SSE4|X264_CPU_SSE42},
     {"Cache32", X264_CPU_CACHELINE_32},
     {"Cache64", X264_CPU_CACHELINE_64},
     {"Slow_mod4_stack", X264_CPU_STACK_MOD4},
@@ -91,6 +92,8 @@ uint32_t x264_cpu_detect( void )
         cpu |= X264_CPU_SSSE3;
     if( ecx&0x00080000 )
         cpu |= X264_CPU_SSE4;
+    if( ecx&0x00100000 )
+        cpu |= X264_CPU_SSE42;
 
     if( cpu & X264_CPU_SSSE3 )
         cpu |= X264_CPU_SSE2_IS_FAST;
@@ -131,7 +134,7 @@ uint32_t x264_cpu_detect( void )
         }
     }
 
-    if( !strcmp((char*)vendor, "GenuineIntel") || !strcmp((char*)vendor, "CyrixInstead") )
+    if( (!strcmp((char*)vendor, "GenuineIntel") || !strcmp((char*)vendor, "CyrixInstead")) && !(cpu&X264_CPU_SSE42))
     {
         /* cacheline size is specified in 3 places, any of which may be missing */
         x264_cpu_cpuid( 1, &eax, &ebx, &ecx, &edx );
index dc06e7c56b91beb347d62fb6ccefd2f17836a2e5..9a4a92b44346c2737fb893962bcb3fa77cc1fa51 100644 (file)
@@ -474,7 +474,3 @@ INIT_MMX
     %endif
 %endmacro
 
-; substitutions which are functionally identical but reduce code size
-%define movdqa movaps
-%define movdqu movups
-
index 2e7ea80cb5eb75f775d20232b3b9bfb1ff24ab7e..4a9860fa0ee44b425c632a94371543aa24611fa7 100644 (file)
@@ -744,6 +744,9 @@ x264_t *x264_encoder_open   ( x264_param_t *param )
         if( !strcmp(x264_cpu_names[i].name, "SSE3")
             && (param->cpu & X264_CPU_SSSE3 || !(param->cpu & X264_CPU_CACHELINE_64)) )
             continue;
+        if( !strcmp(x264_cpu_names[i].name, "SSE4.1")
+            && (param->cpu & X264_CPU_SSE42) )
+            continue;
         if( (param->cpu & x264_cpu_names[i].flags) == x264_cpu_names[i].flags
             && (!i || x264_cpu_names[i].flags != x264_cpu_names[i-1].flags) )
             p += sprintf( p, " %s", x264_cpu_names[i].name );
diff --git a/x264.h b/x264.h
index afb8a4192886608ed62368bc3d3647049bfb1166..323f9bbc4376f9edaf622a3e74d532073d36aebf 100644 (file)
--- a/x264.h
+++ b/x264.h
@@ -58,8 +58,9 @@ typedef struct x264_t x264_t;
 #define X264_CPU_SSE3           0x000200
 #define X264_CPU_SSSE3          0x000400
 #define X264_CPU_PHADD_IS_FAST  0x000800  /* pre-Penryn Core2 have a uselessly slow PHADD instruction */
-#define X264_CPU_SSE4           0x001000  /* SSE4.1 */
-#define X264_CPU_STACK_MOD4     0x002000  /* if stack is only mod4 and not mod16 */
+#define X264_CPU_STACK_MOD4     0x001000  /* if stack is only mod4 and not mod16 */
+#define X264_CPU_SSE4           0x002000  /* SSE4.1 */
+#define X264_CPU_SSE42          0x004000  /* SSE4.2 */
 
 /* Analyse flags
  */