From 1bf7228f7e975e9220daae5a439797aaea2aa511 Mon Sep 17 00:00:00 2001
From: Fiona Glaser <fiona@x264.com>
Date: Wed, 5 Nov 2008 03:11:45 -0800
Subject: [PATCH] Initial Nehalem CPU optimizations movaps/movups are no longer
 equivalent to their integer equivalents on the Nehalem, so that substitution
 is removed. Nehalem has a much lower cacheline split penalty than previous
 Intel CPUs, so cacheline workarounds are no longer necessary. Thanks to Intel
 for providing Avail Media with the pre-release Nehalem CPU needed to prepare
 these (and other not-yet-committed) optimizations. Overall speed improvement
 with Nehalem vs Penryn at the same clock speed is around 40%.

---
 common/cpu.c          | 7 +++++--
 common/x86/x86inc.asm | 4 ----
 encoder/encoder.c     | 3 +++
 x264.h                | 5 +++--
 4 files changed, 11 insertions(+), 8 deletions(-)

diff --git a/common/cpu.c b/common/cpu.c
index 307a0ee1..2d722c69 100644
--- a/common/cpu.c
+++ b/common/cpu.c
@@ -48,7 +48,8 @@ const x264_cpu_name_t x264_cpu_names[] = {
     {"SSE3",    X264_CPU_MMX|X264_CPU_MMXEXT|X264_CPU_SSE|X264_CPU_SSE2|X264_CPU_SSE3},
     {"SSSE3",   X264_CPU_MMX|X264_CPU_MMXEXT|X264_CPU_SSE|X264_CPU_SSE2|X264_CPU_SSE3|X264_CPU_SSSE3},
     {"PHADD",   X264_CPU_MMX|X264_CPU_MMXEXT|X264_CPU_SSE|X264_CPU_SSE2|X264_CPU_SSE3|X264_CPU_SSSE3|X264_CPU_PHADD_IS_FAST},
-    {"SSE4",    X264_CPU_MMX|X264_CPU_MMXEXT|X264_CPU_SSE|X264_CPU_SSE2|X264_CPU_SSE3|X264_CPU_SSSE3|X264_CPU_SSE4},
+    {"SSE4.1",  X264_CPU_MMX|X264_CPU_MMXEXT|X264_CPU_SSE|X264_CPU_SSE2|X264_CPU_SSE3|X264_CPU_SSSE3|X264_CPU_SSE4},
+    {"SSE4.2",  X264_CPU_MMX|X264_CPU_MMXEXT|X264_CPU_SSE|X264_CPU_SSE2|X264_CPU_SSE3|X264_CPU_SSSE3|X264_CPU_SSE4|X264_CPU_SSE42},
     {"Cache32", X264_CPU_CACHELINE_32},
     {"Cache64", X264_CPU_CACHELINE_64},
     {"Slow_mod4_stack", X264_CPU_STACK_MOD4},
@@ -91,6 +92,8 @@ uint32_t x264_cpu_detect( void )
         cpu |= X264_CPU_SSSE3;
     if( ecx&0x00080000 )
         cpu |= X264_CPU_SSE4;
+    if( ecx&0x00100000 )
+        cpu |= X264_CPU_SSE42;
 
     if( cpu & X264_CPU_SSSE3 )
         cpu |= X264_CPU_SSE2_IS_FAST;
@@ -131,7 +134,7 @@ uint32_t x264_cpu_detect( void )
         }
     }
 
-    if( !strcmp((char*)vendor, "GenuineIntel") || !strcmp((char*)vendor, "CyrixInstead") )
+    if( (!strcmp((char*)vendor, "GenuineIntel") || !strcmp((char*)vendor, "CyrixInstead")) && !(cpu&X264_CPU_SSE42))
     {
         /* cacheline size is specified in 3 places, any of which may be missing */
         x264_cpu_cpuid( 1, &eax, &ebx, &ecx, &edx );
diff --git a/common/x86/x86inc.asm b/common/x86/x86inc.asm
index dc06e7c5..9a4a92b4 100644
--- a/common/x86/x86inc.asm
+++ b/common/x86/x86inc.asm
@@ -474,7 +474,3 @@ INIT_MMX
     %endif
 %endmacro
 
-; substitutions which are functionally identical but reduce code size
-%define movdqa movaps
-%define movdqu movups
-
diff --git a/encoder/encoder.c b/encoder/encoder.c
index 2e7ea80c..4a9860fa 100644
--- a/encoder/encoder.c
+++ b/encoder/encoder.c
@@ -744,6 +744,9 @@ x264_t *x264_encoder_open   ( x264_param_t *param )
         if( !strcmp(x264_cpu_names[i].name, "SSE3")
             && (param->cpu & X264_CPU_SSSE3 || !(param->cpu & X264_CPU_CACHELINE_64)) )
             continue;
+        if( !strcmp(x264_cpu_names[i].name, "SSE4.1")
+            && (param->cpu & X264_CPU_SSE42) )
+            continue;
         if( (param->cpu & x264_cpu_names[i].flags) == x264_cpu_names[i].flags
             && (!i || x264_cpu_names[i].flags != x264_cpu_names[i-1].flags) )
             p += sprintf( p, " %s", x264_cpu_names[i].name );
diff --git a/x264.h b/x264.h
index afb8a419..323f9bbc 100644
--- a/x264.h
+++ b/x264.h
@@ -58,8 +58,9 @@ typedef struct x264_t x264_t;
 #define X264_CPU_SSE3           0x000200
 #define X264_CPU_SSSE3          0x000400
 #define X264_CPU_PHADD_IS_FAST  0x000800  /* pre-Penryn Core2 have a uselessly slow PHADD instruction */
-#define X264_CPU_SSE4           0x001000  /* SSE4.1 */
-#define X264_CPU_STACK_MOD4     0x002000  /* if stack is only mod4 and not mod16 */
+#define X264_CPU_STACK_MOD4     0x001000  /* if stack is only mod4 and not mod16 */
+#define X264_CPU_SSE4           0x002000  /* SSE4.1 */
+#define X264_CPU_SSE42          0x004000  /* SSE4.2 */
 
 /* Analyse flags
  */
-- 
2.40.0