From: Loren Merritt <pengvado@akuvian.org>
Date: Sat, 7 Jun 2008 05:30:37 +0000 (-0600)
Subject: enable ssse3 phadd satd on Penryn.
X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=f9ad5ee2564bb272635f0c69fefa28e0b1b47f37;p=libx264

enable ssse3 phadd satd on Penryn.
---

diff --git a/common/cpu.c b/common/cpu.c
index 47a72f76..3ebe970f 100644
--- a/common/cpu.c
+++ b/common/cpu.c
@@ -47,6 +47,7 @@ const struct {
     {"SSE2",    X264_CPU_MMX|X264_CPU_MMXEXT|X264_CPU_SSE|X264_CPU_SSE2},
     {"SSE3",    X264_CPU_MMX|X264_CPU_MMXEXT|X264_CPU_SSE|X264_CPU_SSE2|X264_CPU_SSE3},
     {"SSSE3",   X264_CPU_MMX|X264_CPU_MMXEXT|X264_CPU_SSE|X264_CPU_SSE2|X264_CPU_SSE3|X264_CPU_SSSE3},
+    {"SSE4",    X264_CPU_MMX|X264_CPU_MMXEXT|X264_CPU_SSE|X264_CPU_SSE2|X264_CPU_SSE3|X264_CPU_SSSE3|X264_CPU_SSE4},
     {"3DNow",   X264_CPU_3DNOW},
     {"Altivec", X264_CPU_ALTIVEC},
     {"Cache32", X264_CPU_CACHELINE_SPLIT|X264_CPU_CACHELINE_32},
@@ -88,6 +89,8 @@ uint32_t x264_cpu_detect( void )
         cpu |= X264_CPU_SSE3;
     if( ecx&0x00000200 )
         cpu |= X264_CPU_SSSE3;
+    if( ecx&0x00080000 )
+        cpu |= X264_CPU_SSE4;
 
     x264_cpu_cpuid( 0x80000000, &eax, &ebx, &ecx, &edx );
     max_extended_cap = eax;
diff --git a/common/pixel.c b/common/pixel.c
index 133968cc..0d00b6e5 100644
--- a/common/pixel.c
+++ b/common/pixel.c
@@ -360,6 +360,7 @@ SATD_X_DECL7()
 SATD_X_DECL7( _mmxext )
 SATD_X_DECL5( _sse2 )
 SATD_X_DECL7( _ssse3 )
+SATD_X_DECL5( _ssse3_phadd )
 #endif
 
 /****************************************************************************
@@ -649,6 +650,14 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf )
             INIT2( sad_x4, _cache64_ssse3 );
         }
     }
+
+    if( cpu&X264_CPU_SSE4 )
+    {
+        // enabled on Penryn, but slower on Conroe
+        INIT5( satd, _ssse3_phadd );
+        INIT5( satd_x3, _ssse3_phadd );
+        INIT5( satd_x4, _ssse3_phadd );
+    }
 #endif //HAVE_MMX
 
 #ifdef ARCH_PPC
diff --git a/common/x86/pixel-a.asm b/common/x86/pixel-a.asm
index 9eed1dbc..361e2a63 100644
--- a/common/x86/pixel-a.asm
+++ b/common/x86/pixel-a.asm
@@ -274,19 +274,23 @@ SSD_SSE2 8, 4
     LOAD_DIFF_8P %4, %6, [r0+r4],   [r2+r5]
 %endmacro
 
-;;; row transform not used, because phaddw is much slower than paddw on a Conroe
-;%macro PHSUMSUB 3
-;    movdqa  %3, %1
-;    phaddw  %1, %2
-;    phsubw  %3, %2
-;%endmacro
-
-;%macro HADAMARD4_ROW_SSSE3 5  ; abcd-t -> adtc
-;    PHSUMSUB    %1, %2, %5
-;    PHSUMSUB    %3, %4, %2
-;    PHSUMSUB    %1, %3, %4
-;    PHSUMSUB    %5, %2, %3
-;%endmacro
+; phaddw is used only in 4x4 hadamard, because in 8x8 it's slower:
+; even on Penryn, phaddw has latency 3 while paddw and punpck* have 1.
+; 4x4 is special in that 4x4 transpose in xmmregs takes extra munging,
+; whereas phaddw-based transform doesn't care what order the coefs end up in.
+
+%macro PHSUMSUB 3
+    movdqa  %3, %1
+    phaddw  %1, %2
+    phsubw  %3, %2
+%endmacro
+
+%macro HADAMARD4_ROW_PHADD 5  ; abcd-t -> adtc
+    PHSUMSUB  %1, %2, %5
+    PHSUMSUB  %3, %4, %2
+    PHSUMSUB  %1, %3, %4
+    PHSUMSUB  %5, %2, %3
+%endmacro
 
 %macro SUMSUB_BADC 4
     paddw  %1, %2
@@ -494,6 +498,21 @@ SSD_SSE2 8, 4
     paddusw  xmm6, xmm2
 %endmacro
 
+%macro SATD_8x4_PHADD 1
+    LOAD_DIFF_8x4P    xmm0, xmm1, xmm2, xmm3, xmm4, xmm5
+%if %1
+    lea  r0, [r0+4*r1]
+    lea  r2, [r2+4*r3]
+%endif
+    HADAMARD4_1D    xmm0, xmm1, xmm2, xmm3
+    HADAMARD4_ROW_PHADD xmm0, xmm1, xmm2, xmm3, xmm4
+    ABS4            xmm0, xmm3, xmm4, xmm2, xmm1, xmm5
+    paddusw  xmm0, xmm3
+    paddusw  xmm2, xmm4
+    paddusw  xmm6, xmm0
+    paddusw  xmm6, xmm2
+%endmacro
+
 %macro SATD_START_MMX 0
     lea  r4, [3*r1] ; 3*stride1
     lea  r5, [3*r3] ; 3*stride2
@@ -1279,6 +1298,8 @@ SA8D_16x16_32 ssse3
 INTRA_SA8D_SSE2 ssse3
 INTRA_SATDS_MMX ssse3
 SATD_W4 ssse3 ; mmx, but uses pabsw from ssse3.
+%define SATD_8x4_SSE2 SATD_8x4_PHADD
+SATDS_SSE2 ssse3_phadd
 
 
 
diff --git a/common/x86/pixel.h b/common/x86/pixel.h
index 92adfbfb..6aa556a8 100644
--- a/common/x86/pixel.h
+++ b/common/x86/pixel.h
@@ -51,6 +51,7 @@ DECL_X1( ssd, sse2 )
 DECL_X1( satd, mmxext )
 DECL_X1( satd, sse2 )
 DECL_X1( satd, ssse3 )
+DECL_X1( satd, ssse3_phadd )
 DECL_X1( sa8d, mmxext )
 DECL_X1( sa8d, sse2 )
 DECL_X1( sa8d, ssse3 )
diff --git a/tools/checkasm.c b/tools/checkasm.c
index ab3c4f95..115e2217 100644
--- a/tools/checkasm.c
+++ b/tools/checkasm.c
@@ -120,6 +120,7 @@ static void print_bench(void)
             for( k=0; k<j && benchs[i].vers[k].pointer != b->pointer; k++ );
             if( k<j ) continue;
             printf( "%s_%s%s: %"PRId64"\n", benchs[i].name,
+                    b->cpu&X264_CPU_SSE4 ? "sse4" :
                     b->cpu&X264_CPU_SSSE3 ? "ssse3" :
                     b->cpu&X264_CPU_SSE3 ? "sse3" :
                     b->cpu&X264_CPU_SSE2 ? "sse2" :
@@ -1142,6 +1143,11 @@ int check_all_flags( void )
         ret |= add_flags( &cpu0, &cpu1, X264_CPU_SSSE3, "SSSE3" );
         ret |= add_flags( &cpu0, &cpu1, X264_CPU_CACHELINE_SPLIT|X264_CPU_CACHELINE_64, "SSSE3 Cache64" );
     }
+    if( x264_cpu_detect() & X264_CPU_SSSE3 )
+    {
+        cpu1 &= ~(X264_CPU_CACHELINE_SPLIT|X264_CPU_CACHELINE_64);
+        ret |= add_flags( &cpu0, &cpu1, X264_CPU_SSE4, "SSE4" );
+    }
 #elif ARCH_PPC
     if( x264_cpu_detect() & X264_CPU_ALTIVEC )
     {
diff --git a/x264.h b/x264.h
index ff4cc24b..7b390495 100644
--- a/x264.h
+++ b/x264.h
@@ -58,6 +58,7 @@ typedef struct x264_t x264_t;
 #define X264_CPU_CACHELINE_SPLIT 0x200  /* avoid memory loads that span the border between two cachelines */
 #define X264_CPU_CACHELINE_32 0x0400    /* size of a cacheline in bytes */
 #define X264_CPU_CACHELINE_64 0x0800
+#define X264_CPU_SSE4       0x001000    /* sse 4.1 */
 
 /* Analyse flags
  */