enable ssse3 phadd satd on Penryn.

author Loren Merritt <pengvado@akuvian.org>

Sat, 7 Jun 2008 05:30:37 +0000 (23:30 -0600)

committer Loren Merritt <pengvado@akuvian.org>

Sun, 8 Jun 2008 05:02:01 +0000 (23:02 -0600)
author Loren Merritt <pengvado@akuvian.org>
Sat, 7 Jun 2008 05:30:37 +0000 (23:30 -0600)
committer Loren Merritt <pengvado@akuvian.org>
Sun, 8 Jun 2008 05:02:01 +0000 (23:02 -0600)
diff --git a/common/cpu.c b/common/cpu.c

index 47a72f769ca13c93e67ee384022cff0cb6409c4f..3ebe970f530d3e7ec0fc886f6d0c29d0b924e97f 100644 (file)
--- a/common/cpu.c
+++ b/common/cpu.c
@@ -47,6 +47,7 @@ const struct {
      {"SSE2",    X264_CPU_MMX|X264_CPU_MMXEXT|X264_CPU_SSE|X264_CPU_SSE2},
      {"SSE3",    X264_CPU_MMX|X264_CPU_MMXEXT|X264_CPU_SSE|X264_CPU_SSE2|X264_CPU_SSE3},
      {"SSSE3",   X264_CPU_MMX|X264_CPU_MMXEXT|X264_CPU_SSE|X264_CPU_SSE2|X264_CPU_SSE3|X264_CPU_SSSE3},
+    {"SSE4",    X264_CPU_MMX|X264_CPU_MMXEXT|X264_CPU_SSE|X264_CPU_SSE2|X264_CPU_SSE3|X264_CPU_SSSE3|X264_CPU_SSE4},
      {"3DNow",   X264_CPU_3DNOW},
      {"Altivec", X264_CPU_ALTIVEC},
      {"Cache32", X264_CPU_CACHELINE_SPLIT|X264_CPU_CACHELINE_32},
@@ -88,6 +89,8 @@ uint32_t x264_cpu_detect( void )
          cpu |= X264_CPU_SSE3;
      if( ecx&0x00000200 )
          cpu |= X264_CPU_SSSE3;
+    if( ecx&0x00080000 )
+        cpu |= X264_CPU_SSE4;
  
      x264_cpu_cpuid( 0x80000000, &eax, &ebx, &ecx, &edx );
      max_extended_cap = eax;
diff --git a/common/pixel.c b/common/pixel.c

index 133968cc8f08d3f39509b6c7f331702f964a1682..0d00b6e52c40131f9c2d39c631a1c6285774e1e2 100644 (file)
--- a/common/pixel.c
+++ b/common/pixel.c
@@ -360,6 +360,7 @@ SATD_X_DECL7()
  SATD_X_DECL7( _mmxext )
  SATD_X_DECL5( _sse2 )
  SATD_X_DECL7( _ssse3 )
+SATD_X_DECL5( _ssse3_phadd )
  #endif
  
  /****************************************************************************
@@ -649,6 +650,14 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf )
              INIT2( sad_x4, _cache64_ssse3 );
          }
      }
+
+    if( cpu&X264_CPU_SSE4 )
+    {
+        // enabled on Penryn, but slower on Conroe
+        INIT5( satd, _ssse3_phadd );
+        INIT5( satd_x3, _ssse3_phadd );
+        INIT5( satd_x4, _ssse3_phadd );
+    }
  #endif //HAVE_MMX
  
  #ifdef ARCH_PPC
diff --git a/common/x86/pixel-a.asm b/common/x86/pixel-a.asm

index 9eed1dbcc75365969a8b85d6c470b69b00166902..361e2a633edc2414571a82251e63902226e1062f 100644 (file)
--- a/common/x86/pixel-a.asm
+++ b/common/x86/pixel-a.asm
@@ -274,19 +274,23 @@ SSD_SSE2 8, 4
      LOAD_DIFF_8P %4, %6, [r0+r4],   [r2+r5]
  %endmacro
  
-;;; row transform not used, because phaddw is much slower than paddw on a Conroe
-;%macro PHSUMSUB 3
-;    movdqa  %3, %1
-;    phaddw  %1, %2
-;    phsubw  %3, %2
-;%endmacro
-
-;%macro HADAMARD4_ROW_SSSE3 5  ; abcd-t -> adtc
-;    PHSUMSUB    %1, %2, %5
-;    PHSUMSUB    %3, %4, %2
-;    PHSUMSUB    %1, %3, %4
-;    PHSUMSUB    %5, %2, %3
-;%endmacro
+; phaddw is used only in 4x4 hadamard, because in 8x8 it's slower:
+; even on Penryn, phaddw has latency 3 while paddw and punpck* have 1.
+; 4x4 is special in that 4x4 transpose in xmmregs takes extra munging,
+; whereas phaddw-based transform doesn't care what order the coefs end up in.
+
+%macro PHSUMSUB 3
+    movdqa  %3, %1
+    phaddw  %1, %2
+    phsubw  %3, %2
+%endmacro
+
+%macro HADAMARD4_ROW_PHADD 5  ; abcd-t -> adtc
+    PHSUMSUB  %1, %2, %5
+    PHSUMSUB  %3, %4, %2
+    PHSUMSUB  %1, %3, %4
+    PHSUMSUB  %5, %2, %3
+%endmacro
  
  %macro SUMSUB_BADC 4
      paddw  %1, %2
@@ -494,6 +498,21 @@ SSD_SSE2 8, 4
      paddusw  xmm6, xmm2
  %endmacro
  
+%macro SATD_8x4_PHADD 1
+    LOAD_DIFF_8x4P    xmm0, xmm1, xmm2, xmm3, xmm4, xmm5
+%if %1
+    lea  r0, [r0+4*r1]
+    lea  r2, [r2+4*r3]
+%endif
+    HADAMARD4_1D    xmm0, xmm1, xmm2, xmm3
+    HADAMARD4_ROW_PHADD xmm0, xmm1, xmm2, xmm3, xmm4
+    ABS4            xmm0, xmm3, xmm4, xmm2, xmm1, xmm5
+    paddusw  xmm0, xmm3
+    paddusw  xmm2, xmm4
+    paddusw  xmm6, xmm0
+    paddusw  xmm6, xmm2
+%endmacro
+
  %macro SATD_START_MMX 0
      lea  r4, [3*r1] ; 3*stride1
      lea  r5, [3*r3] ; 3*stride2
@@ -1279,6 +1298,8 @@ SA8D_16x16_32 ssse3
  INTRA_SA8D_SSE2 ssse3
  INTRA_SATDS_MMX ssse3
  SATD_W4 ssse3 ; mmx, but uses pabsw from ssse3.
+%define SATD_8x4_SSE2 SATD_8x4_PHADD
+SATDS_SSE2 ssse3_phadd
  
  
  
diff --git a/common/x86/pixel.h b/common/x86/pixel.h

index 92adfbfb61d87078570a47a09a9653a2b3f1780e..6aa556a8d92595014d6b539de2f241dba215ac8e 100644 (file)
--- a/common/x86/pixel.h
+++ b/common/x86/pixel.h
@@ -51,6 +51,7 @@ DECL_X1( ssd, sse2 )
  DECL_X1( satd, mmxext )
  DECL_X1( satd, sse2 )
  DECL_X1( satd, ssse3 )
+DECL_X1( satd, ssse3_phadd )
  DECL_X1( sa8d, mmxext )
  DECL_X1( sa8d, sse2 )
  DECL_X1( sa8d, ssse3 )
diff --git a/tools/checkasm.c b/tools/checkasm.c

index ab3c4f95ddc7a52fcc4bd81109276a4efffdfd76..115e2217d9086688e45841a19c8f3f76f86e8ee8 100644 (file)
--- a/tools/checkasm.c
+++ b/tools/checkasm.c
@@ -120,6 +120,7 @@ static void print_bench(void)
              for( k=0; k<j && benchs[i].vers[k].pointer != b->pointer; k++ );
              if( k<j ) continue;
              printf( "%s_%s%s: %"PRId64"\n", benchs[i].name,
+                    b->cpu&X264_CPU_SSE4 ? "sse4" :
                      b->cpu&X264_CPU_SSSE3 ? "ssse3" :
                      b->cpu&X264_CPU_SSE3 ? "sse3" :
                      b->cpu&X264_CPU_SSE2 ? "sse2" :
@@ -1142,6 +1143,11 @@ int check_all_flags( void )
          ret |= add_flags( &cpu0, &cpu1, X264_CPU_SSSE3, "SSSE3" );
          ret |= add_flags( &cpu0, &cpu1, X264_CPU_CACHELINE_SPLIT|X264_CPU_CACHELINE_64, "SSSE3 Cache64" );
      }
+    if( x264_cpu_detect() & X264_CPU_SSSE3 )
+    {
+        cpu1 &= ~(X264_CPU_CACHELINE_SPLIT|X264_CPU_CACHELINE_64);
+        ret |= add_flags( &cpu0, &cpu1, X264_CPU_SSE4, "SSE4" );
+    }
  #elif ARCH_PPC
      if( x264_cpu_detect() & X264_CPU_ALTIVEC )
      {
diff --git a/x264.h b/x264.h

index ff4cc24b918a75b58f4124f08cfba79e1d1eddb9..7b3904951c68593c708e0d50a02f5fd9471e16d3 100644 (file)
--- a/x264.h
+++ b/x264.h
@@ -58,6 +58,7 @@ typedef struct x264_t x264_t;
  #define X264_CPU_CACHELINE_SPLIT 0x200  /* avoid memory loads that span the border between two cachelines */
  #define X264_CPU_CACHELINE_32 0x0400    /* size of a cacheline in bytes */
  #define X264_CPU_CACHELINE_64 0x0800
+#define X264_CPU_SSE4       0x001000    /* sse 4.1 */
  
  /* Analyse flags
   */
author	Loren Merritt <pengvado@akuvian.org>
	Sat, 7 Jun 2008 05:30:37 +0000 (23:30 -0600)
committer	Loren Merritt <pengvado@akuvian.org>
	Sun, 8 Jun 2008 05:02:01 +0000 (23:02 -0600)
common/cpu.c		patch \| blob \| history
common/pixel.c		patch \| blob \| history
common/x86/pixel-a.asm		patch \| blob \| history
common/x86/pixel.h		patch \| blob \| history
tools/checkasm.c		patch \| blob \| history
x264.h		patch \| blob \| history