From: Fiona Glaser <fiona@x264.com>
Date: Tue, 17 Mar 2009 18:01:57 +0000 (-0700)
Subject: SSE2 zigzag_interleave
X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=d25d50c9ffb02571c12e13c09356fa08fe97b0b4;p=libx264

SSE2 zigzag_interleave
Replace PHADD with FastShuffle (more accurate naming).
This flag represents asm functions that rely on fast SSE2 shuffle units, and thus are only faster on Phenom, Nehalem, and Penryn CPUs.
---

diff --git a/common/cpu.c b/common/cpu.c
index bf376fc4..860cd957 100644
--- a/common/cpu.c
+++ b/common/cpu.c
@@ -53,7 +53,7 @@ const x264_cpu_name_t x264_cpu_names[] = {
     {"SSE2Fast",X264_CPU_MMX|X264_CPU_MMXEXT|X264_CPU_SSE|X264_CPU_SSE2|X264_CPU_SSE2_IS_FAST},
     {"SSE3",    X264_CPU_MMX|X264_CPU_MMXEXT|X264_CPU_SSE|X264_CPU_SSE2|X264_CPU_SSE3},
     {"SSSE3",   X264_CPU_MMX|X264_CPU_MMXEXT|X264_CPU_SSE|X264_CPU_SSE2|X264_CPU_SSE3|X264_CPU_SSSE3},
-    {"PHADD",   X264_CPU_MMX|X264_CPU_MMXEXT|X264_CPU_SSE|X264_CPU_SSE2|X264_CPU_SSE3|X264_CPU_SSSE3|X264_CPU_PHADD_IS_FAST},
+    {"FastShuffle",   X264_CPU_MMX|X264_CPU_MMXEXT|X264_CPU_SSE|X264_CPU_SSE2|X264_CPU_SHUFFLE_IS_FAST},
     {"SSE4.1",  X264_CPU_MMX|X264_CPU_MMXEXT|X264_CPU_SSE|X264_CPU_SSE2|X264_CPU_SSE3|X264_CPU_SSSE3|X264_CPU_SSE4},
     {"SSE4.2",  X264_CPU_MMX|X264_CPU_MMXEXT|X264_CPU_SSE|X264_CPU_SSE2|X264_CPU_SSE3|X264_CPU_SSSE3|X264_CPU_SSE4|X264_CPU_SSE42},
     {"Cache32", X264_CPU_CACHELINE_32},
@@ -107,7 +107,7 @@ uint32_t x264_cpu_detect( void )
     if( cpu & X264_CPU_SSSE3 )
         cpu |= X264_CPU_SSE2_IS_FAST;
     if( cpu & X264_CPU_SSE4 )
-        cpu |= X264_CPU_PHADD_IS_FAST;
+        cpu |= X264_CPU_SHUFFLE_IS_FAST;
 
     x264_cpu_cpuid( 0x80000000, &eax, &ebx, &ecx, &edx );
     max_extended_cap = eax;
@@ -124,6 +124,7 @@ uint32_t x264_cpu_detect( void )
                 cpu |= X264_CPU_SSE2_IS_FAST;
                 cpu |= X264_CPU_SSE_MISALIGN;
                 cpu |= X264_CPU_LZCNT;
+                cpu |= X264_CPU_SHUFFLE_IS_FAST;
                 x264_cpu_mask_misalign_sse();
             }
             else
diff --git a/common/dct.c b/common/dct.c
index 04301a92..1f8f4b39 100644
--- a/common/dct.c
+++ b/common/dct.c
@@ -663,9 +663,9 @@ void x264_zigzag_init( int cpu, x264_zigzag_function_t *pf, int b_interlaced )
         {
             pf->sub_4x4  = x264_zigzag_sub_4x4_frame_ssse3;
             pf->scan_8x8 = x264_zigzag_scan_8x8_frame_ssse3;
+            if( cpu&X264_CPU_SHUFFLE_IS_FAST )
+                pf->scan_4x4 = x264_zigzag_scan_4x4_frame_ssse3;
         }
-        if( cpu&X264_CPU_PHADD_IS_FAST )
-            pf->scan_4x4 = x264_zigzag_scan_4x4_frame_ssse3;
 #endif
 
 #ifdef ARCH_PPC
@@ -678,5 +678,7 @@ void x264_zigzag_init( int cpu, x264_zigzag_function_t *pf, int b_interlaced )
 #ifdef HAVE_MMX
     if( cpu&X264_CPU_MMX )
         pf->interleave_8x8_cavlc = x264_zigzag_interleave_8x8_cavlc_mmx;
+    if( cpu&X264_CPU_SHUFFLE_IS_FAST )
+        pf->interleave_8x8_cavlc = x264_zigzag_interleave_8x8_cavlc_sse2;
 #endif
 }
diff --git a/common/pixel.c b/common/pixel.c
index 38c39260..76d04e0d 100644
--- a/common/pixel.c
+++ b/common/pixel.c
@@ -763,7 +763,7 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf )
             INIT2( sad_x3, _cache64_ssse3 );
             INIT2( sad_x4, _cache64_ssse3 );
         }
-        if( !(cpu&X264_CPU_PHADD_IS_FAST) )
+        if( !(cpu&X264_CPU_SHUFFLE_IS_FAST) )
         {
             INIT5( ssd, _sse2 ); /* on conroe, sse2 is faster for width8/16 */
         }
diff --git a/common/x86/dct-a.asm b/common/x86/dct-a.asm
index df51926c..6e92df6f 100644
--- a/common/x86/dct-a.asm
+++ b/common/x86/dct-a.asm
@@ -35,7 +35,7 @@ pb_scan4framea: db 12,13,6,7,14,15,0,1,8,9,2,3,4,5,10,11
 pb_scan4frameb: db 0,1,8,9,2,3,4,5,10,11,12,13,6,7,14,15
 pb_idctdc_unpack: db 0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3
 pb_idctdc_unpack2: db 4,4,4,4,5,5,5,5,6,6,6,6,7,7,7,7
-pb_1: times 8 db 1
+pb_1: times 16 db 1
 
 SECTION .text
 
@@ -785,3 +785,50 @@ cglobal x264_zigzag_interleave_8x8_cavlc_mmx, 3,3
     shr     r0d, 16
     mov  [r2+8], r0w
     RET
+
+%macro INTERLEAVE_XMM 1
+    mova   m0, [r1+%1*4+ 0]
+    mova   m1, [r1+%1*4+16]
+    mova   m4, [r1+%1*4+32]
+    mova   m5, [r1+%1*4+48]
+    SBUTTERFLY wd, 0, 1, 6
+    SBUTTERFLY wd, 4, 5, 7
+    SBUTTERFLY wd, 0, 1, 6
+    SBUTTERFLY wd, 4, 5, 7
+    movq   [r0+%1+  0], m0
+    movhps [r0+%1+ 32], m0
+    movq   [r0+%1+ 64], m1
+    movhps [r0+%1+ 96], m1
+    movq   [r0+%1+  8], m4
+    movhps [r0+%1+ 40], m4
+    movq   [r0+%1+ 72], m5
+    movhps [r0+%1+104], m5
+%if %1
+    por    m2, m0
+    por    m3, m1
+    por    m2, m4
+    por    m3, m5
+%else
+    SWAP 0,2
+    SWAP 3,1
+    por    m2, m4
+    por    m3, m5
+%endif
+%endmacro
+
+INIT_XMM
+cglobal x264_zigzag_interleave_8x8_cavlc_sse2, 3,3,8
+    INTERLEAVE_XMM  0
+    INTERLEAVE_XMM 16
+    packsswb m2, m3
+    pxor     m5, m5
+    packsswb m2, m2
+    packsswb m2, m2
+    pcmpeqb  m5, m2
+    paddb    m5, [pb_1 GLOBAL]
+    movd    r0d, m5
+    mov  [r2+0], r0w
+    shr     r0d, 16
+    mov  [r2+8], r0w
+    RET
+
diff --git a/common/x86/dct.h b/common/x86/dct.h
index 5b83d342..44518212 100644
--- a/common/x86/dct.h
+++ b/common/x86/dct.h
@@ -69,5 +69,6 @@ void x264_zigzag_scan_4x4_frame_mmx   ( int16_t level[16], int16_t dct[4][4] );
 void x264_zigzag_scan_4x4_field_mmxext( int16_t level[16], int16_t dct[4][4] );
 void x264_zigzag_sub_4x4_frame_ssse3  ( int16_t level[16], const uint8_t *src, uint8_t *dst );
 void x264_zigzag_interleave_8x8_cavlc_mmx( int16_t *dst, int16_t *src, uint8_t *nnz );
+void x264_zigzag_interleave_8x8_cavlc_sse2( int16_t *dst, int16_t *src, uint8_t *nnz );
 
 #endif
diff --git a/tools/checkasm.c b/tools/checkasm.c
index 1c1562a3..31fd9758 100644
--- a/tools/checkasm.c
+++ b/tools/checkasm.c
@@ -151,7 +151,7 @@ static void print_bench(void)
             if( k<j ) continue;
             printf( "%s_%s%s: %"PRId64"\n", benchs[i].name,
                     b->cpu&X264_CPU_SSE4 ? "sse4" :
-                    b->cpu&X264_CPU_PHADD_IS_FAST ? "phadd" :
+                    b->cpu&X264_CPU_SHUFFLE_IS_FAST ? "fastshuffle" :
                     b->cpu&X264_CPU_SSSE3 ? "ssse3" :
                     b->cpu&X264_CPU_SSE3 ? "sse3" :
                     /* print sse2slow only if there's also a sse2fast version of the same func */
@@ -1364,10 +1364,10 @@ static int check_intra( int cpu_ref, int cpu_new )
     for( i = 0; i < 12; i++ )
         INTRA_TEST( predict_8x8, i, 8, edge );
 
-    used_asm = 1;
     set_func_name("intra_predict_8x8_filter");
     if( ip_a.predict_8x8_filter != ip_ref.predict_8x8_filter )
     {
+        used_asm = 1;
         for( i = 0; i < 32; i++ )
         {
             memcpy( edge2, edge, 33 );
@@ -1463,6 +1463,8 @@ static int check_all_flags( void )
         ret |= add_flags( &cpu0, &cpu1, X264_CPU_SSE | X264_CPU_SSE2 | X264_CPU_SSE2_IS_SLOW, "SSE2Slow" );
         ret |= add_flags( &cpu0, &cpu1, X264_CPU_SSE2_IS_FAST, "SSE2Fast" );
         ret |= add_flags( &cpu0, &cpu1, X264_CPU_CACHELINE_64, "SSE2Fast Cache64" );
+        ret |= add_flags( &cpu0, &cpu1, X264_CPU_SHUFFLE_IS_FAST, "SSE2 FastShuffle" );
+        cpu1 &= ~X264_CPU_SHUFFLE_IS_FAST;
     }
     if( x264_cpu_detect() & X264_CPU_SSE_MISALIGN )
     {
@@ -1483,7 +1485,8 @@ static int check_all_flags( void )
         cpu1 &= ~X264_CPU_CACHELINE_64;
         ret |= add_flags( &cpu0, &cpu1, X264_CPU_SSSE3, "SSSE3" );
         ret |= add_flags( &cpu0, &cpu1, X264_CPU_CACHELINE_64, "SSSE3 Cache64" );
-        ret |= add_flags( &cpu0, &cpu1, X264_CPU_PHADD_IS_FAST, "PHADD" );
+        ret |= add_flags( &cpu0, &cpu1, X264_CPU_SHUFFLE_IS_FAST, "SSSE3 FastShuffle" );
+        cpu1 &= ~X264_CPU_SHUFFLE_IS_FAST;
     }
     if( x264_cpu_detect() & X264_CPU_SSE4 )
     {
diff --git a/x264.h b/x264.h
index 3971992b..26ac421c 100644
--- a/x264.h
+++ b/x264.h
@@ -57,7 +57,7 @@ typedef struct x264_t x264_t;
 #define X264_CPU_SSE2_IS_FAST   0x000100  /* a few functions are only faster on Core2 and Phenom */
 #define X264_CPU_SSE3           0x000200
 #define X264_CPU_SSSE3          0x000400
-#define X264_CPU_PHADD_IS_FAST  0x000800  /* pre-Penryn Core2 have a uselessly slow PHADD instruction */
+#define X264_CPU_SHUFFLE_IS_FAST 0x000800 /* Penryn, Nehalem, and Phenom have fast shuffle units */
 #define X264_CPU_STACK_MOD4     0x001000  /* if stack is only mod4 and not mod16 */
 #define X264_CPU_SSE4           0x002000  /* SSE4.1 */
 #define X264_CPU_SSE42          0x004000  /* SSE4.2 */