From cb688111fb28225a4d1fe2a45472ac0cd093a08f Mon Sep 17 00:00:00 2001
From: Fiona Glaser <fiona@x264.com>
Date: Tue, 30 Dec 2008 20:47:45 -0500
Subject: [PATCH] Add support for SSE4a (Phenom) LZCNT instruction
 Significantly speeds up coeff_last and coeff_level_run on Phenom CPUs for
 faster CAVLC and CABAC. Also a small tweak to coeff_level_run asm.

---
 common/cpu.c           |  2 ++
 common/quant.c         | 13 ++++++++
 common/x86/quant-a.asm | 74 ++++++++++++++++++++++++++++++++----------
 common/x86/quant.h     |  7 ++++
 tools/checkasm.c       | 14 +++++++-
 x264.h                 |  1 +
 6 files changed, 92 insertions(+), 19 deletions(-)

diff --git a/common/cpu.c b/common/cpu.c
index aff31eb8..c1850462 100644
--- a/common/cpu.c
+++ b/common/cpu.c
@@ -54,6 +54,7 @@ const x264_cpu_name_t x264_cpu_names[] = {
     {"Cache32", X264_CPU_CACHELINE_32},
     {"Cache64", X264_CPU_CACHELINE_64},
     {"SSEMisalign", X264_CPU_SSE_MISALIGN},
+    {"LZCNT", X264_CPU_LZCNT},
     {"Slow_mod4_stack", X264_CPU_STACK_MOD4},
     {"", 0},
 };
@@ -117,6 +118,7 @@ uint32_t x264_cpu_detect( void )
             {
                 cpu |= X264_CPU_SSE2_IS_FAST;
                 cpu |= X264_CPU_SSE_MISALIGN;
+                cpu |= X264_CPU_LZCNT;
                 x264_cpu_mask_misalign_sse();
             }
             else
diff --git a/common/quant.c b/common/quant.c
index fa38360c..ac798a25 100644
--- a/common/quant.c
+++ b/common/quant.c
@@ -352,6 +352,11 @@ void x264_quant_init( x264_t *h, int cpu, x264_quant_function_t *pf )
 #endif
         pf->coeff_last[DCT_CHROMA_DC] = x264_coeff_last4_mmxext;
         pf->coeff_level_run[DCT_CHROMA_DC] = x264_coeff_level_run4_mmxext;
+        if( cpu&X264_CPU_LZCNT )
+        {
+            pf->coeff_last[DCT_CHROMA_DC] = x264_coeff_last4_mmxext_lzcnt;
+            pf->coeff_level_run[DCT_CHROMA_DC] = x264_coeff_level_run4_mmxext_lzcnt;
+        }
     }
 
     if( cpu&X264_CPU_SSE2 )
@@ -376,6 +381,14 @@ void x264_quant_init( x264_t *h, int cpu, x264_quant_function_t *pf )
         pf->coeff_last[DCT_LUMA_8x8] = x264_coeff_last64_sse2;
         pf->coeff_level_run[ DCT_LUMA_AC] = x264_coeff_level_run15_sse2;
         pf->coeff_level_run[DCT_LUMA_4x4] = x264_coeff_level_run16_sse2;
+        if( cpu&X264_CPU_LZCNT )
+        {
+            pf->coeff_last[ DCT_LUMA_AC] = x264_coeff_last15_sse2_lzcnt;
+            pf->coeff_last[DCT_LUMA_4x4] = x264_coeff_last16_sse2_lzcnt;
+            pf->coeff_last[DCT_LUMA_8x8] = x264_coeff_last64_sse2_lzcnt;
+            pf->coeff_level_run[ DCT_LUMA_AC] = x264_coeff_level_run15_sse2_lzcnt;
+            pf->coeff_level_run[DCT_LUMA_4x4] = x264_coeff_level_run16_sse2_lzcnt;
+        }
     }
 
     if( cpu&X264_CPU_SSSE3 )
diff --git a/common/x86/quant-a.asm b/common/x86/quant-a.asm
index 5cbdf4a8..3b92379e 100644
--- a/common/x86/quant-a.asm
+++ b/common/x86/quant-a.asm
@@ -688,37 +688,53 @@ DECIMATE8x8 ssse3
     or        %1, %3
 %endmacro
 
+%macro LAST_X86 3
+    bsr %1, %2
+%endmacro
+
+%macro LAST_SSE4A 3
+    lzcnt %1, %2
+    xor %1, %3
+%endmacro
+
+%macro COEFF_LAST4 1
 %ifdef ARCH_X86_64
-cglobal x264_coeff_last4_mmxext, 1,1
-    bsr rax, [r0]
+cglobal x264_coeff_last4_%1, 1,1
+    LAST rax, [r0], 0x3f
     shr eax, 4
     RET
 %else
-cglobal x264_coeff_last4_mmxext, 0,3
+cglobal x264_coeff_last4_%1, 0,3
     mov   edx, r0m
     mov   eax, [edx+4]
     xor   ecx, ecx
     test  eax, eax
     cmovz eax, [edx]
     setnz cl
-    bsr   eax, eax
+    LAST  eax, eax, 0x1f
     shr   eax, 4
     lea   eax, [eax+ecx*2]
     RET
 %endif
+%endmacro
+
+%define LAST LAST_X86
+COEFF_LAST4 mmxext
+%define LAST LAST_SSE4A
+COEFF_LAST4 mmxext_lzcnt
 
 %macro COEFF_LAST 1
 cglobal x264_coeff_last15_%1, 1,3
     LAST_MASK r1d, r0-2, r2d
     xor r1d, 0xffff
-    bsr eax, r1d
+    LAST eax, r1d, 0x1f
     dec eax
     RET
 
 cglobal x264_coeff_last16_%1, 1,3
     LAST_MASK r1d, r0, r2d
     xor r1d, 0xffff
-    bsr eax, r1d
+    LAST eax, r1d, 0x1f
     RET
 
 %ifndef ARCH_X86_64
@@ -738,17 +754,18 @@ cglobal x264_coeff_last16_%1, 1,3
     not r1d
     xor r2d, -1
     jne .secondhalf
-    bsr eax, r1d
+    LAST eax, r1d, 0x1f
     RET
 .secondhalf:
-    bsr eax, r2d
+    LAST eax, r2d, 0x1f
     add eax, 32
     RET
 %endif
 %endmacro
 
 %ifdef ARCH_X86_64
-    cglobal x264_coeff_last64_sse2, 1,4
+%macro COEFF_LAST64 1
+    cglobal x264_coeff_last64_%1, 1,4
     LAST_MASK_SSE2 r1d, r0
     LAST_MASK_SSE2 r2d, r0+32
     LAST_MASK_SSE2 r3d, r0+64
@@ -760,16 +777,25 @@ cglobal x264_coeff_last16_%1, 1,3
     shl r3,  32
     or  r1,  r3
     not r1
-    bsr rax, r1
+    LAST rax, r1, 0x3f
     RET
+%endmacro
+
+%define LAST LAST_X86
+COEFF_LAST64 sse2
+%define LAST LAST_SSE4A
+COEFF_LAST64 sse2_lzcnt
 %endif
 
+%define LAST LAST_X86
 %ifndef ARCH_X86_64
 %define LAST_MASK LAST_MASK_MMX
 COEFF_LAST mmxext
 %endif
 %define LAST_MASK LAST_MASK_SSE2
 COEFF_LAST sse2
+%define LAST LAST_SSE4A
+COEFF_LAST sse2_lzcnt
 
 ;-----------------------------------------------------------------------------
 ; int x264_coeff_level_run( int16_t *dct, x264_run_level_t *runlevel )
@@ -783,6 +809,15 @@ COEFF_LAST sse2
     pmovmskb  %1, mm0
 %endmacro
 
+%macro LZCOUNT_X86 3
+    bsr %1, %2
+    xor %1, %3
+%endmacro
+
+%macro LZCOUNT_SSE4A 3
+    lzcnt %1, %2
+%endmacro
+
 ; t6 = eax for return, t3 = ecx for shift, t[01] = r[01] for x86_64 args
 %ifdef ARCH_X86_64
     DECLARE_REG_TMP 0,1,2,3,4,5,6
@@ -794,21 +829,18 @@ COEFF_LAST sse2
 cglobal x264_coeff_level_run%2_%1,0,7
     movifnidn t0d, r0m
     movifnidn t1d, r1m
-    LAST_MASK t2d, t0-(%2&1)*2, t4d
-    not    t2d
-    shl    t2d, 32-((%2+1)&~1)
+    LAST_MASK t5d, t0-(%2&1)*2, t4d
+    not    t5d
+    shl    t5d, 32-((%2+1)&~1)
     mov    t4d, %2-1
-    mov    t5d, t2d
-    bsr    t3d, t2d
+    LZCOUNT t3d, t5d, 0x1f
     xor    t6d, t6d
     shl    t5d, 1
-    xor    t3d, 0x1f
     sub    t4d, t3d
     shl    t5d, t3b
     mov   [t1], t4d
 .loop:
-    bsr    t3d, t5d
-    xor    t3d, 0x1f
+    LZCOUNT t3d, t5d, 0x1f
     mov    t2w, [t0+t4*2]
     mov   [t1+t6  +36], t3b
     mov   [t1+t6*2+ 4], t2w
@@ -820,6 +852,7 @@ cglobal x264_coeff_level_run%2_%1,0,7
     RET
 %endmacro
 
+%define LZCOUNT LZCOUNT_X86
 %ifndef ARCH_X86_64
 %define LAST_MASK LAST_MASK_MMX
 COEFF_LEVELRUN mmxext, 15
@@ -830,3 +863,8 @@ COEFF_LEVELRUN mmxext, 4
 %define LAST_MASK LAST_MASK_SSE2
 COEFF_LEVELRUN sse2, 15
 COEFF_LEVELRUN sse2, 16
+%define LZCOUNT LZCOUNT_SSE4A
+COEFF_LEVELRUN sse2_lzcnt, 15
+COEFF_LEVELRUN sse2_lzcnt, 16
+%define LAST_MASK LAST_MASK4_MMX
+COEFF_LEVELRUN mmxext_lzcnt, 4
diff --git a/common/x86/quant.h b/common/x86/quant.h
index 46186ceb..878699f9 100644
--- a/common/x86/quant.h
+++ b/common/x86/quant.h
@@ -64,10 +64,17 @@ int x264_coeff_last64_mmxext( int16_t *dct );
 int x264_coeff_last15_sse2( int16_t *dct );
 int x264_coeff_last16_sse2( int16_t *dct );
 int x264_coeff_last64_sse2( int16_t *dct );
+int x264_coeff_last4_mmxext_lzcnt( int16_t *dct );
+int x264_coeff_last15_sse2_lzcnt( int16_t *dct );
+int x264_coeff_last16_sse2_lzcnt( int16_t *dct );
+int x264_coeff_last64_sse2_lzcnt( int16_t *dct );
 int x264_coeff_level_run16_mmxext( int16_t *dct, x264_run_level_t *runlevel );
 int x264_coeff_level_run16_sse2( int16_t *dct, x264_run_level_t *runlevel );
+int x264_coeff_level_run16_sse2_lzcnt( int16_t *dct, x264_run_level_t *runlevel );
 int x264_coeff_level_run15_mmxext( int16_t *dct, x264_run_level_t *runlevel );
 int x264_coeff_level_run15_sse2( int16_t *dct, x264_run_level_t *runlevel );
+int x264_coeff_level_run15_sse2_lzcnt( int16_t *dct, x264_run_level_t *runlevel );
 int x264_coeff_level_run4_mmxext( int16_t *dct, x264_run_level_t *runlevel );
+int x264_coeff_level_run4_mmxext_lzcnt( int16_t *dct, x264_run_level_t *runlevel );
 
 #endif
diff --git a/tools/checkasm.c b/tools/checkasm.c
index d154941e..203a5963 100644
--- a/tools/checkasm.c
+++ b/tools/checkasm.c
@@ -156,7 +156,8 @@ static void print_bench(void)
                     b->cpu&X264_CPU_MMX ? "mmx" : "c",
                     b->cpu&X264_CPU_CACHELINE_32 ? "_c32" :
                     b->cpu&X264_CPU_CACHELINE_64 ? "_c64" :
-                    b->cpu&X264_CPU_SSE_MISALIGN ? "_misalign" : "",
+                    b->cpu&X264_CPU_SSE_MISALIGN ? "_misalign" :
+                    b->cpu&X264_CPU_LZCNT ? "_lzcnt" : "",
                     ((int64_t)10*b->cycles/b->den - nop_time)/4 );
         }
 }
@@ -1392,6 +1393,11 @@ static int check_all_flags( void )
         ret |= add_flags( &cpu0, &cpu1, X264_CPU_CACHELINE_32, "MMX Cache32" );
         cpu1 &= ~X264_CPU_CACHELINE_32;
 #endif
+        if( x264_cpu_detect() & X264_CPU_LZCNT )
+        {
+            ret |= add_flags( &cpu0, &cpu1, X264_CPU_LZCNT, "MMX_LZCNT" );
+            cpu1 &= ~X264_CPU_LZCNT;
+        }
     }
     if( x264_cpu_detect() & X264_CPU_SSE2 )
     {
@@ -1405,6 +1411,12 @@ static int check_all_flags( void )
         ret |= add_flags( &cpu0, &cpu1, X264_CPU_SSE_MISALIGN, "SSE_Misalign" );
         cpu1 &= ~X264_CPU_SSE_MISALIGN;
     }
+    if( x264_cpu_detect() & X264_CPU_LZCNT )
+    {
+        cpu1 &= ~X264_CPU_CACHELINE_64;
+        ret |= add_flags( &cpu0, &cpu1, X264_CPU_LZCNT, "SSE_LZCNT" );
+        cpu1 &= ~X264_CPU_LZCNT;
+    }
     if( x264_cpu_detect() & X264_CPU_SSE3 )
         ret |= add_flags( &cpu0, &cpu1, X264_CPU_SSE3 | X264_CPU_CACHELINE_64, "SSE3" );
     if( x264_cpu_detect() & X264_CPU_SSSE3 )
diff --git a/x264.h b/x264.h
index 51be79ee..8c517b1e 100644
--- a/x264.h
+++ b/x264.h
@@ -62,6 +62,7 @@ typedef struct x264_t x264_t;
 #define X264_CPU_SSE4           0x002000  /* SSE4.1 */
 #define X264_CPU_SSE42          0x004000  /* SSE4.2 */
 #define X264_CPU_SSE_MISALIGN   0x008000  /* Phenom support for misaligned SSE instruction arguments */
+#define X264_CPU_LZCNT          0x010000  /* Phenom support for "leading zero count" instruction. */
 
 /* Analyse flags
  */
-- 
2.40.0