From ded3e28cf1f593cbd1ad7c5255ba4ec82635574c Mon Sep 17 00:00:00 2001
From: Fiona Glaser <fiona@x264.com>
Date: Sat, 31 Jan 2009 05:00:39 -0800
Subject: [PATCH] Faster 8x8dct+CAVLC interleave Integrate array_non_zero with
 the CAVLC 8x8dct interleave function. Roughly 1.5-2x faster than the original
 separate array_non_zero method.

---
 common/dct.c         |  9 ++++++-
 common/dct.h         |  2 +-
 common/x86/dct-a.asm | 59 +++++++++++++++++++++++++++++++++-----------
 common/x86/dct.h     |  2 +-
 encoder/cavlc.c      |  6 +----
 tools/checkasm.c     | 25 ++++++++++++++++++-
 6 files changed, 79 insertions(+), 24 deletions(-)

diff --git a/common/dct.c b/common/dct.c
index 5f9f0fb0..f6095409 100644
--- a/common/dct.c
+++ b/common/dct.c
@@ -608,12 +608,19 @@ static void zigzag_sub_8x8_field( int16_t level[64], const uint8_t *p_src, uint8
 #undef ZIG
 #undef COPY4x4
 
-static void zigzag_interleave_8x8_cavlc( int16_t *dst, int16_t *src )
+static void zigzag_interleave_8x8_cavlc( int16_t *dst, int16_t *src, uint8_t *nnz )
 {
     int i,j;
     for( i=0; i<4; i++ )
+    {
+        int nz = 0;
         for( j=0; j<16; j++ )
+        {
+            nz |= src[i+j*4];
             dst[i*16+j] = src[i+j*4];
+        }
+        nnz[(i&1) + (i>>1)*8] = !!nz;
+    }
 }
 
 void x264_zigzag_init( int cpu, x264_zigzag_function_t *pf, int b_interlaced )
diff --git a/common/dct.h b/common/dct.h
index 71951f9b..3819ce11 100644
--- a/common/dct.h
+++ b/common/dct.h
@@ -119,7 +119,7 @@ typedef struct
     void (*scan_4x4)( int16_t level[16], int16_t dct[4][4] );
     void (*sub_8x8)( int16_t level[64], const uint8_t *p_src, uint8_t *p_dst );
     void (*sub_4x4)( int16_t level[16], const uint8_t *p_src, uint8_t *p_dst );
-    void (*interleave_8x8_cavlc)( int16_t *dst, int16_t *src );
+    void (*interleave_8x8_cavlc)( int16_t *dst, int16_t *src, uint8_t *nnz );
 
 } x264_zigzag_function_t;
 
diff --git a/common/x86/dct-a.asm b/common/x86/dct-a.asm
index 156a7ae4..b6604974 100644
--- a/common/x86/dct-a.asm
+++ b/common/x86/dct-a.asm
@@ -34,6 +34,7 @@ pb_scan4framea: db 12,13,6,7,14,15,0,1,8,9,2,3,4,5,10,11
 pb_scan4frameb: db 0,1,8,9,2,3,4,5,10,11,12,13,6,7,14,15
 pb_idctdc_unpack: db 0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3
 pb_idctdc_unpack2: db 4,4,4,4,5,5,5,5,6,6,6,6,7,7,7,7
+pb_1: times 8 db 1
 
 SECTION .text
 
@@ -737,19 +738,47 @@ cglobal x264_zigzag_sub_4x4_frame_ssse3, 3,3
     movdqa [r0+16], xmm1
     RET
 
-INIT_MMX
-cglobal x264_zigzag_interleave_8x8_cavlc_mmx, 2,3
-    mov    r2d, 24
-.loop:
-    movq   m0, [r1+r2*4+ 0]
-    movq   m1, [r1+r2*4+ 8]
-    movq   m2, [r1+r2*4+16]
-    movq   m3, [r1+r2*4+24]
+;-----------------------------------------------------------------------------
+; void x264_zigzag_interleave_8x8_cavlc_mmx( int16_t *dst, int16_t *src, uint8_t *nnz )
+;-----------------------------------------------------------------------------
+
+%macro INTERLEAVE 1
+    movq   m0, [r1+%1*4+ 0]
+    movq   m1, [r1+%1*4+ 8]
+    movq   m2, [r1+%1*4+16]
+    movq   m3, [r1+%1*4+24]
     TRANSPOSE4x4W 0,1,2,3,4
-    movq   [r0+r2+ 0], m0
-    movq   [r0+r2+32], m1
-    movq   [r0+r2+64], m2
-    movq   [r0+r2+96], m3
-    sub    r2d, 8
-    jge .loop
-    REP_RET
+    movq   [r0+%1+ 0], m0
+    movq   [r0+%1+32], m1
+    movq   [r0+%1+64], m2
+    movq   [r0+%1+96], m3
+%if %1
+    packsswb m0, m1
+    por    m6, m2
+    por    m7, m3
+    por    m5, m0
+%else
+    packsswb m0, m1
+    SWAP   m5, m0
+    SWAP   m6, m2
+    SWAP   m7, m3
+%endif
+%endmacro
+
+INIT_MMX
+cglobal x264_zigzag_interleave_8x8_cavlc_mmx, 3,3
+    INTERLEAVE  0
+    INTERLEAVE  8
+    INTERLEAVE 16
+    INTERLEAVE 24
+    packsswb m6, m7
+    packsswb m5, m6
+    packsswb m5, m5
+    pxor     m0, m0
+    pcmpeqb  m5, m0
+    paddb    m5, [pb_1 GLOBAL]
+    movd    r0d, m5
+    mov  [r2+0], r0w
+    shr     r0d, 16
+    mov  [r2+8], r0w
+    RET
diff --git a/common/x86/dct.h b/common/x86/dct.h
index 99392761..7617ea58 100644
--- a/common/x86/dct.h
+++ b/common/x86/dct.h
@@ -61,6 +61,6 @@ void x264_zigzag_scan_4x4_frame_ssse3 ( int16_t level[16], int16_t dct[4][4] );
 void x264_zigzag_scan_4x4_frame_mmx   ( int16_t level[16], int16_t dct[4][4] );
 void x264_zigzag_scan_4x4_field_mmxext( int16_t level[16], int16_t dct[4][4] );
 void x264_zigzag_sub_4x4_frame_ssse3  ( int16_t level[16], const uint8_t *src, uint8_t *dst );
-void x264_zigzag_interleave_8x8_cavlc_mmx( int16_t *dst, int16_t *src );
+void x264_zigzag_interleave_8x8_cavlc_mmx( int16_t *dst, int16_t *src, uint8_t *nnz );
 
 #endif
diff --git a/encoder/cavlc.c b/encoder/cavlc.c
index bfeecc2f..50eb5a1a 100644
--- a/encoder/cavlc.c
+++ b/encoder/cavlc.c
@@ -273,11 +273,7 @@ static inline void x264_macroblock_luma_write_cavlc( x264_t *h, bs_t *s, int i8s
         /* shuffle 8x8 dct coeffs into 4x4 lists */
         for( i8 = i8start; i8 <= i8end; i8++ )
             if( h->mb.i_cbp_luma & (1 << i8) )
-            {
-                h->zigzagf.interleave_8x8_cavlc( h->dct.luma4x4[i8*4], h->dct.luma8x8[i8] );
-                for( i4 = 0; i4 < 4; i4++ )
-                    h->mb.cache.non_zero_count[x264_scan8[i4+i8*4]] = array_non_zero( h->dct.luma4x4[i4+i8*4] );
-            }
+                h->zigzagf.interleave_8x8_cavlc( h->dct.luma4x4[i8*4], h->dct.luma8x8[i8], &h->mb.cache.non_zero_count[x264_scan8[i8*4]] );
     }
 
     for( i8 = i8start; i8 <= i8end; i8++ )
diff --git a/tools/checkasm.c b/tools/checkasm.c
index 3f89e681..29ddadd5 100644
--- a/tools/checkasm.c
+++ b/tools/checkasm.c
@@ -635,6 +635,26 @@ static int check_dct( int cpu_ref, int cpu_new )
         call_a2( zigzag_asm.name, t2, buf2, buf4 ); \
     }
 
+#define TEST_INTERLEAVE( name, t1, t2, dct, size )   \
+    if( zigzag_asm.name != zigzag_ref.name ) \
+    { \
+        for( j=0; j<100; j++ ) \
+        { \
+            set_func_name( "zigzag_"#name"_%s", interlace?"field":"frame" );\
+            used_asm = 1; \
+            memcpy(dct, buf1, size*sizeof(int16_t));\
+            for( i=0; i<size; i++ ) \
+                dct[i] = rand()&0x1F ? 0 : dct[i]; \
+            memcpy(buf3, buf4, 10*sizeof(uint8_t)); \
+            call_c( zigzag_c.name, t1, dct, buf3 ); \
+            call_a( zigzag_asm.name, t2, dct, buf4 ); \
+            if( memcmp( t1, t2, size*sizeof(int16_t) ) || memcmp( buf3, buf4, 10*sizeof(uint8_t) ) ) \
+            { \
+                ok = 0; \
+            } \
+        } \
+    }
+
     interlace = 0;
     x264_zigzag_init( 0, &zigzag_c, 0 );
     x264_zigzag_init( cpu_ref, &zigzag_ref, 0 );
@@ -643,7 +663,6 @@ static int check_dct( int cpu_ref, int cpu_new )
     ok = 1; used_asm = 0;
     TEST_ZIGZAG_SCAN( scan_8x8, level1, level2, (void*)dct1, 64 );
     TEST_ZIGZAG_SCAN( scan_4x4, level1, level2, dct1[0], 16  );
-    TEST_ZIGZAG_SCAN( interleave_8x8_cavlc, level1, level2, (void*)dct1, 64 );
     TEST_ZIGZAG_SUB( sub_4x4, level1, level2, 16 );
     report( "zigzag_frame :" );
 
@@ -657,6 +676,10 @@ static int check_dct( int cpu_ref, int cpu_new )
     TEST_ZIGZAG_SCAN( scan_4x4, level1, level2, dct1[0], 16  );
     TEST_ZIGZAG_SUB( sub_4x4, level1, level2, 16 );
     report( "zigzag_field :" );
+
+    ok = 1; used_asm = 0;
+    TEST_INTERLEAVE( interleave_8x8_cavlc, level1, level2, dct1[0][0], 64 );
+    report( "zigzag_interleave :" );
 #undef TEST_ZIGZAG_SCAN
 #undef TEST_ZIGZAG_SUB
 
-- 
2.40.0