From: Fiona Glaser <fiona@x264.com>
Date: Thu, 8 Dec 2011 21:45:41 +0000 (-0800)
Subject: Use a large LUT for CAVLC zero-run bit codes
X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=9c0fa2d63f549a44f869562cffa9c041a32ae41d;p=libx264

Use a large LUT for CAVLC zero-run bit codes
Helps the most with trellis and RD, but also helps with bitstream writing.
Seems at worst neutral even in the extreme case of a CPU with small L2 cache (e.g. ARM Cortex A8).
---

diff --git a/common/bitstream.h b/common/bitstream.h
index f8838401..b836eec2 100644
--- a/common/bitstream.h
+++ b/common/bitstream.h
@@ -56,6 +56,7 @@ typedef struct bs_s
 typedef struct
 {
     int     last;
+    int     mask;
     dctcoef level[16];
     uint8_t run[16];
 } x264_run_level_t;
@@ -65,7 +66,6 @@ extern const vlc_t x264_coeff_token[6][16][4];
 extern const vlc_t x264_total_zeros[15][16];
 extern const vlc_t x264_total_zeros_2x2_dc[3][4];
 extern const vlc_t x264_total_zeros_2x4_dc[7][8];
-extern const vlc_t x264_run_before[7][16];
 
 typedef struct
 {
@@ -82,6 +82,11 @@ void x264_bitstream_init( int cpu, x264_bitstream_function_t *pf );
 #define LEVEL_TABLE_SIZE 128
 extern vlc_large_t x264_level_token[7][LEVEL_TABLE_SIZE];
 
+/* The longest possible set of zero run codes sums to 25 bits.  This leaves
+ * plenty of room for both the code (25 bits) and size (5 bits) in a uint32_t. */
+
+extern uint32_t x264_run_before[1<<16];
+
 static inline void bs_init( bs_t *s, void *p_data, int i_data )
 {
     int offset = ((intptr_t)p_data & 3);
diff --git a/common/common.h b/common/common.h
index 2704f291..b6cec651 100644
--- a/common/common.h
+++ b/common/common.h
@@ -236,7 +236,7 @@ void x264_log( x264_t *h, int i_level, const char *psz_fmt, ... );
 
 void x264_reduce_fraction( uint32_t *n, uint32_t *d );
 void x264_reduce_fraction64( uint64_t *n, uint64_t *d );
-void x264_cavlc_init( void );
+void x264_cavlc_init( x264_t *h );
 void x264_cabac_init( x264_t *h );
 
 static ALWAYS_INLINE pixel x264_clip_pixel( int x )
diff --git a/common/quant.c b/common/quant.c
index a6116b4c..a57ca2e2 100644
--- a/common/quant.c
+++ b/common/quant.c
@@ -373,14 +373,17 @@ static int x264_coeff_level_run##num( dctcoef *dct, x264_run_level_t *runlevel )
 {\
     int i_last = runlevel->last = x264_coeff_last##num(dct);\
     int i_total = 0;\
+    int mask = 0;\
     do\
     {\
         int r = 0;\
         runlevel->level[i_total] = dct[i_last];\
+        mask |= 1 << (i_last);\
         while( --i_last >= 0 && dct[i_last] == 0 )\
             r++;\
         runlevel->run[i_total++] = r;\
     } while( i_last >= 0 );\
+    runlevel->mask = mask;\
     return i_total;\
 }
 
diff --git a/common/vlc.c b/common/vlc.c
index bd2fc52c..e6dc77a1 100644
--- a/common/vlc.c
+++ b/common/vlc.c
@@ -738,7 +738,7 @@ const vlc_t x264_total_zeros_2x4_dc[7][8] =
 };
 
 /* [MIN( i_zero_left-1, 6 )][run_before] */
-const vlc_t x264_run_before[7][16] =
+static const vlc_t run_before[7][16] =
 {
     { /* i_zero_left 1 */
         { 0x1, 1 }, /* str=1 */
@@ -799,8 +799,9 @@ const vlc_t x264_run_before[7][16] =
 };
 
 vlc_large_t x264_level_token[7][LEVEL_TABLE_SIZE];
+uint32_t x264_run_before[1<<16];
 
-void x264_cavlc_init( void )
+void x264_cavlc_init( x264_t *h )
 {
     for( int i_suffix = 0; i_suffix < 7; i_suffix++ )
         for( int16_t level = -LEVEL_TABLE_SIZE/2; level < LEVEL_TABLE_SIZE/2; level++ )
@@ -840,4 +841,27 @@ void x264_cavlc_init( void )
                 i_next++;
             vlc->i_next = i_next;
         }
+
+    for( int i = 1; i < (1<<16); i++ )
+    {
+        x264_run_level_t runlevel;
+        ALIGNED_ARRAY_16( dctcoef, dct, [16] );
+        int size = 0;
+        int bits = 0;
+        for( int j = 0; j < 16; j++ )
+            dct[j] = i&(1<<j);
+        int total = h->quantf.coeff_level_run[DCT_LUMA_4x4]( dct, &runlevel );
+        int zeros = runlevel.last + 1 - total;
+        for( int j = 0; j < total-1 && zeros > 0; j++ )
+        {
+            int idx = X264_MIN(zeros, 7) - 1;
+            int run = runlevel.run[j];
+            int len = run_before[idx][run].i_size;
+            size += len;
+            bits <<= len;
+            bits |= run_before[idx][run].i_bits;
+            zeros -= run;
+        }
+        x264_run_before[i] = (bits << 5) + size;
+    }
 }
diff --git a/common/x86/quant-a.asm b/common/x86/quant-a.asm
index d3db5cb8..dd904ba8 100644
--- a/common/x86/quant-a.asm
+++ b/common/x86/quant-a.asm
@@ -1352,8 +1352,16 @@ cglobal coeff_level_run%1,0,7
     movifnidn t1, r1mp
     pxor    m2, m2
     LAST_MASK %1, t5d, t0-(%1&1)*SIZEOF_DCTCOEF, t4d
-    not    t5d
-    shl    t5d, 32-((%1+1)&~1)
+%if %1==15
+    shr   t5d, 1
+%elif %1==8
+    and   t5d, 0xff
+%elif %1==4
+    and   t5d, 0xf
+%endif
+    xor   t5d, (1<<%1)-1
+    mov   [t1+4], t5d
+    shl    t5d, 32-%1
     mov    t4d, %1-1
     LZCOUNT t3d, t5d, 0x1f
     xor    t6d, t6d
@@ -1365,12 +1373,12 @@ cglobal coeff_level_run%1,0,7
     LZCOUNT t3d, t5d, 0x1f
 %ifdef HIGH_BIT_DEPTH
     mov    t2d, [t0+t4*4]
-    mov   [t1+t6  +4+16*4], t3b
-    mov   [t1+t6*4+ 4], t2d
+    mov   [t1+t6+8+16*4], t3b
+    mov   [t1+t6*4+ 8], t2d
 %else
     mov    t2w, [t0+t4*2]
-    mov   [t1+t6  +4+16*2], t3b
-    mov   [t1+t6*2+ 4], t2w
+    mov   [t1+t6+8+16*2], t3b
+    mov   [t1+t6*2+ 8], t2w
 %endif
     inc    t3d
     shl    t5d, t3b
diff --git a/encoder/cavlc.c b/encoder/cavlc.c
index c2fbf067..d7489439 100644
--- a/encoder/cavlc.c
+++ b/encoder/cavlc.c
@@ -132,6 +132,7 @@ static int x264_cavlc_block_residual_internal( x264_t *h, int ctx_block_cat, dct
     runlevel.level[1] = 2;
     runlevel.level[2] = 2;
     i_total = h->quantf.coeff_level_run[ctx_block_cat]( l, &runlevel );
+    x264_prefetch( &x264_run_before[runlevel.mask] );
     i_total_zero = runlevel.last + 1 - i_total;
 
     i_trailing = ((((runlevel.level[0]+1) | (1-runlevel.level[0])) >> 31) & 1) // abs(runlevel.level[0])>1
@@ -188,12 +189,8 @@ static int x264_cavlc_block_residual_internal( x264_t *h, int ctx_block_cat, dct
     else if( (uint8_t)i_total < count_cat[ctx_block_cat] )
         bs_write_vlc( s, x264_total_zeros[i_total-1][i_total_zero] );
 
-    for( int i = 0; i < i_total-1 && i_total_zero > 0; i++ )
-    {
-        int i_zl = X264_MIN( i_total_zero, 7 );
-        bs_write_vlc( s, x264_run_before[i_zl-1][runlevel.run[i]] );
-        i_total_zero -= runlevel.run[i];
-    }
+    int zero_run_code = x264_run_before[runlevel.mask];
+    bs_write( s, zero_run_code&0x1f, zero_run_code>>5 );
 
     return i_total;
 }
diff --git a/encoder/encoder.c b/encoder/encoder.c
index 39fc4097..a9732ed5 100644
--- a/encoder/encoder.c
+++ b/encoder/encoder.c
@@ -1173,10 +1173,6 @@ x264_t *x264_encoder_open( x264_param_t *param )
     x264_predict_8x16c_init( h->param.cpu, h->predict_8x16c );
     x264_predict_8x8_init( h->param.cpu, h->predict_8x8, &h->predict_8x8_filter );
     x264_predict_4x4_init( h->param.cpu, h->predict_4x4 );
-    if( h->param.b_cabac )
-        x264_cabac_init( h );
-    else
-        x264_cavlc_init();
     x264_pixel_init( h->param.cpu, &h->pixf );
     x264_dct_init( h->param.cpu, &h->dctf );
     x264_zigzag_init( h->param.cpu, &h->zigzagf_progressive, &h->zigzagf_interlaced );
@@ -1186,6 +1182,10 @@ x264_t *x264_encoder_open( x264_param_t *param )
     x264_deblock_init( h->param.cpu, &h->loopf, PARAM_INTERLACED );
     x264_bitstream_init( h->param.cpu, &h->bsf );
     x264_dct_init_weights();
+    if( h->param.b_cabac )
+        x264_cabac_init( h );
+    else
+        x264_cavlc_init( h );
 
     mbcmp_init( h );
     chroma_dsp_init( h );
diff --git a/tools/checkasm.c b/tools/checkasm.c
index 204e3b7b..75ce9688 100644
--- a/tools/checkasm.c
+++ b/tools/checkasm.c
@@ -2013,6 +2013,7 @@ static int check_quant( int cpu_ref, int cpu_new )
             int result_c = call_c( qf_c.lastname, dct1+ac, &runlevel_c ); \
             int result_a = call_a( qf_a.lastname, dct1+ac, &runlevel_a ); \
             if( result_c != result_a || runlevel_c.last != runlevel_a.last || \
+                runlevel_c.mask != runlevel_a.mask || \
                 memcmp(runlevel_c.level, runlevel_a.level, sizeof(dctcoef)*result_c) || \
                 memcmp(runlevel_c.run, runlevel_a.run, sizeof(uint8_t)*(result_c-1)) ) \
             { \