From: Fiona Glaser Date: Thu, 8 Dec 2011 21:45:41 +0000 (-0800) Subject: Use a large LUT for CAVLC zero-run bit codes X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=9c0fa2d63f549a44f869562cffa9c041a32ae41d;p=libx264 Use a large LUT for CAVLC zero-run bit codes Helps the most with trellis and RD, but also helps with bitstream writing. Seems at worst neutral even in the extreme case of a CPU with small L2 cache (e.g. ARM Cortex A8). --- diff --git a/common/bitstream.h b/common/bitstream.h index f8838401..b836eec2 100644 --- a/common/bitstream.h +++ b/common/bitstream.h @@ -56,6 +56,7 @@ typedef struct bs_s typedef struct { int last; + int mask; dctcoef level[16]; uint8_t run[16]; } x264_run_level_t; @@ -65,7 +66,6 @@ extern const vlc_t x264_coeff_token[6][16][4]; extern const vlc_t x264_total_zeros[15][16]; extern const vlc_t x264_total_zeros_2x2_dc[3][4]; extern const vlc_t x264_total_zeros_2x4_dc[7][8]; -extern const vlc_t x264_run_before[7][16]; typedef struct { @@ -82,6 +82,11 @@ void x264_bitstream_init( int cpu, x264_bitstream_function_t *pf ); #define LEVEL_TABLE_SIZE 128 extern vlc_large_t x264_level_token[7][LEVEL_TABLE_SIZE]; +/* The longest possible set of zero run codes sums to 25 bits. This leaves + * plenty of room for both the code (25 bits) and size (5 bits) in a uint32_t. */ + +extern uint32_t x264_run_before[1<<16]; + static inline void bs_init( bs_t *s, void *p_data, int i_data ) { int offset = ((intptr_t)p_data & 3); diff --git a/common/common.h b/common/common.h index 2704f291..b6cec651 100644 --- a/common/common.h +++ b/common/common.h @@ -236,7 +236,7 @@ void x264_log( x264_t *h, int i_level, const char *psz_fmt, ... ); void x264_reduce_fraction( uint32_t *n, uint32_t *d ); void x264_reduce_fraction64( uint64_t *n, uint64_t *d ); -void x264_cavlc_init( void ); +void x264_cavlc_init( x264_t *h ); void x264_cabac_init( x264_t *h ); static ALWAYS_INLINE pixel x264_clip_pixel( int x ) diff --git a/common/quant.c b/common/quant.c index a6116b4c..a57ca2e2 100644 --- a/common/quant.c +++ b/common/quant.c @@ -373,14 +373,17 @@ static int x264_coeff_level_run##num( dctcoef *dct, x264_run_level_t *runlevel ) {\ int i_last = runlevel->last = x264_coeff_last##num(dct);\ int i_total = 0;\ + int mask = 0;\ do\ {\ int r = 0;\ runlevel->level[i_total] = dct[i_last];\ + mask |= 1 << (i_last);\ while( --i_last >= 0 && dct[i_last] == 0 )\ r++;\ runlevel->run[i_total++] = r;\ } while( i_last >= 0 );\ + runlevel->mask = mask;\ return i_total;\ } diff --git a/common/vlc.c b/common/vlc.c index bd2fc52c..e6dc77a1 100644 --- a/common/vlc.c +++ b/common/vlc.c @@ -738,7 +738,7 @@ const vlc_t x264_total_zeros_2x4_dc[7][8] = }; /* [MIN( i_zero_left-1, 6 )][run_before] */ -const vlc_t x264_run_before[7][16] = +static const vlc_t run_before[7][16] = { { /* i_zero_left 1 */ { 0x1, 1 }, /* str=1 */ @@ -799,8 +799,9 @@ const vlc_t x264_run_before[7][16] = }; vlc_large_t x264_level_token[7][LEVEL_TABLE_SIZE]; +uint32_t x264_run_before[1<<16]; -void x264_cavlc_init( void ) +void x264_cavlc_init( x264_t *h ) { for( int i_suffix = 0; i_suffix < 7; i_suffix++ ) for( int16_t level = -LEVEL_TABLE_SIZE/2; level < LEVEL_TABLE_SIZE/2; level++ ) @@ -840,4 +841,27 @@ void x264_cavlc_init( void ) i_next++; vlc->i_next = i_next; } + + for( int i = 1; i < (1<<16); i++ ) + { + x264_run_level_t runlevel; + ALIGNED_ARRAY_16( dctcoef, dct, [16] ); + int size = 0; + int bits = 0; + for( int j = 0; j < 16; j++ ) + dct[j] = i&(1<quantf.coeff_level_run[DCT_LUMA_4x4]( dct, &runlevel ); + int zeros = runlevel.last + 1 - total; + for( int j = 0; j < total-1 && zeros > 0; j++ ) + { + int idx = X264_MIN(zeros, 7) - 1; + int run = runlevel.run[j]; + int len = run_before[idx][run].i_size; + size += len; + bits <<= len; + bits |= run_before[idx][run].i_bits; + zeros -= run; + } + x264_run_before[i] = (bits << 5) + size; + } } diff --git a/common/x86/quant-a.asm b/common/x86/quant-a.asm index d3db5cb8..dd904ba8 100644 --- a/common/x86/quant-a.asm +++ b/common/x86/quant-a.asm @@ -1352,8 +1352,16 @@ cglobal coeff_level_run%1,0,7 movifnidn t1, r1mp pxor m2, m2 LAST_MASK %1, t5d, t0-(%1&1)*SIZEOF_DCTCOEF, t4d - not t5d - shl t5d, 32-((%1+1)&~1) +%if %1==15 + shr t5d, 1 +%elif %1==8 + and t5d, 0xff +%elif %1==4 + and t5d, 0xf +%endif + xor t5d, (1<<%1)-1 + mov [t1+4], t5d + shl t5d, 32-%1 mov t4d, %1-1 LZCOUNT t3d, t5d, 0x1f xor t6d, t6d @@ -1365,12 +1373,12 @@ cglobal coeff_level_run%1,0,7 LZCOUNT t3d, t5d, 0x1f %ifdef HIGH_BIT_DEPTH mov t2d, [t0+t4*4] - mov [t1+t6 +4+16*4], t3b - mov [t1+t6*4+ 4], t2d + mov [t1+t6+8+16*4], t3b + mov [t1+t6*4+ 8], t2d %else mov t2w, [t0+t4*2] - mov [t1+t6 +4+16*2], t3b - mov [t1+t6*2+ 4], t2w + mov [t1+t6+8+16*2], t3b + mov [t1+t6*2+ 8], t2w %endif inc t3d shl t5d, t3b diff --git a/encoder/cavlc.c b/encoder/cavlc.c index c2fbf067..d7489439 100644 --- a/encoder/cavlc.c +++ b/encoder/cavlc.c @@ -132,6 +132,7 @@ static int x264_cavlc_block_residual_internal( x264_t *h, int ctx_block_cat, dct runlevel.level[1] = 2; runlevel.level[2] = 2; i_total = h->quantf.coeff_level_run[ctx_block_cat]( l, &runlevel ); + x264_prefetch( &x264_run_before[runlevel.mask] ); i_total_zero = runlevel.last + 1 - i_total; i_trailing = ((((runlevel.level[0]+1) | (1-runlevel.level[0])) >> 31) & 1) // abs(runlevel.level[0])>1 @@ -188,12 +189,8 @@ static int x264_cavlc_block_residual_internal( x264_t *h, int ctx_block_cat, dct else if( (uint8_t)i_total < count_cat[ctx_block_cat] ) bs_write_vlc( s, x264_total_zeros[i_total-1][i_total_zero] ); - for( int i = 0; i < i_total-1 && i_total_zero > 0; i++ ) - { - int i_zl = X264_MIN( i_total_zero, 7 ); - bs_write_vlc( s, x264_run_before[i_zl-1][runlevel.run[i]] ); - i_total_zero -= runlevel.run[i]; - } + int zero_run_code = x264_run_before[runlevel.mask]; + bs_write( s, zero_run_code&0x1f, zero_run_code>>5 ); return i_total; } diff --git a/encoder/encoder.c b/encoder/encoder.c index 39fc4097..a9732ed5 100644 --- a/encoder/encoder.c +++ b/encoder/encoder.c @@ -1173,10 +1173,6 @@ x264_t *x264_encoder_open( x264_param_t *param ) x264_predict_8x16c_init( h->param.cpu, h->predict_8x16c ); x264_predict_8x8_init( h->param.cpu, h->predict_8x8, &h->predict_8x8_filter ); x264_predict_4x4_init( h->param.cpu, h->predict_4x4 ); - if( h->param.b_cabac ) - x264_cabac_init( h ); - else - x264_cavlc_init(); x264_pixel_init( h->param.cpu, &h->pixf ); x264_dct_init( h->param.cpu, &h->dctf ); x264_zigzag_init( h->param.cpu, &h->zigzagf_progressive, &h->zigzagf_interlaced ); @@ -1186,6 +1182,10 @@ x264_t *x264_encoder_open( x264_param_t *param ) x264_deblock_init( h->param.cpu, &h->loopf, PARAM_INTERLACED ); x264_bitstream_init( h->param.cpu, &h->bsf ); x264_dct_init_weights(); + if( h->param.b_cabac ) + x264_cabac_init( h ); + else + x264_cavlc_init( h ); mbcmp_init( h ); chroma_dsp_init( h ); diff --git a/tools/checkasm.c b/tools/checkasm.c index 204e3b7b..75ce9688 100644 --- a/tools/checkasm.c +++ b/tools/checkasm.c @@ -2013,6 +2013,7 @@ static int check_quant( int cpu_ref, int cpu_new ) int result_c = call_c( qf_c.lastname, dct1+ac, &runlevel_c ); \ int result_a = call_a( qf_a.lastname, dct1+ac, &runlevel_a ); \ if( result_c != result_a || runlevel_c.last != runlevel_a.last || \ + runlevel_c.mask != runlevel_a.mask || \ memcmp(runlevel_c.level, runlevel_a.level, sizeof(dctcoef)*result_c) || \ memcmp(runlevel_c.run, runlevel_a.run, sizeof(uint8_t)*(result_c-1)) ) \ { \