Helps the most with trellis and RD, but also helps with bitstream writing.
Seems at worst neutral even in the extreme case of a CPU with small L2 cache (e.g. ARM Cortex A8).
typedef struct
{
int last;
+ int mask;
dctcoef level[16];
uint8_t run[16];
} x264_run_level_t;
extern const vlc_t x264_total_zeros[15][16];
extern const vlc_t x264_total_zeros_2x2_dc[3][4];
extern const vlc_t x264_total_zeros_2x4_dc[7][8];
-extern const vlc_t x264_run_before[7][16];
typedef struct
{
#define LEVEL_TABLE_SIZE 128
extern vlc_large_t x264_level_token[7][LEVEL_TABLE_SIZE];
+/* The longest possible set of zero run codes sums to 25 bits. This leaves
+ * plenty of room for both the code (25 bits) and size (5 bits) in a uint32_t. */
+
+extern uint32_t x264_run_before[1<<16];
+
static inline void bs_init( bs_t *s, void *p_data, int i_data )
{
int offset = ((intptr_t)p_data & 3);
void x264_reduce_fraction( uint32_t *n, uint32_t *d );
void x264_reduce_fraction64( uint64_t *n, uint64_t *d );
-void x264_cavlc_init( void );
+void x264_cavlc_init( x264_t *h );
void x264_cabac_init( x264_t *h );
static ALWAYS_INLINE pixel x264_clip_pixel( int x )
{\
int i_last = runlevel->last = x264_coeff_last##num(dct);\
int i_total = 0;\
+ int mask = 0;\
do\
{\
int r = 0;\
runlevel->level[i_total] = dct[i_last];\
+ mask |= 1 << (i_last);\
while( --i_last >= 0 && dct[i_last] == 0 )\
r++;\
runlevel->run[i_total++] = r;\
} while( i_last >= 0 );\
+ runlevel->mask = mask;\
return i_total;\
}
};
/* [MIN( i_zero_left-1, 6 )][run_before] */
-const vlc_t x264_run_before[7][16] =
+static const vlc_t run_before[7][16] =
{
{ /* i_zero_left 1 */
{ 0x1, 1 }, /* str=1 */
};
vlc_large_t x264_level_token[7][LEVEL_TABLE_SIZE];
+uint32_t x264_run_before[1<<16];
-void x264_cavlc_init( void )
+void x264_cavlc_init( x264_t *h )
{
for( int i_suffix = 0; i_suffix < 7; i_suffix++ )
for( int16_t level = -LEVEL_TABLE_SIZE/2; level < LEVEL_TABLE_SIZE/2; level++ )
i_next++;
vlc->i_next = i_next;
}
+
+ for( int i = 1; i < (1<<16); i++ )
+ {
+ x264_run_level_t runlevel;
+ ALIGNED_ARRAY_16( dctcoef, dct, [16] );
+ int size = 0;
+ int bits = 0;
+ for( int j = 0; j < 16; j++ )
+ dct[j] = i&(1<<j);
+ int total = h->quantf.coeff_level_run[DCT_LUMA_4x4]( dct, &runlevel );
+ int zeros = runlevel.last + 1 - total;
+ for( int j = 0; j < total-1 && zeros > 0; j++ )
+ {
+ int idx = X264_MIN(zeros, 7) - 1;
+ int run = runlevel.run[j];
+ int len = run_before[idx][run].i_size;
+ size += len;
+ bits <<= len;
+ bits |= run_before[idx][run].i_bits;
+ zeros -= run;
+ }
+ x264_run_before[i] = (bits << 5) + size;
+ }
}
movifnidn t1, r1mp
pxor m2, m2
LAST_MASK %1, t5d, t0-(%1&1)*SIZEOF_DCTCOEF, t4d
- not t5d
- shl t5d, 32-((%1+1)&~1)
+%if %1==15
+ shr t5d, 1
+%elif %1==8
+ and t5d, 0xff
+%elif %1==4
+ and t5d, 0xf
+%endif
+ xor t5d, (1<<%1)-1
+ mov [t1+4], t5d
+ shl t5d, 32-%1
mov t4d, %1-1
LZCOUNT t3d, t5d, 0x1f
xor t6d, t6d
LZCOUNT t3d, t5d, 0x1f
%ifdef HIGH_BIT_DEPTH
mov t2d, [t0+t4*4]
- mov [t1+t6 +4+16*4], t3b
- mov [t1+t6*4+ 4], t2d
+ mov [t1+t6+8+16*4], t3b
+ mov [t1+t6*4+ 8], t2d
%else
mov t2w, [t0+t4*2]
- mov [t1+t6 +4+16*2], t3b
- mov [t1+t6*2+ 4], t2w
+ mov [t1+t6+8+16*2], t3b
+ mov [t1+t6*2+ 8], t2w
%endif
inc t3d
shl t5d, t3b
runlevel.level[1] = 2;
runlevel.level[2] = 2;
i_total = h->quantf.coeff_level_run[ctx_block_cat]( l, &runlevel );
+ x264_prefetch( &x264_run_before[runlevel.mask] );
i_total_zero = runlevel.last + 1 - i_total;
i_trailing = ((((runlevel.level[0]+1) | (1-runlevel.level[0])) >> 31) & 1) // abs(runlevel.level[0])>1
else if( (uint8_t)i_total < count_cat[ctx_block_cat] )
bs_write_vlc( s, x264_total_zeros[i_total-1][i_total_zero] );
- for( int i = 0; i < i_total-1 && i_total_zero > 0; i++ )
- {
- int i_zl = X264_MIN( i_total_zero, 7 );
- bs_write_vlc( s, x264_run_before[i_zl-1][runlevel.run[i]] );
- i_total_zero -= runlevel.run[i];
- }
+ int zero_run_code = x264_run_before[runlevel.mask];
+ bs_write( s, zero_run_code&0x1f, zero_run_code>>5 );
return i_total;
}
x264_predict_8x16c_init( h->param.cpu, h->predict_8x16c );
x264_predict_8x8_init( h->param.cpu, h->predict_8x8, &h->predict_8x8_filter );
x264_predict_4x4_init( h->param.cpu, h->predict_4x4 );
- if( h->param.b_cabac )
- x264_cabac_init( h );
- else
- x264_cavlc_init();
x264_pixel_init( h->param.cpu, &h->pixf );
x264_dct_init( h->param.cpu, &h->dctf );
x264_zigzag_init( h->param.cpu, &h->zigzagf_progressive, &h->zigzagf_interlaced );
x264_deblock_init( h->param.cpu, &h->loopf, PARAM_INTERLACED );
x264_bitstream_init( h->param.cpu, &h->bsf );
x264_dct_init_weights();
+ if( h->param.b_cabac )
+ x264_cabac_init( h );
+ else
+ x264_cavlc_init( h );
mbcmp_init( h );
chroma_dsp_init( h );
int result_c = call_c( qf_c.lastname, dct1+ac, &runlevel_c ); \
int result_a = call_a( qf_a.lastname, dct1+ac, &runlevel_a ); \
if( result_c != result_a || runlevel_c.last != runlevel_a.last || \
+ runlevel_c.mask != runlevel_a.mask || \
memcmp(runlevel_c.level, runlevel_a.level, sizeof(dctcoef)*result_c) || \
memcmp(runlevel_c.run, runlevel_a.run, sizeof(uint8_t)*(result_c-1)) ) \
{ \