Pure asm version of level/run coding. Over 2x faster than C.
Up to 40% faster CAVLC RDO. Overall benefit up to ~7.5% with RDO or ~5% with fast encoding settings.
int i_bits_encoded; /* RD only */
} bs_t;
+typedef struct
+{
+ int last;
+ int16_t level[16];
+ uint8_t run[16];
+} x264_run_level_t;
+
extern const vlc_t x264_coeff0_token[5];
extern const vlc_t x264_coeff_token[5][16*4];
extern const vlc_t x264_total_zeros[15][16];
return x264_coeff_last_internal( l, 64 );
}
+#define level_run(num)\
+static int x264_coeff_level_run##num( int16_t *dct, x264_run_level_t *runlevel )\
+{\
+ int i_last = runlevel->last = x264_coeff_last##num(dct);\
+ int i_total = 0;\
+ do\
+ {\
+ int r = 0;\
+ runlevel->level[i_total] = dct[i_last];\
+ while( --i_last >= 0 && dct[i_last] == 0 )\
+ r++;\
+ runlevel->run[i_total++] = r;\
+ } while( i_last >= 0 );\
+ return i_total;\
+}
+
+level_run(4)
+level_run(15)
+level_run(16)
+
+
void x264_quant_init( x264_t *h, int cpu, x264_quant_function_t *pf )
{
pf->quant_8x8 = quant_8x8;
pf->coeff_last[ DCT_LUMA_AC] = x264_coeff_last15;
pf->coeff_last[ DCT_LUMA_4x4] = x264_coeff_last16;
pf->coeff_last[ DCT_LUMA_8x8] = x264_coeff_last64;
+ pf->coeff_level_run[DCT_CHROMA_DC] = x264_coeff_level_run4;
+ pf->coeff_level_run[ DCT_LUMA_AC] = x264_coeff_level_run15;
+ pf->coeff_level_run[ DCT_LUMA_4x4] = x264_coeff_level_run16;
#ifdef HAVE_MMX
if( cpu&X264_CPU_MMX )
pf->coeff_last[ DCT_LUMA_AC] = x264_coeff_last15_mmxext;
pf->coeff_last[ DCT_LUMA_4x4] = x264_coeff_last16_mmxext;
pf->coeff_last[ DCT_LUMA_8x8] = x264_coeff_last64_mmxext;
+ pf->coeff_level_run[ DCT_LUMA_AC] = x264_coeff_level_run15_mmxext;
+ pf->coeff_level_run[ DCT_LUMA_4x4] = x264_coeff_level_run16_mmxext;
#endif
pf->coeff_last[DCT_CHROMA_DC] = x264_coeff_last4_mmxext;
+ pf->coeff_level_run[DCT_CHROMA_DC] = x264_coeff_level_run4_mmxext;
}
if( cpu&X264_CPU_SSE2 )
pf->coeff_last[ DCT_LUMA_AC] = x264_coeff_last15_sse2;
pf->coeff_last[DCT_LUMA_4x4] = x264_coeff_last16_sse2;
pf->coeff_last[DCT_LUMA_8x8] = x264_coeff_last64_sse2;
+ pf->coeff_level_run[ DCT_LUMA_AC] = x264_coeff_level_run15_sse2;
+ pf->coeff_level_run[DCT_LUMA_4x4] = x264_coeff_level_run16_sse2;
}
if( cpu&X264_CPU_SSSE3 )
#endif
pf->coeff_last[ DCT_LUMA_DC] = pf->coeff_last[DCT_LUMA_4x4];
pf->coeff_last[DCT_CHROMA_AC] = pf->coeff_last[ DCT_LUMA_AC];
+ pf->coeff_level_run[ DCT_LUMA_DC] = pf->coeff_level_run[DCT_LUMA_4x4];
+ pf->coeff_level_run[DCT_CHROMA_AC] = pf->coeff_level_run[ DCT_LUMA_AC];
}
int (*decimate_score16)( int16_t *dct );
int (*decimate_score64)( int16_t *dct );
int (*coeff_last[6])( int16_t *dct );
+ int (*coeff_level_run[5])( int16_t *dct, x264_run_level_t *runlevel );
} x264_quant_function_t;
void x264_quant_init( x264_t *h, int cpu, x264_quant_function_t *pf );
DECIMATE8x8 sse2
DECIMATE8x8 ssse3
+;-----------------------------------------------------------------------------
+; int x264_coeff_last( int16_t *dct )
+;-----------------------------------------------------------------------------
+
%macro LAST_MASK_SSE2 2-3
movdqa xmm0, [%2+ 0]
pxor xmm2, xmm2
%endif
%define LAST_MASK LAST_MASK_SSE2
COEFF_LAST sse2
+
+;-----------------------------------------------------------------------------
+; int x264_coeff_level_run( int16_t *dct, x264_run_level_t *runlevel )
+;-----------------------------------------------------------------------------
+
+%macro LAST_MASK4_MMX 2-3
+ movq mm0, [%2]
+ pxor mm2, mm2
+ packsswb mm0, mm0
+ pcmpeqb mm0, mm2
+ pmovmskb %1, mm0
+%endmacro
+
+; t6 = eax for return, t3 = ecx for shift, t[01] = r[01] for x86_64 args
+%ifdef ARCH_X86_64
+ DECLARE_REG_TMP 0,1,2,3,4,5,6
+%else
+ DECLARE_REG_TMP 6,3,2,1,4,5,0
+%endif
+
+%macro COEFF_LEVELRUN 2
+cglobal x264_coeff_level_run%2_%1,0,7
+ movifnidn t0d, r0m
+ movifnidn t1d, r1m
+ LAST_MASK t2d, t0-(%2&1)*2, t4d
+ not t2d
+ shl t2d, 32-((%2+1)&~1)
+ mov t4d, %2-1
+ mov t5d, t2d
+ bsr t3d, t2d
+ xor t6d, t6d
+ shl t5d, 1
+ xor t3d, 0x1f
+ sub t4d, t3d
+ shl t5d, t3b
+ mov [t1], t4d
+.loop:
+ bsr t3d, t5d
+ xor t3d, 0x1f
+ mov t2w, [t0+t4*2]
+ mov [t1+t6 +36], t3b
+ mov [t1+t6*2+ 4], t2w
+ inc t3d
+ shl t5d, t3b
+ inc t6d
+ sub t4d, t3d
+ jge .loop
+ RET
+%endmacro
+
+%ifndef ARCH_X86_64
+%define LAST_MASK LAST_MASK_MMX
+COEFF_LEVELRUN mmxext, 15
+COEFF_LEVELRUN mmxext, 16
+%endif
+%define LAST_MASK LAST_MASK4_MMX
+COEFF_LEVELRUN mmxext, 4
+%define LAST_MASK LAST_MASK_SSE2
+COEFF_LEVELRUN sse2, 15
+COEFF_LEVELRUN sse2, 16
int x264_coeff_last15_sse2( int16_t *dct );
int x264_coeff_last16_sse2( int16_t *dct );
int x264_coeff_last64_sse2( int16_t *dct );
+int x264_coeff_level_run16_mmxext( int16_t *dct, x264_run_level_t *runlevel );
+int x264_coeff_level_run16_sse2( int16_t *dct, x264_run_level_t *runlevel );
+int x264_coeff_level_run15_mmxext( int16_t *dct, x264_run_level_t *runlevel );
+int x264_coeff_level_run15_sse2( int16_t *dct, x264_run_level_t *runlevel );
+int x264_coeff_level_run4_mmxext( int16_t *dct, x264_run_level_t *runlevel );
#endif
/* Weight highly against overflows. */
s->i_bits_encoded += 1000000;
#else
- x264_log(h, X264_LOG_WARNING, "OVERFLOW levelcode=%d is only allowed in High Profile", i_level_code );
+ x264_log(h, X264_LOG_WARNING, "OVERFLOW levelcode=%d is only allowed in High Profile\n", i_level_code );
/* clip level, preserving sign */
i_level_code = (1<<12) - 2 + (i_level_code & 1);
#endif
{
static const uint8_t ct_index[17] = {0,0,1,1,2,2,2,2,3,3,3,3,3,3,3,3,3};
static const uint8_t ctz_index[8] = {3,0,1,0,2,0,1,0};
- int level[16], run[16];
- int i_trailing, i_total_zero, i_last, i_suffix_length, i;
+ x264_run_level_t runlevel;
+ int i_trailing, i_total_zero, i_suffix_length, i;
int i_total = 0;
unsigned int i_sign;
/* x264_mb_predict_non_zero_code return 0 <-> (16+16+1)>>1 = 16 */
return;
}
- i_last = h->quantf.coeff_last[i_ctxBlockCat](l);
- i_total_zero = i_last + 1;
-
/* level and run and total */
/* set these to 2 to allow branchless i_trailing calculation */
- level[1] = 2;
- level[2] = 2;
- do
- {
- int r = 0;
- level[i_total] = l[i_last];
- while( --i_last >= 0 && l[i_last] == 0 )
- r++;
- run[i_total++] = r;
- } while( i_last >= 0 );
+ runlevel.level[1] = 2;
+ runlevel.level[2] = 2;
+ i_total = h->quantf.coeff_level_run[i_ctxBlockCat]( l, &runlevel );
+ i_total_zero = runlevel.last + 1 - i_total;
h->mb.cache.non_zero_count[x264_scan8[i_idx]] = i_total;
- i_total_zero -= i_total;
- i_trailing = ((((level[0]+1) | (1-level[0])) >> 31) & 1) // abs(level[0])>1
- | ((((level[1]+1) | (1-level[1])) >> 31) & 2)
- | ((((level[2]+1) | (1-level[2])) >> 31) & 4);
+ i_trailing = ((((runlevel.level[0]+1) | (1-runlevel.level[0])) >> 31) & 1) // abs(runlevel.level[0])>1
+ | ((((runlevel.level[1]+1) | (1-runlevel.level[1])) >> 31) & 2)
+ | ((((runlevel.level[2]+1) | (1-runlevel.level[2])) >> 31) & 4);
i_trailing = ctz_index[i_trailing];
- i_sign = ((level[2] >> 31) & 1)
- | ((level[1] >> 31) & 2)
- | ((level[0] >> 31) & 4);
+ i_sign = ((runlevel.level[2] >> 31) & 1)
+ | ((runlevel.level[1] >> 31) & 2)
+ | ((runlevel.level[0] >> 31) & 4);
i_sign >>= 3-i_trailing;
/* total/trailing */
if( i_trailing < i_total )
{
- int16_t val = level[i_trailing];
- int16_t val_original = level[i_trailing]+LEVEL_TABLE_SIZE/2;
+ int16_t val = runlevel.level[i_trailing];
+ int16_t val_original = runlevel.level[i_trailing]+LEVEL_TABLE_SIZE/2;
if( i_trailing < 3 )
- val -= (val>>15)|1; /* as level[i] can't be 1 for the first one if i_trailing < 3 */
+ val -= (val>>15)|1; /* as runlevel.level[i] can't be 1 for the first one if i_trailing < 3 */
val += LEVEL_TABLE_SIZE/2;
if( (unsigned)val_original < LEVEL_TABLE_SIZE )
i_suffix_length = block_residual_write_cavlc_escape( h, s, i_suffix_length, val-LEVEL_TABLE_SIZE/2 );
for( i = i_trailing+1; i < i_total; i++ )
{
- val = level[i] + LEVEL_TABLE_SIZE/2;
+ val = runlevel.level[i] + LEVEL_TABLE_SIZE/2;
if( (unsigned)val < LEVEL_TABLE_SIZE )
{
bs_write_vlc( s, x264_level_token[i_suffix_length][val] );
for( i = 0; i < i_total-1 && i_total_zero > 0; i++ )
{
int i_zl = X264_MIN( i_total_zero - 1, 6 );
- bs_write_vlc( s, x264_run_before[i_zl][run[i]] );
- i_total_zero -= run[i];
+ bs_write_vlc( s, x264_run_before[i_zl][runlevel.run[i]] );
+ i_total_zero -= runlevel.run[i];
}
}
ok = oks[1]; used_asm = used_asms[1];
report( "dequant :" );
- ok = 1;
+ ok = 1; used_asm = 0;
if( qf_a.denoise_dct != qf_ref.denoise_dct )
{
int size;
dct1[idx] = !(rand()&3) + (!(rand()&15))*(rand()&3); \
if( ac ) \
dct1[0] = 0; \
- memcpy( dct2, dct1, w*w*2 ); \
- result_c = call_c1( qf_c.decname, (void*)dct2 ); \
- result_a = call_a1( qf_a.decname, (void*)dct2 ); \
+ result_c = call_c( qf_c.decname, (void*)dct1 ); \
+ result_a = call_a( qf_a.decname, (void*)dct1 ); \
if( X264_MIN(result_c,thresh) != X264_MIN(result_a,thresh) ) \
{ \
ok = 0; \
fprintf( stderr, #decname ": [FAILED]\n" ); \
break; \
} \
- call_c2( qf_c.decname, (void*)dct2 ); \
- call_a2( qf_a.decname, (void*)dct2 ); \
} \
}
- ok = 1;
+ ok = 1; used_asm = 0;
TEST_DECIMATE( decimate_score64, 8, 0, 6 );
TEST_DECIMATE( decimate_score16, 4, 0, 6 );
TEST_DECIMATE( decimate_score15, 4, 1, 7 );
nnz |= dct1[idx] = !(rand()&3) + (!(rand()&15))*rand(); \
if( !nnz ) \
dct1[ac] = 1; \
- memcpy( dct2, dct1, w*w*2 ); \
- result_c = call_c1( qf_c.last, (void*)(dct2+ac) ); \
- result_a = call_a1( qf_a.last, (void*)(dct2+ac) ); \
+ result_c = call_c( qf_c.last, (void*)(dct1+ac) ); \
+ result_a = call_a( qf_a.last, (void*)(dct1+ac) ); \
if( result_c != result_a ) \
{ \
ok = 0; \
fprintf( stderr, #lastname ": [FAILED]\n" ); \
break; \
} \
- call_c2( qf_c.last, (void*)(dct2+ac) ); \
- call_a2( qf_a.last, (void*)(dct2+ac) ); \
} \
}
- ok = 1;
+ ok = 1; used_asm = 0;
TEST_LAST( coeff_last[DCT_CHROMA_DC], coeff_last4, 2, 0 );
TEST_LAST( coeff_last[ DCT_LUMA_AC], coeff_last15, 4, 1 );
TEST_LAST( coeff_last[ DCT_LUMA_4x4], coeff_last16, 4, 0 );
TEST_LAST( coeff_last[ DCT_LUMA_8x8], coeff_last64, 8, 0 );
report( "coeff_last :" );
+#define TEST_LEVELRUN( lastname, name, w, ac ) \
+ if( qf_a.lastname != qf_ref.lastname ) \
+ { \
+ set_func_name( #name ); \
+ used_asm = 1; \
+ for( i = 0; i < 100; i++ ) \
+ { \
+ x264_run_level_t runlevel_c, runlevel_a; \
+ int result_c, result_a, idx, nnz=0; \
+ int max = rand() & (w*w-1); \
+ memset( dct1, 0, w*w*2 ); \
+ memcpy( &runlevel_a, buf1+i, sizeof(x264_run_level_t) ); \
+ memcpy( &runlevel_c, buf1+i, sizeof(x264_run_level_t) ); \
+ for( idx = ac; idx < max; idx++ ) \
+ nnz |= dct1[idx] = !(rand()&3) + (!(rand()&15))*rand(); \
+ if( !nnz ) \
+ dct1[ac] = 1; \
+ result_c = call_c( qf_c.lastname, (void*)(dct1+ac), &runlevel_c ); \
+ result_a = call_a( qf_a.lastname, (void*)(dct1+ac), &runlevel_a ); \
+ if( result_c != result_a || runlevel_c.last != runlevel_a.last || \
+ memcmp(runlevel_c.level, runlevel_a.level, sizeof(int16_t)*result_c) || \
+ memcmp(runlevel_c.run, runlevel_a.run, sizeof(uint8_t)*(result_c-1)) ) \
+ { \
+ ok = 0; \
+ fprintf( stderr, #name ": [FAILED]\n" ); \
+ break; \
+ } \
+ } \
+ }
+
+ ok = 1; used_asm = 0;
+ TEST_LEVELRUN( coeff_level_run[DCT_CHROMA_DC], coeff_level_run4, 2, 0 );
+ TEST_LEVELRUN( coeff_level_run[ DCT_LUMA_AC], coeff_level_run15, 4, 1 );
+ TEST_LEVELRUN( coeff_level_run[ DCT_LUMA_4x4], coeff_level_run16, 4, 0 );
+ report( "coeff_level_run :" );
+
return ret;
}