From 406a40dc41438edac3f60d231eb9196b3d33008f Mon Sep 17 00:00:00 2001 From: Fiona Glaser Date: Sat, 27 Dec 2008 21:36:14 -0500 Subject: [PATCH] Much faster CABAC RDO Since RDO doesn't care about what order bit costs are calculated, merge sigmap and level coding into the same loop in RDO. This is bit-exact for 4x4dct but slightly incorrect for 8x8dct due to the sigmap containing duplicated contexts. However, the PSNR penalty of this is extremely small (~0.001db). Speed benefit is about 15% in 4x4dct and 30% in 8x8dct residual bit cost calculation at QP20. Overall encoding speed benefit is up to 5%, depending on encoding settings. Also remove an old unnecessary CABAC table that hasn't been used for years. --- common/cabac.c | 35 --------------- encoder/cabac.c | 116 +++++++++++++++++++++++++++++++++++++++++------- 2 files changed, 101 insertions(+), 50 deletions(-) diff --git a/common/cabac.c b/common/cabac.c index 722451bd..7a2e94dd 100644 --- a/common/cabac.c +++ b/common/cabac.c @@ -742,41 +742,6 @@ const uint8_t x264_cabac_renorm_shift[64]= { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, }; -static const uint8_t x264_cabac_probability[128] = -{ - FIX8(0.9812), FIX8(0.9802), FIX8(0.9792), FIX8(0.9781), - FIX8(0.9769), FIX8(0.9757), FIX8(0.9744), FIX8(0.9730), - FIX8(0.9716), FIX8(0.9700), FIX8(0.9684), FIX8(0.9667), - FIX8(0.9650), FIX8(0.9631), FIX8(0.9611), FIX8(0.9590), - FIX8(0.9568), FIX8(0.9545), FIX8(0.9521), FIX8(0.9495), - FIX8(0.9468), FIX8(0.9440), FIX8(0.9410), FIX8(0.9378), - FIX8(0.9345), FIX8(0.9310), FIX8(0.9273), FIX8(0.9234), - FIX8(0.9193), FIX8(0.9150), FIX8(0.9105), FIX8(0.9057), - FIX8(0.9006), FIX8(0.8953), FIX8(0.8897), FIX8(0.8838), - FIX8(0.8776), FIX8(0.8710), FIX8(0.8641), FIX8(0.8569), - FIX8(0.8492), FIX8(0.8411), FIX8(0.8326), FIX8(0.8237), - FIX8(0.8143), FIX8(0.8043), FIX8(0.7938), FIX8(0.7828), - FIX8(0.7712), FIX8(0.7590), FIX8(0.7461), FIX8(0.7325), - FIX8(0.7182), FIX8(0.7031), FIX8(0.6872), FIX8(0.6705), - FIX8(0.6528), FIX8(0.6343), FIX8(0.6147), FIX8(0.5941), - FIX8(0.5724), FIX8(0.5495), FIX8(0.5254), FIX8(0.5000), - FIX8(0.5000), FIX8(0.4746), FIX8(0.4505), FIX8(0.4276), - FIX8(0.4059), FIX8(0.3853), FIX8(0.3657), FIX8(0.3472), - FIX8(0.3295), FIX8(0.3128), FIX8(0.2969), FIX8(0.2818), - FIX8(0.2675), FIX8(0.2539), FIX8(0.2410), FIX8(0.2288), - FIX8(0.2172), FIX8(0.2062), FIX8(0.1957), FIX8(0.1857), - FIX8(0.1763), FIX8(0.1674), FIX8(0.1589), FIX8(0.1508), - FIX8(0.1431), FIX8(0.1359), FIX8(0.1290), FIX8(0.1224), - FIX8(0.1162), FIX8(0.1103), FIX8(0.1047), FIX8(0.0994), - FIX8(0.0943), FIX8(0.0895), FIX8(0.0850), FIX8(0.0807), - FIX8(0.0766), FIX8(0.0727), FIX8(0.0690), FIX8(0.0655), - FIX8(0.0622), FIX8(0.0590), FIX8(0.0560), FIX8(0.0532), - FIX8(0.0505), FIX8(0.0479), FIX8(0.0455), FIX8(0.0432), - FIX8(0.0410), FIX8(0.0389), FIX8(0.0369), FIX8(0.0350), - FIX8(0.0333), FIX8(0.0316), FIX8(0.0300), FIX8(0.0284), - FIX8(0.0270), FIX8(0.0256), FIX8(0.0243), FIX8(0.0231), - FIX8(0.0219), FIX8(0.0208), FIX8(0.0198), FIX8(0.0187) -}; /* -ln2(probability) */ #define F(a,b) {FIX8(a),FIX8(b)} const uint16_t x264_cabac_entropy[128][2] = diff --git a/encoder/cabac.c b/encoder/cabac.c index 93aa88ba..4fa74033 100644 --- a/encoder/cabac.c +++ b/encoder/cabac.c @@ -636,6 +636,7 @@ static const uint8_t coeff_abs_level_transition[2][8] = { { 4, 4, 4, 4, 5, 6, 7, 7 } }; +#if !RDO_SKIP_BS static void block_residual_write_cabac( x264_t *h, x264_cabac_t *cb, int i_ctxBlockCat, int i_idx, int16_t *l, int i_count ) { const int i_ctx_sig = significant_coeff_flag_offset[h->mb.b_interlaced][i_ctxBlockCat]; @@ -692,9 +693,7 @@ static void block_residual_write_cabac( x264_t *h, x264_cabac_t *cb, int i_ctxBl if( i == i_last ) { i_coeff_abs_m1[i_coeff] = abs(l[i]) - 1; -#if !RDO_SKIP_BS i_coeff_sign[i_coeff] = l[i] < 0; -#endif i_coeff++; } @@ -711,15 +710,10 @@ static void block_residual_write_cabac( x264_t *h, x264_cabac_t *cb, int i_ctxBl { x264_cabac_encode_decision( cb, ctx, 1 ); ctx = coeff_abs_levelgt1_ctx[node_ctx] + i_ctx_level; -#if RDO_SKIP_BS - cb->f8_bits_encoded += cabac_size_unary[i_prefix][cb->state[ctx]]; - cb->state[ctx] = cabac_transition_unary[i_prefix][cb->state[ctx]]; -#else for( i = 0; i < i_prefix - 1; i++ ) x264_cabac_encode_decision( cb, ctx, 1 ); if( i_prefix < 14 ) x264_cabac_encode_decision( cb, ctx, 0 ); -#endif if( i_prefix >= 14 ) x264_cabac_encode_ue_bypass( cb, 0, i_coeff_abs_m1[i_coeff] - 14 ); @@ -729,18 +723,110 @@ static void block_residual_write_cabac( x264_t *h, x264_cabac_t *cb, int i_ctxBl { x264_cabac_encode_decision( cb, ctx, 0 ); node_ctx = coeff_abs_level_transition[0][node_ctx]; -#if RDO_SKIP_BS - x264_cabac_encode_bypass( cb, 0 ); // sign -#endif } -#if !RDO_SKIP_BS x264_cabac_encode_bypass( cb, i_coeff_sign[i_coeff] ); -#endif } while( i_coeff > 0 ); } +#define block_residual_write_cabac_8x8( h, cb, idx, l ) block_residual_write_cabac( h, cb, DCT_LUMA_8x8, idx, l, 64 ) + +#else + +/* Faster RDO by merging sigmap and level coding. Note that for 8x8dct + * this is slightly incorrect because the sigmap is not reversible + * (contexts are repeated). However, there is nearly no quality penalty + * for this (~0.001db) and the speed boost (~30%) is worth it. */ +static void ALWAYS_INLINE block_residual_write_cabac_internal( x264_t *h, x264_cabac_t *cb, int i_ctxBlockCat, int i_idx, int16_t *l, int i_count, int b_8x8 ) +{ + const int i_ctx_sig = significant_coeff_flag_offset[h->mb.b_interlaced][i_ctxBlockCat]; + const int i_ctx_last = last_coeff_flag_offset[h->mb.b_interlaced][i_ctxBlockCat]; + const int i_ctx_level = coeff_abs_level_m1_offset[i_ctxBlockCat]; + const uint8_t *significant_coeff_flag_offset = significant_coeff_flag_offset_8x8[h->mb.b_interlaced]; + int i_last, i_coeff_abs_m1, ctx, i_prefix, i, node_ctx; + + if( !b_8x8 ) + { + /* coded block flag */ + ctx = 85 + x264_cabac_mb_cbf_ctxidxinc( h, i_ctxBlockCat, i_idx ); + if( h->mb.cache.non_zero_count[x264_scan8[i_idx]] ) + x264_cabac_encode_decision( cb, ctx, 1 ); + else + { + x264_cabac_encode_decision( cb, ctx, 0 ); + return; + } + } + + i_last = h->quantf.coeff_last[i_ctxBlockCat](l); + i_coeff_abs_m1 = abs(l[i_last]) - 1; + i_prefix = X264_MIN( i_coeff_abs_m1, 14 ); + ctx = coeff_abs_level1_ctx[0] + i_ctx_level; + if( i_last != i_count - 1 ) + { + x264_cabac_encode_decision( cb, i_ctx_sig + (b_8x8?significant_coeff_flag_offset[i_last]:i_last), 1 ); + x264_cabac_encode_decision( cb, i_ctx_last + (b_8x8?last_coeff_flag_offset_8x8[i_last]:i_last), 1 ); + } + + if( i_prefix ) + { + x264_cabac_encode_decision( cb, ctx, 1 ); + ctx = coeff_abs_levelgt1_ctx[0] + i_ctx_level; + cb->f8_bits_encoded += cabac_size_unary[i_prefix][cb->state[ctx]]; + cb->state[ctx] = cabac_transition_unary[i_prefix][cb->state[ctx]]; + if( i_prefix >= 14 ) + x264_cabac_encode_ue_bypass( cb, 0, i_coeff_abs_m1 - 14 ); + node_ctx = coeff_abs_level_transition[1][0]; + } + else + { + x264_cabac_encode_decision( cb, ctx, 0 ); + node_ctx = coeff_abs_level_transition[0][0]; + x264_cabac_encode_bypass( cb, 0 ); // sign + } + + for( i = i_last-1 ; i >= 0; i-- ) + { + if( l[i] ) + { + x264_cabac_encode_decision( cb, i_ctx_sig + (b_8x8?significant_coeff_flag_offset[i]:i), 1 ); + x264_cabac_encode_decision( cb, i_ctx_last + (b_8x8?last_coeff_flag_offset_8x8[i]:i), 0 ); + ctx = coeff_abs_level1_ctx[node_ctx] + i_ctx_level; + + if( (unsigned)(l[i]+1) > 2 ) + { + i_coeff_abs_m1 = abs(l[i]) - 1; + i_prefix = X264_MIN( i_coeff_abs_m1, 14 ); + x264_cabac_encode_decision( cb, ctx, 1 ); + ctx = coeff_abs_levelgt1_ctx[node_ctx] + i_ctx_level; + cb->f8_bits_encoded += cabac_size_unary[i_prefix][cb->state[ctx]]; + cb->state[ctx] = cabac_transition_unary[i_prefix][cb->state[ctx]]; + if( i_prefix >= 14 ) + x264_cabac_encode_ue_bypass( cb, 0, i_coeff_abs_m1 - 14 ); + node_ctx = coeff_abs_level_transition[1][node_ctx]; + } + else + { + x264_cabac_encode_decision( cb, ctx, 0 ); + node_ctx = coeff_abs_level_transition[0][node_ctx]; + x264_cabac_encode_bypass( cb, 0 ); + } + } + else + x264_cabac_encode_decision( cb, i_ctx_sig + (b_8x8?significant_coeff_flag_offset[i]:i), 0 ); + } +} + +static void block_residual_write_cabac_8x8( x264_t *h, x264_cabac_t *cb, int i_idx, int16_t *l ) +{ + block_residual_write_cabac_internal( h, cb, DCT_LUMA_8x8, i_idx, l, 64, 1 ); +} +static void block_residual_write_cabac( x264_t *h, x264_cabac_t *cb, int i_ctxBlockCat, int i_idx, int16_t *l, int i_count ) +{ + block_residual_write_cabac_internal( h, cb, i_ctxBlockCat, i_idx, l, i_count, 0 ); +} +#endif void x264_macroblock_write_cabac( x264_t *h, x264_cabac_t *cb ) { @@ -959,7 +1045,7 @@ void x264_macroblock_write_cabac( x264_t *h, x264_cabac_t *cb ) { for( i = 0; i < 4; i++ ) if( h->mb.i_cbp_luma & ( 1 << i ) ) - block_residual_write_cabac( h, cb, DCT_LUMA_8x8, i, h->dct.luma8x8[i], 64 ); + block_residual_write_cabac_8x8( h, cb, i, h->dct.luma8x8[i] ); } else { @@ -1024,7 +1110,7 @@ static void x264_partition_size_cabac( x264_t *h, x264_cabac_t *cb, int i8, int if( h->mb.i_cbp_luma & (1 << i8) ) { if( h->mb.b_transform_8x8 ) - block_residual_write_cabac( h, cb, DCT_LUMA_8x8, i8, h->dct.luma8x8[i8], 64 ); + block_residual_write_cabac_8x8( h, cb, i8, h->dct.luma8x8[i8] ); else { int i4; @@ -1063,7 +1149,7 @@ static void x264_partition_i8x8_size_cabac( x264_t *h, x264_cabac_t *cb, int i8, { *(uint16_t*)&h->mb.cache.non_zero_count[x264_scan8[i8*4]] = 0x0101; *(uint16_t*)&h->mb.cache.non_zero_count[x264_scan8[i8*4+2]] = 0x0101; - block_residual_write_cabac( h, cb, DCT_LUMA_8x8, 4*i8, h->dct.luma8x8[i8], 64 ); + block_residual_write_cabac_8x8( h, cb, 4*i8, h->dct.luma8x8[i8] ); } else { -- 2.40.0