From c656d68ff441d2925afcb40ccfaf49279fd95656 Mon Sep 17 00:00:00 2001 From: Fiona Glaser Date: Sat, 7 Feb 2009 02:27:16 -0800 Subject: [PATCH] Much faster CABAC residual context selection Up to ~17% faster CABAC RDO, ~36% faster intra-only CABAC RDO. Up to 7% faster overall in extreme cases. --- encoder/cabac.c | 121 ++++++++++++++++-------------------------------- 1 file changed, 39 insertions(+), 82 deletions(-) diff --git a/encoder/cabac.c b/encoder/cabac.c index acdd4319..fb1d1227 100644 --- a/encoder/cabac.c +++ b/encoder/cabac.c @@ -505,92 +505,43 @@ static inline void x264_cabac_mb8x8_mvd( x264_t *h, x264_cabac_t *cb, int i_list } } -static int x264_cabac_mb_cbf_ctxidxinc( x264_t *h, int i_cat, int i_idx ) +/* i_ctxBlockCat: 0-> DC 16x16 i_idx = 0 + * 1-> AC 16x16 i_idx = luma4x4idx + * 2-> Luma4x4 i_idx = luma4x4idx + * 3-> DC Chroma i_idx = iCbCr + * 4-> AC Chroma i_idx = 4 * iCbCr + chroma4x4idx + * 5-> Luma8x8 i_idx = luma8x8idx + */ + +static int ALWAYS_INLINE x264_cabac_mb_cbf_ctxidxinc( x264_t *h, int i_cat, int i_idx ) { - /* i_ctxBlockCat: 0-> DC 16x16 i_idx = 0 - * 1-> AC 16x16 i_idx = luma4x4idx - * 2-> Luma4x4 i_idx = luma4x4idx - * 3-> DC Chroma i_idx = iCbCr - * 4-> AC Chroma i_idx = 4 * iCbCr + chroma4x4idx - * 5-> Luma8x8 i_idx = luma8x8idx - */ - - int i_mba_xy = -1; - int i_mbb_xy = -1; - int i_nza = 0; - int i_nzb = 0; + int i_nza; + int i_nzb; + int b_intra = IS_INTRA( h->mb.i_type ); switch( i_cat ) { - case DCT_LUMA_DC: - if( h->mb.i_neighbour & MB_LEFT ) - { - i_mba_xy = h->mb.i_mb_xy - 1; - i_nza = h->mb.cbp[i_mba_xy] & 0x100; - } - if( h->mb.i_neighbour & MB_TOP ) - { - i_mbb_xy = h->mb.i_mb_top_xy; - i_nzb = h->mb.cbp[i_mbb_xy] & 0x100; - } - break; case DCT_LUMA_AC: case DCT_LUMA_4x4: - if( i_idx & ~10 ) // block_idx_x > 0 - i_mba_xy = h->mb.i_mb_xy; - else if( h->mb.i_neighbour & MB_LEFT ) - i_mba_xy = h->mb.i_mb_xy - 1; - - if( i_idx & ~5 ) // block_idx_y > 0 - i_mbb_xy = h->mb.i_mb_xy; - else if( h->mb.i_neighbour & MB_TOP ) - i_mbb_xy = h->mb.i_mb_top_xy; - + case DCT_CHROMA_AC: /* no need to test for skip/pcm */ - if( i_mba_xy >= 0 ) - i_nza = h->mb.cache.non_zero_count[x264_scan8[i_idx] - 1]; - if( i_mbb_xy >= 0 ) - i_nzb = h->mb.cache.non_zero_count[x264_scan8[i_idx] - 8]; - break; + i_nza = h->mb.cache.non_zero_count[x264_scan8[i_idx] - 1]; + i_nzb = h->mb.cache.non_zero_count[x264_scan8[i_idx] - 8]; + i_nza &= 0x7f + (b_intra << 7); + i_nzb &= 0x7f + (b_intra << 7); + return 4*i_cat + 2*!!i_nzb + !!i_nza; + case DCT_LUMA_DC: + /* Note: this depends on the exact values of MB_LEFT and MB_TOP enums */ + i_nza = ((h->mb.cbp[h->mb.i_mb_xy - 1] >> 8) | ~h->mb.i_neighbour) & 1; + i_nzb = ((h->mb.cbp[h->mb.i_mb_top_xy] >> 7) | ~h->mb.i_neighbour) & 2; + return 4*i_cat + i_nzb + i_nza; case DCT_CHROMA_DC: /* no need to test skip/pcm */ i_idx -= 25; - if( h->mb.i_neighbour & MB_LEFT ) - { - i_mba_xy = h->mb.i_mb_xy - 1; - i_nza = h->mb.cbp[i_mba_xy] & (0x200 << i_idx); - } - if( h->mb.i_neighbour & MB_TOP ) - { - i_mbb_xy = h->mb.i_mb_top_xy; - i_nzb = h->mb.cbp[i_mbb_xy] & (0x200 << i_idx); - } - break; - case DCT_CHROMA_AC: - if( i_idx & 1 ) - i_mba_xy = h->mb.i_mb_xy; - else if( h->mb.i_neighbour & MB_LEFT ) - i_mba_xy = h->mb.i_mb_xy - 1; - - if( i_idx & 2 ) - i_mbb_xy = h->mb.i_mb_xy; - else if( h->mb.i_neighbour & MB_TOP ) - i_mbb_xy = h->mb.i_mb_top_xy; - - /* no need to test skip/pcm */ - if( i_mba_xy >= 0 ) - i_nza = h->mb.cache.non_zero_count[x264_scan8[i_idx] - 1]; - if( i_mbb_xy >= 0 ) - i_nzb = h->mb.cache.non_zero_count[x264_scan8[i_idx] - 8]; + i_nza = h->mb.i_neighbour & MB_LEFT ? (h->mb.cbp[h->mb.i_mb_xy - 1] >> (9 + i_idx)) & 1 : b_intra; + i_nzb = h->mb.i_neighbour & MB_TOP ? (h->mb.cbp[h->mb.i_mb_top_xy] >> (9 + i_idx)) & 1 : b_intra; + return 4*i_cat + 2*i_nzb + i_nza; } - - if( IS_INTRA( h->mb.i_type ) ) - { - i_nza |= i_mba_xy < 0; - i_nzb |= i_mbb_xy < 0; - } - - return 4*i_cat + 2*!!i_nzb + !!i_nza; } @@ -637,7 +588,7 @@ static const uint8_t coeff_abs_level_transition[2][8] = { }; #if !RDO_SKIP_BS -static void block_residual_write_cabac( x264_t *h, x264_cabac_t *cb, int i_ctxBlockCat, int i_idx, int16_t *l, int i_count ) +static void block_residual_write_cabac( x264_t *h, x264_cabac_t *cb, int i_ctxBlockCat, int i_idx, int16_t *l, int i_count, int ctxidxinc ) { const int i_ctx_sig = significant_coeff_flag_offset[h->mb.b_interlaced][i_ctxBlockCat]; const int i_ctx_last = last_coeff_flag_offset[h->mb.b_interlaced][i_ctxBlockCat]; @@ -655,7 +606,7 @@ static void block_residual_write_cabac( x264_t *h, x264_cabac_t *cb, int i_ctxBl if( i_count != 64 ) { /* coded block flag */ - int ctx = 85 + x264_cabac_mb_cbf_ctxidxinc( h, i_ctxBlockCat, i_idx ); + int ctx = 85 + ctxidxinc; if( h->mb.cache.non_zero_count[x264_scan8[i_idx]] ) x264_cabac_encode_decision( cb, ctx, 1 ); else @@ -736,7 +687,7 @@ static void block_residual_write_cabac( x264_t *h, x264_cabac_t *cb, int i_ctxBl * this is slightly incorrect because the sigmap is not reversible * (contexts are repeated). However, there is nearly no quality penalty * for this (~0.001db) and the speed boost (~30%) is worth it. */ -static void ALWAYS_INLINE block_residual_write_cabac_internal( x264_t *h, x264_cabac_t *cb, int i_ctxBlockCat, int i_idx, int16_t *l, int i_count, int b_8x8 ) +static void ALWAYS_INLINE block_residual_write_cabac_internal( x264_t *h, x264_cabac_t *cb, int i_ctxBlockCat, int i_idx, int16_t *l, int i_count, int ctxidxinc, int b_8x8 ) { const int i_ctx_sig = significant_coeff_flag_offset[h->mb.b_interlaced][i_ctxBlockCat]; const int i_ctx_last = last_coeff_flag_offset[h->mb.b_interlaced][i_ctxBlockCat]; @@ -747,7 +698,7 @@ static void ALWAYS_INLINE block_residual_write_cabac_internal( x264_t *h, x264_c if( !b_8x8 ) { /* coded block flag */ - ctx = 85 + x264_cabac_mb_cbf_ctxidxinc( h, i_ctxBlockCat, i_idx ); + ctx = 85 + ctxidxinc; if( h->mb.cache.non_zero_count[x264_scan8[i_idx]] ) x264_cabac_encode_decision( cb, ctx, 1 ); else @@ -820,14 +771,20 @@ static void ALWAYS_INLINE block_residual_write_cabac_internal( x264_t *h, x264_c static void block_residual_write_cabac_8x8( x264_t *h, x264_cabac_t *cb, int i_idx, int16_t *l ) { - block_residual_write_cabac_internal( h, cb, DCT_LUMA_8x8, i_idx, l, 64, 1 ); + block_residual_write_cabac_internal( h, cb, DCT_LUMA_8x8, i_idx, l, 64, 0, 1 ); } -static void block_residual_write_cabac( x264_t *h, x264_cabac_t *cb, int i_ctxBlockCat, int i_idx, int16_t *l, int i_count ) +static void block_residual_write_cabac( x264_t *h, x264_cabac_t *cb, int i_ctxBlockCat, int i_idx, int16_t *l, int i_count, int ctxidxinc ) { - block_residual_write_cabac_internal( h, cb, i_ctxBlockCat, i_idx, l, i_count, 0 ); + block_residual_write_cabac_internal( h, cb, i_ctxBlockCat, i_idx, l, i_count, ctxidxinc, 0 ); } #endif +#define block_residual_write_cabac( h, cb, i_ctxBlockCat, i_idx, l, i_count ) \ +{ \ + int ctxidxinc = x264_cabac_mb_cbf_ctxidxinc( h, i_ctxBlockCat, i_idx); \ + block_residual_write_cabac( h, cb, i_ctxBlockCat, i_idx, l, i_count, ctxidxinc ); \ +} + void x264_macroblock_write_cabac( x264_t *h, x264_cabac_t *cb ) { const int i_mb_type = h->mb.i_type; -- 2.40.0