From: Fiona Glaser Date: Thu, 9 Apr 2009 09:14:41 +0000 (-0700) Subject: Various CABAC optimizations X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=2bcc39fd4cb14bb5d8776d2dc560ebdce4eaf20a;p=libx264 Various CABAC optimizations Move calculation of b_intra out of the core residual loop and hardcode it where applicable. Inlining cabac_mb_mvd was unnecessary and wasted tremendous amounts of code size. Inlining only cache_mvd is faster and significantly smaller. --- diff --git a/encoder/cabac.c b/encoder/cabac.c index 14b237bb..188e0488 100644 --- a/encoder/cabac.c +++ b/encoder/cabac.c @@ -444,7 +444,7 @@ static inline void x264_cabac_mb_mvd_cpn( x264_t *h, x264_cabac_t *cb, int i_lis } } -static inline void x264_cabac_mb_mvd( x264_t *h, x264_cabac_t *cb, int i_list, int idx, int width, int height ) +static NOINLINE uint32_t x264_cabac_mb_mvd( x264_t *h, x264_cabac_t *cb, int i_list, int idx, int width, int height ) { DECLARE_ALIGNED_4( int16_t mvp[2] ); int mdx, mdy; @@ -458,8 +458,13 @@ static inline void x264_cabac_mb_mvd( x264_t *h, x264_cabac_t *cb, int i_list, i x264_cabac_mb_mvd_cpn( h, cb, i_list, idx, 0, mdx ); x264_cabac_mb_mvd_cpn( h, cb, i_list, idx, 1, mdy ); - /* save value */ - x264_macroblock_cache_mvd( h, block_idx_x[idx], block_idx_y[idx], width, height, i_list, pack16to32_mask(mdx,mdy) ); + return pack16to32_mask(mdx,mdy); +} + +#define x264_cabac_mb_mvd(h,cb,i_list,idx,width,height)\ +{\ + uint32_t mvd = x264_cabac_mb_mvd(h,cb,i_list,idx,width,height);\ + x264_macroblock_cache_mvd( h, block_idx_x[idx], block_idx_y[idx], width, height, i_list, mvd );\ } static inline void x264_cabac_mb8x8_mvd( x264_t *h, x264_cabac_t *cb, int i_list, int i ) @@ -505,11 +510,10 @@ static inline void x264_cabac_mb8x8_mvd( x264_t *h, x264_cabac_t *cb, int i_list * 5-> Luma8x8 i_idx = luma8x8idx */ -static int ALWAYS_INLINE x264_cabac_mb_cbf_ctxidxinc( x264_t *h, int i_cat, int i_idx ) +static int ALWAYS_INLINE x264_cabac_mb_cbf_ctxidxinc( x264_t *h, int i_cat, int i_idx, int b_intra ) { int i_nza; int i_nzb; - int b_intra = IS_INTRA( h->mb.i_type ); switch( i_cat ) { @@ -672,7 +676,7 @@ static void block_residual_write_cabac( x264_t *h, x264_cabac_t *cb, int i_ctxBl x264_cabac_encode_bypass( cb, i_coeff_sign[i_coeff] ); } while( i_coeff > 0 ); } -#define block_residual_write_cabac_8x8( h, cb, idx, l ) block_residual_write_cabac( h, cb, DCT_LUMA_8x8, idx, l, 64 ) +#define block_residual_write_cabac_8x8( h, cb, idx, l ) block_residual_write_cabac( h, cb, DCT_LUMA_8x8, idx, l, 64, 0 ) #else @@ -784,9 +788,9 @@ static void block_residual_write_cabac( x264_t *h, x264_cabac_t *cb, int i_ctxBl } #endif -#define block_residual_write_cabac( h, cb, i_ctxBlockCat, i_idx, l, i_count ) \ +#define block_residual_write_cabac( h, cb, i_ctxBlockCat, i_idx, l, i_count, b_intra ) \ { \ - int ctxidxinc = x264_cabac_mb_cbf_ctxidxinc( h, i_ctxBlockCat, i_idx); \ + int ctxidxinc = x264_cabac_mb_cbf_ctxidxinc( h, i_ctxBlockCat, i_idx, b_intra ); \ block_residual_write_cabac( h, cb, i_ctxBlockCat, i_idx, l, i_count, ctxidxinc ); \ } @@ -990,18 +994,19 @@ void x264_macroblock_write_cabac( x264_t *h, x264_cabac_t *cb ) if( h->mb.i_cbp_luma > 0 || h->mb.i_cbp_chroma > 0 || i_mb_type == I_16x16 ) { + const int b_intra = IS_INTRA( i_mb_type ); x264_cabac_mb_qp_delta( h, cb ); /* write residual */ if( i_mb_type == I_16x16 ) { /* DC Luma */ - block_residual_write_cabac( h, cb, DCT_LUMA_DC, 24, h->dct.luma16x16_dc, 16 ); + block_residual_write_cabac( h, cb, DCT_LUMA_DC, 24, h->dct.luma16x16_dc, 16, 1 ); /* AC Luma */ if( h->mb.i_cbp_luma != 0 ) for( i = 0; i < 16; i++ ) - block_residual_write_cabac( h, cb, DCT_LUMA_AC, i, h->dct.luma4x4[i]+1, 15 ); + block_residual_write_cabac( h, cb, DCT_LUMA_AC, i, h->dct.luma4x4[i]+1, 15, 1 ); } else if( h->mb.b_transform_8x8 ) { @@ -1013,18 +1018,18 @@ void x264_macroblock_write_cabac( x264_t *h, x264_cabac_t *cb ) { for( i = 0; i < 16; i++ ) if( h->mb.i_cbp_luma & ( 1 << ( i / 4 ) ) ) - block_residual_write_cabac( h, cb, DCT_LUMA_4x4, i, h->dct.luma4x4[i], 16 ); + block_residual_write_cabac( h, cb, DCT_LUMA_4x4, i, h->dct.luma4x4[i], 16, b_intra ); } if( h->mb.i_cbp_chroma &0x03 ) /* Chroma DC residual present */ { - block_residual_write_cabac( h, cb, DCT_CHROMA_DC, 25, h->dct.chroma_dc[0], 4 ); - block_residual_write_cabac( h, cb, DCT_CHROMA_DC, 26, h->dct.chroma_dc[1], 4 ); + block_residual_write_cabac( h, cb, DCT_CHROMA_DC, 25, h->dct.chroma_dc[0], 4, b_intra ); + block_residual_write_cabac( h, cb, DCT_CHROMA_DC, 26, h->dct.chroma_dc[1], 4, b_intra ); } if( h->mb.i_cbp_chroma&0x02 ) /* Chroma AC residual present */ { for( i = 16; i < 24; i++ ) - block_residual_write_cabac( h, cb, DCT_CHROMA_AC, i, h->dct.luma4x4[i]+1, 15 ); + block_residual_write_cabac( h, cb, DCT_CHROMA_AC, i, h->dct.luma4x4[i]+1, 15, b_intra ); } } @@ -1050,7 +1055,9 @@ static void x264_partition_size_cabac( x264_t *h, x264_cabac_t *cb, int i8, int if( i_mb_type == P_8x8 ) x264_cabac_mb8x8_mvd( h, cb, 0, i8 ); else if( i_mb_type == P_L0 ) + { x264_cabac_mb_mvd( h, cb, 0, 4*i8, 4>>b_8x16, 2< B_DIRECT && i_mb_type < B_8x8 ) { if( x264_mb_type_list_table[ i_mb_type ][0][!!i8] ) x264_cabac_mb_mvd( h, cb, 0, 4*i8, 4>>b_8x16, 2<dct.luma4x4[i4+i8*4], 16 ); + block_residual_write_cabac( h, cb, DCT_LUMA_4x4, i4+i8*4, h->dct.luma4x4[i4+i8*4], 16, 0 ); } } - block_residual_write_cabac( h, cb, DCT_CHROMA_AC, 16+i8, h->dct.luma4x4[16+i8]+1, 15 ); - block_residual_write_cabac( h, cb, DCT_CHROMA_AC, 20+i8, h->dct.luma4x4[20+i8]+1, 15 ); + block_residual_write_cabac( h, cb, DCT_CHROMA_AC, 16+i8, h->dct.luma4x4[16+i8]+1, 15, 0 ); + block_residual_write_cabac( h, cb, DCT_CHROMA_AC, 20+i8, h->dct.luma4x4[20+i8]+1, 15, 0 ); i8 += x264_pixel_size[i_pixel].h >> 3; } @@ -1091,13 +1098,15 @@ static void x264_partition_size_cabac( x264_t *h, x264_cabac_t *cb, int i8, int static void x264_subpartition_size_cabac( x264_t *h, x264_cabac_t *cb, int i4, int i_pixel ) { int b_8x4 = i_pixel == PIXEL_8x4; - block_residual_write_cabac( h, cb, DCT_LUMA_4x4, i4, h->dct.luma4x4[i4], 16 ); + block_residual_write_cabac( h, cb, DCT_LUMA_4x4, i4, h->dct.luma4x4[i4], 16, 0 ); if( i_pixel == PIXEL_4x4 ) + { x264_cabac_mb_mvd( h, cb, 0, i4, 1, 1 ); + } else { x264_cabac_mb_mvd( h, cb, 0, i4, 1+b_8x4, 2-b_8x4 ); - block_residual_write_cabac( h, cb, DCT_LUMA_4x4, i4+2-b_8x4, h->dct.luma4x4[i4+2-b_8x4], 16 ); + block_residual_write_cabac( h, cb, DCT_LUMA_4x4, i4+2-b_8x4, h->dct.luma4x4[i4+2-b_8x4], 16, 0 ); } } @@ -1116,7 +1125,7 @@ static void x264_partition_i4x4_size_cabac( x264_t *h, x264_cabac_t *cb, int i4, const int i_pred = x264_mb_predict_intra4x4_mode( h, i4 ); i_mode = x264_mb_pred_mode4x4_fix( i_mode ); x264_cabac_mb_intra4x4_pred_mode( cb, i_pred, i_mode ); - block_residual_write_cabac( h, cb, DCT_LUMA_4x4, i4, h->dct.luma4x4[i4], 16 ); + block_residual_write_cabac( h, cb, DCT_LUMA_4x4, i4, h->dct.luma4x4[i4], 16, 1 ); } static void x264_i8x8_chroma_size_cabac( x264_t *h, x264_cabac_t *cb ) @@ -1125,14 +1134,14 @@ static void x264_i8x8_chroma_size_cabac( x264_t *h, x264_cabac_t *cb ) x264_cabac_mb_cbp_chroma( h, cb ); if( h->mb.i_cbp_chroma > 0 ) { - block_residual_write_cabac( h, cb, DCT_CHROMA_DC, 25, h->dct.chroma_dc[0], 4 ); - block_residual_write_cabac( h, cb, DCT_CHROMA_DC, 26, h->dct.chroma_dc[1], 4 ); + block_residual_write_cabac( h, cb, DCT_CHROMA_DC, 25, h->dct.chroma_dc[0], 4, 1 ); + block_residual_write_cabac( h, cb, DCT_CHROMA_DC, 26, h->dct.chroma_dc[1], 4, 1 ); if( h->mb.i_cbp_chroma == 2 ) { int i; for( i = 16; i < 24; i++ ) - block_residual_write_cabac( h, cb, DCT_CHROMA_AC, i, h->dct.luma4x4[i]+1, 15 ); + block_residual_write_cabac( h, cb, DCT_CHROMA_AC, i, h->dct.luma4x4[i]+1, 15, 1 ); } } }