From: Fiona Glaser Date: Sat, 5 Jul 2008 00:32:32 +0000 (-0600) Subject: faster bs_write X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=ab90da748df305101b720f932736dd6d7f990214;p=libx264 faster bs_write --- diff --git a/common/bs.h b/common/bs.h index 88d41f03..823471a7 100644 --- a/common/bs.h +++ b/common/bs.h @@ -1,7 +1,11 @@ /***************************************************************************** * bs.h : ***************************************************************************** - * Copyright (C) 2003 Laurent Aimar + * Copyright (C) 2003-2008 x264 project + * + * Authors: Loren Merritt + * Fiona Glaser + * Laurent Aimar * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by @@ -27,103 +31,125 @@ typedef struct bs_s uint8_t *p; uint8_t *p_end; + intptr_t cur_bits; int i_left; /* i_count number of available bits */ int i_bits_encoded; /* RD only */ } bs_t; static inline void bs_init( bs_t *s, void *p_data, int i_data ) { - s->p_start = p_data; - s->p = p_data; - s->p_end = s->p + i_data; - s->i_left = 8; + int offset = ((intptr_t)p_data & (WORD_SIZE-1)); + s->p = s->p_start = (uint8_t*)p_data - offset; + s->p_end = (uint8_t*)p_data + i_data; + s->i_left = offset ? 8*offset : (WORD_SIZE*8); + s->cur_bits = endian_fix( *(intptr_t*)s->p ); } static inline int bs_pos( bs_t *s ) { - return( 8 * ( s->p - s->p_start ) + 8 - s->i_left ); + return( 8 * (s->p - s->p_start) + (WORD_SIZE*8) - s->i_left ); +} + +/* Write the rest of cur_bits to the bitstream; results in a bitstream no longer 32/64-bit aligned. */ +static inline void bs_flush( bs_t *s ) +{ + *(intptr_t*)s->p = endian_fix( s->cur_bits << s->i_left ); + s->p += WORD_SIZE - s->i_left / 8; + s->i_left = WORD_SIZE*8; } static inline void bs_write( bs_t *s, int i_count, uint32_t i_bits ) { - while( i_count > 0 ) + if( WORD_SIZE == 8 ) + { + s->cur_bits = (s->cur_bits << i_count) | i_bits; + s->i_left -= i_count; + if( s->i_left <= 32 ) + { + *(uint32_t*)s->p = endian_fix( s->cur_bits << s->i_left ); + s->i_left += 32; + s->p += 4; + } + } + else { - if( i_count < 32 ) - i_bits &= (1<i_left ) { - *s->p = (*s->p << i_count) | i_bits; + s->cur_bits = (s->cur_bits << i_count) | i_bits; s->i_left -= i_count; - break; } else { - *s->p = (*s->p << s->i_left) | (i_bits >> (i_count - s->i_left)); i_count -= s->i_left; - s->p++; - s->i_left = 8; + s->cur_bits = (s->cur_bits << s->i_left) | (i_bits >> i_count); + *(uint32_t*)s->p = endian_fix( s->cur_bits ); + s->p += 4; + s->cur_bits = i_bits; + s->i_left = 32 - i_count; } } } +/* Special case to eliminate branch in normal bs_write. */ +/* Golomb never writes an even-size code, so this is only used in slice headers. */ +static inline void bs_write32( bs_t *s, uint32_t i_bits ) +{ + bs_write( s, 16, i_bits >> 16 ); + bs_write( s, 16, i_bits ); +} + static inline void bs_write1( bs_t *s, uint32_t i_bit ) { - *s->p <<= 1; - *s->p |= i_bit; + s->cur_bits <<= 1; + s->cur_bits |= i_bit; s->i_left--; - if( s->i_left == 0 ) + if( s->i_left == WORD_SIZE*8-32 ) { - s->p++; - s->i_left = 8; + *(uint32_t*)s->p = endian_fix32( s->cur_bits ); + s->p += 4; + s->i_left = WORD_SIZE*8; } } static inline void bs_align_0( bs_t *s ) { - if( s->i_left != 8 ) + if( s->i_left&7 ) { - *s->p <<= s->i_left; - s->i_left = 8; - s->p++; + s->cur_bits <<= s->i_left&7; + s->i_left &= ~7; } + bs_flush( s ); } static inline void bs_align_1( bs_t *s ) { - if( s->i_left != 8 ) + if( s->i_left&7 ) { - *s->p <<= s->i_left; - *s->p |= (1 << s->i_left) - 1; - s->i_left = 8; - s->p++; + s->cur_bits <<= s->i_left&7; + s->cur_bits |= (1 << (s->i_left&7)) - 1; + s->i_left &= ~7; } + bs_flush( s ); } -static inline void bs_align( bs_t *s ) -{ - bs_align_0( s ); -} - - /* golomb functions */ -static inline void bs_write_ue( bs_t *s, unsigned int val ) +static const uint8_t i_size0_255[256] = +{ + 1,1,2,2,3,3,3,3,4,4,4,4,4,4,4,4,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5, + 6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6, + 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, + 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, + 8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8, + 8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8, + 8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8, + 8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8 +}; + +static inline void bs_write_ue_big( bs_t *s, unsigned int val ) { int i_size = 0; - static const uint8_t i_size0_255[256] = - { - 1,1,2,2,3,3,3,3,4,4,4,4,4,4,4,4,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5, - 6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6, - 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, - 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, - 8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8, - 8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8, - 8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8, - 8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8 - }; if( val == 0 ) - { bs_write1( s, 1 ); - } else { unsigned int tmp = ++val; @@ -144,89 +170,103 @@ static inline void bs_write_ue( bs_t *s, unsigned int val ) } } +/* Only works on values under 255. */ +static inline void bs_write_ue( bs_t *s, int val ) +{ + if( val == 0 ) + bs_write1( s, 1 ); + else + bs_write( s, 2 * i_size0_255[val+1] - 1, val+1 ); +} + static inline void bs_write_se( bs_t *s, int val ) { - bs_write_ue( s, val <= 0 ? -val * 2 : val * 2 - 1); + int i_size = 0; + val = val <= 0 ? -val * 2 : val * 2 - 1; + + if( val == 0 ) + bs_write1( s, 1 ); + else + { + unsigned int tmp = ++val; + + if( tmp >= 0x100 ) + { + i_size += 8; + tmp >>= 8; + } + i_size += i_size0_255[tmp]; + + bs_write( s, 2 * i_size - 1, val ); + } } static inline void bs_write_te( bs_t *s, int x, int val ) { if( x == 1 ) - { - bs_write1( s, 1&~val ); - } + bs_write1( s, 1^val ); else if( x > 1 ) - { bs_write_ue( s, val ); - } } static inline void bs_rbsp_trailing( bs_t *s ) { bs_write1( s, 1 ); - if( s->i_left != 8 ) - { - bs_write( s, s->i_left, 0x00 ); - } + bs_flush( s ); } +static const uint8_t i_size0_254[255] = +{ + 1, 3, 3, 5, 5, 5, 5, 7, 7, 7, 7, 7, 7, 7, 7, + 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, + 11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11, + 11,11,11,11,11,11,11,11,11,13,13,13,13,13,13,13,13,13,13,13,13,13,13, + 13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13, + 13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13, + 13,13,13,13,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15, + 15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15, + 15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15, + 15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15, + 15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15, + 15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15 +}; + static inline int bs_size_ue( unsigned int val ) { - static const uint8_t i_size0_254[255] = - { - 1, 3, 3, 5, 5, 5, 5, 7, 7, 7, 7, 7, 7, 7, 7, - 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, - 11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11, - 11,11,11,11,11,11,11,11,11,13,13,13,13,13,13,13,13,13,13,13,13,13,13, - 13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13, - 13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13, - 13,13,13,13,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15, - 15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15, - 15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15, - 15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15, - 15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15, - 15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15 - }; + return i_size0_254[val]; +} +static inline int bs_size_ue_big( unsigned int val ) +{ if( val < 255 ) - { return i_size0_254[val]; - } else { - int i_size = 0; - val++; - - if( val >= 0x10000 ) - { - i_size += 32; - val = (val >> 16) - 1; - } - if( val >= 0x100 ) - { - i_size += 16; - val = (val >> 8) - 1; - } - return i_size0_254[val] + i_size; + val = (val >> 8) - 1; + return i_size0_254[val] + 16; } } static inline int bs_size_se( int val ) { - return bs_size_ue( val <= 0 ? -val * 2 : val * 2 - 1); + val = val <= 0 ? -val * 2 : val * 2 - 1; + if( val < 255 ) + return i_size0_254[val]; + else + { + val++; + val = (val >> 8) - 1; + return i_size0_254[val] + 16; + } } static inline int bs_size_te( int x, int val ) { if( x == 1 ) - { return 1; - } else if( x > 1 ) - { - return bs_size_ue( val ); - } + return i_size0_254[val]; return 0; } diff --git a/common/osdep.h b/common/osdep.h index 7b8dacb2..960e7db0 100644 --- a/common/osdep.h +++ b/common/osdep.h @@ -133,8 +133,7 @@ #define x264_pthread_cond_wait(c,m) #endif -/* FIXME: long isn't always the native register size (e.g. win64). */ -#define WORD_SIZE sizeof(long) +#define WORD_SIZE sizeof(void*) #if !defined(_WIN64) && !defined(__LP64__) #if defined(_MSC_VER) || defined(__INTEL_COMPILER) @@ -142,4 +141,31 @@ #endif #endif +#ifdef WORDS_BIGENDIAN +#define endian_fix(x) (x) +#elif defined(__GNUC__) && defined(HAVE_MMX) +static ALWAYS_INLINE uint32_t endian_fix32( uint32_t x ) +{ + asm("bswap %0":"+r"(x)); + return x; +} +static ALWAYS_INLINE intptr_t endian_fix( intptr_t x ) +{ + asm("bswap %0":"+r"(x)); + return x; +} +#else +static ALWAYS_INLINE uint32_t endian_fix32( uint32_t x ) +{ + return (x<<24) + ((x<<8)&0xff0000) + ((x>>8)&0xff00) + (x>>24); +} +static ALWAYS_INLINE intptr_t endian_fix( intptr_t x ) +{ + if( WORD_SIZE == 8 ) + return endian_fix32(x>>32) + ((uint64_t)endian_fix32(x)<<32); + else + return endian_fix32(x); +} +#endif + #endif /* X264_OSDEP_H */ diff --git a/encoder/cavlc.c b/encoder/cavlc.c index ffe16ada..057efdd2 100644 --- a/encoder/cavlc.c +++ b/encoder/cavlc.c @@ -147,19 +147,17 @@ static void block_residual_write_cavlc( x264_t *h, bs_t *s, int i_idx, int16_t * if( ( i_level_code >> i_suffix_length ) < 14 ) { - bs_write( s, (i_level_code >> i_suffix_length) + 1, 1 ); - if( i_suffix_length > 0 ) - bs_write( s, i_suffix_length, i_level_code ); + bs_write( s, (i_level_code >> i_suffix_length) + 1 + i_suffix_length, + (1< 0 && ( i_level_code >> i_suffix_length ) == 14 ) { - bs_write( s, 15, 1 ); - bs_write( s, i_suffix_length, i_level_code ); + bs_write( s, 15 + i_suffix_length, + (1<= i_pred ) + i_mode--; + bs_write( s, 4, i_mode ); } } bs_write_ue( s, x264_mb_pred_mode8x8c_fix[ h->mb.i_chroma_pred_mode ] ); diff --git a/encoder/encoder.c b/encoder/encoder.c index e87105d3..327776e7 100644 --- a/encoder/encoder.c +++ b/encoder/encoder.c @@ -39,6 +39,8 @@ #define NALU_OVERHEAD 5 // startcode + NAL type costs 5 bytes per frame +#define bs_write_ue bs_write_ue_big + static void x264_encoder_frame_end( x264_t *h, x264_t *thread_current, x264_nal_t **pp_nal, int *pi_nal, x264_picture_t *pic_out ); diff --git a/encoder/rdo.c b/encoder/rdo.c index edc50766..76bf57be 100644 --- a/encoder/rdo.c +++ b/encoder/rdo.c @@ -43,7 +43,7 @@ static uint16_t cabac_prefix_size[15][128]; #define x264_cabac_encode_decision(c,x,v) x264_cabac_size_decision(c,x,v) #define x264_cabac_encode_terminal(c) x264_cabac_size_decision(c,276,0) #define x264_cabac_encode_bypass(c,v) ((c)->f8_bits_encoded += 256) -#define x264_cabac_encode_ue_bypass(c,e,v) ((c)->f8_bits_encoded += (bs_size_ue(v+(1<f8_bits_encoded += (bs_size_ue_big(v+(1<vui.b_timing_info_present ); if( sps->vui.b_timing_info_present ) { - bs_write( s, 32, sps->vui.i_num_units_in_tick ); - bs_write( s, 32, sps->vui.i_time_scale ); + bs_write32( s, sps->vui.i_num_units_in_tick ); + bs_write32( s, sps->vui.i_time_scale ); bs_write1( s, sps->vui.b_fixed_frame_rate ); }