From: David Conrad Date: Fri, 21 Aug 2009 03:44:09 +0000 (-0700) Subject: Fix unaligned accesses in bitstream writer X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=1a072a3a013976a178e0068be021e23b9a0ed59f;p=libx264 Fix unaligned accesses in bitstream writer Fixes x264 on CPUs with no unaligned access support (e.g. SPARC). Improves performance marginally on CPUs with penalties for unaligned stores (e.g. some x86). --- diff --git a/common/bs.h b/common/bs.h index 68b9f5e3..0c009921 100644 --- a/common/bs.h +++ b/common/bs.h @@ -73,21 +73,22 @@ extern vlc_large_t x264_level_token[7][LEVEL_TABLE_SIZE]; static inline void bs_init( bs_t *s, void *p_data, int i_data ) { - int offset = ((intptr_t)p_data & (WORD_SIZE-1)); + int offset = ((intptr_t)p_data & 3); s->p = s->p_start = (uint8_t*)p_data - offset; s->p_end = (uint8_t*)p_data + i_data; - s->i_left = offset ? 8*offset : (WORD_SIZE*8); - s->cur_bits = endian_fix( *(intptr_t*)s->p ); + s->i_left = (WORD_SIZE - offset)*8; + s->cur_bits = endian_fix32(*(uint32_t *)(s->p)); + s->cur_bits >>= (4-offset)*8; } static inline int bs_pos( bs_t *s ) { return( 8 * (s->p - s->p_start) + (WORD_SIZE*8) - s->i_left ); } -/* Write the rest of cur_bits to the bitstream; results in a bitstream no longer 32/64-bit aligned. */ +/* Write the rest of cur_bits to the bitstream; results in a bitstream no longer 32-bit aligned. */ static inline void bs_flush( bs_t *s ) { - *(intptr_t*)s->p = endian_fix( s->cur_bits << s->i_left ); + *(uint32_t*)s->p = endian_fix32( s->cur_bits << (s->i_left&31) ); s->p += WORD_SIZE - s->i_left / 8; s->i_left = WORD_SIZE*8; } @@ -151,21 +152,12 @@ static inline void bs_write1( bs_t *s, uint32_t i_bit ) static inline void bs_align_0( bs_t *s ) { - if( s->i_left&7 ) - { - s->cur_bits <<= s->i_left&7; - s->i_left &= ~7; - } + bs_write( s, s->i_left&7, 0 ); bs_flush( s ); } static inline void bs_align_1( bs_t *s ) { - if( s->i_left&7 ) - { - s->cur_bits <<= s->i_left&7; - s->cur_bits |= (1 << (s->i_left&7)) - 1; - s->i_left &= ~7; - } + bs_write( s, s->i_left&7, (1 << (s->i_left&7)) - 1 ); bs_flush( s ); } @@ -245,7 +237,7 @@ static inline void bs_write_te( bs_t *s, int x, int val ) static inline void bs_rbsp_trailing( bs_t *s ) { bs_write1( s, 1 ); - bs_flush( s ); + bs_write( s, s->i_left&7, 0 ); } static inline int bs_size_ue( unsigned int val ) diff --git a/encoder/cavlc.c b/encoder/cavlc.c index 89bf07d7..0d88bfc6 100644 --- a/encoder/cavlc.c +++ b/encoder/cavlc.c @@ -298,6 +298,7 @@ void x264_macroblock_write_cavlc( x264_t *h, bs_t *s ) #if !RDO_SKIP_BS if( i_mb_type == I_PCM ) { + uint8_t *p_start = s->p_start; bs_write_ue( s, i_mb_i_offset + 25 ); i_mb_pos_tex = bs_pos( s ); h->stat.frame.i_mv_bits += i_mb_pos_tex - i_mb_pos_start; @@ -313,6 +314,9 @@ void x264_macroblock_write_cavlc( x264_t *h, bs_t *s ) memcpy( s->p + i*8, h->mb.pic.p_fenc[2] + i*FENC_STRIDE, 8 ); s->p += 64; + bs_init( s, s->p, s->p_end - s->p ); + s->p_start = p_start; + /* if PCM is chosen, we need to store reconstructed frame data */ h->mc.copy[PIXEL_16x16]( h->mb.pic.p_fdec[0], FDEC_STRIDE, h->mb.pic.p_fenc[0], FENC_STRIDE, 16 ); h->mc.copy[PIXEL_8x8] ( h->mb.pic.p_fdec[1], FDEC_STRIDE, h->mb.pic.p_fenc[1], FENC_STRIDE, 8 ); diff --git a/encoder/encoder.c b/encoder/encoder.c index 3c95e860..caefa669 100644 --- a/encoder/encoder.c +++ b/encoder/encoder.c @@ -981,6 +981,7 @@ int x264_encoder_headers( x264_t *h, x264_nal_t **pp_nal, int *pi_nal ) x264_nal_start( h, NAL_PPS, NAL_PRIORITY_HIGHEST ); x264_pps_write( &h->out.bs, h->pps ); x264_nal_end( h ); + bs_flush( &h->out.bs ); } /* now set output*/ *pi_nal = h->out.i_nal; @@ -1374,6 +1375,7 @@ static int x264_slice_write( x264_t *h ) bs_write_ue( &h->out.bs, i_skip ); /* last skip run */ /* rbsp_slice_trailing_bits */ bs_rbsp_trailing( &h->out.bs ); + bs_flush( &h->out.bs ); } x264_nal_end( h );