]> granicus.if.org Git - libx264/commitdiff
faster bs_write
authorFiona Glaser <fiona@x264.com>
Sat, 5 Jul 2008 00:32:32 +0000 (18:32 -0600)
committerLoren Merritt <pengvado@akuvian.org>
Thu, 10 Jul 2008 13:36:23 +0000 (07:36 -0600)
common/bs.h
common/osdep.h
encoder/cavlc.c
encoder/encoder.c
encoder/rdo.c
encoder/set.c

index 88d41f03ba7ab2eb66ac9e4e6fc1a5285fa9ba7c..823471a731abb60c002525b80bb75709f72d5a69 100644 (file)
@@ -1,7 +1,11 @@
 /*****************************************************************************
  * bs.h :
  *****************************************************************************
- * Copyright (C) 2003 Laurent Aimar <fenrir@via.ecp.fr>
+ * Copyright (C) 2003-2008 x264 project
+ *
+ * Authors: Loren Merritt <lorenm@u.washington.edu>
+ *          Fiona Glaser <fiona@x264.com>
+ *          Laurent Aimar <fenrir@via.ecp.fr>
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
@@ -27,103 +31,125 @@ typedef struct bs_s
     uint8_t *p;
     uint8_t *p_end;
 
+    intptr_t cur_bits;
     int     i_left;    /* i_count number of available bits */
     int     i_bits_encoded; /* RD only */
 } bs_t;
 
 static inline void bs_init( bs_t *s, void *p_data, int i_data )
 {
-    s->p_start = p_data;
-    s->p       = p_data;
-    s->p_end   = s->p + i_data;
-    s->i_left  = 8;
+    int offset = ((intptr_t)p_data & (WORD_SIZE-1));
+    s->p       = s->p_start = (uint8_t*)p_data - offset;
+    s->p_end   = (uint8_t*)p_data + i_data;
+    s->i_left  = offset ? 8*offset : (WORD_SIZE*8);
+    s->cur_bits = endian_fix( *(intptr_t*)s->p );
 }
 static inline int bs_pos( bs_t *s )
 {
-    return( 8 * ( s->p - s->p_start ) + 8 - s->i_left );
+    return( 8 * (s->p - s->p_start) + (WORD_SIZE*8) - s->i_left );
+}
+
+/* Write the rest of cur_bits to the bitstream; results in a bitstream no longer 32/64-bit aligned. */
+static inline void bs_flush( bs_t *s )
+{
+    *(intptr_t*)s->p = endian_fix( s->cur_bits << s->i_left );
+    s->p += WORD_SIZE - s->i_left / 8;
+    s->i_left = WORD_SIZE*8;
 }
 
 static inline void bs_write( bs_t *s, int i_count, uint32_t i_bits )
 {
-    while( i_count > 0 )
+    if( WORD_SIZE == 8 )
+    {
+        s->cur_bits = (s->cur_bits << i_count) | i_bits;
+        s->i_left -= i_count;
+        if( s->i_left <= 32 )
+        {
+            *(uint32_t*)s->p = endian_fix( s->cur_bits << s->i_left );
+            s->i_left += 32;
+            s->p += 4;
+        }
+    }
+    else
     {
-        if( i_count < 32 )
-            i_bits &= (1<<i_count)-1;
         if( i_count < s->i_left )
         {
-            *s->p = (*s->p << i_count) | i_bits;
+            s->cur_bits = (s->cur_bits << i_count) | i_bits;
             s->i_left -= i_count;
-            break;
         }
         else
         {
-            *s->p = (*s->p << s->i_left) | (i_bits >> (i_count - s->i_left));
             i_count -= s->i_left;
-            s->p++;
-            s->i_left = 8;
+            s->cur_bits = (s->cur_bits << s->i_left) | (i_bits >> i_count);
+            *(uint32_t*)s->p = endian_fix( s->cur_bits );
+            s->p += 4;
+            s->cur_bits = i_bits;
+            s->i_left = 32 - i_count;
         }
     }
 }
 
+/* Special case to eliminate branch in normal bs_write. */
+/* Golomb never writes an even-size code, so this is only used in slice headers. */
+static inline void bs_write32( bs_t *s, uint32_t i_bits )
+{
+    bs_write( s, 16, i_bits >> 16 );
+    bs_write( s, 16, i_bits );
+}
+
 static inline void bs_write1( bs_t *s, uint32_t i_bit )
 {
-    *s->p <<= 1;
-    *s->p |= i_bit;
+    s->cur_bits <<= 1;
+    s->cur_bits |= i_bit;
     s->i_left--;
-    if( s->i_left == 0 )
+    if( s->i_left == WORD_SIZE*8-32 )
     {
-        s->p++;
-        s->i_left = 8;
+        *(uint32_t*)s->p = endian_fix32( s->cur_bits );
+        s->p += 4;
+        s->i_left = WORD_SIZE*8;
     }
 }
 
 static inline void bs_align_0( bs_t *s )
 {
-    if( s->i_left != 8 )
+    if( s->i_left&7 )
     {
-        *s->p <<= s->i_left;
-        s->i_left = 8;
-        s->p++;
+        s->cur_bits <<= s->i_left&7;
+        s->i_left &= ~7;
     }
+    bs_flush( s );
 }
 static inline void bs_align_1( bs_t *s )
 {
-    if( s->i_left != 8 )
+    if( s->i_left&7 )
     {
-        *s->p <<= s->i_left;
-        *s->p |= (1 << s->i_left) - 1;
-        s->i_left = 8;
-        s->p++;
+        s->cur_bits <<= s->i_left&7;
+        s->cur_bits |= (1 << (s->i_left&7)) - 1;
+        s->i_left &= ~7;
     }
+    bs_flush( s );
 }
-static inline void bs_align( bs_t *s )
-{
-    bs_align_0( s );
-}
-
-
 
 /* golomb functions */
 
-static inline void bs_write_ue( bs_t *s, unsigned int val )
+static const uint8_t i_size0_255[256] =
+{
+    1,1,2,2,3,3,3,3,4,4,4,4,4,4,4,4,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,
+    6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,
+    7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,
+    7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,
+    8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,
+    8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,
+    8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,
+    8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8
+};
+
+static inline void bs_write_ue_big( bs_t *s, unsigned int val )
 {
     int i_size = 0;
-    static const uint8_t i_size0_255[256] =
-    {
-        1,1,2,2,3,3,3,3,4,4,4,4,4,4,4,4,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,
-        6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,
-        7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,
-        7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,
-        8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,
-        8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,
-        8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,
-        8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8
-    };
 
     if( val == 0 )
-    {
         bs_write1( s, 1 );
-    }
     else
     {
         unsigned int tmp = ++val;
@@ -144,89 +170,103 @@ static inline void bs_write_ue( bs_t *s, unsigned int val )
     }
 }
 
+/* Only works on values under 255. */
+static inline void bs_write_ue( bs_t *s, int val )
+{
+    if( val == 0 )
+        bs_write1( s, 1 );
+    else
+        bs_write( s, 2 * i_size0_255[val+1] - 1, val+1 );
+}
+
 static inline void bs_write_se( bs_t *s, int val )
 {
-    bs_write_ue( s, val <= 0 ? -val * 2 : val * 2 - 1);
+    int i_size = 0;
+    val = val <= 0 ? -val * 2 : val * 2 - 1;
+
+    if( val == 0 )
+        bs_write1( s, 1 );
+    else
+    {
+        unsigned int tmp = ++val;
+
+        if( tmp >= 0x100 )
+        {
+            i_size += 8;
+            tmp >>= 8;
+        }
+        i_size += i_size0_255[tmp];
+
+        bs_write( s, 2 * i_size - 1, val );
+    }
 }
 
 static inline void bs_write_te( bs_t *s, int x, int val )
 {
     if( x == 1 )
-    {
-        bs_write1( s, 1&~val );
-    }
+        bs_write1( s, 1^val );
     else if( x > 1 )
-    {
         bs_write_ue( s, val );
-    }
 }
 
 static inline void bs_rbsp_trailing( bs_t *s )
 {
     bs_write1( s, 1 );
-    if( s->i_left != 8 )
-    {
-        bs_write( s, s->i_left, 0x00 );
-    }
+    bs_flush( s );
 }
 
+static const uint8_t i_size0_254[255] =
+{
+    1, 3, 3, 5, 5, 5, 5, 7, 7, 7, 7, 7, 7, 7, 7,
+    9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
+    11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,
+    11,11,11,11,11,11,11,11,11,13,13,13,13,13,13,13,13,13,13,13,13,13,13,
+    13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,
+    13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,
+    13,13,13,13,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,
+    15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,
+    15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,
+    15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,
+    15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,
+    15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15
+};
+
 static inline int bs_size_ue( unsigned int val )
 {
-    static const uint8_t i_size0_254[255] =
-    {
-        1, 3, 3, 5, 5, 5, 5, 7, 7, 7, 7, 7, 7, 7, 7,
-        9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
-        11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,
-        11,11,11,11,11,11,11,11,11,13,13,13,13,13,13,13,13,13,13,13,13,13,13,
-        13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,
-        13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,
-        13,13,13,13,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,
-        15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,
-        15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,
-        15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,
-        15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,
-        15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15
-    };
+    return i_size0_254[val];
+}
 
+static inline int bs_size_ue_big( unsigned int val )
+{
     if( val < 255 )
-    {
         return i_size0_254[val];
-    }
     else
     {
-        int i_size = 0;
-
         val++;
-
-        if( val >= 0x10000 )
-        {
-            i_size += 32;
-            val = (val >> 16) - 1;
-        }
-        if( val >= 0x100 )
-        {
-            i_size += 16;
-            val = (val >> 8) - 1;
-        }
-        return i_size0_254[val] + i_size;
+        val = (val >> 8) - 1;
+        return i_size0_254[val] + 16;
     }
 }
 
 static inline int bs_size_se( int val )
 {
-    return bs_size_ue( val <= 0 ? -val * 2 : val * 2 - 1);
+    val = val <= 0 ? -val * 2 : val * 2 - 1;
+    if( val < 255 )
+        return i_size0_254[val];
+    else
+    {
+        val++;
+        val = (val >> 8) - 1;
+        return i_size0_254[val] + 16;
+    }
 }
 
 static inline int bs_size_te( int x, int val )
 {
     if( x == 1 )
-    {
         return 1;
-    }
     else if( x > 1 )
-    {
-        return bs_size_ue( val );
-    }
+        return i_size0_254[val];
     return 0;
 }
 
index 7b8dacb211a896cceafbc4ba0a147b76a363a25b..960e7db0a07aee33ba44d1bbc75fd54ed6193787 100644 (file)
 #define x264_pthread_cond_wait(c,m)
 #endif
 
-/* FIXME: long isn't always the native register size (e.g. win64). */
-#define WORD_SIZE sizeof(long)
+#define WORD_SIZE sizeof(void*)
 
 #if !defined(_WIN64) && !defined(__LP64__)
 #if defined(_MSC_VER) || defined(__INTEL_COMPILER)
 #endif
 #endif
 
+#ifdef WORDS_BIGENDIAN
+#define endian_fix(x) (x)
+#elif defined(__GNUC__) && defined(HAVE_MMX)
+static ALWAYS_INLINE uint32_t endian_fix32( uint32_t x )
+{
+    asm("bswap %0":"+r"(x));
+    return x;
+}
+static ALWAYS_INLINE intptr_t endian_fix( intptr_t x )
+{
+    asm("bswap %0":"+r"(x));
+    return x;
+}
+#else
+static ALWAYS_INLINE uint32_t endian_fix32( uint32_t x )
+{
+    return (x<<24) + ((x<<8)&0xff0000) + ((x>>8)&0xff00) + (x>>24);
+}
+static ALWAYS_INLINE intptr_t endian_fix( intptr_t x )
+{
+    if( WORD_SIZE == 8 )
+        return endian_fix32(x>>32) + ((uint64_t)endian_fix32(x)<<32);
+    else
+        return endian_fix32(x);
+}
+#endif
+
 #endif /* X264_OSDEP_H */
index ffe16adab8e7d8726543ab8b0eb203f27dc8f2b4..057efdd287d2aafa26133c42b033fd91c76c7416 100644 (file)
@@ -147,19 +147,17 @@ static void block_residual_write_cavlc( x264_t *h, bs_t *s, int i_idx, int16_t *
 
         if( ( i_level_code >> i_suffix_length ) < 14 )
         {
-            bs_write( s, (i_level_code >> i_suffix_length) + 1, 1 );
-            if( i_suffix_length > 0 )
-                bs_write( s, i_suffix_length, i_level_code );
+            bs_write( s, (i_level_code >> i_suffix_length) + 1 + i_suffix_length,
+                     (1<<i_suffix_length) + (i_level_code & ((1<<i_suffix_length)-1)) );
         }
         else if( i_suffix_length == 0 && i_level_code < 30 )
         {
-            bs_write( s, 15, 1 );
-            bs_write( s, 4, i_level_code - 14 );
+            bs_write( s, 19, (1<<4) + (i_level_code - 14) );
         }
         else if( i_suffix_length > 0 && ( i_level_code >> i_suffix_length ) == 14 )
         {
-            bs_write( s, 15, 1 );
-            bs_write( s, i_suffix_length, i_level_code );
+            bs_write( s, 15 + i_suffix_length,
+                      (1<<i_suffix_length) + (i_level_code & ((1<<i_suffix_length)-1)) );
         }
         else
         {
@@ -192,7 +190,7 @@ static void block_residual_write_cavlc( x264_t *h, bs_t *s, int i_idx, int16_t *
                 }
             }
             bs_write( s, i_level_prefix + 1, 1 );
-            bs_write( s, i_level_prefix - 3, i_level_code );
+            bs_write( s, i_level_prefix - 3, i_level_code & ((1<<(i_level_prefix-3))-1) );
         }
 
         if( i_suffix_length == 0 )
@@ -398,15 +396,9 @@ void x264_macroblock_write_cavlc( x264_t *h, bs_t *s )
             }
             else
             {
-                bs_write1( s, 0 );  /* b_prev_intra4x4_pred_mode */
-                if( i_mode < i_pred )
-                {
-                    bs_write( s, 3, i_mode );
-                }
-                else
-                {
-                    bs_write( s, 3, i_mode - 1 );
-                }
+                if( i_mode >= i_pred )
+                    i_mode--;
+                bs_write( s, 4, i_mode );
             }
         }
         bs_write_ue( s, x264_mb_pred_mode8x8c_fix[ h->mb.i_chroma_pred_mode ] );
index e87105d3d3ac246b0cf9801cb93dbe97ca3081e8..327776e7259ecf6d559cc2b44b293e3e6ad714b1 100644 (file)
@@ -39,6 +39,8 @@
 
 #define NALU_OVERHEAD 5 // startcode + NAL type costs 5 bytes per frame
 
+#define bs_write_ue bs_write_ue_big
+
 static void x264_encoder_frame_end( x264_t *h, x264_t *thread_current,
                                     x264_nal_t **pp_nal, int *pi_nal,
                                     x264_picture_t *pic_out );
index edc50766eecba987d6c3bbb3b7375153761d6be6..76bf57bed985fe9ff8b3c5796cba67c4eca6c008 100644 (file)
@@ -43,7 +43,7 @@ static uint16_t cabac_prefix_size[15][128];
 #define x264_cabac_encode_decision(c,x,v) x264_cabac_size_decision(c,x,v)
 #define x264_cabac_encode_terminal(c)     x264_cabac_size_decision(c,276,0)
 #define x264_cabac_encode_bypass(c,v)     ((c)->f8_bits_encoded += 256)
-#define x264_cabac_encode_ue_bypass(c,e,v) ((c)->f8_bits_encoded += (bs_size_ue(v+(1<<e)-1)-e)<<8)
+#define x264_cabac_encode_ue_bypass(c,e,v) ((c)->f8_bits_encoded += (bs_size_ue_big(v+(1<<e)-1)-e)<<8)
 #define x264_cabac_encode_flush(h,c)
 #define x264_macroblock_write_cabac  x264_macroblock_size_cabac
 #include "cabac.c"
index c3a065d0ed4c0be1a44a384bf515a7a07cb9db35..f47954d0db8532beb6ee139fedf1765bcb5628a9 100644 (file)
@@ -28,6 +28,8 @@
 #include "config.h"
 #endif
 
+#define bs_write_ue bs_write_ue_big
+
 static void transpose( uint8_t *buf, int w )
 {
     int i, j;
@@ -339,8 +341,8 @@ void x264_sps_write( bs_t *s, x264_sps_t *sps )
         bs_write1( s, sps->vui.b_timing_info_present );
         if( sps->vui.b_timing_info_present )
         {
-            bs_write( s, 32, sps->vui.i_num_units_in_tick );
-            bs_write( s, 32, sps->vui.i_time_scale );
+            bs_write32( s, sps->vui.i_num_units_in_tick );
+            bs_write32( s, sps->vui.i_time_scale );
             bs_write1( s, sps->vui.b_fixed_frame_rate );
         }