From 406a40dc41438edac3f60d231eb9196b3d33008f Mon Sep 17 00:00:00 2001
From: Fiona Glaser <fiona@x264.com>
Date: Sat, 27 Dec 2008 21:36:14 -0500
Subject: [PATCH] Much faster CABAC RDO Since RDO doesn't care about what order
 bit costs are calculated, merge sigmap and level coding into the same loop in
 RDO. This is bit-exact for 4x4dct but slightly incorrect for 8x8dct due to
 the sigmap containing duplicated contexts. However, the PSNR penalty of this
 is extremely small (~0.001db). Speed benefit is about 15% in 4x4dct and 30%
 in 8x8dct residual bit cost calculation at QP20. Overall encoding speed
 benefit is up to 5%, depending on encoding settings. Also remove an old
 unnecessary CABAC table that hasn't been used for years.

---
 common/cabac.c  |  35 ---------------
 encoder/cabac.c | 116 +++++++++++++++++++++++++++++++++++++++++-------
 2 files changed, 101 insertions(+), 50 deletions(-)

diff --git a/common/cabac.c b/common/cabac.c
index 722451bd..7a2e94dd 100644
--- a/common/cabac.c
+++ b/common/cabac.c
@@ -742,41 +742,6 @@ const uint8_t x264_cabac_renorm_shift[64]= {
  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
 };
 
-static const uint8_t x264_cabac_probability[128] =
-{
-    FIX8(0.9812), FIX8(0.9802), FIX8(0.9792), FIX8(0.9781),
-    FIX8(0.9769), FIX8(0.9757), FIX8(0.9744), FIX8(0.9730),
-    FIX8(0.9716), FIX8(0.9700), FIX8(0.9684), FIX8(0.9667),
-    FIX8(0.9650), FIX8(0.9631), FIX8(0.9611), FIX8(0.9590),
-    FIX8(0.9568), FIX8(0.9545), FIX8(0.9521), FIX8(0.9495),
-    FIX8(0.9468), FIX8(0.9440), FIX8(0.9410), FIX8(0.9378),
-    FIX8(0.9345), FIX8(0.9310), FIX8(0.9273), FIX8(0.9234),
-    FIX8(0.9193), FIX8(0.9150), FIX8(0.9105), FIX8(0.9057),
-    FIX8(0.9006), FIX8(0.8953), FIX8(0.8897), FIX8(0.8838),
-    FIX8(0.8776), FIX8(0.8710), FIX8(0.8641), FIX8(0.8569),
-    FIX8(0.8492), FIX8(0.8411), FIX8(0.8326), FIX8(0.8237),
-    FIX8(0.8143), FIX8(0.8043), FIX8(0.7938), FIX8(0.7828),
-    FIX8(0.7712), FIX8(0.7590), FIX8(0.7461), FIX8(0.7325),
-    FIX8(0.7182), FIX8(0.7031), FIX8(0.6872), FIX8(0.6705),
-    FIX8(0.6528), FIX8(0.6343), FIX8(0.6147), FIX8(0.5941),
-    FIX8(0.5724), FIX8(0.5495), FIX8(0.5254), FIX8(0.5000),
-    FIX8(0.5000), FIX8(0.4746), FIX8(0.4505), FIX8(0.4276),
-    FIX8(0.4059), FIX8(0.3853), FIX8(0.3657), FIX8(0.3472),
-    FIX8(0.3295), FIX8(0.3128), FIX8(0.2969), FIX8(0.2818),
-    FIX8(0.2675), FIX8(0.2539), FIX8(0.2410), FIX8(0.2288),
-    FIX8(0.2172), FIX8(0.2062), FIX8(0.1957), FIX8(0.1857),
-    FIX8(0.1763), FIX8(0.1674), FIX8(0.1589), FIX8(0.1508),
-    FIX8(0.1431), FIX8(0.1359), FIX8(0.1290), FIX8(0.1224),
-    FIX8(0.1162), FIX8(0.1103), FIX8(0.1047), FIX8(0.0994),
-    FIX8(0.0943), FIX8(0.0895), FIX8(0.0850), FIX8(0.0807),
-    FIX8(0.0766), FIX8(0.0727), FIX8(0.0690), FIX8(0.0655),
-    FIX8(0.0622), FIX8(0.0590), FIX8(0.0560), FIX8(0.0532),
-    FIX8(0.0505), FIX8(0.0479), FIX8(0.0455), FIX8(0.0432),
-    FIX8(0.0410), FIX8(0.0389), FIX8(0.0369), FIX8(0.0350),
-    FIX8(0.0333), FIX8(0.0316), FIX8(0.0300), FIX8(0.0284),
-    FIX8(0.0270), FIX8(0.0256), FIX8(0.0243), FIX8(0.0231),
-    FIX8(0.0219), FIX8(0.0208), FIX8(0.0198), FIX8(0.0187)
-};
 /* -ln2(probability) */
 #define F(a,b) {FIX8(a),FIX8(b)}
 const uint16_t x264_cabac_entropy[128][2] =
diff --git a/encoder/cabac.c b/encoder/cabac.c
index 93aa88ba..4fa74033 100644
--- a/encoder/cabac.c
+++ b/encoder/cabac.c
@@ -636,6 +636,7 @@ static const uint8_t coeff_abs_level_transition[2][8] = {
     { 4, 4, 4, 4, 5, 6, 7, 7 }
 };
 
+#if !RDO_SKIP_BS
 static void block_residual_write_cabac( x264_t *h, x264_cabac_t *cb, int i_ctxBlockCat, int i_idx, int16_t *l, int i_count )
 {
     const int i_ctx_sig = significant_coeff_flag_offset[h->mb.b_interlaced][i_ctxBlockCat];
@@ -692,9 +693,7 @@ static void block_residual_write_cabac( x264_t *h, x264_cabac_t *cb, int i_ctxBl
     if( i == i_last )
     {
         i_coeff_abs_m1[i_coeff] = abs(l[i]) - 1;
-#if !RDO_SKIP_BS
         i_coeff_sign[i_coeff]   = l[i] < 0;
-#endif
         i_coeff++;
     }
 
@@ -711,15 +710,10 @@ static void block_residual_write_cabac( x264_t *h, x264_cabac_t *cb, int i_ctxBl
         {
             x264_cabac_encode_decision( cb, ctx, 1 );
             ctx = coeff_abs_levelgt1_ctx[node_ctx] + i_ctx_level;
-#if RDO_SKIP_BS
-            cb->f8_bits_encoded += cabac_size_unary[i_prefix][cb->state[ctx]];
-            cb->state[ctx] = cabac_transition_unary[i_prefix][cb->state[ctx]];
-#else
             for( i = 0; i < i_prefix - 1; i++ )
                 x264_cabac_encode_decision( cb, ctx, 1 );
             if( i_prefix < 14 )
                 x264_cabac_encode_decision( cb, ctx, 0 );
-#endif
             if( i_prefix >= 14 )
                 x264_cabac_encode_ue_bypass( cb, 0, i_coeff_abs_m1[i_coeff] - 14 );
 
@@ -729,18 +723,110 @@ static void block_residual_write_cabac( x264_t *h, x264_cabac_t *cb, int i_ctxBl
         {
             x264_cabac_encode_decision( cb, ctx, 0 );
             node_ctx = coeff_abs_level_transition[0][node_ctx];
-#if RDO_SKIP_BS
-            x264_cabac_encode_bypass( cb, 0 ); // sign
-#endif
         }
 
-#if !RDO_SKIP_BS
         x264_cabac_encode_bypass( cb, i_coeff_sign[i_coeff] );
-#endif
     } while( i_coeff > 0 );
 }
+#define block_residual_write_cabac_8x8( h, cb, idx, l ) block_residual_write_cabac( h, cb, DCT_LUMA_8x8, idx, l, 64 )
+
+#else
+
+/* Faster RDO by merging sigmap and level coding.  Note that for 8x8dct
+ * this is slightly incorrect because the sigmap is not reversible
+ * (contexts are repeated).  However, there is nearly no quality penalty
+ * for this (~0.001db) and the speed boost (~30%) is worth it. */
+static void ALWAYS_INLINE block_residual_write_cabac_internal( x264_t *h, x264_cabac_t *cb, int i_ctxBlockCat, int i_idx, int16_t *l, int i_count, int b_8x8 )
+{
+    const int i_ctx_sig = significant_coeff_flag_offset[h->mb.b_interlaced][i_ctxBlockCat];
+    const int i_ctx_last = last_coeff_flag_offset[h->mb.b_interlaced][i_ctxBlockCat];
+    const int i_ctx_level = coeff_abs_level_m1_offset[i_ctxBlockCat];
+    const uint8_t *significant_coeff_flag_offset = significant_coeff_flag_offset_8x8[h->mb.b_interlaced];
+    int i_last, i_coeff_abs_m1, ctx, i_prefix, i, node_ctx;
+
+    if( !b_8x8 )
+    {
+        /* coded block flag */
+        ctx = 85 + x264_cabac_mb_cbf_ctxidxinc( h, i_ctxBlockCat, i_idx );
+        if( h->mb.cache.non_zero_count[x264_scan8[i_idx]] )
+            x264_cabac_encode_decision( cb, ctx, 1 );
+        else
+        {
+            x264_cabac_encode_decision( cb, ctx, 0 );
+            return;
+        }
+    }
+
+    i_last = h->quantf.coeff_last[i_ctxBlockCat](l);
 
+    i_coeff_abs_m1 = abs(l[i_last]) - 1;
+    i_prefix = X264_MIN( i_coeff_abs_m1, 14 );
+    ctx = coeff_abs_level1_ctx[0] + i_ctx_level;
 
+    if( i_last != i_count - 1 )
+    {
+        x264_cabac_encode_decision( cb, i_ctx_sig + (b_8x8?significant_coeff_flag_offset[i_last]:i_last), 1 );
+        x264_cabac_encode_decision( cb, i_ctx_last + (b_8x8?last_coeff_flag_offset_8x8[i_last]:i_last), 1 );
+    }
+
+    if( i_prefix )
+    {
+        x264_cabac_encode_decision( cb, ctx, 1 );
+        ctx = coeff_abs_levelgt1_ctx[0] + i_ctx_level;
+        cb->f8_bits_encoded += cabac_size_unary[i_prefix][cb->state[ctx]];
+        cb->state[ctx] = cabac_transition_unary[i_prefix][cb->state[ctx]];
+        if( i_prefix >= 14 )
+            x264_cabac_encode_ue_bypass( cb, 0, i_coeff_abs_m1 - 14 );
+        node_ctx = coeff_abs_level_transition[1][0];
+    }
+    else
+    {
+        x264_cabac_encode_decision( cb, ctx, 0 );
+        node_ctx = coeff_abs_level_transition[0][0];
+        x264_cabac_encode_bypass( cb, 0 ); // sign
+    }
+
+    for( i = i_last-1 ; i >= 0; i-- )
+    {
+        if( l[i] )
+        {
+            x264_cabac_encode_decision( cb, i_ctx_sig + (b_8x8?significant_coeff_flag_offset[i]:i), 1 );
+            x264_cabac_encode_decision( cb, i_ctx_last + (b_8x8?last_coeff_flag_offset_8x8[i]:i), 0 );
+            ctx = coeff_abs_level1_ctx[node_ctx] + i_ctx_level;
+
+            if( (unsigned)(l[i]+1) > 2 )
+            {
+                i_coeff_abs_m1 = abs(l[i]) - 1;
+                i_prefix = X264_MIN( i_coeff_abs_m1, 14 );
+                x264_cabac_encode_decision( cb, ctx, 1 );
+                ctx = coeff_abs_levelgt1_ctx[node_ctx] + i_ctx_level;
+                cb->f8_bits_encoded += cabac_size_unary[i_prefix][cb->state[ctx]];
+                cb->state[ctx] = cabac_transition_unary[i_prefix][cb->state[ctx]];
+                if( i_prefix >= 14 )
+                    x264_cabac_encode_ue_bypass( cb, 0, i_coeff_abs_m1 - 14 );
+                node_ctx = coeff_abs_level_transition[1][node_ctx];
+            }
+            else
+            {
+                x264_cabac_encode_decision( cb, ctx, 0 );
+                node_ctx = coeff_abs_level_transition[0][node_ctx];
+                x264_cabac_encode_bypass( cb, 0 );
+            }
+        }
+        else
+            x264_cabac_encode_decision( cb, i_ctx_sig + (b_8x8?significant_coeff_flag_offset[i]:i), 0 );
+    }
+}
+
+static void block_residual_write_cabac_8x8( x264_t *h, x264_cabac_t *cb, int i_idx, int16_t *l )
+{
+    block_residual_write_cabac_internal( h, cb, DCT_LUMA_8x8, i_idx, l, 64, 1 );
+}
+static void block_residual_write_cabac( x264_t *h, x264_cabac_t *cb, int i_ctxBlockCat, int i_idx, int16_t *l, int i_count )
+{
+    block_residual_write_cabac_internal( h, cb, i_ctxBlockCat, i_idx, l, i_count, 0 );
+}
+#endif
 
 void x264_macroblock_write_cabac( x264_t *h, x264_cabac_t *cb )
 {
@@ -959,7 +1045,7 @@ void x264_macroblock_write_cabac( x264_t *h, x264_cabac_t *cb )
         {
             for( i = 0; i < 4; i++ )
                 if( h->mb.i_cbp_luma & ( 1 << i ) )
-                    block_residual_write_cabac( h, cb, DCT_LUMA_8x8, i, h->dct.luma8x8[i], 64 );
+                    block_residual_write_cabac_8x8( h, cb, i, h->dct.luma8x8[i] );
         }
         else
         {
@@ -1024,7 +1110,7 @@ static void x264_partition_size_cabac( x264_t *h, x264_cabac_t *cb, int i8, int
         if( h->mb.i_cbp_luma & (1 << i8) )
         {
             if( h->mb.b_transform_8x8 )
-                block_residual_write_cabac( h, cb, DCT_LUMA_8x8, i8, h->dct.luma8x8[i8], 64 );
+                block_residual_write_cabac_8x8( h, cb, i8, h->dct.luma8x8[i8] );
             else
             {
                 int i4;
@@ -1063,7 +1149,7 @@ static void x264_partition_i8x8_size_cabac( x264_t *h, x264_cabac_t *cb, int i8,
     {
         *(uint16_t*)&h->mb.cache.non_zero_count[x264_scan8[i8*4]] = 0x0101;
         *(uint16_t*)&h->mb.cache.non_zero_count[x264_scan8[i8*4+2]] = 0x0101;
-        block_residual_write_cabac( h, cb, DCT_LUMA_8x8, 4*i8, h->dct.luma8x8[i8], 64 );
+        block_residual_write_cabac_8x8( h, cb, 4*i8, h->dct.luma8x8[i8] );
     }
     else
     {
-- 
2.40.0