From c656d68ff441d2925afcb40ccfaf49279fd95656 Mon Sep 17 00:00:00 2001
From: Fiona Glaser <fiona@x264.com>
Date: Sat, 7 Feb 2009 02:27:16 -0800
Subject: [PATCH] Much faster CABAC residual context selection Up to ~17%
 faster CABAC RDO, ~36% faster intra-only CABAC RDO. Up to 7% faster overall
 in extreme cases.

---
 encoder/cabac.c | 121 ++++++++++++++++--------------------------------
 1 file changed, 39 insertions(+), 82 deletions(-)

diff --git a/encoder/cabac.c b/encoder/cabac.c
index acdd4319..fb1d1227 100644
--- a/encoder/cabac.c
+++ b/encoder/cabac.c
@@ -505,92 +505,43 @@ static inline void x264_cabac_mb8x8_mvd( x264_t *h, x264_cabac_t *cb, int i_list
     }
 }
 
-static int x264_cabac_mb_cbf_ctxidxinc( x264_t *h, int i_cat, int i_idx )
+/* i_ctxBlockCat: 0-> DC 16x16  i_idx = 0
+ *                1-> AC 16x16  i_idx = luma4x4idx
+ *                2-> Luma4x4   i_idx = luma4x4idx
+ *                3-> DC Chroma i_idx = iCbCr
+ *                4-> AC Chroma i_idx = 4 * iCbCr + chroma4x4idx
+ *                5-> Luma8x8   i_idx = luma8x8idx
+ */
+
+static int ALWAYS_INLINE x264_cabac_mb_cbf_ctxidxinc( x264_t *h, int i_cat, int i_idx )
 {
-    /* i_ctxBlockCat: 0-> DC 16x16  i_idx = 0
-     *                1-> AC 16x16  i_idx = luma4x4idx
-     *                2-> Luma4x4   i_idx = luma4x4idx
-     *                3-> DC Chroma i_idx = iCbCr
-     *                4-> AC Chroma i_idx = 4 * iCbCr + chroma4x4idx
-     *                5-> Luma8x8   i_idx = luma8x8idx
-     */
-
-    int i_mba_xy = -1;
-    int i_mbb_xy = -1;
-    int i_nza = 0;
-    int i_nzb = 0;
+    int i_nza;
+    int i_nzb;
+    int b_intra = IS_INTRA( h->mb.i_type );
 
     switch( i_cat )
     {
-        case DCT_LUMA_DC:
-            if( h->mb.i_neighbour & MB_LEFT )
-            {
-                i_mba_xy = h->mb.i_mb_xy - 1;
-                i_nza = h->mb.cbp[i_mba_xy] & 0x100;
-            }
-            if( h->mb.i_neighbour & MB_TOP )
-            {
-                i_mbb_xy = h->mb.i_mb_top_xy;
-                i_nzb = h->mb.cbp[i_mbb_xy] & 0x100;
-            }
-            break;
         case DCT_LUMA_AC:
         case DCT_LUMA_4x4:
-            if( i_idx & ~10 ) // block_idx_x > 0
-                i_mba_xy = h->mb.i_mb_xy;
-            else if( h->mb.i_neighbour & MB_LEFT )
-                i_mba_xy = h->mb.i_mb_xy - 1;
-
-            if( i_idx & ~5 ) // block_idx_y > 0
-                i_mbb_xy = h->mb.i_mb_xy;
-            else if( h->mb.i_neighbour & MB_TOP )
-                i_mbb_xy = h->mb.i_mb_top_xy;
-
+        case DCT_CHROMA_AC:
             /* no need to test for skip/pcm */
-            if( i_mba_xy >= 0 )
-                i_nza = h->mb.cache.non_zero_count[x264_scan8[i_idx] - 1];
-            if( i_mbb_xy >= 0 )
-                i_nzb = h->mb.cache.non_zero_count[x264_scan8[i_idx] - 8];
-            break;
+            i_nza = h->mb.cache.non_zero_count[x264_scan8[i_idx] - 1];
+            i_nzb = h->mb.cache.non_zero_count[x264_scan8[i_idx] - 8];
+            i_nza &= 0x7f + (b_intra << 7);
+            i_nzb &= 0x7f + (b_intra << 7);
+            return 4*i_cat + 2*!!i_nzb + !!i_nza;
+        case DCT_LUMA_DC:
+            /* Note: this depends on the exact values of MB_LEFT and MB_TOP enums */
+            i_nza = ((h->mb.cbp[h->mb.i_mb_xy - 1] >> 8) | ~h->mb.i_neighbour) & 1;
+            i_nzb = ((h->mb.cbp[h->mb.i_mb_top_xy] >> 7) | ~h->mb.i_neighbour) & 2;
+            return 4*i_cat + i_nzb + i_nza;
         case DCT_CHROMA_DC:
             /* no need to test skip/pcm */
             i_idx -= 25;
-            if( h->mb.i_neighbour & MB_LEFT )
-            {
-                i_mba_xy = h->mb.i_mb_xy - 1;
-                i_nza = h->mb.cbp[i_mba_xy] & (0x200 << i_idx);
-            }
-            if( h->mb.i_neighbour & MB_TOP )
-            {
-                i_mbb_xy = h->mb.i_mb_top_xy;
-                i_nzb = h->mb.cbp[i_mbb_xy] & (0x200 << i_idx);
-            }
-            break;
-        case DCT_CHROMA_AC:
-            if( i_idx & 1 )
-                i_mba_xy = h->mb.i_mb_xy;
-            else if( h->mb.i_neighbour & MB_LEFT )
-                i_mba_xy = h->mb.i_mb_xy - 1;
-
-            if( i_idx & 2 )
-                i_mbb_xy = h->mb.i_mb_xy;
-            else if( h->mb.i_neighbour & MB_TOP )
-                i_mbb_xy = h->mb.i_mb_top_xy;
-
-            /* no need to test skip/pcm */
-            if( i_mba_xy >= 0 )
-                i_nza = h->mb.cache.non_zero_count[x264_scan8[i_idx] - 1];
-            if( i_mbb_xy >= 0 )
-                i_nzb = h->mb.cache.non_zero_count[x264_scan8[i_idx] - 8];
+            i_nza = h->mb.i_neighbour & MB_LEFT ? (h->mb.cbp[h->mb.i_mb_xy - 1] >> (9 + i_idx)) & 1 : b_intra;
+            i_nzb = h->mb.i_neighbour & MB_TOP  ? (h->mb.cbp[h->mb.i_mb_top_xy] >> (9 + i_idx)) & 1 : b_intra;
+            return 4*i_cat + 2*i_nzb + i_nza;
     }
-
-    if( IS_INTRA( h->mb.i_type ) )
-    {
-        i_nza |= i_mba_xy < 0;
-        i_nzb |= i_mbb_xy < 0;
-    }
-
-    return 4*i_cat + 2*!!i_nzb + !!i_nza;
 }
 
 
@@ -637,7 +588,7 @@ static const uint8_t coeff_abs_level_transition[2][8] = {
 };
 
 #if !RDO_SKIP_BS
-static void block_residual_write_cabac( x264_t *h, x264_cabac_t *cb, int i_ctxBlockCat, int i_idx, int16_t *l, int i_count )
+static void block_residual_write_cabac( x264_t *h, x264_cabac_t *cb, int i_ctxBlockCat, int i_idx, int16_t *l, int i_count, int ctxidxinc )
 {
     const int i_ctx_sig = significant_coeff_flag_offset[h->mb.b_interlaced][i_ctxBlockCat];
     const int i_ctx_last = last_coeff_flag_offset[h->mb.b_interlaced][i_ctxBlockCat];
@@ -655,7 +606,7 @@ static void block_residual_write_cabac( x264_t *h, x264_cabac_t *cb, int i_ctxBl
     if( i_count != 64 )
     {
         /* coded block flag */
-        int ctx = 85 + x264_cabac_mb_cbf_ctxidxinc( h, i_ctxBlockCat, i_idx );
+        int ctx = 85 + ctxidxinc;
         if( h->mb.cache.non_zero_count[x264_scan8[i_idx]] )
             x264_cabac_encode_decision( cb, ctx, 1 );
         else
@@ -736,7 +687,7 @@ static void block_residual_write_cabac( x264_t *h, x264_cabac_t *cb, int i_ctxBl
  * this is slightly incorrect because the sigmap is not reversible
  * (contexts are repeated).  However, there is nearly no quality penalty
  * for this (~0.001db) and the speed boost (~30%) is worth it. */
-static void ALWAYS_INLINE block_residual_write_cabac_internal( x264_t *h, x264_cabac_t *cb, int i_ctxBlockCat, int i_idx, int16_t *l, int i_count, int b_8x8 )
+static void ALWAYS_INLINE block_residual_write_cabac_internal( x264_t *h, x264_cabac_t *cb, int i_ctxBlockCat, int i_idx, int16_t *l, int i_count, int ctxidxinc, int b_8x8 )
 {
     const int i_ctx_sig = significant_coeff_flag_offset[h->mb.b_interlaced][i_ctxBlockCat];
     const int i_ctx_last = last_coeff_flag_offset[h->mb.b_interlaced][i_ctxBlockCat];
@@ -747,7 +698,7 @@ static void ALWAYS_INLINE block_residual_write_cabac_internal( x264_t *h, x264_c
     if( !b_8x8 )
     {
         /* coded block flag */
-        ctx = 85 + x264_cabac_mb_cbf_ctxidxinc( h, i_ctxBlockCat, i_idx );
+        ctx = 85 + ctxidxinc;
         if( h->mb.cache.non_zero_count[x264_scan8[i_idx]] )
             x264_cabac_encode_decision( cb, ctx, 1 );
         else
@@ -820,14 +771,20 @@ static void ALWAYS_INLINE block_residual_write_cabac_internal( x264_t *h, x264_c
 
 static void block_residual_write_cabac_8x8( x264_t *h, x264_cabac_t *cb, int i_idx, int16_t *l )
 {
-    block_residual_write_cabac_internal( h, cb, DCT_LUMA_8x8, i_idx, l, 64, 1 );
+    block_residual_write_cabac_internal( h, cb, DCT_LUMA_8x8, i_idx, l, 64, 0, 1 );
 }
-static void block_residual_write_cabac( x264_t *h, x264_cabac_t *cb, int i_ctxBlockCat, int i_idx, int16_t *l, int i_count )
+static void block_residual_write_cabac( x264_t *h, x264_cabac_t *cb, int i_ctxBlockCat, int i_idx, int16_t *l, int i_count, int ctxidxinc )
 {
-    block_residual_write_cabac_internal( h, cb, i_ctxBlockCat, i_idx, l, i_count, 0 );
+    block_residual_write_cabac_internal( h, cb, i_ctxBlockCat, i_idx, l, i_count, ctxidxinc, 0 );
 }
 #endif
 
+#define block_residual_write_cabac( h, cb, i_ctxBlockCat, i_idx, l, i_count ) \
+{ \
+    int ctxidxinc = x264_cabac_mb_cbf_ctxidxinc( h, i_ctxBlockCat, i_idx); \
+    block_residual_write_cabac( h, cb, i_ctxBlockCat, i_idx, l, i_count, ctxidxinc ); \
+}
+
 void x264_macroblock_write_cabac( x264_t *h, x264_cabac_t *cb )
 {
     const int i_mb_type = h->mb.i_type;
-- 
2.40.0