From: Loren Merritt <pengvado@videolan.org>
Date: Tue, 14 Jun 2005 19:19:52 +0000 (+0000)
Subject: rate-distortion optimized MB types in I- and P-frames (--subme 6)
X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=15ecd54fc67e75ccd380a7e36720f1a0c2514f94;p=libx264

rate-distortion optimized MB types in I- and P-frames (--subme 6)


git-svn-id: svn://svn.videolan.org/x264/trunk@262 df754926-b1dd-0310-bc7b-ec298dee348c
---

diff --git a/common/cabac.h b/common/cabac.h
index 945fb17a..db15ee09 100644
--- a/common/cabac.h
+++ b/common/cabac.h
@@ -75,5 +75,9 @@ void x264_cabac_encode_bypass( x264_cabac_t *cb, int b );
 void x264_cabac_encode_terminal( x264_cabac_t *cb, int b );
 void x264_cabac_encode_flush( x264_cabac_t *cb );
 
+static inline int x264_cabac_pos( x264_cabac_t *cb )
+{
+    return bs_pos( cb->s ) + cb->i_bits_outstanding;
+}
 
 #endif
diff --git a/common/common.h b/common/common.h
index a50867f9..ebd2ec90 100644
--- a/common/common.h
+++ b/common/common.h
@@ -404,6 +404,7 @@ struct x264_t
         } cache;
 
         /* */
+        int     i_qp;       /* current qp */
         int     i_last_qp;  /* last qp */
         int     i_last_dqp; /* last delta qp */
         int     b_variable_qp; /* whether qp is allowed to vary per macroblock */
diff --git a/common/macroblock.c b/common/macroblock.c
index 03cf373b..85a73c7d 100644
--- a/common/macroblock.c
+++ b/common/macroblock.c
@@ -1485,7 +1485,7 @@ void x264_macroblock_cache_save( x264_t *h )
     if( h->param.b_cabac )
     {
         if( i_mb_type == I_4x4 || i_mb_type == I_16x16 )
-            h->mb.chroma_pred_mode[i_mb_xy] = h->mb.i_chroma_pred_mode;
+            h->mb.chroma_pred_mode[i_mb_xy] = x264_mb_pred_mode8x8c_fix[ h->mb.i_chroma_pred_mode ];
         else
             h->mb.chroma_pred_mode[i_mb_xy] = I_PRED_CHROMA_DC;
 
diff --git a/encoder/analyse.c b/encoder/analyse.c
index ab7507fe..2d0c1c30 100644
--- a/encoder/analyse.c
+++ b/encoder/analyse.c
@@ -1,10 +1,11 @@
 /*****************************************************************************
  * analyse.c: h264 encoder library
  *****************************************************************************
- * Copyright (C) 2003 Laurent Aimar
+ * Copyright (C) 2003 x264 project
  * $Id: analyse.c,v 1.1 2004/06/03 19:27:08 fenrir Exp $
  *
  * Authors: Laurent Aimar <fenrir@via.ecp.fr>
+ *          Loren Merritt <lorenm@u.washington.edu>
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
@@ -32,6 +33,8 @@
 #include "macroblock.h"
 #include "me.h"
 #include "ratecontrol.h"
+#include "analyse.h"
+#include "rdo.c"
 
 typedef struct
 {
@@ -69,13 +72,16 @@ typedef struct
 {
     /* conduct the analysis using this lamda and QP */
     int i_lambda;
+    int i_lambda2;
     int i_qp;
     int16_t *p_cost_mv;
+    int b_mbrd;
 
 
     /* I: Intra part */
     /* Take some shortcuts in intra search if intra is deemed unlikely */
     int b_fast_intra;
+    int i_best_satd;
 
     /* Luma part */
     int i_sad_i16x16;
@@ -111,6 +117,7 @@ typedef struct
 
 } x264_mb_analysis_t;
 
+/* lambda = pow(2,qp/6-2) */
 static const int i_qp0_cost_table[52] = {
    1, 1, 1, 1, 1, 1, 1, 1,  /*  0-7 */
    1, 1, 1, 1,              /*  8-11 */
@@ -121,6 +128,19 @@ static const int i_qp0_cost_table[52] = {
   40,45,51,57,64,72,81,91   /* 44-51 */
 };
 
+/* pow(lambda,2) * .9 */
+static const int i_qp0_cost2_table[52] = {
+   1,   1,   1,   1,   1,   1, /*  0-5  */
+   1,   1,   1,   1,   1,   1, /*  6-11 */
+   1,   1,   1,   2,   2,   3, /* 12-17 */
+   4,   5,   6,   7,   9,  11, /* 18-23 */
+  14,  18,  23,  29,  36,  46, /* 24-29 */
+  58,  73,  91, 115, 145, 183, /* 30-35 */
+ 230, 290, 366, 461, 581, 731, /* 36-41 */
+ 922,1161,1463,1843,2322,2926, /* 42-47 */
+3686,4645,5852,7373
+};
+
 static const uint8_t block_idx_x[16] = {
     0, 1, 0, 1, 2, 3, 2, 3, 0, 1, 0, 1, 2, 3, 2, 3
 };
@@ -142,6 +162,8 @@ static const int i_sub_mb_p_cost_table[4] = {
     5, 3, 3, 1
 };
 
+static void x264_analyse_update_cache( x264_t *h, x264_mb_analysis_t *a );
+
 /* initialize an array of lambda*nbits for all possible mvs */
 static void x264_mb_analyse_load_costs( x264_t *h, x264_mb_analysis_t *a )
 {
@@ -171,12 +193,13 @@ static void x264_mb_analyse_init( x264_t *h, x264_mb_analysis_t *a, int i_qp )
     /* conduct the analysis using this lamda and QP */
     a->i_qp = i_qp;
     a->i_lambda = i_qp0_cost_table[i_qp];
+    a->i_lambda2 = i_qp0_cost2_table[i_qp];
+    a->b_mbrd = h->param.analyse.i_subpel_refine >= 6 && h->sh.i_type != SLICE_TYPE_B;
 
     h->mb.i_me_method = h->param.analyse.i_me_method;
     h->mb.i_subpel_refine = h->param.analyse.i_subpel_refine;
     h->mb.b_chroma_me = h->param.analyse.b_chroma_me && h->sh.i_type == SLICE_TYPE_P
                         && h->mb.i_subpel_refine >= 5;
-    a->b_fast_intra = 0;
 
     h->mb.b_transform_8x8 = 0;
 
@@ -186,6 +209,9 @@ static void x264_mb_analyse_init( x264_t *h, x264_mb_analysis_t *a, int i_qp )
     a->i_sad_i4x4   =
     a->i_sad_i8x8chroma = COST_MAX;
 
+    a->b_fast_intra = 0;
+    a->i_best_satd = COST_MAX;
+
     /* II: Inter part P/B frame */
     if( h->sh.i_type != SLICE_TYPE_I )
     {
@@ -245,7 +271,8 @@ static void x264_mb_analyse_init( x264_t *h, x264_mb_analysis_t *a, int i_qp )
         /* Fast intra decision */
         if( h->mb.i_mb_xy - h->sh.i_first_mb > 4 )
         {
-            if(   IS_INTRA( h->mb.i_mb_type_left )
+            if( a->b_mbrd
+               || IS_INTRA( h->mb.i_mb_type_left )
                || IS_INTRA( h->mb.i_mb_type_top )
                || IS_INTRA( h->mb.i_mb_type_topleft )
                || IS_INTRA( h->mb.i_mb_type_topright )
@@ -382,18 +409,73 @@ static void predict_4x4_mode_available( unsigned int i_neighbour,
     }
 }
 
-static void x264_mb_analyse_intra( x264_t *h, x264_mb_analysis_t *res, int i_cost_inter )
+static void x264_mb_analyse_intra_chroma( x264_t *h, x264_mb_analysis_t *a )
+{
+    int i;
+
+    int i_max;
+    int predict_mode[9];
+
+    uint8_t *p_dstc[2], *p_srcc[2];
+    int      i_stride[2];
+
+    if( a->i_sad_i8x8chroma < COST_MAX )
+        return;
+
+    /* 8x8 prediction selection for chroma */
+    p_dstc[0] = h->mb.pic.p_fdec[1];
+    p_dstc[1] = h->mb.pic.p_fdec[2];
+    p_srcc[0] = h->mb.pic.p_fenc[1];
+    p_srcc[1] = h->mb.pic.p_fenc[2];
+
+    i_stride[0] = h->mb.pic.i_stride[1];
+    i_stride[1] = h->mb.pic.i_stride[2];
+
+    predict_8x8chroma_mode_available( h->mb.i_neighbour, predict_mode, &i_max );
+    a->i_sad_i8x8chroma = COST_MAX;
+    for( i = 0; i < i_max; i++ )
+    {
+        int i_sad;
+        int i_mode;
+
+        i_mode = predict_mode[i];
+
+        /* we do the prediction */
+        h->predict_8x8c[i_mode]( p_dstc[0], i_stride[0] );
+        h->predict_8x8c[i_mode]( p_dstc[1], i_stride[1] );
+
+        /* we calculate the cost */
+        i_sad = h->pixf.satd[PIXEL_8x8]( p_dstc[0], i_stride[0],
+                                         p_srcc[0], i_stride[0] ) +
+                h->pixf.satd[PIXEL_8x8]( p_dstc[1], i_stride[1],
+                                         p_srcc[1], i_stride[1] ) +
+                a->i_lambda * bs_size_ue( x264_mb_pred_mode8x8c_fix[i_mode] );
+
+        /* if i_score is lower it is better */
+        if( a->i_sad_i8x8chroma > i_sad )
+        {
+            a->i_predict8x8chroma = i_mode;
+            a->i_sad_i8x8chroma   = i_sad;
+        }
+    }
+
+    h->mb.i_chroma_pred_mode = a->i_predict8x8chroma;
+}
+
+static void x264_mb_analyse_intra( x264_t *h, x264_mb_analysis_t *a, int i_cost_inter )
 {
     const unsigned int flags = h->sh.i_type == SLICE_TYPE_I ? h->param.analyse.intra : h->param.analyse.inter;
     const int i_stride = h->mb.pic.i_stride[0];
     uint8_t  *p_src = h->mb.pic.p_fenc[0];
     uint8_t  *p_dst = h->mb.pic.p_fdec[0];
+    int      f8_satd_rd_ratio = 0;
 
     int i, idx;
-
     int i_max;
     int predict_mode[9];
 
+    const int i_satd_thresh = a->i_best_satd * 5/4 + a->i_lambda * 10;
+
     /*---------------- Try all mode and calculate their score ---------------*/
 
     /* 16x16 prediction selection */
@@ -404,34 +486,45 @@ static void x264_mb_analyse_intra( x264_t *h, x264_mb_analysis_t *res, int i_cos
         int i_mode;
 
         i_mode = predict_mode[i];
-
-        /* we do the prediction */
         h->predict_16x16[i_mode]( p_dst, i_stride );
 
-        /* we calculate the diff and get the square sum of the diff */
         i_sad = h->pixf.satd[PIXEL_16x16]( p_dst, i_stride, p_src, i_stride ) +
-                res->i_lambda * bs_size_ue( x264_mb_pred_mode16x16_fix[i_mode] );
-        /* if i_score is lower it is better */
-        if( res->i_sad_i16x16 > i_sad )
+                a->i_lambda * bs_size_ue( x264_mb_pred_mode16x16_fix[i_mode] );
+        if( a->i_sad_i16x16 > i_sad )
         {
-            res->i_predict16x16 = i_mode;
-            res->i_sad_i16x16     = i_sad;
+            a->i_predict16x16 = i_mode;
+            a->i_sad_i16x16   = i_sad;
         }
     }
-    /* cavlc mb type prefix */
-    if( h->sh.i_type == SLICE_TYPE_B )
-        res->i_sad_i16x16 += res->i_lambda * i_mb_b_cost_table[I_16x16];
 
-    if( res->b_fast_intra )
+    if( a->b_mbrd )
     {
-        if( res->i_sad_i16x16 > 2*i_cost_inter )
+        f8_satd_rd_ratio = ((unsigned)i_cost_inter << 8) / a->i_best_satd + 1;
+        x264_mb_analyse_intra_chroma( h, a );
+        if( h->mb.b_chroma_me )
+            a->i_sad_i16x16 += a->i_sad_i8x8chroma;
+        if( a->i_sad_i16x16 < i_satd_thresh )
+        {
+            h->mb.i_type = I_16x16;
+            h->mb.i_intra16x16_pred_mode = a->i_predict16x16;
+            a->i_sad_i16x16 = x264_rd_cost_mb( h, a->i_lambda2 );
+        }
+        else
+            a->i_sad_i16x16 = a->i_sad_i16x16 * f8_satd_rd_ratio >> 8;
+    }
+    else
+    {
+        if( h->sh.i_type == SLICE_TYPE_B )
+            /* cavlc mb type prefix */
+            a->i_sad_i16x16 += a->i_lambda * i_mb_b_cost_table[I_16x16];
+        if( a->b_fast_intra && a->i_sad_i16x16 > 2*i_cost_inter )
             return;
     }
 
     /* 4x4 prediction selection */
     if( flags & X264_ANALYSE_I4x4 )
     {
-        res->i_sad_i4x4 = 0;
+        a->i_sad_i4x4 = 0;
         for( idx = 0; idx < 16; idx++ )
         {
             uint8_t *p_src_by;
@@ -455,40 +548,51 @@ static void x264_mb_analyse_intra( x264_t *h, x264_mb_analysis_t *res, int i_cos
                 int i_mode;
 
                 i_mode = predict_mode[i];
-
-                /* we do the prediction */
                 h->predict_4x4[i_mode]( p_dst_by, i_stride );
 
-                /* we calculate diff and get the square sum of the diff */
                 i_sad = h->pixf.satd[PIXEL_4x4]( p_dst_by, i_stride,
-                                                 p_src_by, i_stride );
-
-                i_sad += res->i_lambda * (i_pred_mode == x264_mb_pred_mode4x4_fix[i_mode] ? 1 : 4);
+                                                 p_src_by, i_stride )
+                      + a->i_lambda * (i_pred_mode == x264_mb_pred_mode4x4_fix(i_mode) ? 1 : 4);
 
-                /* if i_score is lower it is better */
                 if( i_best > i_sad )
                 {
-                    res->i_predict4x4[x][y] = i_mode;
+                    a->i_predict4x4[x][y] = i_mode;
                     i_best = i_sad;
                 }
             }
-            res->i_sad_i4x4 += i_best;
+            a->i_sad_i4x4 += i_best;
 
             /* we need to encode this block now (for next ones) */
-            h->predict_4x4[res->i_predict4x4[x][y]]( p_dst_by, i_stride );
-            x264_mb_encode_i4x4( h, idx, res->i_qp );
+            h->predict_4x4[a->i_predict4x4[x][y]]( p_dst_by, i_stride );
+            x264_mb_encode_i4x4( h, idx, a->i_qp );
 
-            h->mb.cache.intra4x4_pred_mode[x264_scan8[idx]] = res->i_predict4x4[x][y];
+            h->mb.cache.intra4x4_pred_mode[x264_scan8[idx]] = a->i_predict4x4[x][y];
+        }
+
+        a->i_sad_i4x4 += a->i_lambda * 24;    /* from JVT (SATD0) */
+        if( a->b_mbrd )
+        {
+            if( h->mb.b_chroma_me )
+                a->i_sad_i4x4 += a->i_sad_i8x8chroma;
+            if( a->i_sad_i4x4 < i_satd_thresh )
+            {
+                h->mb.i_type = I_4x4;
+                a->i_sad_i4x4 = x264_rd_cost_mb( h, a->i_lambda2 );
+            }
+            else
+                a->i_sad_i4x4 = a->i_sad_i4x4 * f8_satd_rd_ratio >> 8;
+        }
+        else
+        {
+            if( h->sh.i_type == SLICE_TYPE_B )
+                a->i_sad_i4x4 += a->i_lambda * i_mb_b_cost_table[I_4x4];
         }
-        res->i_sad_i4x4 += res->i_lambda * 24;    /* from JVT (SATD0) */
-        if( h->sh.i_type == SLICE_TYPE_B )
-            res->i_sad_i4x4 += res->i_lambda * i_mb_b_cost_table[I_4x4];
     }
 
     /* 8x8 prediction selection */
     if( flags & X264_ANALYSE_I8x8 )
     {
-        res->i_sad_i8x8 = 0;
+        a->i_sad_i8x8 = 0;
         for( idx = 0; idx < 4; idx++ )
         {
             uint8_t *p_src_by;
@@ -516,81 +620,43 @@ static void x264_mb_analyse_intra( x264_t *h, x264_mb_analysis_t *res, int i_cos
 
                 /* could use sa8d, but it doesn't seem worth the speed cost (without mmx at least) */
                 i_sad = h->pixf.satd[PIXEL_8x8]( p_dst_by, i_stride,
-                                                 p_src_by, i_stride );
-
-                i_sad += res->i_lambda * (i_pred_mode == x264_mb_pred_mode4x4_fix[i_mode] ? 1 : 4);
+                                                 p_src_by, i_stride )
+                      + a->i_lambda * (i_pred_mode == x264_mb_pred_mode4x4_fix(i_mode) ? 1 : 4);
 
                 if( i_best > i_sad )
                 {
-                    res->i_predict8x8[x][y] = i_mode;
+                    a->i_predict8x8[x][y] = i_mode;
                     i_best = i_sad;
                 }
             }
-            res->i_sad_i8x8 += i_best;
+            a->i_sad_i8x8 += i_best;
 
             /* we need to encode this block now (for next ones) */
-            h->predict_8x8[res->i_predict8x8[x][y]]( p_dst_by, i_stride, h->mb.i_neighbour );
-            x264_mb_encode_i8x8( h, idx, res->i_qp );
+            h->predict_8x8[a->i_predict8x8[x][y]]( p_dst_by, i_stride, h->mb.i_neighbour );
+            x264_mb_encode_i8x8( h, idx, a->i_qp );
 
-            x264_macroblock_cache_intra8x8_pred( h, 2*x, 2*y, res->i_predict4x4[x][y] );
+            x264_macroblock_cache_intra8x8_pred( h, 2*x, 2*y, a->i_predict8x8[x][y] );
         }
-        // FIXME some bias like in i4x4?
-        if( h->sh.i_type == SLICE_TYPE_B )
-            res->i_sad_i8x8 += res->i_lambda * i_mb_b_cost_table[I_8x8];
-    }
-}
-
-static void x264_mb_analyse_intra_chroma( x264_t *h, x264_mb_analysis_t *res )
-{
-    int i;
-
-    int i_max;
-    int predict_mode[9];
-
-    uint8_t *p_dstc[2], *p_srcc[2];
-    int      i_stride[2];
 
-    if( res->i_sad_i8x8chroma < COST_MAX )
-        return;
-
-    /* 8x8 prediction selection for chroma */
-    p_dstc[0] = h->mb.pic.p_fdec[1];
-    p_dstc[1] = h->mb.pic.p_fdec[2];
-    p_srcc[0] = h->mb.pic.p_fenc[1];
-    p_srcc[1] = h->mb.pic.p_fenc[2];
-
-    i_stride[0] = h->mb.pic.i_stride[1];
-    i_stride[1] = h->mb.pic.i_stride[2];
-
-    predict_8x8chroma_mode_available( h->mb.i_neighbour, predict_mode, &i_max );
-    res->i_sad_i8x8chroma = COST_MAX;
-    for( i = 0; i < i_max; i++ )
-    {
-        int i_sad;
-        int i_mode;
-
-        i_mode = predict_mode[i];
-
-        /* we do the prediction */
-        h->predict_8x8c[i_mode]( p_dstc[0], i_stride[0] );
-        h->predict_8x8c[i_mode]( p_dstc[1], i_stride[1] );
-
-        /* we calculate the cost */
-        i_sad = h->pixf.satd[PIXEL_8x8]( p_dstc[0], i_stride[0],
-                                         p_srcc[0], i_stride[0] ) +
-                h->pixf.satd[PIXEL_8x8]( p_dstc[1], i_stride[1],
-                                         p_srcc[1], i_stride[1] ) +
-                res->i_lambda * bs_size_ue( x264_mb_pred_mode8x8c_fix[i_mode] );
-
-        /* if i_score is lower it is better */
-        if( res->i_sad_i8x8chroma > i_sad )
+        if( a->b_mbrd )
+        {
+            if( h->mb.b_chroma_me )
+                a->i_sad_i8x8 += a->i_sad_i8x8chroma;
+            if( a->i_sad_i8x8 < i_satd_thresh )
+            {
+                h->mb.i_type = I_8x8;
+                a->i_sad_i8x8 = x264_rd_cost_mb( h, a->i_lambda2 );
+            }
+            else
+                a->i_sad_i8x8 = a->i_sad_i8x8 * f8_satd_rd_ratio >> 8;
+        }
+        else
         {
-            res->i_predict8x8chroma = i_mode;
-            res->i_sad_i8x8chroma   = i_sad;
+            // FIXME some bias like in i4x4?
+            if( h->sh.i_type == SLICE_TYPE_B )
+                a->i_sad_i8x8 += a->i_lambda * i_mb_b_cost_table[I_8x8];
         }
     }
-
-    h->mb.i_chroma_pred_mode = res->i_predict8x8chroma;
 }
 
 #define LOAD_FENC( m, src, xoff, yoff) \
@@ -646,11 +712,22 @@ static void x264_mb_analyse_inter_p16x16( x264_t *h, x264_mb_analysis_t *a )
         h->mb.mvr[0][i_ref][h->mb.i_mb_xy][1] = m.mv[1];
     }
 
-    /* subtract ref cost, so we don't have to add it for the other P types */
-    a->l0.me16x16.cost -= a->i_lambda * bs_size_te( h->sh.i_num_ref_idx_l0_active - 1, a->l0.i_ref );
-
     /* Set global ref, needed for all others modes */
     x264_macroblock_cache_ref( h, 0, 0, 4, 4, 0, a->l0.i_ref );
+
+    if( a->b_mbrd )
+    {
+        a->i_best_satd = a->l0.me16x16.cost;
+        h->mb.i_type = P_L0;
+        h->mb.i_partition = D_16x16;
+        x264_macroblock_cache_mv ( h, 0, 0, 4, 4, 0, a->l0.me16x16.mv[0], a->l0.me16x16.mv[1] );
+        a->l0.me16x16.cost = x264_rd_cost_mb( h, a->i_lambda2 );
+    }
+    else
+    {
+        /* subtract ref cost, so we don't have to add it for the other P types */
+        a->l0.me16x16.cost -= a->i_lambda * bs_size_te( h->sh.i_num_ref_idx_l0_active - 1, a->l0.i_ref );
+    }
 }
 
 static void x264_mb_analyse_inter_p8x8( x264_t *h, x264_mb_analysis_t *a )
@@ -693,7 +770,16 @@ static void x264_mb_analyse_inter_p8x8( x264_t *h, x264_mb_analysis_t *a )
     }
 
     a->l0.i_cost8x8 = a->l0.me8x8[0].cost + a->l0.me8x8[1].cost +
-                   a->l0.me8x8[2].cost + a->l0.me8x8[3].cost;
+                      a->l0.me8x8[2].cost + a->l0.me8x8[3].cost;
+    if( a->b_mbrd )
+    {
+        if( a->i_best_satd > a->l0.i_cost8x8 )
+            a->i_best_satd = a->l0.i_cost8x8;
+        h->mb.i_type = P_8x8;
+        h->mb.i_sub_partition[0] = h->mb.i_sub_partition[1] =
+        h->mb.i_sub_partition[2] = h->mb.i_sub_partition[3] = D_L0_8x8;
+        a->l0.i_cost8x8 = x264_rd_cost_mb( h, a->i_lambda2 );
+    }
 }
 
 static void x264_mb_analyse_inter_p16x8( x264_t *h, x264_mb_analysis_t *a )
@@ -728,6 +814,13 @@ static void x264_mb_analyse_inter_p16x8( x264_t *h, x264_mb_analysis_t *a )
     }
 
     a->l0.i_cost16x8 = a->l0.me16x8[0].cost + a->l0.me16x8[1].cost;
+    if( a->b_mbrd )
+    {
+        if( a->i_best_satd > a->l0.i_cost16x8 )
+            a->i_best_satd = a->l0.i_cost16x8;
+        h->mb.i_type = P_L0;
+        a->l0.i_cost16x8 = x264_rd_cost_mb( h, a->i_lambda2 );
+    }
 }
 
 static void x264_mb_analyse_inter_p8x16( x264_t *h, x264_mb_analysis_t *a )
@@ -762,6 +855,13 @@ static void x264_mb_analyse_inter_p8x16( x264_t *h, x264_mb_analysis_t *a )
     }
 
     a->l0.i_cost8x16 = a->l0.me8x16[0].cost + a->l0.me8x16[1].cost;
+    if( a->b_mbrd )
+    {
+        if( a->i_best_satd > a->l0.i_cost8x16 )
+            a->i_best_satd = a->l0.i_cost8x16;
+        h->mb.i_type = P_L0;
+        a->l0.i_cost8x16 = x264_rd_cost_mb( h, a->i_lambda2 );
+    }
 }
 
 static int x264_mb_analyse_inter_p4x4_chroma( x264_t *h, x264_mb_analysis_t *a, uint8_t **p_fref, int i8x8, int pixel )
@@ -1345,12 +1445,13 @@ static void x264_mb_analyse_inter_b8x16( x264_t *h, x264_mb_analysis_t *a )
 
 static inline void x264_mb_analyse_transform( x264_t *h )
 {
-    if( h->param.analyse.b_transform_8x8
-        && !IS_INTRA( h->mb.i_type )
-        && x264_mb_transform_8x8_allowed( h ) )
+    h->mb.cache.b_transform_8x8_allowed =
+        h->param.analyse.b_transform_8x8
+        && !IS_INTRA( h->mb.i_type ) && x264_mb_transform_8x8_allowed( h );
+
+    if( h->mb.cache.b_transform_8x8_allowed )
     {
         int i_cost4, i_cost8;
-
         /* FIXME only luma mc is needed */
         x264_mb_mc( h );
 
@@ -1358,11 +1459,33 @@ static inline void x264_mb_analyse_transform( x264_t *h )
                                              h->mb.pic.p_fdec[0], h->mb.pic.i_stride[0] );
         i_cost4 = h->pixf.satd[PIXEL_16x16]( h->mb.pic.p_fenc[0], h->mb.pic.i_stride[0],
                                              h->mb.pic.p_fdec[0], h->mb.pic.i_stride[0] );
+
         h->mb.b_transform_8x8 = i_cost8 < i_cost4;
-        h->mb.cache.b_transform_8x8_allowed = 1;
     }
-    else
-        h->mb.cache.b_transform_8x8_allowed = 0;
+}
+
+static inline void x264_mb_analyse_transform_rd( x264_t *h, x264_mb_analysis_t *a, int *i_cost )
+{
+    h->mb.cache.b_transform_8x8_allowed =
+        h->param.analyse.b_transform_8x8 && x264_mb_transform_8x8_allowed( h );
+
+    if( h->mb.cache.b_transform_8x8_allowed )
+    {
+        int i_cost8;
+        x264_analyse_update_cache( h, a );
+        h->mb.b_transform_8x8 = !h->mb.b_transform_8x8;
+        /* FIXME only luma is needed, but the score for comparison already includes chroma */
+        i_cost8 = x264_rd_cost_mb( h, a->i_lambda2 );
+
+        if( *i_cost >= i_cost8 )
+        {
+            if( *i_cost > 0 )
+                a->i_best_satd = (int64_t)a->i_best_satd * i_cost8 / *i_cost;
+            *i_cost = i_cost8;
+        }
+        else
+            h->mb.b_transform_8x8 = !h->mb.b_transform_8x8;
+    }
 }
 
 
@@ -1374,11 +1497,8 @@ void x264_macroblock_analyse( x264_t *h )
     x264_mb_analysis_t analysis;
     int i;
 
-    h->mb.qp[h->mb.i_mb_xy] = x264_ratecontrol_qp(h);
-
-    /* prevent QP from varying too fast. FIXME what's a sane limit? */
-    h->mb.qp[h->mb.i_mb_xy] = x264_clip3( h->mb.qp[h->mb.i_mb_xy],
-                                          h->mb.i_last_qp - 12, h->mb.i_last_qp + 12 );
+    h->mb.i_qp =
+    h->mb.qp[h->mb.i_mb_xy] = x264_ratecontrol_qp( h );
 
     /* init analysis */
     x264_mb_analyse_init( h, &analysis, h->mb.qp[h->mb.i_mb_xy] );
@@ -1424,6 +1544,7 @@ void x264_macroblock_analyse( x264_t *h )
             const unsigned int flags = h->param.analyse.inter;
             int i_type;
             int i_partition;
+            int i_thresh16x8;
 
             x264_mb_analyse_load_costs( h, &analysis );
 
@@ -1443,46 +1564,63 @@ void x264_macroblock_analyse( x264_t *h )
 
                 i_type = P_8x8;
                 i_partition = D_8x8;
-                h->mb.i_sub_partition[0] = D_L0_8x8;
-                h->mb.i_sub_partition[1] = D_L0_8x8;
-                h->mb.i_sub_partition[2] = D_L0_8x8;
-                h->mb.i_sub_partition[3] = D_L0_8x8;
+                h->mb.i_sub_partition[0] = h->mb.i_sub_partition[1] =
+                h->mb.i_sub_partition[2] = h->mb.i_sub_partition[3] = D_L0_8x8;
 
                 i_cost = analysis.l0.i_cost8x8;
 
                 /* Do sub 8x8 */
                 if( flags & X264_ANALYSE_PSUB8x8 )
                 {
+                    int i_cost_bak = i_cost;
+                    int b_sub8x8 = 0;
                     for( i = 0; i < 4; i++ )
                     {
                         x264_mb_analyse_inter_p4x4( h, &analysis, i );
                         if( analysis.l0.i_cost4x4[i] < analysis.l0.me8x8[i].cost )
                         {
-                            int i_cost8x8;
-
+                            int i_cost8x8 = analysis.l0.i_cost4x4[i];
                             h->mb.i_sub_partition[i] = D_L0_4x4;
-                            i_cost8x8 = analysis.l0.i_cost4x4[i];
 
                             x264_mb_analyse_inter_p8x4( h, &analysis, i );
-                            if( analysis.l0.i_cost8x4[i] < analysis.l0.i_cost4x4[i] )
+                            if( analysis.l0.i_cost8x4[i] < i_cost8x8 )
                             {
                                 h->mb.i_sub_partition[i] = D_L0_8x4;
                                 i_cost8x8 = analysis.l0.i_cost8x4[i];
                             }
 
                             x264_mb_analyse_inter_p4x8( h, &analysis, i );
-                            if( analysis.l0.i_cost4x8[i] < analysis.l0.i_cost4x4[i] )
+                            if( analysis.l0.i_cost4x8[i] < i_cost8x8 )
                             {
                                 h->mb.i_sub_partition[i] = D_L0_4x8;
                                 i_cost8x8 = analysis.l0.i_cost4x8[i];
                             }
 
                             i_cost += i_cost8x8 - analysis.l0.me8x8[i].cost;
+                            b_sub8x8 = 1;
+                        }
+                    }
+                    /* TODO: RD per subpartition */
+                    if( b_sub8x8 && analysis.b_mbrd )
+                    {
+                        i_cost = x264_rd_cost_mb( h, analysis.i_lambda2 );
+                        if( i_cost > i_cost_bak )
+                        {
+                            i_cost = i_cost_bak;
+                            h->mb.i_sub_partition[0] = h->mb.i_sub_partition[1] =
+                            h->mb.i_sub_partition[2] = h->mb.i_sub_partition[3] = D_L0_8x8;
                         }
                     }
                 }
+            }
 
-                /* Now do sub 16x8/8x16 */
+            /* Now do 16x8/8x16 */
+            i_thresh16x8 = analysis.l0.me8x8[1].cost_mv + analysis.l0.me8x8[2].cost_mv;
+            if( analysis.b_mbrd )
+                i_thresh16x8 = i_thresh16x8 * analysis.i_lambda2 / analysis.i_lambda;
+            if( ( flags & X264_ANALYSE_PSUB16x16 ) &&
+                analysis.l0.i_cost8x8 < analysis.l0.me16x16.cost + i_thresh16x8 )
+            {
                 x264_mb_analyse_inter_p16x8( h, &analysis );
                 if( analysis.l0.i_cost16x8 < i_cost )
                 {
@@ -1500,28 +1638,33 @@ void x264_macroblock_analyse( x264_t *h )
                 }
             }
 
-            h->mb.i_type = i_type;
             h->mb.i_partition = i_partition;
 
             /* refine qpel */
-            if( h->mb.i_partition == D_16x16 )
+            //FIXME mb_type costs?
+            if( analysis.b_mbrd )
+            {
+                h->mb.i_type = i_type;
+                x264_mb_analyse_transform_rd( h, &analysis, &i_cost );
+            }
+            else if( i_partition == D_16x16 )
             {
                 x264_me_refine_qpel( h, &analysis.l0.me16x16 );
                 i_cost = analysis.l0.me16x16.cost;
             }
-            else if( h->mb.i_partition == D_16x8 )
+            else if( i_partition == D_16x8 )
             {
                 x264_me_refine_qpel( h, &analysis.l0.me16x8[0] );
                 x264_me_refine_qpel( h, &analysis.l0.me16x8[1] );
                 i_cost = analysis.l0.me16x8[0].cost + analysis.l0.me16x8[1].cost;
             }
-            else if( h->mb.i_partition == D_8x16 )
+            else if( i_partition == D_8x16 )
             {
                 x264_me_refine_qpel( h, &analysis.l0.me8x16[0] );
                 x264_me_refine_qpel( h, &analysis.l0.me8x16[1] );
                 i_cost = analysis.l0.me8x16[0].cost + analysis.l0.me8x16[1].cost;
             }
-            else if( h->mb.i_partition == D_8x8 )
+            else if( i_partition == D_8x8 )
             {
                 int i8x8;
                 i_cost = 0;
@@ -1564,10 +1707,10 @@ void x264_macroblock_analyse( x264_t *h )
             }
 
             x264_mb_analyse_intra( h, &analysis, i_cost );
-            if( h->mb.b_chroma_me &&
+            if( h->mb.b_chroma_me && !analysis.b_mbrd &&
                 ( analysis.i_sad_i16x16 < i_cost
-             ||   analysis.i_sad_i8x8 < i_cost
-             ||   analysis.i_sad_i4x4 < i_cost ))
+               || analysis.i_sad_i8x8 < i_cost
+               || analysis.i_sad_i4x4 < i_cost ))
             {
                 x264_mb_analyse_intra_chroma( h, &analysis );
                 analysis.i_sad_i16x16 += analysis.i_sad_i8x8chroma;
@@ -1591,10 +1734,11 @@ void x264_macroblock_analyse( x264_t *h )
 
             if( i_intra_cost < i_cost )
             {
-                h->mb.i_type = i_intra_type;
+                i_type = i_intra_type;
                 i_cost = i_intra_cost;
             }
 
+            h->mb.i_type = i_type;
             h->stat.frame.i_intra_cost += i_intra_cost;
             h->stat.frame.i_inter_cost += i_cost;
         }
@@ -1784,47 +1928,56 @@ void x264_macroblock_analyse( x264_t *h )
         }
     }
 
-    /*-------------------- Update MB from the analysis ----------------------*/
+    x264_analyse_update_cache( h, &analysis );
+
+    if( !analysis.b_mbrd )
+        x264_mb_analyse_transform( h );
+}
+
+/*-------------------- Update MB from the analysis ----------------------*/
+static void x264_analyse_update_cache( x264_t *h, x264_mb_analysis_t *a  )
+{
+    int i;
+
     switch( h->mb.i_type )
     {
         case I_4x4:
             for( i = 0; i < 16; i++ )
             {
                 h->mb.cache.intra4x4_pred_mode[x264_scan8[i]] =
-                    analysis.i_predict4x4[block_idx_x[i]][block_idx_y[i]];
+                    a->i_predict4x4[block_idx_x[i]][block_idx_y[i]];
             }
 
-            x264_mb_analyse_intra_chroma( h, &analysis );
+            x264_mb_analyse_intra_chroma( h, a );
             break;
         case I_8x8:
-            h->mb.b_transform_8x8 = 1;
             for( i = 0; i < 4; i++ )
                 x264_macroblock_cache_intra8x8_pred( h, 2*(i&1), 2*(i>>1),
-                    analysis.i_predict8x8[i&1][i>>1] );
+                    a->i_predict8x8[i&1][i>>1] );
 
-            x264_mb_analyse_intra_chroma( h, &analysis );
+            x264_mb_analyse_intra_chroma( h, a );
             break;
         case I_16x16:
-            h->mb.i_intra16x16_pred_mode = analysis.i_predict16x16;
-            x264_mb_analyse_intra_chroma( h, &analysis );
+            h->mb.i_intra16x16_pred_mode = a->i_predict16x16;
+            x264_mb_analyse_intra_chroma( h, a );
             break;
 
         case P_L0:
-            x264_macroblock_cache_ref( h, 0, 0, 4, 4, 0, analysis.l0.i_ref );
+            x264_macroblock_cache_ref( h, 0, 0, 4, 4, 0, a->l0.i_ref );
             switch( h->mb.i_partition )
             {
                 case D_16x16:
-                    x264_macroblock_cache_mv ( h, 0, 0, 4, 4, 0, analysis.l0.me16x16.mv[0], analysis.l0.me16x16.mv[1] );
+                    x264_macroblock_cache_mv ( h, 0, 0, 4, 4, 0, a->l0.me16x16.mv[0], a->l0.me16x16.mv[1] );
                     break;
 
                 case D_16x8:
-                    x264_macroblock_cache_mv ( h, 0, 0, 4, 2, 0, analysis.l0.me16x8[0].mv[0], analysis.l0.me16x8[0].mv[1] );
-                    x264_macroblock_cache_mv ( h, 0, 2, 4, 2, 0, analysis.l0.me16x8[1].mv[0], analysis.l0.me16x8[1].mv[1] );
+                    x264_macroblock_cache_mv ( h, 0, 0, 4, 2, 0, a->l0.me16x8[0].mv[0], a->l0.me16x8[0].mv[1] );
+                    x264_macroblock_cache_mv ( h, 0, 2, 4, 2, 0, a->l0.me16x8[1].mv[0], a->l0.me16x8[1].mv[1] );
                     break;
 
                 case D_8x16:
-                    x264_macroblock_cache_mv ( h, 0, 0, 2, 4, 0, analysis.l0.me8x16[0].mv[0], analysis.l0.me8x16[0].mv[1] );
-                    x264_macroblock_cache_mv ( h, 2, 0, 2, 4, 0, analysis.l0.me8x16[1].mv[0], analysis.l0.me8x16[1].mv[1] );
+                    x264_macroblock_cache_mv ( h, 0, 0, 2, 4, 0, a->l0.me8x16[0].mv[0], a->l0.me8x16[0].mv[1] );
+                    x264_macroblock_cache_mv ( h, 2, 0, 2, 4, 0, a->l0.me8x16[1].mv[0], a->l0.me8x16[1].mv[1] );
                     break;
 
                 default:
@@ -1834,7 +1987,7 @@ void x264_macroblock_analyse( x264_t *h )
             break;
 
         case P_8x8:
-            x264_macroblock_cache_ref( h, 0, 0, 4, 4, 0, analysis.l0.i_ref );
+            x264_macroblock_cache_ref( h, 0, 0, 4, 4, 0, a->l0.i_ref );
             for( i = 0; i < 4; i++ )
             {
                 const int x = 2*(i%2);
@@ -1843,21 +1996,21 @@ void x264_macroblock_analyse( x264_t *h )
                 switch( h->mb.i_sub_partition[i] )
                 {
                     case D_L0_8x8:
-                        x264_macroblock_cache_mv( h, x, y, 2, 2, 0, analysis.l0.me8x8[i].mv[0], analysis.l0.me8x8[i].mv[1] );
+                        x264_macroblock_cache_mv( h, x, y, 2, 2, 0, a->l0.me8x8[i].mv[0], a->l0.me8x8[i].mv[1] );
                         break;
                     case D_L0_8x4:
-                        x264_macroblock_cache_mv( h, x, y+0, 2, 1, 0, analysis.l0.me8x4[i][0].mv[0], analysis.l0.me8x4[i][0].mv[1] );
-                        x264_macroblock_cache_mv( h, x, y+1, 2, 1, 0, analysis.l0.me8x4[i][1].mv[0], analysis.l0.me8x4[i][1].mv[1] );
+                        x264_macroblock_cache_mv( h, x, y+0, 2, 1, 0, a->l0.me8x4[i][0].mv[0], a->l0.me8x4[i][0].mv[1] );
+                        x264_macroblock_cache_mv( h, x, y+1, 2, 1, 0, a->l0.me8x4[i][1].mv[0], a->l0.me8x4[i][1].mv[1] );
                         break;
                     case D_L0_4x8:
-                        x264_macroblock_cache_mv( h, x+0, y, 1, 2, 0, analysis.l0.me4x8[i][0].mv[0], analysis.l0.me4x8[i][0].mv[1] );
-                        x264_macroblock_cache_mv( h, x+1, y, 1, 2, 0, analysis.l0.me4x8[i][1].mv[0], analysis.l0.me4x8[i][1].mv[1] );
+                        x264_macroblock_cache_mv( h, x+0, y, 1, 2, 0, a->l0.me4x8[i][0].mv[0], a->l0.me4x8[i][0].mv[1] );
+                        x264_macroblock_cache_mv( h, x+1, y, 1, 2, 0, a->l0.me4x8[i][1].mv[0], a->l0.me4x8[i][1].mv[1] );
                         break;
                     case D_L0_4x4:
-                        x264_macroblock_cache_mv( h, x+0, y+0, 1, 1, 0, analysis.l0.me4x4[i][0].mv[0], analysis.l0.me4x4[i][0].mv[1] );
-                        x264_macroblock_cache_mv( h, x+1, y+0, 1, 1, 0, analysis.l0.me4x4[i][1].mv[0], analysis.l0.me4x4[i][1].mv[1] );
-                        x264_macroblock_cache_mv( h, x+0, y+1, 1, 1, 0, analysis.l0.me4x4[i][2].mv[0], analysis.l0.me4x4[i][2].mv[1] );
-                        x264_macroblock_cache_mv( h, x+1, y+1, 1, 1, 0, analysis.l0.me4x4[i][3].mv[0], analysis.l0.me4x4[i][3].mv[1] );
+                        x264_macroblock_cache_mv( h, x+0, y+0, 1, 1, 0, a->l0.me4x4[i][0].mv[0], a->l0.me4x4[i][0].mv[1] );
+                        x264_macroblock_cache_mv( h, x+1, y+0, 1, 1, 0, a->l0.me4x4[i][1].mv[0], a->l0.me4x4[i][1].mv[1] );
+                        x264_macroblock_cache_mv( h, x+0, y+1, 1, 1, 0, a->l0.me4x4[i][2].mv[0], a->l0.me4x4[i][2].mv[1] );
+                        x264_macroblock_cache_mv( h, x+1, y+1, 1, 1, 0, a->l0.me4x4[i][3].mv[0], a->l0.me4x4[i][3].mv[1] );
                         break;
                     default:
                         x264_log( h, X264_LOG_ERROR, "internal error\n" );
@@ -1890,7 +2043,7 @@ void x264_macroblock_analyse( x264_t *h )
         case B_8x8:
             /* optimize: cache might not need to be rewritten */
             for( i = 0; i < 4; i++ )
-                x264_mb_cache_mv_b8x8( h, &analysis, i, 1 );
+                x264_mb_cache_mv_b8x8( h, a, i, 1 );
             break;
 
         default: /* the rest of the B types */
@@ -1900,8 +2053,8 @@ void x264_macroblock_analyse( x264_t *h )
                 switch( h->mb.i_type )
                 {
                 case B_L0_L0:
-                    x264_macroblock_cache_ref( h, 0, 0, 4, 4, 0, analysis.l0.i_ref );
-                    x264_macroblock_cache_mv ( h, 0, 0, 4, 4, 0, analysis.l0.me16x16.mv[0], analysis.l0.me16x16.mv[1] );
+                    x264_macroblock_cache_ref( h, 0, 0, 4, 4, 0, a->l0.i_ref );
+                    x264_macroblock_cache_mv ( h, 0, 0, 4, 4, 0, a->l0.me16x16.mv[0], a->l0.me16x16.mv[1] );
 
                     x264_macroblock_cache_ref( h, 0, 0, 4, 4, 1, -1 );
                     x264_macroblock_cache_mv ( h, 0, 0, 4, 4, 1,  0, 0 );
@@ -1912,33 +2065,31 @@ void x264_macroblock_analyse( x264_t *h )
                     x264_macroblock_cache_mv ( h, 0, 0, 4, 4, 0,  0, 0 );
                     x264_macroblock_cache_mvd( h, 0, 0, 4, 4, 0,  0, 0 );
 
-                    x264_macroblock_cache_ref( h, 0, 0, 4, 4, 1, analysis.l1.i_ref );
-                    x264_macroblock_cache_mv ( h, 0, 0, 4, 4, 1, analysis.l1.me16x16.mv[0], analysis.l1.me16x16.mv[1] );
+                    x264_macroblock_cache_ref( h, 0, 0, 4, 4, 1, a->l1.i_ref );
+                    x264_macroblock_cache_mv ( h, 0, 0, 4, 4, 1, a->l1.me16x16.mv[0], a->l1.me16x16.mv[1] );
                     break;
                 case B_BI_BI:
-                    x264_macroblock_cache_ref( h, 0, 0, 4, 4, 0, analysis.l0.i_ref );
-                    x264_macroblock_cache_mv ( h, 0, 0, 4, 4, 0, analysis.l0.me16x16.mv[0], analysis.l0.me16x16.mv[1] );
+                    x264_macroblock_cache_ref( h, 0, 0, 4, 4, 0, a->l0.i_ref );
+                    x264_macroblock_cache_mv ( h, 0, 0, 4, 4, 0, a->l0.me16x16.mv[0], a->l0.me16x16.mv[1] );
 
-                    x264_macroblock_cache_ref( h, 0, 0, 4, 4, 1, analysis.l1.i_ref );
-                    x264_macroblock_cache_mv ( h, 0, 0, 4, 4, 1, analysis.l1.me16x16.mv[0], analysis.l1.me16x16.mv[1] );
+                    x264_macroblock_cache_ref( h, 0, 0, 4, 4, 1, a->l1.i_ref );
+                    x264_macroblock_cache_mv ( h, 0, 0, 4, 4, 1, a->l1.me16x16.mv[0], a->l1.me16x16.mv[1] );
                     break;
                 }
                 break;
             case D_16x8:
-                x264_mb_cache_mv_b16x8( h, &analysis, 0, 1 );
-                x264_mb_cache_mv_b16x8( h, &analysis, 1, 1 );
+                x264_mb_cache_mv_b16x8( h, a, 0, 1 );
+                x264_mb_cache_mv_b16x8( h, a, 1, 1 );
                 break;
             case D_8x16:
-                x264_mb_cache_mv_b8x16( h, &analysis, 0, 1 );
-                x264_mb_cache_mv_b8x16( h, &analysis, 1, 1 );
+                x264_mb_cache_mv_b8x16( h, a, 0, 1 );
+                x264_mb_cache_mv_b8x16( h, a, 1, 1 );
                 break;
             default:
                 x264_log( h, X264_LOG_ERROR, "internal error (invalid MB type)\n" );
                 break;
             }
     }
-
-    x264_mb_analyse_transform( h );
 }
 
 #include "slicetype_decision.c"
diff --git a/encoder/cabac.c b/encoder/cabac.c
index 9eec50c1..2e17cc02 100644
--- a/encoder/cabac.c
+++ b/encoder/cabac.c
@@ -44,39 +44,39 @@ static const uint8_t block_idx_xy[4][4] =
     { 5, 7, 13, 15}
 };
 
-static inline void x264_cabac_mb_type_intra( x264_t *h, int i_mb_type,
+static inline void x264_cabac_mb_type_intra( x264_t *h, x264_cabac_t *cb, int i_mb_type,
                     int ctx0, int ctx1, int ctx2, int ctx3, int ctx4, int ctx5 )
 {
     if( i_mb_type == I_4x4 || i_mb_type == I_8x8 )
     {
-        x264_cabac_encode_decision( &h->cabac, ctx0, 0 );
+        x264_cabac_encode_decision( cb, ctx0, 0 );
     }
     else if( i_mb_type == I_PCM )
     {
-        x264_cabac_encode_decision( &h->cabac, ctx0, 1 );
-        x264_cabac_encode_terminal( &h->cabac,       1 );
+        x264_cabac_encode_decision( cb, ctx0, 1 );
+        x264_cabac_encode_terminal( cb,       1 );
     }
     else
     {
-        x264_cabac_encode_decision( &h->cabac, ctx0, 1 );
-        x264_cabac_encode_terminal( &h->cabac,       0 );
+        x264_cabac_encode_decision( cb, ctx0, 1 );
+        x264_cabac_encode_terminal( cb,       0 );
 
-        x264_cabac_encode_decision( &h->cabac, ctx1, ( h->mb.i_cbp_luma == 0 ? 0 : 1 ));
+        x264_cabac_encode_decision( cb, ctx1, ( h->mb.i_cbp_luma == 0 ? 0 : 1 ));
         if( h->mb.i_cbp_chroma == 0 )
         {
-            x264_cabac_encode_decision( &h->cabac, ctx2, 0 );
+            x264_cabac_encode_decision( cb, ctx2, 0 );
         }
         else
         {
-            x264_cabac_encode_decision( &h->cabac, ctx2, 1 );
-            x264_cabac_encode_decision( &h->cabac, ctx3, ( h->mb.i_cbp_chroma == 1 ? 0 : 1 ) );
+            x264_cabac_encode_decision( cb, ctx2, 1 );
+            x264_cabac_encode_decision( cb, ctx3, ( h->mb.i_cbp_chroma == 1 ? 0 : 1 ) );
         }
-        x264_cabac_encode_decision( &h->cabac, ctx4, ( (h->mb.i_intra16x16_pred_mode / 2) ? 1 : 0 ));
-        x264_cabac_encode_decision( &h->cabac, ctx5, ( (h->mb.i_intra16x16_pred_mode % 2) ? 1 : 0 ));
+        x264_cabac_encode_decision( cb, ctx4, ( (h->mb.i_intra16x16_pred_mode / 2) ? 1 : 0 ));
+        x264_cabac_encode_decision( cb, ctx5, ( (h->mb.i_intra16x16_pred_mode % 2) ? 1 : 0 ));
     }
 }
 
-static void x264_cabac_mb_type( x264_t *h )
+static void x264_cabac_mb_type( x264_t *h, x264_cabac_t *cb )
 {
     const int i_mb_type = h->mb.i_type;
 
@@ -92,7 +92,7 @@ static void x264_cabac_mb_type( x264_t *h )
             ctx++;
         }
 
-        x264_cabac_mb_type_intra( h, i_mb_type, 3+ctx, 3+3, 3+4, 3+5, 3+6, 3+7 );
+        x264_cabac_mb_type_intra( h, cb, i_mb_type, 3+ctx, 3+3, 3+4, 3+5, 3+6, 3+7 );
     }
     else if( h->sh.i_type == SLICE_TYPE_P )
     {
@@ -101,36 +101,36 @@ static void x264_cabac_mb_type( x264_t *h )
         {
             if( h->mb.i_partition == D_16x16 )
             {
-                x264_cabac_encode_decision( &h->cabac, 14, 0 );
-                x264_cabac_encode_decision( &h->cabac, 15, 0 );
-                x264_cabac_encode_decision( &h->cabac, 16, 0 );
+                x264_cabac_encode_decision( cb, 14, 0 );
+                x264_cabac_encode_decision( cb, 15, 0 );
+                x264_cabac_encode_decision( cb, 16, 0 );
             }
             else if( h->mb.i_partition == D_16x8 )
             {
-                x264_cabac_encode_decision( &h->cabac, 14, 0 );
-                x264_cabac_encode_decision( &h->cabac, 15, 1 );
-                x264_cabac_encode_decision( &h->cabac, 17, 1 );
+                x264_cabac_encode_decision( cb, 14, 0 );
+                x264_cabac_encode_decision( cb, 15, 1 );
+                x264_cabac_encode_decision( cb, 17, 1 );
             }
             else if( h->mb.i_partition == D_8x16 )
             {
-                x264_cabac_encode_decision( &h->cabac, 14, 0 );
-                x264_cabac_encode_decision( &h->cabac, 15, 1 );
-                x264_cabac_encode_decision( &h->cabac, 17, 0 );
+                x264_cabac_encode_decision( cb, 14, 0 );
+                x264_cabac_encode_decision( cb, 15, 1 );
+                x264_cabac_encode_decision( cb, 17, 0 );
             }
         }
         else if( i_mb_type == P_8x8 )
         {
-            x264_cabac_encode_decision( &h->cabac, 14, 0 );
-            x264_cabac_encode_decision( &h->cabac, 15, 0 );
-            x264_cabac_encode_decision( &h->cabac, 16, 1 );
+            x264_cabac_encode_decision( cb, 14, 0 );
+            x264_cabac_encode_decision( cb, 15, 0 );
+            x264_cabac_encode_decision( cb, 16, 1 );
         }
         else /* intra */
         {
             /* prefix */
-            x264_cabac_encode_decision( &h->cabac, 14, 1 );
+            x264_cabac_encode_decision( cb, 14, 1 );
 
             /* suffix */
-            x264_cabac_mb_type_intra( h, i_mb_type, 17+0, 17+1, 17+2, 17+2, 17+3, 17+3 );
+            x264_cabac_mb_type_intra( h, cb, i_mb_type, 17+0, 17+1, 17+2, 17+2, 17+3, 17+3 );
         }
     }
     else if( h->sh.i_type == SLICE_TYPE_B )
@@ -147,31 +147,31 @@ static void x264_cabac_mb_type( x264_t *h )
 
         if( i_mb_type == B_DIRECT )
         {
-            x264_cabac_encode_decision( &h->cabac, 27+ctx, 0 );
+            x264_cabac_encode_decision( cb, 27+ctx, 0 );
         }
         else if( i_mb_type == B_8x8 )
         {
-            x264_cabac_encode_decision( &h->cabac, 27+ctx, 1 );
-            x264_cabac_encode_decision( &h->cabac, 27+3,   1 );
-            x264_cabac_encode_decision( &h->cabac, 27+4,   1 );
+            x264_cabac_encode_decision( cb, 27+ctx, 1 );
+            x264_cabac_encode_decision( cb, 27+3,   1 );
+            x264_cabac_encode_decision( cb, 27+4,   1 );
 
-            x264_cabac_encode_decision( &h->cabac, 27+5,   1 );
-            x264_cabac_encode_decision( &h->cabac, 27+5,   1 );
-            x264_cabac_encode_decision( &h->cabac, 27+5,   1 );
+            x264_cabac_encode_decision( cb, 27+5,   1 );
+            x264_cabac_encode_decision( cb, 27+5,   1 );
+            x264_cabac_encode_decision( cb, 27+5,   1 );
         }
         else if( IS_INTRA( i_mb_type ) )
         {
             /* prefix */
-            x264_cabac_encode_decision( &h->cabac, 27+ctx, 1 );
-            x264_cabac_encode_decision( &h->cabac, 27+3,   1 );
-            x264_cabac_encode_decision( &h->cabac, 27+4,   1 );
+            x264_cabac_encode_decision( cb, 27+ctx, 1 );
+            x264_cabac_encode_decision( cb, 27+3,   1 );
+            x264_cabac_encode_decision( cb, 27+4,   1 );
 
-            x264_cabac_encode_decision( &h->cabac, 27+5,   1 );
-            x264_cabac_encode_decision( &h->cabac, 27+5,   0 );
-            x264_cabac_encode_decision( &h->cabac, 27+5,   1 );
+            x264_cabac_encode_decision( cb, 27+5,   1 );
+            x264_cabac_encode_decision( cb, 27+5,   0 );
+            x264_cabac_encode_decision( cb, 27+5,   1 );
 
             /* suffix */
-            x264_cabac_mb_type_intra( h, i_mb_type, 32+0, 32+1, 32+2, 32+2, 32+3, 32+3 );
+            x264_cabac_mb_type_intra( h, cb, i_mb_type, 32+0, 32+1, 32+2, 32+2, 32+3, 32+3 );
         }
         else
         {
@@ -233,12 +233,12 @@ static void x264_cabac_mb_type( x264_t *h )
                     return;
             }
 
-            x264_cabac_encode_decision( &h->cabac, 27+ctx,                         i_mb_bits[idx][0] );
-            x264_cabac_encode_decision( &h->cabac, 27+3,                           i_mb_bits[idx][1] );
-            x264_cabac_encode_decision( &h->cabac, 27+(i_mb_bits[idx][1] != 0 ? 4 : 5), i_mb_bits[idx][2] );
+            x264_cabac_encode_decision( cb, 27+ctx, i_mb_bits[idx][0] );
+            x264_cabac_encode_decision( cb, 27+3,   i_mb_bits[idx][1] );
+            x264_cabac_encode_decision( cb, 27+(i_mb_bits[idx][1] != 0 ? 4 : 5), i_mb_bits[idx][2] );
             for( i = 3; i < i_mb_len[idx]; i++ )
             {
-                x264_cabac_encode_decision( &h->cabac, 27+5,                       i_mb_bits[idx][i] );
+                x264_cabac_encode_decision( cb, 27+5, i_mb_bits[idx][i] );
             }
         }
     }
@@ -248,30 +248,30 @@ static void x264_cabac_mb_type( x264_t *h )
     }
 }
 
-static void x264_cabac_mb_intra4x4_pred_mode( x264_t *h, int i_pred, int i_mode )
+static void x264_cabac_mb_intra4x4_pred_mode( x264_cabac_t *cb, int i_pred, int i_mode )
 {
     if( i_pred == i_mode )
     {
         /* b_prev_intra4x4_pred_mode */
-        x264_cabac_encode_decision( &h->cabac, 68, 1 );
+        x264_cabac_encode_decision( cb, 68, 1 );
     }
     else
     {
         /* b_prev_intra4x4_pred_mode */
-        x264_cabac_encode_decision( &h->cabac, 68, 0 );
+        x264_cabac_encode_decision( cb, 68, 0 );
         if( i_mode > i_pred  )
         {
             i_mode--;
         }
-        x264_cabac_encode_decision( &h->cabac, 69, (i_mode     )&0x01 );
-        x264_cabac_encode_decision( &h->cabac, 69, (i_mode >> 1)&0x01 );
-        x264_cabac_encode_decision( &h->cabac, 69, (i_mode >> 2)&0x01 );
+        x264_cabac_encode_decision( cb, 69, (i_mode     )&0x01 );
+        x264_cabac_encode_decision( cb, 69, (i_mode >> 1)&0x01 );
+        x264_cabac_encode_decision( cb, 69, (i_mode >> 2)&0x01 );
     }
 }
 
-static void x264_cabac_mb_intra_chroma_pred_mode( x264_t *h )
+static void x264_cabac_mb_intra_chroma_pred_mode( x264_t *h, x264_cabac_t *cb )
 {
-    const int i_mode  = h->mb.i_chroma_pred_mode;
+    const int i_mode = x264_mb_pred_mode8x8c_fix[ h->mb.i_chroma_pred_mode ];
     int       ctx = 0;
 
     /* No need to test for I4x4 or I_16x16 as cache_save handle that */
@@ -286,20 +286,20 @@ static void x264_cabac_mb_intra_chroma_pred_mode( x264_t *h )
 
     if( i_mode == 0 )
     {
-        x264_cabac_encode_decision( &h->cabac, 64 + ctx, 0 );
+        x264_cabac_encode_decision( cb, 64 + ctx, 0 );
     }
     else
     {
-        x264_cabac_encode_decision( &h->cabac, 64 + ctx, 1 );
-        x264_cabac_encode_decision( &h->cabac, 64 + 3, ( i_mode == 1 ? 0 : 1 ) );
+        x264_cabac_encode_decision( cb, 64 + ctx, 1 );
+        x264_cabac_encode_decision( cb, 64 + 3, ( i_mode == 1 ? 0 : 1 ) );
         if( i_mode > 1 )
         {
-            x264_cabac_encode_decision( &h->cabac, 64 + 3, ( i_mode == 2 ? 0 : 1 ) );
+            x264_cabac_encode_decision( cb, 64 + 3, ( i_mode == 2 ? 0 : 1 ) );
         }
     }
 }
 
-static void x264_cabac_mb_cbp_luma( x264_t *h )
+static void x264_cabac_mb_cbp_luma( x264_t *h, x264_cabac_t *cb )
 {
     /* TODO: clean up and optimize */
     int i8x8;
@@ -341,11 +341,11 @@ static void x264_cabac_mb_cbp_luma( x264_t *h )
             }
         }
 
-        x264_cabac_encode_decision( &h->cabac, 73 + ctx, (h->mb.i_cbp_luma >> i8x8)&0x01 );
+        x264_cabac_encode_decision( cb, 73 + ctx, (h->mb.i_cbp_luma >> i8x8)&0x01 );
     }
 }
 
-static void x264_cabac_mb_cbp_chroma( x264_t *h )
+static void x264_cabac_mb_cbp_chroma( x264_t *h, x264_cabac_t *cb )
 {
     int cbp_a = -1;
     int cbp_b = -1;
@@ -367,21 +367,21 @@ static void x264_cabac_mb_cbp_chroma( x264_t *h )
     if( cbp_b > 0 ) ctx += 2;
     if( h->mb.i_cbp_chroma == 0 )
     {
-        x264_cabac_encode_decision( &h->cabac, 77 + ctx, 0 );
+        x264_cabac_encode_decision( cb, 77 + ctx, 0 );
     }
     else
     {
-        x264_cabac_encode_decision( &h->cabac, 77 + ctx, 1 );
+        x264_cabac_encode_decision( cb, 77 + ctx, 1 );
 
         ctx = 4;
         if( cbp_a == 2 ) ctx++;
         if( cbp_b == 2 ) ctx += 2;
-        x264_cabac_encode_decision( &h->cabac, 77 + ctx, h->mb.i_cbp_chroma > 1 ? 1 : 0 );
+        x264_cabac_encode_decision( cb, 77 + ctx, h->mb.i_cbp_chroma > 1 ? 1 : 0 );
     }
 }
 
 /* TODO check it with != qp per mb */
-static void x264_cabac_mb_qp_delta( x264_t *h )
+static void x264_cabac_mb_qp_delta( x264_t *h, x264_cabac_t *cb )
 {
     int i_mbn_xy = h->mb.i_mb_xy - 1;
     int i_dqp = h->mb.qp[h->mb.i_mb_xy] - h->mb.i_last_qp;
@@ -397,14 +397,14 @@ static void x264_cabac_mb_qp_delta( x264_t *h )
 
     while( val > 0 )
     {
-        x264_cabac_encode_decision( &h->cabac,  60 + ctx, 1 );
+        x264_cabac_encode_decision( cb, 60 + ctx, 1 );
         if( ctx < 2 )
             ctx = 2;
         else
             ctx = 3;
         val--;
     }
-    x264_cabac_encode_decision( &h->cabac,  60 + ctx, 0 );
+    x264_cabac_encode_decision( cb, 60 + ctx, 0 );
 }
 
 void x264_cabac_mb_skip( x264_t *h, int b_skip )
@@ -426,54 +426,54 @@ void x264_cabac_mb_skip( x264_t *h, int b_skip )
         x264_cabac_encode_decision( &h->cabac, 24 + ctx, b_skip ? 1 : 0 );
 }
 
-static inline void x264_cabac_mb_sub_p_partition( x264_t *h, int i_sub )
+static inline void x264_cabac_mb_sub_p_partition( x264_cabac_t *cb, int i_sub )
 {
     if( i_sub == D_L0_8x8 )
     {
-        x264_cabac_encode_decision( &h->cabac, 21, 1 );
+        x264_cabac_encode_decision( cb, 21, 1 );
     }
     else if( i_sub == D_L0_8x4 )
     {
-        x264_cabac_encode_decision( &h->cabac, 21, 0 );
-        x264_cabac_encode_decision( &h->cabac, 22, 0 );
+        x264_cabac_encode_decision( cb, 21, 0 );
+        x264_cabac_encode_decision( cb, 22, 0 );
     }
     else if( i_sub == D_L0_4x8 )
     {
-        x264_cabac_encode_decision( &h->cabac, 21, 0 );
-        x264_cabac_encode_decision( &h->cabac, 22, 1 );
-        x264_cabac_encode_decision( &h->cabac, 23, 1 );
+        x264_cabac_encode_decision( cb, 21, 0 );
+        x264_cabac_encode_decision( cb, 22, 1 );
+        x264_cabac_encode_decision( cb, 23, 1 );
     }
     else if( i_sub == D_L0_4x4 )
     {
-        x264_cabac_encode_decision( &h->cabac, 21, 0 );
-        x264_cabac_encode_decision( &h->cabac, 22, 1 );
-        x264_cabac_encode_decision( &h->cabac, 23, 0 );
+        x264_cabac_encode_decision( cb, 21, 0 );
+        x264_cabac_encode_decision( cb, 22, 1 );
+        x264_cabac_encode_decision( cb, 23, 0 );
     }
 }
 
-static inline void x264_cabac_mb_sub_b_partition( x264_t *h, int i_sub )
+static inline void x264_cabac_mb_sub_b_partition( x264_cabac_t *cb, int i_sub )
 {
 #define WRITE_SUB_3(a,b,c) {\
-        x264_cabac_encode_decision( &h->cabac, 36, a );\
-        x264_cabac_encode_decision( &h->cabac, 37, b );\
-        x264_cabac_encode_decision( &h->cabac, 39, c );\
+        x264_cabac_encode_decision( cb, 36, a );\
+        x264_cabac_encode_decision( cb, 37, b );\
+        x264_cabac_encode_decision( cb, 39, c );\
     }
 #define WRITE_SUB_5(a,b,c,d,e) {\
-        x264_cabac_encode_decision( &h->cabac, 36, a );\
-        x264_cabac_encode_decision( &h->cabac, 37, b );\
-        x264_cabac_encode_decision( &h->cabac, 38, c );\
-        x264_cabac_encode_decision( &h->cabac, 39, d );\
-        x264_cabac_encode_decision( &h->cabac, 39, e );\
+        x264_cabac_encode_decision( cb, 36, a );\
+        x264_cabac_encode_decision( cb, 37, b );\
+        x264_cabac_encode_decision( cb, 38, c );\
+        x264_cabac_encode_decision( cb, 39, d );\
+        x264_cabac_encode_decision( cb, 39, e );\
     }
 #define WRITE_SUB_6(a,b,c,d,e,f) {\
         WRITE_SUB_5(a,b,c,d,e)\
-        x264_cabac_encode_decision( &h->cabac, 39, f );\
+        x264_cabac_encode_decision( cb, 39, f );\
     }
 
     switch( i_sub )
     {
         case D_DIRECT_8x8:
-            x264_cabac_encode_decision( &h->cabac, 36, 0 );
+            x264_cabac_encode_decision( cb, 36, 0 );
             break;
         case D_L0_8x8: WRITE_SUB_3(1,0,0); break;
         case D_L1_8x8: WRITE_SUB_3(1,0,1); break;
@@ -490,13 +490,13 @@ static inline void x264_cabac_mb_sub_b_partition( x264_t *h, int i_sub )
     }
 }
 
-static inline void x264_cabac_mb_transform_size( x264_t *h )
+static inline void x264_cabac_mb_transform_size( x264_t *h, x264_cabac_t *cb )
 {
     int ctx = 399 + h->mb.cache.i_neighbour_transform_size;
-    x264_cabac_encode_decision( &h->cabac, ctx, h->mb.b_transform_8x8 );
+    x264_cabac_encode_decision( cb, ctx, h->mb.b_transform_8x8 );
 }
 
-static inline void x264_cabac_mb_ref( x264_t *h, int i_list, int idx )
+static inline void x264_cabac_mb_ref( x264_t *h, x264_cabac_t *cb, int i_list, int idx )
 {
     const int i8 = x264_scan8[idx];
     const int i_refa = h->mb.cache.ref[i_list][i8 - 1];
@@ -511,7 +511,7 @@ static inline void x264_cabac_mb_ref( x264_t *h, int i_list, int idx )
 
     while( i_ref > 0 )
     {
-        x264_cabac_encode_decision( &h->cabac, 54 + ctx, 1 );
+        x264_cabac_encode_decision( cb, 54 + ctx, 1 );
         if( ctx < 4 )
             ctx = 4;
         else
@@ -519,12 +519,12 @@ static inline void x264_cabac_mb_ref( x264_t *h, int i_list, int idx )
 
         i_ref--;
     }
-    x264_cabac_encode_decision( &h->cabac, 54 + ctx, 0 );
+    x264_cabac_encode_decision( cb, 54 + ctx, 0 );
 }
 
 
-static inline void  x264_cabac_mb_mvd_cpn( x264_t *h, int i_list, int idx, int l, int mvd )
+static inline void  x264_cabac_mb_mvd_cpn( x264_t *h, x264_cabac_t *cb, int i_list, int idx, int l, int mvd )
 {
     const int amvd = abs( h->mb.cache.mvd[i_list][x264_scan8[idx] - 1][l] ) +
                      abs( h->mb.cache.mvd[i_list][x264_scan8[idx] - 8][l] );
@@ -544,7 +544,7 @@ static inline void  x264_cabac_mb_mvd_cpn( x264_t *h, int i_list, int idx, int l
 
     for( i = 0; i < i_prefix; i++ )
     {
-        x264_cabac_encode_decision( &h->cabac, ctxbase + ctx, 1 );
+        x264_cabac_encode_decision( cb, ctxbase + ctx, 1 );
         if( ctx < 3 )
             ctx = 3;
         else if( ctx < 6 )
@@ -552,7 +552,7 @@ static inline void  x264_cabac_mb_mvd_cpn( x264_t *h, int i_list, int idx, int l
     }
     if( i_prefix < 9 )
     {
-        x264_cabac_encode_decision( &h->cabac, ctxbase + ctx, 0 );
+        x264_cabac_encode_decision( cb, ctxbase + ctx, 0 );
     }
 
     if( i_prefix >= 9 )
@@ -562,25 +562,25 @@ static inline void  x264_cabac_mb_mvd_cpn( x264_t *h, int i_list, int idx, int l
 
         while( i_suffix >= (1<<k) )
         {
-            x264_cabac_encode_bypass( &h->cabac, 1 );
+            x264_cabac_encode_bypass( cb, 1 );
             i_suffix -= 1 << k;
             k++;
         }
-        x264_cabac_encode_bypass( &h->cabac, 0 );
+        x264_cabac_encode_bypass( cb, 0 );
         while( k-- )
         {
-            x264_cabac_encode_bypass( &h->cabac, (i_suffix >> k)&0x01 );
+            x264_cabac_encode_bypass( cb, (i_suffix >> k)&0x01 );
         }
     }
 
     /* sign */
     if( mvd > 0 )
-        x264_cabac_encode_bypass( &h->cabac, 0 );
+        x264_cabac_encode_bypass( cb, 0 );
     else if( mvd < 0 )
-        x264_cabac_encode_bypass( &h->cabac, 1 );
+        x264_cabac_encode_bypass( cb, 1 );
 }
 
-static inline void  x264_cabac_mb_mvd( x264_t *h, int i_list, int idx, int width, int height )
+static inline void x264_cabac_mb_mvd( x264_t *h, x264_cabac_t *cb, int i_list, int idx, int width, int height )
 {
     int mvp[2];
     int mdx, mdy;
@@ -591,14 +591,14 @@ static inline void  x264_cabac_mb_mvd( x264_t *h, int i_list, int idx, int width
     mdy = h->mb.cache.mv[i_list][x264_scan8[idx]][1] - mvp[1];
 
     /* encode */
-    x264_cabac_mb_mvd_cpn( h, i_list, idx, 0, mdx );
-    x264_cabac_mb_mvd_cpn( h, i_list, idx, 1, mdy );
+    x264_cabac_mb_mvd_cpn( h, cb, i_list, idx, 0, mdx );
+    x264_cabac_mb_mvd_cpn( h, cb, i_list, idx, 1, mdy );
 
     /* save value */
     x264_macroblock_cache_mvd( h, block_idx_x[idx], block_idx_y[idx], width, height, i_list, mdx, mdy );
 }
 
-static inline void x264_cabac_mb8x8_mvd( x264_t *h, int i_list )
+static inline void x264_cabac_mb8x8_mvd( x264_t *h, x264_cabac_t *cb, int i_list )
 {
     int i;
     for( i = 0; i < 4; i++ )
@@ -613,27 +613,27 @@ static inline void x264_cabac_mb8x8_mvd( x264_t *h, int i_list )
             case D_L0_8x8:
             case D_L1_8x8:
             case D_BI_8x8:
-                x264_cabac_mb_mvd( h, i_list, 4*i, 2, 2 );
+                x264_cabac_mb_mvd( h, cb, i_list, 4*i, 2, 2 );
                 break;
             case D_L0_8x4:
             case D_L1_8x4:
             case D_BI_8x4:
-                x264_cabac_mb_mvd( h, i_list, 4*i+0, 2, 1 );
-                x264_cabac_mb_mvd( h, i_list, 4*i+2, 2, 1 );
+                x264_cabac_mb_mvd( h, cb, i_list, 4*i+0, 2, 1 );
+                x264_cabac_mb_mvd( h, cb, i_list, 4*i+2, 2, 1 );
                 break;
             case D_L0_4x8:
             case D_L1_4x8:
             case D_BI_4x8:
-                x264_cabac_mb_mvd( h, i_list, 4*i+0, 1, 2 );
-                x264_cabac_mb_mvd( h, i_list, 4*i+1, 1, 2 );
+                x264_cabac_mb_mvd( h, cb, i_list, 4*i+0, 1, 2 );
+                x264_cabac_mb_mvd( h, cb, i_list, 4*i+1, 1, 2 );
                 break;
             case D_L0_4x4:
             case D_L1_4x4:
             case D_BI_4x4:
-                x264_cabac_mb_mvd( h, i_list, 4*i+0, 1, 1 );
-                x264_cabac_mb_mvd( h, i_list, 4*i+1, 1, 1 );
-                x264_cabac_mb_mvd( h, i_list, 4*i+2, 1, 1 );
-                x264_cabac_mb_mvd( h, i_list, 4*i+3, 1, 1 );
+                x264_cabac_mb_mvd( h, cb, i_list, 4*i+0, 1, 1 );
+                x264_cabac_mb_mvd( h, cb, i_list, 4*i+1, 1, 1 );
+                x264_cabac_mb_mvd( h, cb, i_list, 4*i+2, 1, 1 );
+                x264_cabac_mb_mvd( h, cb, i_list, 4*i+3, 1, 1 );
                 break;
         }
     }
@@ -758,7 +758,7 @@ static int x264_cabac_mb_cbf_ctxidxinc( x264_t *h, int i_cat, int i_idx )
 }
 
 
-static void block_residual_write_cabac( x264_t *h, int i_ctxBlockCat, int i_idx, int *l, int i_count )
+static void block_residual_write_cabac( x264_t *h, x264_cabac_t *cb, int i_ctxBlockCat, int i_idx, int *l, int i_count )
 {
     static const int significant_coeff_flag_offset[6] = { 0, 15, 29, 44, 47, 297 };
     static const int last_significant_coeff_flag_offset[6] = { 0, 15, 29, 44, 47, 251 };
@@ -809,7 +809,7 @@ static void block_residual_write_cabac( x264_t *h, int i_ctxBlockCat, int i_idx,
     if( i_count != 64 )
     {
         /* coded block flag */
-        x264_cabac_encode_decision( &h->cabac, 85 + x264_cabac_mb_cbf_ctxidxinc( h, i_ctxBlockCat, i_idx ), i_coeff != 0 );
+        x264_cabac_encode_decision( cb, 85 + x264_cabac_mb_cbf_ctxidxinc( h, i_ctxBlockCat, i_idx ), i_coeff != 0 );
         if( i_coeff == 0 )
             return;
     }
@@ -829,12 +829,12 @@ static void block_residual_write_cabac( x264_t *h, int i_ctxBlockCat, int i_idx,
 
         if( l[i] != 0 )
         {
-            x264_cabac_encode_decision( &h->cabac, 105 + significant_coeff_flag_offset[i_ctxBlockCat] + i_sig_ctxIdxInc, 1 );
-            x264_cabac_encode_decision( &h->cabac, 166 + last_significant_coeff_flag_offset[i_ctxBlockCat] + i_last_ctxIdxInc, i == i_last ? 1 : 0 );
+            x264_cabac_encode_decision( cb, 105 + significant_coeff_flag_offset[i_ctxBlockCat] + i_sig_ctxIdxInc, 1 );
+            x264_cabac_encode_decision( cb, 166 + last_significant_coeff_flag_offset[i_ctxBlockCat] + i_last_ctxIdxInc, i == i_last ? 1 : 0 );
         }
         else
         {
-            x264_cabac_encode_decision( &h->cabac, 105 + significant_coeff_flag_offset[i_ctxBlockCat] + i_sig_ctxIdxInc, 0 );
+            x264_cabac_encode_decision( cb, 105 + significant_coeff_flag_offset[i_ctxBlockCat] + i_sig_ctxIdxInc, 0 );
         }
         if( i == i_last )
         {
@@ -855,17 +855,17 @@ static void block_residual_write_cabac( x264_t *h, int i_ctxBlockCat, int i_idx,
         i_ctxIdxInc = (i_abslevelgt1 != 0 ? 0 : X264_MIN( 4, i_abslevel1 + 1 )) + coeff_abs_level_m1_offset[i_ctxBlockCat];
         if( i_prefix == 0 )
         {
-            x264_cabac_encode_decision( &h->cabac,  227 + i_ctxIdxInc, 0 );
+            x264_cabac_encode_decision( cb, 227 + i_ctxIdxInc, 0 );
         }
         else
         {
             int j;
-            x264_cabac_encode_decision( &h->cabac,  227 + i_ctxIdxInc, 1 );
+            x264_cabac_encode_decision( cb, 227 + i_ctxIdxInc, 1 );
             i_ctxIdxInc = 5 + X264_MIN( 4, i_abslevelgt1 ) + coeff_abs_level_m1_offset[i_ctxBlockCat];
             for( j = 0; j < i_prefix - 1; j++ )
-                x264_cabac_encode_decision( &h->cabac,  227 + i_ctxIdxInc, 1 );
+                x264_cabac_encode_decision( cb, 227 + i_ctxIdxInc, 1 );
             if( i_prefix < 14 )
-                x264_cabac_encode_decision( &h->cabac,  227 + i_ctxIdxInc, 0 );
+                x264_cabac_encode_decision( cb, 227 + i_ctxIdxInc, 0 );
         }
         /* suffix */
         if( i_coeff_abs_m1[i] >= 14 )
@@ -875,17 +875,17 @@ static void block_residual_write_cabac( x264_t *h, int i_ctxBlockCat, int i_idx,
 
             while( i_suffix >= (1<<k) )
             {
-                x264_cabac_encode_bypass( &h->cabac, 1 );
+                x264_cabac_encode_bypass( cb, 1 );
                 i_suffix -= 1 << k;
                 k++;
             }
-            x264_cabac_encode_bypass( &h->cabac, 0 );
+            x264_cabac_encode_bypass( cb, 0 );
             while( k-- )
-                x264_cabac_encode_bypass( &h->cabac, (i_suffix >> k)&0x01 );
+                x264_cabac_encode_bypass( cb, (i_suffix >> k)&0x01 );
         }
 
         /* write sign */
-        x264_cabac_encode_bypass( &h->cabac, i_coeff_sign[i] );
+        x264_cabac_encode_bypass( cb, i_coeff_sign[i] );
 
         if( i_coeff_abs_m1[i] == 0 )
             i_abslevel1++;
@@ -896,17 +896,19 @@ static void block_residual_write_cabac( x264_t *h, int i_ctxBlockCat, int i_idx,
 
 
-void x264_macroblock_write_cabac( x264_t *h, bs_t *s )
+void x264_macroblock_write_cabac( x264_t *h, x264_cabac_t *cb )
 {
+    bs_t *s = cb->s;
     const int i_mb_type = h->mb.i_type;
-    const int i_mb_pos_start = bs_pos( s );
-    int       i_mb_pos_tex;
+    const int i_mb_pos_start = x264_cabac_pos( cb );
+    int       i_mb_pos_tex = 0;
+    const int b_update_stats = (cb == &h->cabac);
 
     int i_list;
     int i;
 
     /* Write the MB type */
-    x264_cabac_mb_type( h );
+    x264_cabac_mb_type( h, cb );
 
     /* PCM special block type UNTESTED */
     if( i_mb_type == I_PCM )
@@ -933,14 +935,14 @@ void x264_macroblock_write_cabac( x264_t *h, bs_t *s )
             const int y = 8 * h->mb.i_mb_y + (i / 8);
             bs_write( s, 8, h->fenc->plane[2][y*h->mb.pic.i_stride[2]+x] );
         }
-        x264_cabac_encode_init( &h->cabac, s );
+        x264_cabac_encode_init( cb, s );
         return;
     }
 
     if( IS_INTRA( i_mb_type ) )
     {
         if( h->pps->b_transform_8x8_mode && i_mb_type != I_16x16 )
-            x264_cabac_mb_transform_size( h );
+            x264_cabac_mb_transform_size( h, cb );
 
         if( i_mb_type != I_16x16 )
         {
@@ -949,11 +951,11 @@ void x264_macroblock_write_cabac( x264_t *h, bs_t *s )
             {
                 const int i_pred = x264_mb_predict_intra4x4_mode( h, i );
                 const int i_mode = h->mb.cache.intra4x4_pred_mode[x264_scan8[i]];
-                x264_cabac_mb_intra4x4_pred_mode( h, i_pred, i_mode );
+                x264_cabac_mb_intra4x4_pred_mode( cb, i_pred, i_mode );
             }
         }
 
-        x264_cabac_mb_intra_chroma_pred_mode( h );
+        x264_cabac_mb_intra_chroma_pred_mode( h, cb );
     }
     else if( i_mb_type == P_L0 )
     {
@@ -961,57 +963,57 @@ void x264_macroblock_write_cabac( x264_t *h, bs_t *s )
         {
             if( h->sh.i_num_ref_idx_l0_active > 1 )
             {
-                x264_cabac_mb_ref( h, 0, 0 );
+                x264_cabac_mb_ref( h, cb, 0, 0 );
             }
-            x264_cabac_mb_mvd( h, 0, 0, 4, 4 );
+            x264_cabac_mb_mvd( h, cb, 0, 0, 4, 4 );
         }
         else if( h->mb.i_partition == D_16x8 )
         {
             if( h->sh.i_num_ref_idx_l0_active > 1 )
             {
-                x264_cabac_mb_ref( h, 0, 0 );
-                x264_cabac_mb_ref( h, 0, 8 );
+                x264_cabac_mb_ref( h, cb, 0, 0 );
+                x264_cabac_mb_ref( h, cb, 0, 8 );
             }
-            x264_cabac_mb_mvd( h, 0, 0, 4, 2 );
-            x264_cabac_mb_mvd( h, 0, 8, 4, 2 );
+            x264_cabac_mb_mvd( h, cb, 0, 0, 4, 2 );
+            x264_cabac_mb_mvd( h, cb, 0, 8, 4, 2 );
         }
         else if( h->mb.i_partition == D_8x16 )
         {
             if( h->sh.i_num_ref_idx_l0_active > 1 )
             {
-                x264_cabac_mb_ref( h, 0, 0 );
-                x264_cabac_mb_ref( h, 0, 4 );
+                x264_cabac_mb_ref( h, cb, 0, 0 );
+                x264_cabac_mb_ref( h, cb, 0, 4 );
             }
-            x264_cabac_mb_mvd( h, 0, 0, 2, 4 );
-            x264_cabac_mb_mvd( h, 0, 4, 2, 4 );
+            x264_cabac_mb_mvd( h, cb, 0, 0, 2, 4 );
+            x264_cabac_mb_mvd( h, cb, 0, 4, 2, 4 );
         }
     }
     else if( i_mb_type == P_8x8 )
     {
         /* sub mb type */
-        x264_cabac_mb_sub_p_partition( h, h->mb.i_sub_partition[0] );
-        x264_cabac_mb_sub_p_partition( h, h->mb.i_sub_partition[1] );
-        x264_cabac_mb_sub_p_partition( h, h->mb.i_sub_partition[2] );
-        x264_cabac_mb_sub_p_partition( h, h->mb.i_sub_partition[3] );
+        x264_cabac_mb_sub_p_partition( cb, h->mb.i_sub_partition[0] );
+        x264_cabac_mb_sub_p_partition( cb, h->mb.i_sub_partition[1] );
+        x264_cabac_mb_sub_p_partition( cb, h->mb.i_sub_partition[2] );
+        x264_cabac_mb_sub_p_partition( cb, h->mb.i_sub_partition[3] );
 
         /* ref 0 */
         if( h->sh.i_num_ref_idx_l0_active > 1 )
         {
-            x264_cabac_mb_ref( h, 0, 0 );
-            x264_cabac_mb_ref( h, 0, 4 );
-            x264_cabac_mb_ref( h, 0, 8 );
-            x264_cabac_mb_ref( h, 0, 12 );
+            x264_cabac_mb_ref( h, cb, 0, 0 );
+            x264_cabac_mb_ref( h, cb, 0, 4 );
+            x264_cabac_mb_ref( h, cb, 0, 8 );
+            x264_cabac_mb_ref( h, cb, 0, 12 );
         }
 
-        x264_cabac_mb8x8_mvd( h, 0 );
+        x264_cabac_mb8x8_mvd( h, cb, 0 );
     }
     else if( i_mb_type == B_8x8 )
     {
         /* sub mb type */
-        x264_cabac_mb_sub_b_partition( h, h->mb.i_sub_partition[0] );
-        x264_cabac_mb_sub_b_partition( h, h->mb.i_sub_partition[1] );
-        x264_cabac_mb_sub_b_partition( h, h->mb.i_sub_partition[2] );
-        x264_cabac_mb_sub_b_partition( h, h->mb.i_sub_partition[3] );
+        x264_cabac_mb_sub_b_partition( cb, h->mb.i_sub_partition[0] );
+        x264_cabac_mb_sub_b_partition( cb, h->mb.i_sub_partition[1] );
+        x264_cabac_mb_sub_b_partition( cb, h->mb.i_sub_partition[2] );
+        x264_cabac_mb_sub_b_partition( cb, h->mb.i_sub_partition[3] );
 
         /* ref */
         for( i_list = 0; i_list < 2; i_list++ )
@@ -1020,11 +1022,11 @@ void x264_macroblock_write_cabac( x264_t *h, bs_t *s )
                 continue;
             for( i = 0; i < 4; i++ )
                 if( x264_mb_partition_listX_table[i_list][ h->mb.i_sub_partition[i] ] )
-                    x264_cabac_mb_ref( h, i_list, 4*i );
+                    x264_cabac_mb_ref( h, cb, i_list, 4*i );
         }
 
-        x264_cabac_mb8x8_mvd( h, 0 );
-        x264_cabac_mb8x8_mvd( h, 1 );
+        x264_cabac_mb8x8_mvd( h, cb, 0 );
+        x264_cabac_mb8x8_mvd( h, cb, 1 );
     }
     else if( i_mb_type != B_DIRECT )
     {
@@ -1046,17 +1048,17 @@ void x264_macroblock_write_cabac( x264_t *h, bs_t *s )
             {
                 if( h->mb.i_partition == D_16x16 )
                 {
-                    if( b_list[i_list][0] ) x264_cabac_mb_ref( h, i_list, 0 );
+                    if( b_list[i_list][0] ) x264_cabac_mb_ref( h, cb, i_list, 0 );
                 }
                 else if( h->mb.i_partition == D_16x8 )
                 {
-                    if( b_list[i_list][0] ) x264_cabac_mb_ref( h, i_list, 0 );
-                    if( b_list[i_list][1] ) x264_cabac_mb_ref( h, i_list, 8 );
+                    if( b_list[i_list][0] ) x264_cabac_mb_ref( h, cb, i_list, 0 );
+                    if( b_list[i_list][1] ) x264_cabac_mb_ref( h, cb, i_list, 8 );
                 }
                 else if( h->mb.i_partition == D_8x16 )
                 {
-                    if( b_list[i_list][0] ) x264_cabac_mb_ref( h, i_list, 0 );
-                    if( b_list[i_list][1] ) x264_cabac_mb_ref( h, i_list, 4 );
+                    if( b_list[i_list][0] ) x264_cabac_mb_ref( h, cb, i_list, 0 );
+                    if( b_list[i_list][1] ) x264_cabac_mb_ref( h, cb, i_list, 4 );
                 }
             }
         }
@@ -1064,78 +1066,84 @@ void x264_macroblock_write_cabac( x264_t *h, bs_t *s )
         {
             if( h->mb.i_partition == D_16x16 )
             {
-                if( b_list[i_list][0] ) x264_cabac_mb_mvd( h, i_list, 0, 4, 4 );
+                if( b_list[i_list][0] ) x264_cabac_mb_mvd( h, cb, i_list, 0, 4, 4 );
             }
             else if( h->mb.i_partition == D_16x8 )
             {
-                if( b_list[i_list][0] ) x264_cabac_mb_mvd( h, i_list, 0, 4, 2 );
-                if( b_list[i_list][1] ) x264_cabac_mb_mvd( h, i_list, 8, 4, 2 );
+                if( b_list[i_list][0] ) x264_cabac_mb_mvd( h, cb, i_list, 0, 4, 2 );
+                if( b_list[i_list][1] ) x264_cabac_mb_mvd( h, cb, i_list, 8, 4, 2 );
             }
             else if( h->mb.i_partition == D_8x16 )
             {
-                if( b_list[i_list][0] ) x264_cabac_mb_mvd( h, i_list, 0, 2, 4 );
-                if( b_list[i_list][1] ) x264_cabac_mb_mvd( h, i_list, 4, 2, 4 );
+                if( b_list[i_list][0] ) x264_cabac_mb_mvd( h, cb, i_list, 0, 2, 4 );
+                if( b_list[i_list][1] ) x264_cabac_mb_mvd( h, cb, i_list, 4, 2, 4 );
             }
         }
     }
 
-    i_mb_pos_tex = bs_pos( s );
-    h->stat.frame.i_hdr_bits += i_mb_pos_tex - i_mb_pos_start;
+    if( b_update_stats )
+    {
+        i_mb_pos_tex = x264_cabac_pos( cb );
+        h->stat.frame.i_hdr_bits += i_mb_pos_tex - i_mb_pos_start;
+    }
 
     if( i_mb_type != I_16x16 )
     {
-        x264_cabac_mb_cbp_luma( h );
-        x264_cabac_mb_cbp_chroma( h );
+        x264_cabac_mb_cbp_luma( h, cb );
+        x264_cabac_mb_cbp_chroma( h, cb );
     }
 
     if( h->mb.cache.b_transform_8x8_allowed && h->mb.i_cbp_luma && !IS_INTRA(i_mb_type) )
     {
-        x264_cabac_mb_transform_size( h );
+        x264_cabac_mb_transform_size( h, cb );
     }
 
     if( h->mb.i_cbp_luma > 0 || h->mb.i_cbp_chroma > 0 || i_mb_type == I_16x16 )
     {
-        x264_cabac_mb_qp_delta( h );
+        x264_cabac_mb_qp_delta( h, cb );
 
         /* write residual */
         if( i_mb_type == I_16x16 )
         {
             /* DC Luma */
-            block_residual_write_cabac( h, 0, 0, h->dct.luma16x16_dc, 16 );
+            block_residual_write_cabac( h, cb, 0, 0, h->dct.luma16x16_dc, 16 );
 
             /* AC Luma */
             if( h->mb.i_cbp_luma != 0 )
                 for( i = 0; i < 16; i++ )
-                    block_residual_write_cabac( h, 1, i, h->dct.block[i].residual_ac, 15 );
+                    block_residual_write_cabac( h, cb, 1, i, h->dct.block[i].residual_ac, 15 );
         }
         else if( h->mb.b_transform_8x8 )
         {
             for( i = 0; i < 4; i++ )
                 if( h->mb.i_cbp_luma & ( 1 << i ) )
-                    block_residual_write_cabac( h, 5, i, h->dct.luma8x8[i], 64 );
+                    block_residual_write_cabac( h, cb, 5, i, h->dct.luma8x8[i], 64 );
         }
         else
         {
             for( i = 0; i < 16; i++ )
                 if( h->mb.i_cbp_luma & ( 1 << ( i / 4 ) ) )
-                    block_residual_write_cabac( h, 2, i, h->dct.block[i].luma4x4, 16 );
+                    block_residual_write_cabac( h, cb, 2, i, h->dct.block[i].luma4x4, 16 );
         }
 
         if( h->mb.i_cbp_chroma &0x03 )    /* Chroma DC residual present */
         {
-            block_residual_write_cabac( h, 3, 0, h->dct.chroma_dc[0], 4 );
-            block_residual_write_cabac( h, 3, 1, h->dct.chroma_dc[1], 4 );
+            block_residual_write_cabac( h, cb, 3, 0, h->dct.chroma_dc[0], 4 );
+            block_residual_write_cabac( h, cb, 3, 1, h->dct.chroma_dc[1], 4 );
         }
         if( h->mb.i_cbp_chroma&0x02 ) /* Chroma AC residual present */
         {
             for( i = 0; i < 8; i++ )
-                block_residual_write_cabac( h, 4, i, h->dct.block[16+i].residual_ac, 15 );
+                block_residual_write_cabac( h, cb, 4, i, h->dct.block[16+i].residual_ac, 15 );
         }
     }
 
-    if( IS_INTRA( i_mb_type ) )
-        h->stat.frame.i_itex_bits += bs_pos(s) - i_mb_pos_tex;
-    else
-        h->stat.frame.i_ptex_bits += bs_pos(s) - i_mb_pos_tex;
+    if( b_update_stats )
+    {
+        if( IS_INTRA( i_mb_type ) )
+            h->stat.frame.i_itex_bits += x264_cabac_pos( cb ) - i_mb_pos_tex;
+        else
+            h->stat.frame.i_ptex_bits += x264_cabac_pos( cb ) - i_mb_pos_tex;
+    }
 }
 
diff --git a/encoder/cavlc.c b/encoder/cavlc.c
index 31cbd906..dc442f09 100644
--- a/encoder/cavlc.c
+++ b/encoder/cavlc.c
@@ -450,13 +450,13 @@ void x264_macroblock_write_cavlc( x264_t *h, bs_t *s )
                 }
             }
         }
-        bs_write_ue( s, h->mb.i_chroma_pred_mode );
+        bs_write_ue( s, x264_mb_pred_mode8x8c_fix[ h->mb.i_chroma_pred_mode ] );
     }
     else if( i_mb_type == I_16x16 )
     {
         bs_write_ue( s, i_mb_i_offset + 1 + h->mb.i_intra16x16_pred_mode +
                         h->mb.i_cbp_chroma * 4 + ( h->mb.i_cbp_luma == 0 ? 0 : 12 ) );
-        bs_write_ue( s, h->mb.i_chroma_pred_mode );
+        bs_write_ue( s, x264_mb_pred_mode8x8c_fix[ h->mb.i_chroma_pred_mode ] );
     }
     else if( i_mb_type == P_L0 )
     {
diff --git a/encoder/encoder.c b/encoder/encoder.c
index a69ea666..7456df65 100644
--- a/encoder/encoder.c
+++ b/encoder/encoder.c
@@ -412,7 +412,7 @@ static int x264_validate_parameters( x264_t *h )
         h->param.analyse.i_me_range = 4;
     if( h->param.analyse.i_me_range > 16 && h->param.analyse.i_me_method <= X264_ME_HEX )
         h->param.analyse.i_me_range = 16;
-    h->param.analyse.i_subpel_refine = x264_clip3( h->param.analyse.i_subpel_refine, 1, 5 );
+    h->param.analyse.i_subpel_refine = x264_clip3( h->param.analyse.i_subpel_refine, 1, 6 );
     if( !(h->param.analyse.inter & X264_ANALYSE_PSUB16x16) )
         h->param.analyse.inter &= ~X264_ANALYSE_PSUB8x8;
     if( !h->param.analyse.b_transform_8x8 )
@@ -944,7 +944,7 @@ static int x264_slice_write( x264_t *h )
             {
                 if( h->sh.i_type != SLICE_TYPE_I )
                     x264_cabac_mb_skip( h, 0 );
-                x264_macroblock_write_cabac( h, &h->out.bs );
+                x264_macroblock_write_cabac( h, &h->cabac );
             }
         }
         else
diff --git a/encoder/macroblock.c b/encoder/macroblock.c
index 6768dac4..e47d0827 100644
--- a/encoder/macroblock.c
+++ b/encoder/macroblock.c
@@ -587,7 +587,7 @@ void x264_macroblock_encode_pskip( x264_t *h )
 void x264_macroblock_encode( x264_t *h )
 {
     int i_cbp_dc = 0;
-    int i_qscale;
+    int i_qp = h->mb.i_qp;
     int i;
 
     if( h->mb.i_type == P_SKIP )
@@ -604,23 +604,22 @@ void x264_macroblock_encode( x264_t *h )
         return;
     }
 
-    /* quantification scale */
-    i_qscale = h->mb.qp[h->mb.i_mb_xy];
-
     if( h->mb.i_type == I_16x16 )
     {
         const int i_mode = h->mb.i_intra16x16_pred_mode;
+        h->mb.b_transform_8x8 = 0;
         /* do the right prediction */
         h->predict_16x16[i_mode]( h->mb.pic.p_fdec[0], h->mb.pic.i_stride[0] );
 
         /* encode the 16x16 macroblock */
-        x264_mb_encode_i16x16( h, i_qscale );
+        x264_mb_encode_i16x16( h, i_qp );
 
         /* fix the pred mode value */
         h->mb.i_intra16x16_pred_mode = x264_mb_pred_mode16x16_fix[i_mode];
     }
     else if( h->mb.i_type == I_8x8 )
     {
+        h->mb.b_transform_8x8 = 1;
         for( i = 0; i < 4; i++ )
         {
             const int i_dst = h->mb.pic.i_stride[0];
@@ -628,12 +627,13 @@ void x264_macroblock_encode( x264_t *h )
             int      i_mode = h->mb.cache.intra4x4_pred_mode[x264_scan8[4*i]];
 
             h->predict_8x8[i_mode]( p_dst, i_dst, h->mb.i_neighbour8[i] );
-            x264_mb_encode_i8x8( h, i, i_qscale );
+            x264_mb_encode_i8x8( h, i, i_qp );
             h->mb.cache.intra4x4_pred_mode[x264_scan8[4*i]] = x264_mb_pred_mode4x4_fix(i_mode);
         }
     }
     else if( h->mb.i_type == I_4x4 )
     {
+        h->mb.b_transform_8x8 = 0;
         for( i = 0; i < 16; i++ )
         {
             const int i_dst = h->mb.pic.i_stride[0];
@@ -641,7 +641,7 @@ void x264_macroblock_encode( x264_t *h )
             int      i_mode = h->mb.cache.intra4x4_pred_mode[x264_scan8[i]];
 
             h->predict_4x4[i_mode]( p_dst, i_dst );
-            x264_mb_encode_i4x4( h, i, i_qscale );
+            x264_mb_encode_i4x4( h, i, i_qp );
             h->mb.cache.intra4x4_pred_mode[x264_scan8[i]] = x264_mb_pred_mode4x4_fix(i_mode);
         }
     }
@@ -664,9 +664,9 @@ void x264_macroblock_encode( x264_t *h )
             {
                 int i_decimate_8x8;
 
-                quant_8x8( dct8x8[idx], i_qscale, 0 );
+                quant_8x8( dct8x8[idx], i_qp, 0 );
                 scan_zigzag_8x8full( h->dct.luma8x8[idx], dct8x8[idx] );
-                x264_mb_dequant_8x8( dct8x8[idx], i_qscale );
+                x264_mb_dequant_8x8( dct8x8[idx], i_qp );
 
                 i_decimate_8x8 = x264_mb_decimate_score( h->dct.luma8x8[idx], 64 );
                 i_decimate_mb += i_decimate_8x8;
@@ -699,9 +699,9 @@ void x264_macroblock_encode( x264_t *h )
                 {
                     idx = i8x8 * 4 + i4x4;
 
-                    quant_4x4( dct4x4[idx], i_qscale, 0 );
+                    quant_4x4( dct4x4[idx], i_qp, 0 );
                     scan_zigzag_4x4full( h->dct.block[idx].luma4x4, dct4x4[idx] );
-                    x264_mb_dequant_4x4( dct4x4[idx], i_qscale );
+                    x264_mb_dequant_4x4( dct4x4[idx], i_qp );
 
                     i_decimate_8x8 += x264_mb_decimate_score( h->dct.block[idx].luma4x4, 16 );
                 }
@@ -733,20 +733,16 @@ void x264_macroblock_encode( x264_t *h )
     }
 
     /* encode chroma */
-    i_qscale = i_chroma_qp_table[x264_clip3( i_qscale + h->pps->i_chroma_qp_index_offset, 0, 51 )];
+    i_qp = i_chroma_qp_table[x264_clip3( i_qp + h->pps->i_chroma_qp_index_offset, 0, 51 )];
     if( IS_INTRA( h->mb.i_type ) )
     {
         const int i_mode = h->mb.i_chroma_pred_mode;
-        /* do the right prediction */
         h->predict_8x8c[i_mode]( h->mb.pic.p_fdec[1], h->mb.pic.i_stride[1] );
         h->predict_8x8c[i_mode]( h->mb.pic.p_fdec[2], h->mb.pic.i_stride[2] );
-
-        /* fix the pred mode value */
-        h->mb.i_chroma_pred_mode = x264_mb_pred_mode8x8c_fix[i_mode];
     }
 
     /* encode the 8x8 blocks */
-    x264_mb_encode_8x8_chroma( h, !IS_INTRA( h->mb.i_type ), i_qscale );
+    x264_mb_encode_8x8_chroma( h, !IS_INTRA( h->mb.i_type ), i_qp );
 
     /* Calculate the Luma/Chroma patern and non_zero_count */
     h->mb.i_cbp_luma = 0x00;
@@ -841,7 +837,7 @@ void x264_macroblock_encode( x264_t *h )
         {
             h->mb.i_type = P_SKIP;
             h->mb.qp[h->mb.i_mb_xy] = h->mb.i_last_qp;  /* Needed */
-            /* XXX qp reset may have issues when used in RD instead of the real encode*/
+            /* XXX qp reset may have issues when used in RD instead of the real encode */
         }
     }
 
@@ -868,16 +864,13 @@ int x264_macroblock_probe_skip( x264_t *h, int b_bidir )
     DECLARE_ALIGNED( int16_t, dct2x2[2][2], 16 );
     DECLARE_ALIGNED( int,     dctscan[16], 16 );
 
-    int i_qp;
+    int i_qp = h->mb.i_qp;
     int mvp[2];
     int ch;
 
     int i8x8, i4x4;
     int i_decimate_mb;
 
-    /* quantization scale */
-    i_qp = h->mb.qp[h->mb.i_mb_xy];
-
     if( !b_bidir )
     {
         /* Get the MV */
diff --git a/encoder/macroblock.h b/encoder/macroblock.h
index a16bcf10..9f8f661c 100644
--- a/encoder/macroblock.h
+++ b/encoder/macroblock.h
@@ -34,7 +34,7 @@ static inline int x264_macroblock_probe_bskip( x264_t *h )
     { return x264_macroblock_probe_skip( h, 1 ); }
 
 void x264_macroblock_encode      ( x264_t *h );
-void x264_macroblock_write_cabac ( x264_t *h, bs_t *s );
+void x264_macroblock_write_cabac ( x264_t *h, x264_cabac_t *cb );
 void x264_macroblock_write_cavlc ( x264_t *h, bs_t *s );
 
 void x264_cabac_mb_skip( x264_t *h, int b_skip );
diff --git a/encoder/me.c b/encoder/me.c
index 147dd159..c5084048 100644
--- a/encoder/me.c
+++ b/encoder/me.c
@@ -39,7 +39,8 @@ static const int subpel_iterations[][4] =
     {1,2,0,0},
     {0,2,1,0},
     {0,2,1,1},
-    {0,2,1,2}};
+    {0,2,1,2},
+    {0,0,2,3}};
 
 static void refine_subpel( x264_t *h, x264_me_t *m, int hpel_iters, int qpel_iters );
 
diff --git a/encoder/rdo.c b/encoder/rdo.c
new file mode 100644
index 00000000..b9902421
--- /dev/null
+++ b/encoder/rdo.c
@@ -0,0 +1,62 @@
+/*****************************************************************************
+ * rdo.c: h264 encoder library (rate-distortion optimization)
+ *****************************************************************************
+ * Copyright (C) 2005 x264 project
+ *
+ * Authors: Loren Merritt <lorenm@u.washington.edu>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111, USA.
+ *****************************************************************************/
+
+static int x264_rd_cost_mb( x264_t *h, int i_lambda2 )
+{
+    // backup mb_type because x264_macroblock_encode may change it to skip
+    int i_type_bak = h->mb.i_type;
+    int b_transform_bak = h->mb.b_transform_8x8;
+    int i_ssd;
+    int i_bits;
+
+    x264_macroblock_encode( h );
+
+    i_ssd = h->pixf.ssd[PIXEL_16x16]( h->mb.pic.p_fenc[0], h->mb.pic.i_stride[0],
+                                      h->mb.pic.p_fdec[0], h->mb.pic.i_stride[0] )
+          + h->pixf.ssd[PIXEL_8x8](   h->mb.pic.p_fenc[1], h->mb.pic.i_stride[1],
+                                      h->mb.pic.p_fdec[1], h->mb.pic.i_stride[1] )
+          + h->pixf.ssd[PIXEL_8x8](   h->mb.pic.p_fenc[2], h->mb.pic.i_stride[2],
+                                      h->mb.pic.p_fdec[2], h->mb.pic.i_stride[2] );
+
+    if( IS_SKIP( h->mb.i_type ) )
+    {
+        i_bits = 1;
+    }
+    else if( h->param.b_cabac )
+    {
+        x264_cabac_t cabac_tmp = h->cabac;
+        bs_t bs_tmp = h->out.bs;
+        cabac_tmp.s = &bs_tmp;
+        x264_macroblock_write_cabac( h, &cabac_tmp );
+        i_bits = x264_cabac_pos( &cabac_tmp ) - x264_cabac_pos( &h->cabac );
+    }
+    else
+    {
+        bs_t bs_tmp = h->out.bs;
+        x264_macroblock_write_cavlc( h, &bs_tmp );
+        i_bits = bs_pos( &bs_tmp ) - bs_pos( &h->out.bs );
+    }
+    h->mb.i_type = i_type_bak;
+    h->mb.b_transform_8x8 = b_transform_bak;
+
+    return i_ssd + i_bits * i_lambda2;
+}
diff --git a/x264.c b/x264.c
index 89e22470..4b70c4b6 100644
--- a/x264.c
+++ b/x264.c
@@ -227,7 +227,8 @@ static void Help( x264_param_t *defaults )
              "                                  - umh: uneven multi-hexagon search\n"
              "                                  - esa: exhaustive search (slow)\n"
              "      --merange <integer>     Maximum motion vector search range [%d]\n"
-             "  -m, --subme <integer>       Subpixel motion estimation quality: 1=fast, 5=best. [%d]\n"
+             "  -m, --subme <integer>       Subpixel motion estimation and partition\n"
+             "                                  decision quality: 1=fast, 6=best. [%d]\n"
              "      --no-chroma-me          Ignore chroma in motion estimation\n"
              "  -8, --8x8dct                Adaptive spatial transform size\n"
              "\n"
diff --git a/x264.h b/x264.h
index 14a11d21..eef21e02 100644
--- a/x264.h
+++ b/x264.h
@@ -26,7 +26,7 @@
 
 #include <stdarg.h>
 
-#define X264_BUILD 29
+#define X264_BUILD 30
 
 /* x264_t:
  *      opaque handler for decoder and encoder */