From 3b66f690bd8a7d1417cedf98aec0df2702338bb2 Mon Sep 17 00:00:00 2001
From: Loren Merritt <pengvado@videolan.org>
Date: Mon, 20 Mar 2006 23:00:52 +0000
Subject: [PATCH] RD subpel motion estimation (--subme 7)

git-svn-id: svn://svn.videolan.org/x264/trunk@476 df754926-b1dd-0310-bc7b-ec298dee348c
---
 common/macroblock.c  | 125 ++++++++++++-------------
 common/macroblock.h  |   1 +
 encoder/analyse.c    | 170 +++++++++++++++++++++++++++++++++-
 encoder/cabac.c      | 162 +++++++++++++++++++++++++--------
 encoder/cavlc.c      | 211 +++++++++++++++++++++++++++++--------------
 encoder/encoder.c    |   3 +-
 encoder/macroblock.c |  85 +++++++++++++++++
 encoder/macroblock.h |   2 +
 encoder/me.c         |  96 ++++++++++++++++++++
 encoder/me.h         |   2 +
 encoder/rdo.c        |  86 ++++++++++++++++++
 x264.c               |   2 +-
 12 files changed, 772 insertions(+), 173 deletions(-)

diff --git a/common/macroblock.c b/common/macroblock.c
index 1f14c6b6..1507cf57 100644
--- a/common/macroblock.c
+++ b/common/macroblock.c
@@ -687,6 +687,69 @@ static void x264_mb_mc_direct8x8( x264_t *h, int x, int y )
     }
 }
 
+void x264_mb_mc_8x8( x264_t *h, int i8 )
+{
+    const int x = 2*(i8&1);
+    const int y = 2*(i8>>1);
+    switch( h->mb.i_sub_partition[i8] )
+    {
+        case D_L0_8x8:
+            x264_mb_mc_0xywh( h, x, y, 2, 2 );
+            break;
+        case D_L0_8x4:
+            x264_mb_mc_0xywh( h, x, y+0, 2, 1 );
+            x264_mb_mc_0xywh( h, x, y+1, 2, 1 );
+            break;
+        case D_L0_4x8:
+            x264_mb_mc_0xywh( h, x+0, y, 1, 2 );
+            x264_mb_mc_0xywh( h, x+1, y, 1, 2 );
+            break;
+        case D_L0_4x4:
+            x264_mb_mc_0xywh( h, x+0, y+0, 1, 1 );
+            x264_mb_mc_0xywh( h, x+1, y+0, 1, 1 );
+            x264_mb_mc_0xywh( h, x+0, y+1, 1, 1 );
+            x264_mb_mc_0xywh( h, x+1, y+1, 1, 1 );
+            break;
+        case D_L1_8x8:
+            x264_mb_mc_1xywh( h, x, y, 2, 2 );
+            break;
+        case D_L1_8x4:
+            x264_mb_mc_1xywh( h, x, y+0, 2, 1 );
+            x264_mb_mc_1xywh( h, x, y+1, 2, 1 );
+            break;
+        case D_L1_4x8:
+            x264_mb_mc_1xywh( h, x+0, y, 1, 2 );
+            x264_mb_mc_1xywh( h, x+1, y, 1, 2 );
+            break;
+        case D_L1_4x4:
+            x264_mb_mc_1xywh( h, x+0, y+0, 1, 1 );
+            x264_mb_mc_1xywh( h, x+1, y+0, 1, 1 );
+            x264_mb_mc_1xywh( h, x+0, y+1, 1, 1 );
+            x264_mb_mc_1xywh( h, x+1, y+1, 1, 1 );
+            break;
+        case D_BI_8x8:
+            x264_mb_mc_01xywh( h, x, y, 2, 2 );
+            break;
+        case D_BI_8x4:
+            x264_mb_mc_01xywh( h, x, y+0, 2, 1 );
+            x264_mb_mc_01xywh( h, x, y+1, 2, 1 );
+            break;
+        case D_BI_4x8:
+            x264_mb_mc_01xywh( h, x+0, y, 1, 2 );
+            x264_mb_mc_01xywh( h, x+1, y, 1, 2 );
+            break;
+        case D_BI_4x4:
+            x264_mb_mc_01xywh( h, x+0, y+0, 1, 1 );
+            x264_mb_mc_01xywh( h, x+1, y+0, 1, 1 );
+            x264_mb_mc_01xywh( h, x+0, y+1, 1, 1 );
+            x264_mb_mc_01xywh( h, x+1, y+1, 1, 1 );
+            break;
+        case D_DIRECT_8x8:
+            x264_mb_mc_direct8x8( h, x, y );
+            break;
+    }
+}
+
 void x264_mb_mc( x264_t *h )
 {
     if( h->mb.i_type == P_L0 )
@@ -710,67 +773,7 @@ void x264_mb_mc( x264_t *h )
     {
         int i;
         for( i = 0; i < 4; i++ )
-        {
-            const int x = 2*(i%2);
-            const int y = 2*(i/2);
-            switch( h->mb.i_sub_partition[i] )
-            {
-                case D_L0_8x8:
-                    x264_mb_mc_0xywh( h, x, y, 2, 2 );
-                    break;
-                case D_L0_8x4:
-                    x264_mb_mc_0xywh( h, x, y+0, 2, 1 );
-                    x264_mb_mc_0xywh( h, x, y+1, 2, 1 );
-                    break;
-                case D_L0_4x8:
-                    x264_mb_mc_0xywh( h, x+0, y, 1, 2 );
-                    x264_mb_mc_0xywh( h, x+1, y, 1, 2 );
-                    break;
-                case D_L0_4x4:
-                    x264_mb_mc_0xywh( h, x+0, y+0, 1, 1 );
-                    x264_mb_mc_0xywh( h, x+1, y+0, 1, 1 );
-                    x264_mb_mc_0xywh( h, x+0, y+1, 1, 1 );
-                    x264_mb_mc_0xywh( h, x+1, y+1, 1, 1 );
-                    break;
-                case D_L1_8x8:
-                    x264_mb_mc_1xywh( h, x, y, 2, 2 );
-                    break;
-                case D_L1_8x4:
-                    x264_mb_mc_1xywh( h, x, y+0, 2, 1 );
-                    x264_mb_mc_1xywh( h, x, y+1, 2, 1 );
-                    break;
-                case D_L1_4x8:
-                    x264_mb_mc_1xywh( h, x+0, y, 1, 2 );
-                    x264_mb_mc_1xywh( h, x+1, y, 1, 2 );
-                    break;
-                case D_L1_4x4:
-                    x264_mb_mc_1xywh( h, x+0, y+0, 1, 1 );
-                    x264_mb_mc_1xywh( h, x+1, y+0, 1, 1 );
-                    x264_mb_mc_1xywh( h, x+0, y+1, 1, 1 );
-                    x264_mb_mc_1xywh( h, x+1, y+1, 1, 1 );
-                    break;
-                case D_BI_8x8:
-                    x264_mb_mc_01xywh( h, x, y, 2, 2 );
-                    break;
-                case D_BI_8x4:
-                    x264_mb_mc_01xywh( h, x, y+0, 2, 1 );
-                    x264_mb_mc_01xywh( h, x, y+1, 2, 1 );
-                    break;
-                case D_BI_4x8:
-                    x264_mb_mc_01xywh( h, x+0, y, 1, 2 );
-                    x264_mb_mc_01xywh( h, x+1, y, 1, 2 );
-                    break;
-                case D_BI_4x4:
-                    x264_mb_mc_01xywh( h, x+0, y+0, 1, 1 );
-                    x264_mb_mc_01xywh( h, x+1, y+0, 1, 1 );
-                    x264_mb_mc_01xywh( h, x+0, y+1, 1, 1 );
-                    x264_mb_mc_01xywh( h, x+1, y+1, 1, 1 );
-                    break;
-                case D_DIRECT_8x8:
-                    x264_mb_mc_direct8x8( h, x, y );
-                    break;
-            }
-        }
+            x264_mb_mc_8x8( h, i );
     }
     else if( h->mb.i_type == B_SKIP || h->mb.i_type == B_DIRECT )
     {
diff --git a/common/macroblock.h b/common/macroblock.h
index 6ca6492b..6a54dc7d 100644
--- a/common/macroblock.h
+++ b/common/macroblock.h
@@ -263,6 +263,7 @@ void x264_mb_encode_i4x4( x264_t *h, int idx, int i_qscale );
 void x264_mb_encode_i8x8( x264_t *h, int idx, int i_qscale );
 
 void x264_mb_mc( x264_t *h );
+void x264_mb_mc_8x8( x264_t *h, int i8 );
 
 
 static inline void x264_macroblock_cache_ref( x264_t *h, int x, int y, int width, int height, int i_list, int ref )
diff --git a/encoder/analyse.c b/encoder/analyse.c
index e1a00d3b..af35e26a 100644
--- a/encoder/analyse.c
+++ b/encoder/analyse.c
@@ -660,6 +660,136 @@ static void x264_mb_analyse_intra( x264_t *h, x264_mb_analysis_t *a, int i_cost_
     }
 }
 
+static void x264_intra_rd_refine( x264_t *h, x264_mb_analysis_t *a )
+{
+    uint8_t  *p_src = h->mb.pic.p_fenc[0];
+    uint8_t  *p_dst = h->mb.pic.p_fdec[0];
+
+    int i, idx, x, y;
+    int i_max, i_sad, i_best, i_mode;
+    int i_pred_mode;
+    int predict_mode[9];
+
+    if( h->mb.i_type == I_16x16 )
+    {
+        int old_pred_mode = a->i_predict16x16;
+        i_best = a->i_sad_i16x16;
+        predict_16x16_mode_available( h->mb.i_neighbour, predict_mode, &i_max );
+        for( i = 0; i < i_max; i++ )
+        {
+            if( predict_mode[i] == old_pred_mode )
+                continue;
+            h->mb.i_intra16x16_pred_mode = predict_mode[i];
+            i_sad = x264_rd_cost_mb( h, a->i_lambda2 );
+            if( i_best > i_sad )
+            {
+                a->i_predict16x16 = predict_mode[i];
+                i_best = i_sad;
+            }
+        }
+    }
+    else if( h->mb.i_type == I_4x4 )
+    {
+        for( idx = 0; idx < 16; idx++ )
+        {
+            uint32_t pels[4];
+            int i_nnz = 0;
+            uint8_t *p_src_by;
+            uint8_t *p_dst_by;
+            i_best = COST_MAX;
+
+            i_pred_mode= x264_mb_predict_intra4x4_mode( h, idx );
+            x = block_idx_x[idx];
+            y = block_idx_y[idx];
+
+            p_src_by = p_src + 4*x + 4*y*FENC_STRIDE;
+            p_dst_by = p_dst + 4*x + 4*y*FDEC_STRIDE;
+            predict_4x4_mode_available( h->mb.i_neighbour4[idx], predict_mode, &i_max );
+
+            if( (h->mb.i_neighbour4[idx] & (MB_TOPRIGHT|MB_TOP)) == MB_TOP )
+                /* emulate missing topright samples */
+                *(uint32_t*) &p_dst_by[4 - FDEC_STRIDE] = p_dst_by[3 - FDEC_STRIDE] * 0x01010101U;
+
+            for( i = 0; i < i_max; i++ )
+            {
+                i_mode = predict_mode[i];
+                h->predict_4x4[i_mode]( p_dst_by );
+
+                i_sad = x264_rd_cost_i4x4( h, a->i_lambda2, idx, i_mode );
+
+                if( i_best > i_sad )
+                {
+                    a->i_predict4x4[x][y] = i_mode;
+                    i_best = i_sad;
+                    pels[0] = *(uint32_t*)(p_dst_by+0*FDEC_STRIDE);
+                    pels[1] = *(uint32_t*)(p_dst_by+1*FDEC_STRIDE);
+                    pels[2] = *(uint32_t*)(p_dst_by+2*FDEC_STRIDE);
+                    pels[3] = *(uint32_t*)(p_dst_by+3*FDEC_STRIDE);
+                    i_nnz = h->mb.cache.non_zero_count[x264_scan8[idx]];
+                }
+            }
+
+            *(uint32_t*)(p_dst_by+0*FDEC_STRIDE) = pels[0];
+            *(uint32_t*)(p_dst_by+1*FDEC_STRIDE) = pels[1];
+            *(uint32_t*)(p_dst_by+2*FDEC_STRIDE) = pels[2];
+            *(uint32_t*)(p_dst_by+3*FDEC_STRIDE) = pels[3];
+            h->mb.cache.non_zero_count[x264_scan8[idx]] = i_nnz;
+
+            h->mb.cache.intra4x4_pred_mode[x264_scan8[idx]] = a->i_predict4x4[x][y];
+        }
+    }
+    else if( h->mb.i_type == I_8x8 )
+    {
+        for( idx = 0; idx < 4; idx++ )
+        {
+            uint64_t pels_h = 0;
+            uint8_t pels_v[7];
+            int i_nnz[3];
+            uint8_t *p_src_by;
+            uint8_t *p_dst_by;
+            int j;
+            i_best = COST_MAX;
+
+            i_pred_mode= x264_mb_predict_intra4x4_mode( h, 4*idx );
+            x = idx&1;
+            y = idx>>1;
+
+            p_src_by = p_src + 8*x + 8*y*FENC_STRIDE;
+            p_dst_by = p_dst + 8*x + 8*y*FDEC_STRIDE;
+            predict_4x4_mode_available( h->mb.i_neighbour8[idx], predict_mode, &i_max );
+            for( i = 0; i < i_max; i++ )
+            {
+                i_mode = predict_mode[i];
+                h->predict_8x8[i_mode]( p_dst_by, h->mb.i_neighbour8[idx] );
+
+                i_sad = x264_rd_cost_i8x8( h, a->i_lambda2, idx, i_mode );
+
+                if( i_best > i_sad )
+                {
+                    a->i_predict8x8[x][y] = i_mode;
+                    i_best = i_sad;
+
+                    pels_h = *(uint64_t*)(p_dst_by+7*FDEC_STRIDE);
+                    if( !(idx&1) )
+                        for( j=0; j<7; j++ )
+                            pels_v[j] = p_dst_by[7+j*FDEC_STRIDE];
+                    for( j=0; j<3; j++ )
+                        i_nnz[j] = h->mb.cache.non_zero_count[x264_scan8[4*idx+j+1]];
+                }
+            }
+
+            *(uint64_t*)(p_dst_by+7*FDEC_STRIDE) = pels_h;
+            if( !(idx&1) )
+                for( j=0; j<7; j++ )
+                    p_dst_by[7+j*FDEC_STRIDE] = pels_v[j];
+            for( j=0; j<3; j++ )
+                h->mb.cache.non_zero_count[x264_scan8[4*idx+j+1]] = i_nnz[j];
+
+            x264_macroblock_cache_intra8x8_pred( h, 2*x, 2*y, a->i_predict8x8[x][y] );
+        }
+    }
+}
+
 #define LOAD_FENC( m, src, xoff, yoff) \
     (m)->i_stride[0] = h->mb.pic.i_stride[0]; \
     (m)->i_stride[1] = h->mb.pic.i_stride[1]; \
@@ -1805,6 +1935,9 @@ void x264_macroblock_analyse( x264_t *h )
         }
         if( analysis.i_sad_i8x8 < i_cost )
             h->mb.i_type = I_8x8;
+
+        if( h->mb.i_subpel_refine >= 7 )
+            x264_intra_rd_refine( h, &analysis );
     }
     else if( h->sh.i_type == SLICE_TYPE_P )
     {
@@ -1859,8 +1992,6 @@ void x264_macroblock_analyse( x264_t *h )
             if( ( flags & X264_ANALYSE_PSUB16x16 ) &&
                 analysis.l0.i_cost8x8 < analysis.l0.me16x16.cost )
             {
-                int i;
-
                 i_type = P_8x8;
                 i_partition = D_8x8;
                 h->mb.i_sub_partition[0] = h->mb.i_sub_partition[1] =
@@ -2041,6 +2172,41 @@ void x264_macroblock_analyse( x264_t *h )
             h->mb.i_type = i_type;
             h->stat.frame.i_intra_cost += i_intra_cost;
             h->stat.frame.i_inter_cost += i_cost;
+
+            if( h->mb.i_subpel_refine >= 7 )
+            {
+                if( IS_INTRA( h->mb.i_type ) )
+                {
+                    x264_intra_rd_refine( h, &analysis );
+                }
+                else if( i_partition == D_16x16 )
+                {
+                    x264_macroblock_cache_ref( h, 0, 0, 4, 4, 0, analysis.l0.me16x16.i_ref );
+                    x264_me_refine_qpel_rd( h, &analysis.l0.me16x16, analysis.i_lambda2, 0 );
+                }
+                else if( i_partition == D_16x8 )
+                {
+                    x264_macroblock_cache_ref( h, 0, 0, 4, 2, 0, analysis.l0.me16x8[0].i_ref );
+                    x264_macroblock_cache_ref( h, 0, 2, 4, 2, 0, analysis.l0.me16x8[1].i_ref );
+                    x264_me_refine_qpel_rd( h, &analysis.l0.me16x8[0], analysis.i_lambda2, 0 );
+                    x264_me_refine_qpel_rd( h, &analysis.l0.me16x8[1], analysis.i_lambda2, 2 );
+                }
+                else if( i_partition == D_8x16 )
+                {
+                    x264_macroblock_cache_ref( h, 0, 0, 2, 4, 0, analysis.l0.me8x16[0].i_ref );
+                    x264_macroblock_cache_ref( h, 2, 0, 2, 4, 0, analysis.l0.me8x16[1].i_ref );
+                    x264_me_refine_qpel_rd( h, &analysis.l0.me8x16[0], analysis.i_lambda2, 0 );
+                    x264_me_refine_qpel_rd( h, &analysis.l0.me8x16[1], analysis.i_lambda2, 1 );
+                }
+                else if( i_partition == D_8x8 )
+                {
+                    int i8x8;
+                    x264_analyse_update_cache( h, &analysis );
+                    for( i8x8 = 0; i8x8 < 4; i8x8++ )
+                         if( h->mb.i_sub_partition[i8x8] == D_L0_8x8 )
+                             x264_me_refine_qpel_rd( h, &analysis.l0.me8x8[i8x8], analysis.i_lambda2, i8x8 );
+                }
+            }
         }
     }
     else if( h->sh.i_type == SLICE_TYPE_B )
diff --git a/encoder/cabac.c b/encoder/cabac.c
index a9caa73b..0f19548f 100644
--- a/encoder/cabac.c
+++ b/encoder/cabac.c
@@ -500,7 +500,7 @@ static inline void x264_cabac_mb_ref( x264_t *h, x264_cabac_t *cb, int i_list, i
 
 
 
-static inline void  x264_cabac_mb_mvd_cpn( x264_t *h, x264_cabac_t *cb, int i_list, int idx, int l, int mvd )
+static inline void x264_cabac_mb_mvd_cpn( x264_t *h, x264_cabac_t *cb, int i_list, int idx, int l, int mvd )
 {
     const int amvd = abs( h->mb.cache.mvd[i_list][x264_scan8[idx] - 1][l] ) +
                      abs( h->mb.cache.mvd[i_list][x264_scan8[idx] - 8][l] );
@@ -556,44 +556,38 @@ static inline void x264_cabac_mb_mvd( x264_t *h, x264_cabac_t *cb, int i_list, i
     x264_macroblock_cache_mvd( h, block_idx_x[idx], block_idx_y[idx], width, height, i_list, mdx, mdy );
 }
 
-static inline void x264_cabac_mb8x8_mvd( x264_t *h, x264_cabac_t *cb, int i_list )
+static inline void x264_cabac_mb8x8_mvd( x264_t *h, x264_cabac_t *cb, int i_list, int i )
 {
-    int i;
-    for( i = 0; i < 4; i++ )
-    {
-        if( !x264_mb_partition_listX_table[i_list][ h->mb.i_sub_partition[i] ] )
-        {
-            continue;
-        }
+    if( !x264_mb_partition_listX_table[i_list][ h->mb.i_sub_partition[i] ] )
+        return;
 
-        switch( h->mb.i_sub_partition[i] )
-        {
-            case D_L0_8x8:
-            case D_L1_8x8:
-            case D_BI_8x8:
-                x264_cabac_mb_mvd( h, cb, i_list, 4*i, 2, 2 );
-                break;
-            case D_L0_8x4:
-            case D_L1_8x4:
-            case D_BI_8x4:
-                x264_cabac_mb_mvd( h, cb, i_list, 4*i+0, 2, 1 );
-                x264_cabac_mb_mvd( h, cb, i_list, 4*i+2, 2, 1 );
-                break;
-            case D_L0_4x8:
-            case D_L1_4x8:
-            case D_BI_4x8:
-                x264_cabac_mb_mvd( h, cb, i_list, 4*i+0, 1, 2 );
-                x264_cabac_mb_mvd( h, cb, i_list, 4*i+1, 1, 2 );
-                break;
-            case D_L0_4x4:
-            case D_L1_4x4:
-            case D_BI_4x4:
-                x264_cabac_mb_mvd( h, cb, i_list, 4*i+0, 1, 1 );
-                x264_cabac_mb_mvd( h, cb, i_list, 4*i+1, 1, 1 );
-                x264_cabac_mb_mvd( h, cb, i_list, 4*i+2, 1, 1 );
-                x264_cabac_mb_mvd( h, cb, i_list, 4*i+3, 1, 1 );
-                break;
-        }
+    switch( h->mb.i_sub_partition[i] )
+    {
+        case D_L0_8x8:
+        case D_L1_8x8:
+        case D_BI_8x8:
+            x264_cabac_mb_mvd( h, cb, i_list, 4*i, 2, 2 );
+            break;
+        case D_L0_8x4:
+        case D_L1_8x4:
+        case D_BI_8x4:
+            x264_cabac_mb_mvd( h, cb, i_list, 4*i+0, 2, 1 );
+            x264_cabac_mb_mvd( h, cb, i_list, 4*i+2, 2, 1 );
+            break;
+        case D_L0_4x8:
+        case D_L1_4x8:
+        case D_BI_4x8:
+            x264_cabac_mb_mvd( h, cb, i_list, 4*i+0, 1, 2 );
+            x264_cabac_mb_mvd( h, cb, i_list, 4*i+1, 1, 2 );
+            break;
+        case D_L0_4x4:
+        case D_L1_4x4:
+        case D_BI_4x4:
+            x264_cabac_mb_mvd( h, cb, i_list, 4*i+0, 1, 1 );
+            x264_cabac_mb_mvd( h, cb, i_list, 4*i+1, 1, 1 );
+            x264_cabac_mb_mvd( h, cb, i_list, 4*i+2, 1, 1 );
+            x264_cabac_mb_mvd( h, cb, i_list, 4*i+3, 1, 1 );
+            break;
     }
 }
 
@@ -912,7 +906,8 @@ void x264_macroblock_write_cabac( x264_t *h, x264_cabac_t *cb )
             x264_cabac_mb_ref( h, cb, 0, 12 );
         }
 
-        x264_cabac_mb8x8_mvd( h, cb, 0 );
+        for( i = 0; i < 4; i++ )
+            x264_cabac_mb8x8_mvd( h, cb, 0, i );
     }
     else if( i_mb_type == B_8x8 )
     {
@@ -932,8 +927,10 @@ void x264_macroblock_write_cabac( x264_t *h, x264_cabac_t *cb )
                     x264_cabac_mb_ref( h, cb, i_list, 4*i );
         }
 
-        x264_cabac_mb8x8_mvd( h, cb, 0 );
-        x264_cabac_mb8x8_mvd( h, cb, 1 );
+        for( i = 0; i < 4; i++ )
+            x264_cabac_mb8x8_mvd( h, cb, 0, i );
+        for( i = 0; i < 4; i++ )
+            x264_cabac_mb8x8_mvd( h, cb, 1, i );
     }
     else if( i_mb_type != B_DIRECT )
     {
@@ -1052,3 +1049,88 @@ void x264_macroblock_write_cabac( x264_t *h, x264_cabac_t *cb )
 #endif
 }
 
+#ifdef RDO_SKIP_BS
+/*****************************************************************************
+ * RD only; doesn't generate a valid bitstream
+ * doesn't write cbp or chroma dc (I don't know how much this matters)
+ * works on all partition sizes except 16x16
+ * for sub8x8, call once per 8x8 block
+ *****************************************************************************/
+void x264_partition_size_cabac( x264_t *h, x264_cabac_t *cb, int i8, int i_pixel )
+{
+    const int i_mb_type = h->mb.i_type;
+    int j;
+
+    if( i_mb_type == P_8x8 )
+    {
+        x264_cabac_mb_sub_p_partition( cb, h->mb.i_sub_partition[i8] );
+        if( h->sh.i_num_ref_idx_l0_active > 1 )
+            x264_cabac_mb_ref( h, cb, 0, 4*i8 );
+        x264_cabac_mb8x8_mvd( h, cb, 0, i8 );
+    }
+    else if( i_mb_type == P_L0 )
+    {
+        if( h->sh.i_num_ref_idx_l0_active > 1 )
+            x264_cabac_mb_ref( h, cb, 0, 4*i8 );
+        if( h->mb.i_partition == D_16x8 )
+            x264_cabac_mb_mvd( h, cb, 0, 4*i8, 4, 2 );
+        else //8x16
+            x264_cabac_mb_mvd( h, cb, 0, 4*i8, 2, 4 );
+    }
+    else if( i_mb_type == B_8x8 )
+    {
+        x264_cabac_mb_sub_b_partition( cb, h->mb.i_sub_partition[i8] );
+
+        if( h->sh.i_num_ref_idx_l0_active > 1
+            && x264_mb_partition_listX_table[0][ h->mb.i_sub_partition[i8] ] )
+            x264_cabac_mb_ref( h, cb, 0, 4*i8 );
+        if( h->sh.i_num_ref_idx_l1_active > 1
+            && x264_mb_partition_listX_table[1][ h->mb.i_sub_partition[i8] ] )
+            x264_cabac_mb_ref( h, cb, 1, 4*i8 );
+
+        x264_cabac_mb8x8_mvd( h, cb, 0, i8 );
+        x264_cabac_mb8x8_mvd( h, cb, 1, i8 );
+    }
+    else
+    {
+        x264_log(h, X264_LOG_ERROR, "invalid/unhandled mb_type\n" );
+        return;
+    }
+
+    for( j = (i_pixel < PIXEL_8x8); j >= 0; j-- )
+    {
+        if( h->mb.i_cbp_luma & (1 << i8) )
+        {
+            if( h->mb.b_transform_8x8 )
+                block_residual_write_cabac( h, cb, DCT_LUMA_8x8, i8, h->dct.luma8x8[i8], 64 );
+            else
+            {
+                int i4;
+                for( i4 = 0; i4 < 4; i4++ )
+                    block_residual_write_cabac( h, cb, DCT_LUMA_4x4, i4+i8*4, h->dct.block[i4+i8*4].luma4x4, 16 );
+            }
+        }
+
+        block_residual_write_cabac( h, cb, DCT_CHROMA_AC, i8,   h->dct.block[16+i8  ].residual_ac, 15 );
+        block_residual_write_cabac( h, cb, DCT_CHROMA_AC, i8+4, h->dct.block[16+i8+4].residual_ac, 15 );
+
+        i8 += x264_pixel_size[i_pixel].h >> 3;
+    }
+}
+
+static void x264_partition_i8x8_size_cabac( x264_t *h, x264_cabac_t *cb, int i8, int i_mode )
+{
+    const int i_pred = x264_mb_predict_intra4x4_mode( h, 4*i8 );
+    i_mode = x264_mb_pred_mode4x4_fix( i_mode );
+    x264_cabac_mb_intra4x4_pred_mode( cb, i_pred, i_mode );
+    block_residual_write_cabac( h, cb, DCT_LUMA_8x8, 4*i8, h->dct.luma8x8[i8], 64 );
+}
+
+static void x264_partition_i4x4_size_cabac( x264_t *h, x264_cabac_t *cb, int i4, int i_mode )
+{
+    const int i_pred = x264_mb_predict_intra4x4_mode( h, i4 );
+    i_mode = x264_mb_pred_mode4x4_fix( i_mode );
+    x264_cabac_mb_intra4x4_pred_mode( cb, i_pred, i_mode );
+    block_residual_write_cabac( h, cb, DCT_LUMA_4x4, i4, h->dct.block[i4].luma4x4, 16 );
+}
+#endif
diff --git a/encoder/cavlc.c b/encoder/cavlc.c
index fc662907..27a98e92 100644
--- a/encoder/cavlc.c
+++ b/encoder/cavlc.c
@@ -267,79 +267,56 @@ static void cavlc_qp_delta( x264_t *h, bs_t *s )
     bs_write_se( s, i_dqp );
 }
 
-static void x264_sub_mb_mv_write_cavlc( x264_t *h, bs_t *s, int i_list )
+static void cavlc_mb_mvd( x264_t *h, bs_t *s, int i_list, int idx, int width )
 {
-    int i;
-    for( i = 0; i < 4; i++ )
-    {
-        int mvp[2];
+    int mvp[2];
+    x264_mb_predict_mv( h, i_list, idx, width, mvp );
+    bs_write_se( s, h->mb.cache.mv[i_list][x264_scan8[idx]][0] - mvp[0] );
+    bs_write_se( s, h->mb.cache.mv[i_list][x264_scan8[idx]][1] - mvp[1] );
+}
 
-        if( !x264_mb_partition_listX_table[i_list][ h->mb.i_sub_partition[i] ] )
-        {
-            continue;
-        }
-
-        switch( h->mb.i_sub_partition[i] )
-        {
-            case D_L0_8x8:
-            case D_L1_8x8:
-            case D_BI_8x8:
-                x264_mb_predict_mv( h, i_list, 4*i, 2, mvp );
-                bs_write_se( s, h->mb.cache.mv[i_list][x264_scan8[4*i]][0] - mvp[0] );
-                bs_write_se( s, h->mb.cache.mv[i_list][x264_scan8[4*i]][1] - mvp[1] );
-                break;
-            case D_L0_8x4:
-            case D_L1_8x4:
-            case D_BI_8x4:
-                x264_mb_predict_mv( h, i_list, 4*i+0, 2, mvp );
-                bs_write_se( s, h->mb.cache.mv[i_list][x264_scan8[4*i]][0] - mvp[0] );
-                bs_write_se( s, h->mb.cache.mv[i_list][x264_scan8[4*i]][1] - mvp[1] );
-
-                x264_mb_predict_mv( h, i_list, 4*i+2, 2, mvp );
-                bs_write_se( s, h->mb.cache.mv[i_list][x264_scan8[4*i+2]][0] - mvp[0] );
-                bs_write_se( s, h->mb.cache.mv[i_list][x264_scan8[4*i+2]][1] - mvp[1] );
-                break;
-            case D_L0_4x8:
-            case D_L1_4x8:
-            case D_BI_4x8:
-                x264_mb_predict_mv( h, i_list, 4*i+0, 1, mvp );
-                bs_write_se( s, h->mb.cache.mv[i_list][x264_scan8[4*i]][0] - mvp[0] );
-                bs_write_se( s, h->mb.cache.mv[i_list][x264_scan8[4*i]][1] - mvp[1] );
-
-                x264_mb_predict_mv( h, i_list, 4*i+1, 1, mvp );
-                bs_write_se( s, h->mb.cache.mv[i_list][x264_scan8[4*i+1]][0] - mvp[0] );
-                bs_write_se( s, h->mb.cache.mv[i_list][x264_scan8[4*i+1]][1] - mvp[1] );
-                break;
-            case D_L0_4x4:
-            case D_L1_4x4:
-            case D_BI_4x4:
-                x264_mb_predict_mv( h, i_list, 4*i+0, 1, mvp );
-                bs_write_se( s, h->mb.cache.mv[i_list][x264_scan8[4*i]][0] - mvp[0] );
-                bs_write_se( s, h->mb.cache.mv[i_list][x264_scan8[4*i]][1] - mvp[1] );
-
-                x264_mb_predict_mv( h, i_list, 4*i+1, 1, mvp );
-                bs_write_se( s, h->mb.cache.mv[i_list][x264_scan8[4*i+1]][0] - mvp[0] );
-                bs_write_se( s, h->mb.cache.mv[i_list][x264_scan8[4*i+1]][1] - mvp[1] );
-
-                x264_mb_predict_mv( h, i_list, 4*i+2, 1, mvp );
-                bs_write_se( s, h->mb.cache.mv[i_list][x264_scan8[4*i+2]][0] - mvp[0] );
-                bs_write_se( s, h->mb.cache.mv[i_list][x264_scan8[4*i+2]][1] - mvp[1] );
-
-                x264_mb_predict_mv( h, i_list, 4*i+3, 1, mvp );
-                bs_write_se( s, h->mb.cache.mv[i_list][x264_scan8[4*i+3]][0] - mvp[0] );
-                bs_write_se( s, h->mb.cache.mv[i_list][x264_scan8[4*i+3]][1] - mvp[1] );
-                break;
-        }
+static void cavlc_mb8x8_mvd( x264_t *h, bs_t *s, int i_list, int i )
+{
+    if( !x264_mb_partition_listX_table[i_list][ h->mb.i_sub_partition[i] ] )
+        return;
+
+    switch( h->mb.i_sub_partition[i] )
+    {
+        case D_L0_8x8:
+        case D_L1_8x8:
+        case D_BI_8x8:
+            cavlc_mb_mvd( h, s, i_list, 4*i, 2 );
+            break;
+        case D_L0_8x4:
+        case D_L1_8x4:
+        case D_BI_8x4:
+            cavlc_mb_mvd( h, s, i_list, 4*i+0, 2 );
+            cavlc_mb_mvd( h, s, i_list, 4*i+2, 2 );
+            break;
+        case D_L0_4x8:
+        case D_L1_4x8:
+        case D_BI_4x8:
+            cavlc_mb_mvd( h, s, i_list, 4*i+0, 1 );
+            cavlc_mb_mvd( h, s, i_list, 4*i+1, 1 );
+            break;
+        case D_L0_4x4:
+        case D_L1_4x4:
+        case D_BI_4x4:
+            cavlc_mb_mvd( h, s, i_list, 4*i+0, 1 );
+            cavlc_mb_mvd( h, s, i_list, 4*i+1, 1 );
+            cavlc_mb_mvd( h, s, i_list, 4*i+2, 1 );
+            cavlc_mb_mvd( h, s, i_list, 4*i+3, 1 );
+            break;
     }
 }
 
-static void x264_macroblock_luma_write_cavlc( x264_t *h, bs_t *s )
+static inline void x264_macroblock_luma_write_cavlc( x264_t *h, bs_t *s, int i8start, int i8end )
 {
     int i8, i4, i;
     if( h->mb.b_transform_8x8 )
     {
         /* shuffle 8x8 dct coeffs into 4x4 lists */
-        for( i8 = 0; i8 < 4; i8++ )
+        for( i8 = i8start; i8 <= i8end; i8++ )
             if( h->mb.i_cbp_luma & (1 << i8) )
                 for( i4 = 0; i4 < 4; i4++ )
                 {
@@ -350,7 +327,7 @@ static void x264_macroblock_luma_write_cavlc( x264_t *h, bs_t *s )
                 }
     }
 
-    for( i8 = 0; i8 < 4; i8++ )
+    for( i8 = i8start; i8 <= i8end; i8++ )
         if( h->mb.i_cbp_luma & (1 << i8) )
             for( i4 = 0; i4 < 4; i4++ )
                 block_residual_write_cavlc( h, s, i4+i8*4, h->dct.block[i4+i8*4].luma4x4, 16 );
@@ -541,7 +518,8 @@ void x264_macroblock_write_cavlc( x264_t *h, bs_t *s )
             bs_write_te( s, h->sh.i_num_ref_idx_l0_active - 1, h->mb.cache.ref[0][x264_scan8[12]] );
         }
 
-        x264_sub_mb_mv_write_cavlc( h, s, 0 );
+        for( i = 0; i < 4; i++ )
+            cavlc_mb8x8_mvd( h, s, 0, i );
     }
     else if( i_mb_type == B_8x8 )
     {
@@ -568,8 +546,10 @@ void x264_macroblock_write_cavlc( x264_t *h, bs_t *s )
             }
         }
         /* mvd */
-        x264_sub_mb_mv_write_cavlc( h, s, 0 );
-        x264_sub_mb_mv_write_cavlc( h, s, 1 );
+        for( i = 0; i < 4; i++ )
+            cavlc_mb8x8_mvd( h, s, 0, i );
+        for( i = 0; i < 4; i++ )
+            cavlc_mb8x8_mvd( h, s, 1, i );
     }
     else if( i_mb_type != B_DIRECT )
     {
@@ -702,7 +682,7 @@ void x264_macroblock_write_cavlc( x264_t *h, bs_t *s )
     else if( h->mb.i_cbp_luma != 0 || h->mb.i_cbp_chroma != 0 )
     {
         cavlc_qp_delta( h, s );
-        x264_macroblock_luma_write_cavlc( h, s );
+        x264_macroblock_luma_write_cavlc( h, s, 0, 3 );
     }
     if( h->mb.i_cbp_chroma != 0 )
     {
@@ -721,3 +701,98 @@ void x264_macroblock_write_cavlc( x264_t *h, bs_t *s )
         h->stat.frame.i_ptex_bits += bs_pos(s) - i_mb_pos_tex;
 #endif
 }
+
+#ifdef RDO_SKIP_BS
+/*****************************************************************************
+ * RD only; doesn't generate a valid bitstream
+ * doesn't write cbp or chroma dc (I don't know how much this matters)
+ * works on all partition sizes except 16x16
+ * for sub8x8, call once per 8x8 block
+ *****************************************************************************/
+int x264_partition_size_cavlc( x264_t *h, int i8, int i_pixel )
+{
+    bs_t s;
+    const int i_mb_type = h->mb.i_type;
+    int j;
+
+    s.i_bits_encoded = 0;
+
+    if( i_mb_type == P_8x8 )
+    {
+        bs_write_ue( &s, sub_mb_type_p_to_golomb[ h->mb.i_sub_partition[i8] ] );
+        if( h->sh.i_num_ref_idx_l0_active > 1 )
+            bs_write_te( &s, h->sh.i_num_ref_idx_l0_active - 1, h->mb.cache.ref[0][x264_scan8[4*i8]] );
+        cavlc_mb8x8_mvd( h, &s, 0, i8 );
+    }
+    else if( i_mb_type == P_L0 )
+    {
+        if( h->sh.i_num_ref_idx_l0_active > 1 )
+            bs_write_te( &s, h->sh.i_num_ref_idx_l0_active - 1, h->mb.cache.ref[0][x264_scan8[4*i8]] );
+        if( h->mb.i_partition == D_16x8 )
+            cavlc_mb_mvd( h, &s, 0, 4*i8, 4 );
+        else //8x16
+            cavlc_mb_mvd( h, &s, 0, 4*i8, 2 );
+    }
+    else if( i_mb_type == B_8x8 )
+    {
+        bs_write_ue( &s, sub_mb_type_b_to_golomb[ h->mb.i_sub_partition[i8] ] );
+
+        if( h->sh.i_num_ref_idx_l0_active > 1
+            && x264_mb_partition_listX_table[0][ h->mb.i_sub_partition[i8] ] )
+            bs_write_te( &s, h->sh.i_num_ref_idx_l0_active - 1, h->mb.cache.ref[0][x264_scan8[4*i8]] );
+        if( h->sh.i_num_ref_idx_l1_active > 1
+            && x264_mb_partition_listX_table[1][ h->mb.i_sub_partition[i8] ] )
+            bs_write_te( &s, h->sh.i_num_ref_idx_l1_active - 1, h->mb.cache.ref[1][x264_scan8[4*i8]] );
+
+        cavlc_mb8x8_mvd( h, &s, 0, i8 );
+        cavlc_mb8x8_mvd( h, &s, 1, i8 );
+    }
+    else
+    {
+        x264_log(h, X264_LOG_ERROR, "invalid/unhandled mb_type\n" );
+        return 0;
+    }
+
+    for( j = (i_pixel < PIXEL_8x8); j >= 0; j-- )
+    {
+        x264_macroblock_luma_write_cavlc( h, &s, i8, i8 );
+
+        block_residual_write_cavlc( h, &s, i8,   h->dct.block[16+i8  ].residual_ac, 15 );
+        block_residual_write_cavlc( h, &s, i8+4, h->dct.block[16+i8+4].residual_ac, 15 );
+
+        i8 += x264_pixel_size[i_pixel].h >> 3;
+    }
+
+    return s.i_bits_encoded;
+}
+
+static int cavlc_intra4x4_pred_size( x264_t *h, int i4, int i_mode )
+{
+    if( x264_mb_predict_intra4x4_mode( h, i4 ) == x264_mb_pred_mode4x4_fix( i_mode ) )
+        return 1;
+    else
+        return 4;
+}
+
+static int x264_partition_i8x8_size_cavlc( x264_t *h, int i8, int i_mode )
+{
+    int i4, i;
+    h->out.bs.i_bits_encoded = cavlc_intra4x4_pred_size( h, 4*i8, i_mode );
+    for( i4 = 0; i4 < 4; i4++ )
+    {
+        for( i = 0; i < 16; i++ )
+            h->dct.block[i4+i8*4].luma4x4[i] = h->dct.luma8x8[i8][i4+i*4];
+        h->mb.cache.non_zero_count[x264_scan8[i4+i8*4]] =
+            array_non_zero_count( h->dct.block[i4+i8*4].luma4x4, 16 );
+        block_residual_write_cavlc( h, &h->out.bs, i4+i8*4, h->dct.block[i4+i8*4].luma4x4, 16 );
+    }
+    return h->out.bs.i_bits_encoded;
+}
+
+static int x264_partition_i4x4_size_cavlc( x264_t *h, int i4, int i_mode )
+{
+    h->out.bs.i_bits_encoded = cavlc_intra4x4_pred_size( h, i4, i_mode );
+    block_residual_write_cavlc( h, &h->out.bs, i4, h->dct.block[i4].luma4x4, 16 );
+    return h->out.bs.i_bits_encoded;
+}
+#endif
diff --git a/encoder/encoder.c b/encoder/encoder.c
index 97f6e363..59a049c8 100644
--- a/encoder/encoder.c
+++ b/encoder/encoder.c
@@ -357,6 +357,7 @@ static int x264_validate_parameters( x264_t *h )
         h->param.analyse.i_trellis = 0;
         h->param.analyse.b_fast_pskip = 0;
         h->param.analyse.i_noise_reduction = 0;
+        h->param.analyse.i_subpel_refine = x264_clip3( h->param.analyse.i_subpel_refine, 1, 6 );
     }
 
     if( ( h->param.i_width % 16 || h->param.i_height % 16 ) && !h->mb.b_lossless )
@@ -394,7 +395,7 @@ static int x264_validate_parameters( x264_t *h )
         h->param.analyse.i_me_range = 4;
     if( h->param.analyse.i_me_range > 16 && h->param.analyse.i_me_method <= X264_ME_HEX )
         h->param.analyse.i_me_range = 16;
-    h->param.analyse.i_subpel_refine = x264_clip3( h->param.analyse.i_subpel_refine, 1, 6 );
+    h->param.analyse.i_subpel_refine = x264_clip3( h->param.analyse.i_subpel_refine, 1, 7 );
     h->param.analyse.b_bframe_rdo = h->param.analyse.b_bframe_rdo && h->param.analyse.i_subpel_refine >= 6;
     h->param.analyse.b_mixed_references = h->param.analyse.b_mixed_references && h->param.i_frame_reference > 1;
     h->param.analyse.inter &= X264_ANALYSE_PSUB16x16|X264_ANALYSE_PSUB8x8|X264_ANALYSE_BSUB16x16|
diff --git a/encoder/macroblock.c b/encoder/macroblock.c
index 3cc6716a..15220e38 100644
--- a/encoder/macroblock.c
+++ b/encoder/macroblock.c
@@ -849,3 +849,88 @@ void x264_denoise_dct( x264_t *h, int16_t *dct )
         }
     }
 }
+
+/*****************************************************************************
+ * RD only; 4 calls to this do not make up for one macroblock_encode.
+ * doesn't transform chroma dc.
+ *****************************************************************************/
+void x264_macroblock_encode_p8x8( x264_t *h, int i8 )
+{
+    int i_qp = h->mb.i_qp;
+    uint8_t *p_fenc = h->mb.pic.p_fenc[0] + (i8&1)*8 + (i8>>1)*8*FENC_STRIDE;
+    uint8_t *p_fdec = h->mb.pic.p_fdec[0] + (i8&1)*8 + (i8>>1)*8*FDEC_STRIDE;
+    int i_decimate_8x8 = 0;
+    int nnz8x8 = 1;
+    int ch;
+
+    x264_mb_mc_8x8( h, i8 );
+
+    if( h->mb.b_transform_8x8 )
+    {
+        int16_t dct8x8[8][8];
+        h->dctf.sub8x8_dct8( dct8x8, p_fenc, FENC_STRIDE, p_fdec, FDEC_STRIDE );
+
+        quant_8x8( h, dct8x8, h->quant8_mf[CQM_8PY], i_qp, 0 );
+        scan_zigzag_8x8full( h->dct.luma8x8[i8], dct8x8 );
+        i_decimate_8x8 = x264_mb_decimate_score( h->dct.luma8x8[i8], 64 );
+
+        if( i_decimate_8x8 < 4 )
+        {
+            memset( h->dct.luma8x8[i8], 0, sizeof(h->dct.luma8x8[i8]) );
+            nnz8x8 = 0;
+        }
+        if( nnz8x8 )
+        {
+            h->quantf.dequant_8x8( dct8x8, h->dequant8_mf[CQM_8PY], i_qp );
+            h->dctf.add8x8_idct8( p_fdec, FDEC_STRIDE, dct8x8 );
+        }
+    }
+    else
+    {
+        int i4, idx;
+        int16_t dct4x4[4][4][4];
+        h->dctf.sub8x8_dct( dct4x4, p_fenc, FENC_STRIDE, p_fdec, FDEC_STRIDE );
+
+        for( i4 = 0; i4 < 4; i4++ )
+        {
+            idx = i8 * 4 + i4;
+
+            quant_4x4( h, dct4x4[i4], h->quant4_mf[CQM_4PY], i_qp, 0 );
+            scan_zigzag_4x4full( h->dct.block[idx].luma4x4, dct4x4[i4] );
+            i_decimate_8x8 += x264_mb_decimate_score( h->dct.block[idx].luma4x4, 16 );
+        }
+
+        if( i_decimate_8x8 < 4 )
+        {
+            memset( &h->dct.block[i8*4], 0, 4 * sizeof(*h->dct.block) );
+            nnz8x8 = 0;
+        }
+        if( nnz8x8 )
+        {
+            for( i4 = 0; i4 < 4; i4++ )
+                 h->quantf.dequant_4x4( dct4x4[i4], h->dequant4_mf[CQM_4PY], i_qp );
+            h->dctf.add8x8_idct( p_fdec, FDEC_STRIDE, dct4x4 );
+        }
+    }
+
+    i_qp = i_chroma_qp_table[x264_clip3( i_qp + h->pps->i_chroma_qp_index_offset, 0, 51 )];
+
+    for( ch = 0; ch < 2; ch++ )
+    {
+        int16_t dct4x4[4][4];
+        p_fenc = h->mb.pic.p_fenc[1+ch] + (i8&1)*4 + (i8>>1)*4*FENC_STRIDE;
+        p_fdec = h->mb.pic.p_fdec[1+ch] + (i8&1)*4 + (i8>>1)*4*FDEC_STRIDE;
+
+        h->dctf.sub4x4_dct( dct4x4, p_fenc, FENC_STRIDE, p_fdec, FDEC_STRIDE );
+        quant_4x4( h, dct4x4, h->quant4_mf[CQM_4PC], i_qp, 0 );
+        scan_zigzag_4x4( h->dct.block[16+i8+ch*4].residual_ac, dct4x4 );
+        h->quantf.dequant_4x4( dct4x4, h->dequant4_mf[CQM_4PC], i_qp );
+        h->dctf.add4x4_idct( p_fdec, FDEC_STRIDE, dct4x4 );
+    }
+
+    if( nnz8x8 )
+        h->mb.i_cbp_luma |= (1 << i8);
+    else
+        h->mb.i_cbp_luma &= ~(1 << i8);
+    h->mb.i_cbp_chroma = 0x02;
+}
diff --git a/encoder/macroblock.h b/encoder/macroblock.h
index f856fd3b..e324bbb0 100644
--- a/encoder/macroblock.h
+++ b/encoder/macroblock.h
@@ -39,6 +39,8 @@ void x264_macroblock_encode      ( x264_t *h );
 void x264_macroblock_write_cabac ( x264_t *h, x264_cabac_t *cb );
 void x264_macroblock_write_cavlc ( x264_t *h, bs_t *s );
 
+void x264_macroblock_encode_p8x8( x264_t *h, int i8 );
+
 void x264_cabac_mb_skip( x264_t *h, int b_skip );
 
 void x264_quant_4x4_trellis( x264_t *h, int16_t dct[4][4], int i_quant_cat,
diff --git a/encoder/me.c b/encoder/me.c
index 822baa0c..66bfdfc9 100644
--- a/encoder/me.c
+++ b/encoder/me.c
@@ -39,6 +39,7 @@ static const int subpel_iterations[][4] =
     {0,2,1,0},
     {0,2,1,1},
     {0,2,1,2},
+    {0,0,2,2},
     {0,0,2,2}};
 
 static void refine_subpel( x264_t *h, x264_me_t *m, int hpel_iters, int qpel_iters, int *p_halfpel_thresh, int b_refine_qpel );
@@ -712,3 +713,98 @@ int x264_me_refine_bidir( x264_t *h, x264_me_t *m0, x264_me_t *m1, int i_weight
     m1->mv[1] = bm1y;
     return bcost;
 }
+
+#define COST_MV_RD( mx, my, dir ) \
+{ \
+    if( (dir^1) != odir && (dir<0 || !p_visited[(mx)+(my)*16]) ) \
+    { \
+        int cost; \
+        cache_mv[0] = cache_mv2[0] = mx; \
+        cache_mv[1] = cache_mv2[1] = my; \
+        cost = x264_rd_cost_part( h, i_lambda2, i8, m->i_pixel ); \
+        if( cost < bcost ) \
+        {                  \
+            bcost = cost;  \
+            bmx = mx;      \
+            bmy = my;      \
+        } \
+        if(dir>=0) p_visited[(mx)+(my)*16] = 1; \
+    } \
+}
+
+void x264_me_refine_qpel_rd( x264_t *h, x264_me_t *m, int i_lambda2, int i8 )
+{
+    // don't have to fill the whole mv cache rectangle
+    static const int pixel_mv_offs[] = { 0, 4, 4*8, 0 };
+    int16_t *cache_mv = h->mb.cache.mv[0][x264_scan8[i8*4]];
+    int16_t *cache_mv2 = cache_mv + pixel_mv_offs[m->i_pixel];
+    const int bw = x264_pixel_size[m->i_pixel].w>>2;
+    const int bh = x264_pixel_size[m->i_pixel].h>>2;
+
+    int bcost = m->i_pixel == PIXEL_16x16 ? m->cost : COST_MAX;
+    int bmx = m->mv[0]; 
+    int bmy = m->mv[1];
+    int omx, omy, i;
+    int odir = -1, bdir;
+
+    int visited[16*13] = {0}; // only need 13x13, but 16 is more convenient
+    int *p_visited = &visited[6+6*16];
+
+    if( m->i_pixel != PIXEL_16x16 )
+    {
+        COST_MV_RD( bmx, bmy, -1 );
+        x264_mb_predict_mv( h, 0, i8*4, bw, m->mvp );
+    }
+
+    /* check the predicted mv */
+    if( bmx != m->mvp[0] || bmy != m->mvp[1] )
+        COST_MV_RD( m->mvp[0], m->mvp[1], -1 );
+
+    /* mark mv and mvp as visited */
+    p_visited[0] = 1;
+    p_visited -= bmx + bmy*16;
+    {
+        int mx = bmx ^ m->mv[0] ^ m->mvp[0];
+        int my = bmy ^ m->mv[1] ^ m->mvp[1];
+        if( abs(mx-bmx) < 7 && abs(my-bmy) < 7 )
+            p_visited[mx + my*16] = 1;
+    }
+
+    /* hpel */  
+    bdir = -1;
+    for( i = 0; i < 2; i++ )
+    {
+         omx = bmx;
+         omy = bmy;
+         odir = bdir;
+         COST_MV_RD( omx, omy - 2, 0 );
+         COST_MV_RD( omx, omy + 2, 1 );
+         COST_MV_RD( omx - 2, omy, 2 );
+         COST_MV_RD( omx + 2, omy, 3 );
+         if( bmx == omx && bmy == omy )
+            break;
+    }
+    
+    /* qpel */
+    bdir = -1;
+    for( i = 0; i < 2; i++ )
+    {
+         omx = bmx;
+         omy = bmy;
+         odir = bdir;
+         COST_MV_RD( omx, omy - 1, 0 );
+         COST_MV_RD( omx, omy + 1, 1 );
+         COST_MV_RD( omx - 1, omy, 2 );
+         COST_MV_RD( omx + 1, omy, 3 );
+         if( bmx == omx && bmy == omy )
+            break;
+    }
+
+    m->cost = bcost;
+    m->mv[0] = bmx;
+    m->mv[1] = bmy;
+
+    x264_macroblock_cache_mv ( h, 2*(i8&1), i8&2, bw, bh, 0, bmx, bmy );
+    x264_macroblock_cache_mvd( h, 2*(i8&1), i8&2, bw, bh, 0, bmx - m->mvp[0], bmy - m->mvp[1] );
+}
+
diff --git a/encoder/me.h b/encoder/me.h
index 03678c82..8c640a97 100644
--- a/encoder/me.h
+++ b/encoder/me.h
@@ -52,6 +52,8 @@ static inline void x264_me_search( x264_t *h, x264_me_t *m, int (*mvc)[2], int i
     { x264_me_search_ref( h, m, mvc, i_mvc, NULL ); }
 
 void x264_me_refine_qpel( x264_t *h, x264_me_t *m );
+void x264_me_refine_qpel_rd( x264_t *h, x264_me_t *m, int i_lambda2, int i8 );
 int x264_me_refine_bidir( x264_t *h, x264_me_t *m0, x264_me_t *m1, int i_weight );
+int x264_rd_cost_part( x264_t *h, int i_lambda2, int i8, int i_pixel );
 
 #endif
diff --git a/encoder/rdo.c b/encoder/rdo.c
index ba1397d6..480d5ec4 100644
--- a/encoder/rdo.c
+++ b/encoder/rdo.c
@@ -57,6 +57,12 @@ static int ssd_mb( x264_t *h )
                                      h->mb.pic.p_fdec[2], FDEC_STRIDE );
 }
 
+static int ssd_plane( x264_t *h, int size, int p, int x, int y )
+{
+    return h->pixf.ssd[size]( h->mb.pic.p_fenc[p] + x+y*FENC_STRIDE, FENC_STRIDE,
+                              h->mb.pic.p_fdec[p] + x+y*FDEC_STRIDE, FDEC_STRIDE );
+}
+
 static int x264_rd_cost_mb( x264_t *h, int i_lambda2 )
 {
     int b_transform_bak = h->mb.b_transform_8x8;
@@ -91,6 +97,86 @@ static int x264_rd_cost_mb( x264_t *h, int i_lambda2 )
     return i_ssd + i_bits;
 }
 
+int x264_rd_cost_part( x264_t *h, int i_lambda2, int i8, int i_pixel )
+{
+    int i_ssd, i_bits;
+
+    if( i_pixel == PIXEL_16x16 )
+    {
+        int type_bak = h->mb.i_type;
+        int i_cost = x264_rd_cost_mb( h, i_lambda2 );
+        h->mb.i_type = type_bak;
+        return i_cost;
+    }
+
+    x264_macroblock_encode_p8x8( h, i8 );
+    if( i_pixel == PIXEL_16x8 )
+        x264_macroblock_encode_p8x8( h, i8+1 );
+    if( i_pixel == PIXEL_8x16 )
+        x264_macroblock_encode_p8x8( h, i8+2 );
+
+    i_ssd = ssd_plane( h, i_pixel,   0, (i8&1)*8, (i8>>1)*8 )
+          + ssd_plane( h, i_pixel+3, 1, (i8&1)*4, (i8>>1)*4 )
+          + ssd_plane( h, i_pixel+3, 2, (i8&1)*4, (i8>>1)*4 );
+
+    if( h->param.b_cabac )
+    {
+        x264_cabac_t cabac_tmp = h->cabac;
+        cabac_tmp.f8_bits_encoded = 0;
+        x264_partition_size_cabac( h, &cabac_tmp, i8, i_pixel );
+        i_bits = ( cabac_tmp.f8_bits_encoded * i_lambda2 + 128 ) >> 8;
+    }
+    else
+    {
+        i_bits = x264_partition_size_cavlc( h, i8, i_pixel ) * i_lambda2;
+    }
+
+    return i_ssd + i_bits;
+}
+
+int x264_rd_cost_i8x8( x264_t *h, int i_lambda2, int i8, int i_mode )
+{
+    int i_ssd, i_bits;
+
+    x264_mb_encode_i8x8( h, i8, h->mb.i_qp );
+    i_ssd = ssd_plane( h, PIXEL_8x8, 0, (i8&1)*8, (i8>>1)*8 );
+
+    if( h->param.b_cabac )
+    {
+        x264_cabac_t cabac_tmp = h->cabac;
+        cabac_tmp.f8_bits_encoded = 0;
+        x264_partition_i8x8_size_cabac( h, &cabac_tmp, i8, i_mode );
+        i_bits = ( cabac_tmp.f8_bits_encoded * i_lambda2 + 128 ) >> 8;
+    }
+    else
+    {
+        i_bits = x264_partition_i8x8_size_cavlc( h, i8, i_mode ) * i_lambda2;
+    }
+
+    return i_ssd + i_bits;
+}
+
+int x264_rd_cost_i4x4( x264_t *h, int i_lambda2, int i4, int i_mode )
+{
+    int i_ssd, i_bits;
+
+    x264_mb_encode_i4x4( h, i4, h->mb.i_qp );
+    i_ssd = ssd_plane( h, PIXEL_4x4, 0, block_idx_x[i4]*4, block_idx_y[i4]*4 );
+
+    if( h->param.b_cabac )
+    {
+        x264_cabac_t cabac_tmp = h->cabac;
+        cabac_tmp.f8_bits_encoded = 0;
+        x264_partition_i4x4_size_cabac( h, &cabac_tmp, i4, i_mode );
+        i_bits = ( cabac_tmp.f8_bits_encoded * i_lambda2 + 128 ) >> 8;
+    }
+    else
+    {
+        i_bits = x264_partition_i4x4_size_cavlc( h, i4, i_mode ) * i_lambda2;
+    }
+
+    return i_ssd + i_bits;
+}
 
 /****************************************************************************
  * Trellis RD quantization
diff --git a/x264.c b/x264.c
index 10f40415..eb277697 100644
--- a/x264.c
+++ b/x264.c
@@ -209,7 +209,7 @@ static void Help( x264_param_t *defaults )
              "                                  - esa: exhaustive search (slow)\n"
              "      --merange <integer>     Maximum motion vector search range [%d]\n"
              "  -m, --subme <integer>       Subpixel motion estimation and partition\n"
-             "                                  decision quality: 1=fast, 6=best. [%d]\n"
+             "                                  decision quality: 1=fast, 7=best. [%d]\n"
              "      --b-rdo                 RD based mode decision for B-frames. Requires subme 6.\n"
              "      --mixed-refs            Decide references on a per partition basis\n"
              "      --no-chroma-me          Ignore chroma in motion estimation\n"
-- 
2.40.0