From: Loren Merritt <pengvado@videolan.org>
Date: Fri, 17 Dec 2004 10:57:02 +0000 (+0000)
Subject: implement macroblock types B_SKIP, B_DIRECT, B_8x8
X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=199ff7406b76dc1c10b756053398bf8a834bcf5c;p=libx264

implement macroblock types B_SKIP, B_DIRECT, B_8x8


git-svn-id: svn://svn.videolan.org/x264/trunk@68 df754926-b1dd-0310-bc7b-ec298dee348c
---

diff --git a/common/common.c b/common/common.c
index 1d64d2ed..243396fa 100644
--- a/common/common.c
+++ b/common/common.c
@@ -79,7 +79,7 @@ void    x264_param_default( x264_param_t *param )
     param->rc.i_qp_max = 51;
     param->rc.i_qp_step = 4;
     param->rc.f_ip_factor = 1.4;
-    param->rc.f_pb_factor = 1.4;
+    param->rc.f_pb_factor = 1.3;
 
     param->rc.b_stat_write = 0;
     param->rc.psz_stat_out = "x264_2pass.log";
@@ -97,7 +97,8 @@ void    x264_param_default( x264_param_t *param )
 
     /* */
     param->analyse.intra = X264_ANALYSE_I4x4;
-    param->analyse.inter = X264_ANALYSE_I4x4 | X264_ANALYSE_PSUB16x16;
+    param->analyse.inter = X264_ANALYSE_I4x4 | X264_ANALYSE_PSUB16x16 | X264_ANALYSE_BSUB16x16;
+    param->analyse.i_direct_mv_pred = X264_DIRECT_PRED_TEMPORAL;
     param->analyse.i_subpel_refine = 1;
     param->analyse.b_psnr = 1;
 }
diff --git a/common/common.h b/common/common.h
index 1aacd57d..c60fdfdd 100644
--- a/common/common.h
+++ b/common/common.h
@@ -250,6 +250,8 @@ struct x264_t
     /* MB table and cache for current frame/mb */
     struct
     {
+        int     i_mb_count;                 /* number of mbs in a frame */
+
         /* Strides */
         int     i_mb_stride;
 
@@ -270,7 +272,15 @@ struct x264_t
         int16_t (*mv[2])[2];                /* mb mv. set to 0 for intra mb */
         int16_t (*mvd[2])[2];               /* mb mv difference with predict. set to 0 if intra. cabac only */
         int8_t   *ref[2];                   /* mb ref. set to -1 if non used (intra or Lx only) */
-        int16_t (*mvr[2][16])[2];           /* mb mv for each possible ref */
+        int16_t (*mvr[2][16])[2];           /* 16x16 mv for each possible ref */
+        int8_t  *skipbp;                    /* block pattern for SKIP or DIRECT (sub)mbs. B-frames + cabac only */
+
+        /* for B_SKIP and B_DIRECT motion prediction */
+        struct
+        {
+            int16_t (*mv)[2];               /* keep only L0 */
+            int8_t   *ref;
+        } list1ref0;
 
         /* current value */
         int     i_type;
@@ -313,6 +323,12 @@ struct x264_t
             /* 0 if non avaible */
             int16_t mv[2][X264_SCAN8_SIZE][2];
             int16_t mvd[2][X264_SCAN8_SIZE][2];
+
+            /* 1 if SKIP or DIRECT. set only for B-frames + CABAC */
+            int8_t  skip[X264_SCAN8_SIZE];
+
+            int16_t direct_mv[2][X264_SCAN8_SIZE][2];
+            int8_t  direct_ref[2][X264_SCAN8_SIZE];
         } cache;
 
         /* */
diff --git a/common/macroblock.c b/common/macroblock.c
index 77982488..356fc6bf 100644
--- a/common/macroblock.c
+++ b/common/macroblock.c
@@ -349,6 +349,183 @@ void x264_mb_predict_mv_pskip( x264_t *h, int mv[2] )
     }
 }
 
+static int x264_mb_predict_mv_direct16x16_temporal( x264_t *h )
+{
+    int i_mb_4x4 = 16 * h->mb.i_mb_stride * h->mb.i_mb_y + 4 * h->mb.i_mb_x;
+    int i_mb_8x8 =  4 * h->mb.i_mb_stride * h->mb.i_mb_y + 2 * h->mb.i_mb_x;
+    int i;
+    
+    x264_macroblock_cache_ref( h, 0, 0, 4, 4, 1, 0 );
+    
+    for( i = 0; i < 4; i++ )
+    {
+        const int x8 = 2*(i%2);
+        const int y8 = 2*(i/2);
+        /* TODO: MapColToList0 */
+        const int i_ref = h->mb.list1ref0.ref[ i_mb_8x8 + x8/2 + y8 * h->mb.i_mb_stride ];
+
+        if( i_ref == -1 )
+        {
+            x264_macroblock_cache_ref( h, x8, y8, 2, 2, 0, 0 );
+            x264_macroblock_cache_mv(  h, x8, y8, 2, 2, 0, 0, 0 );
+            x264_macroblock_cache_mv(  h, x8, y8, 2, 2, 1, 0, 0 );
+        }
+        else
+        {
+            int tb = x264_clip3( h->fdec->i_poc     - h->fref0[i_ref]->i_poc, -128, 127 );
+            int td = x264_clip3( h->fref1[0]->i_poc - h->fref0[i_ref]->i_poc, -128, 127 );
+            int tx = (16384 + (abs(td) >> 1)) / td;
+            int dist_scale_factor = x264_clip3( (tb * tx + 32) >> 6, -1024, 1023 );
+            int x4, y4;
+
+            x264_macroblock_cache_ref( h, x8, y8, 2, 2, 0, i_ref );
+
+            for( y4 = y8; y4 < y8+2; y4++ )
+                for( x4 = x8; x4 < x8+2; x4++ )
+                {
+                    const int16_t *mv_col = h->mb.list1ref0.mv[ i_mb_4x4 + x4 + y4 * 4 * h->mb.i_mb_stride ];
+                    if( td == 0 /* || pic0 is a long-term ref */ )
+                    {
+                        x264_macroblock_cache_mv( h, x4, y4, 1, 1, 0, mv_col[0], mv_col[1] );
+                        x264_macroblock_cache_mv( h, x4, y4, 1, 1, 1, 0, 0 );
+                    }
+                    else
+                    {
+                        int mv_l0[2];
+                        mv_l0[0] = ( dist_scale_factor * mv_col[0] + 128 ) >> 8;
+                        mv_l0[1] = ( dist_scale_factor * mv_col[1] + 128 ) >> 8;
+                        x264_macroblock_cache_mv( h, x4, y4, 1, 1, 0, mv_l0[0], mv_l0[1] );
+                        x264_macroblock_cache_mv( h, x4, y4, 1, 1, 1, mv_l0[0] - mv_col[0], mv_l0[1] - mv_col[1] );
+                    }
+                }
+        }
+    }
+
+    return 1;
+}
+
+static int x264_mb_predict_mv_direct16x16_spatial( x264_t *h )
+{
+    int ref[2];
+    int mv[2][2];
+    int i_list;
+    int i8, i4;
+    const int s8x8 = 2 * h->mb.i_mb_stride;
+    const int s4x4 = 4 * h->mb.i_mb_stride;
+    const int8_t *l1ref = &h->mb.list1ref0.ref[ 2*h->mb.i_mb_x + 2*s8x8*h->mb.i_mb_y ];
+    const int16_t (*l1mv)[2] = (const int16_t (*)[2])
+        &h->mb.list1ref0.mv[ 4*h->mb.i_mb_x + 4*s4x4*h->mb.i_mb_y ];
+
+    for( i_list=0; i_list<2; i_list++ )
+    {
+        int i_refa = h->mb.cache.ref[i_list][X264_SCAN8_0 - 1];
+        int i_refb = h->mb.cache.ref[i_list][X264_SCAN8_0 - 8];
+        int i_refc = h->mb.cache.ref[i_list][X264_SCAN8_0 - 8 + 4];
+        if( i_refc == -2 )
+            i_refc = h->mb.cache.ref[i_list][X264_SCAN8_0 - 8 - 1];
+
+        ref[i_list] = i_refa;
+        if( ref[i_list] < 0 || ( i_refb < ref[i_list] && i_refb >= 0 ))
+            ref[i_list] = i_refb;
+        if( ref[i_list] < 0 || ( i_refc < ref[i_list] && i_refc >= 0 ))
+            ref[i_list] = i_refc;
+        if( ref[i_list] < 0 )
+            ref[i_list] = -1;
+    }
+
+    if( ref[0] < 0 && ref[1] < 0 )
+    {
+        ref[0] = 
+        ref[1] = 0;
+        mv[0][0] = 
+        mv[0][1] = 
+        mv[1][0] = 
+        mv[1][1] = 0;
+    }
+    else
+    {
+        for( i_list=0; i_list<2; i_list++ )
+        {
+            if( ref[i_list] >= 0 )
+                x264_mb_predict_mv_16x16( h, i_list, ref[i_list], mv[i_list] );
+            else
+                mv[i_list][0] = mv[i_list][1] = 0;
+        }
+    }
+
+    /* FIXME: clip mv ? */
+    
+    x264_macroblock_cache_ref( h, 0, 0, 4, 4, 0, ref[0] );
+    x264_macroblock_cache_ref( h, 0, 0, 4, 4, 1, ref[1] );
+    x264_macroblock_cache_mv(  h, 0, 0, 4, 4, 0, mv[0][0], mv[0][1] );
+    x264_macroblock_cache_mv(  h, 0, 0, 4, 4, 1, mv[1][0], mv[1][1] );
+
+    /* col_zero_flag */
+    for( i8=0; i8<4; i8++ )
+    {
+        const int x8 = i8%2;
+        const int y8 = i8/2;
+        if( l1ref[ x8 + y8*s8x8 ] == 0 )
+        {
+            for( i4=0; i4<4; i4++ )
+            {
+                const int x4 = i4%2 + 2*x8;
+                const int y4 = i4/2 + 2*y8;
+                const int16_t *mvcol = l1mv[x4 + y4*s4x4];
+                if( abs( mvcol[0] ) <= 1 && abs( mvcol[1] ) <= 1 )
+                {
+                    if( ref[0] == 0 )
+                        x264_macroblock_cache_mv( h, x4, y4, 1, 1, 0, 0, 0 );
+                    if( ref[1] == 0 )
+                        x264_macroblock_cache_mv( h, x4, y4, 1, 1, 1, 0, 0 );
+                }
+            }
+        }
+    }
+
+    return 1;
+}
+
+int x264_mb_predict_mv_direct16x16( x264_t *h )
+{
+    int b_available;
+    if( h->param.analyse.i_direct_mv_pred == X264_DIRECT_PRED_NONE )
+        return 0;
+    else if( h->sh.b_direct_spatial_mv_pred )
+        b_available = x264_mb_predict_mv_direct16x16_spatial( h );
+    else
+        b_available = x264_mb_predict_mv_direct16x16_temporal( h );
+
+    /* cache ref & mv */
+    if( b_available )
+    {
+        int i, l;
+        for( l = 0; l < 2; l++ )
+            for( i = 0; i < 4; i++ )
+                h->mb.cache.direct_ref[l][i] = h->mb.cache.ref[l][x264_scan8[i*4]];
+        memcpy(h->mb.cache.direct_mv, h->mb.cache.mv, sizeof(h->mb.cache.mv));
+    }
+
+    return b_available;
+}
+
+void x264_mb_load_mv_direct8x8( x264_t *h, int idx )
+{
+    const int x = 2*(idx%2);
+    const int y = 2*(idx/2);
+    int l;
+    x264_macroblock_cache_ref( h, x, y, 2, 2, 0, h->mb.cache.direct_ref[0][idx] );
+    x264_macroblock_cache_ref( h, x, y, 2, 2, 1, h->mb.cache.direct_ref[1][idx] );
+    for( l = 0; l < 2; l++ )
+    {
+        *(uint64_t*)h->mb.cache.mv[l][x264_scan8[idx*4]] =
+        *(uint64_t*)h->mb.cache.direct_mv[l][x264_scan8[idx*4]];
+        *(uint64_t*)h->mb.cache.mv[l][x264_scan8[idx*4]+8] =
+        *(uint64_t*)h->mb.cache.direct_mv[l][x264_scan8[idx*4]+8];
+    }
+}
+
+/* This just improves encoder performance, it's not part of the spec */
 void x264_mb_predict_mv_ref16x16( x264_t *h, int i_list, int i_ref, int mvc[4][2], int *i_mvc )
 {
     int16_t (*mvr)[2] = h->mb.mvr[i_list][i_ref];
@@ -469,6 +646,49 @@ static inline void x264_mb_mc_01xywh( x264_t *h, int x, int y, int width, int he
     h->pixf.avg[i_mode]( &h->mb.pic.p_fdec[2][2*y*h->mb.pic.i_stride[2]+2*x], h->mb.pic.i_stride[2], tmp, 16 );
 }
 
+static void x264_mb_mc_direct8x8( x264_t *h, int x, int y )
+{
+    const int i8 = x264_scan8[0] + x + 8*y;
+
+    /* FIXME: optimize based on current block size, not global settings? */
+    if( h->sps->b_direct8x8_inference )
+    {
+        if( h->mb.cache.ref[0][i8] >= 0 )
+            if( h->mb.cache.ref[1][i8] >= 0 )
+                x264_mb_mc_01xywh( h, x, y, 2, 2 );
+            else
+                x264_mb_mc_0xywh( h, x, y, 2, 2 );
+        else
+            x264_mb_mc_1xywh( h, x, y, 2, 2 );
+    }
+    else
+    {
+        if( h->mb.cache.ref[0][i8] >= 0 )
+        {
+            if( h->mb.cache.ref[1][i8] >= 0 )
+            {
+                x264_mb_mc_01xywh( h, x+0, y+0, 1, 1 );
+                x264_mb_mc_01xywh( h, x+1, y+0, 1, 1 );
+                x264_mb_mc_01xywh( h, x+0, y+1, 1, 1 );
+                x264_mb_mc_01xywh( h, x+1, y+1, 1, 1 );
+            }
+            else
+            {
+                x264_mb_mc_0xywh( h, x+0, y+0, 1, 1 );
+                x264_mb_mc_0xywh( h, x+1, y+0, 1, 1 );
+                x264_mb_mc_0xywh( h, x+0, y+1, 1, 1 );
+                x264_mb_mc_0xywh( h, x+1, y+1, 1, 1 );
+            }
+        }
+        else
+        {
+            x264_mb_mc_1xywh( h, x+0, y+0, 1, 1 );
+            x264_mb_mc_1xywh( h, x+1, y+0, 1, 1 );
+            x264_mb_mc_1xywh( h, x+0, y+1, 1, 1 );
+            x264_mb_mc_1xywh( h, x+1, y+1, 1, 1 );
+        }
+    }
+}
 
 void x264_mb_mc( x264_t *h )
 {
@@ -489,7 +709,7 @@ void x264_mb_mc( x264_t *h )
             x264_mb_mc_0xywh( h, 2, 0, 2, 4 );
         }
     }
-    else if( h->mb.i_type == P_8x8 )
+    else if( h->mb.i_type == P_8x8 || h->mb.i_type == B_8x8 )
     {
         int i;
         for( i = 0; i < 4; i++ )
@@ -515,13 +735,55 @@ void x264_mb_mc( x264_t *h )
                     x264_mb_mc_0xywh( h, x+0, y+1, 1, 1 );
                     x264_mb_mc_0xywh( h, x+1, y+1, 1, 1 );
                     break;
+                case D_L1_8x8:
+                    x264_mb_mc_1xywh( h, x, y, 2, 2 );
+                    break;
+                case D_L1_8x4:
+                    x264_mb_mc_1xywh( h, x, y+0, 2, 1 );
+                    x264_mb_mc_1xywh( h, x, y+1, 2, 1 );
+                    break;
+                case D_L1_4x8:
+                    x264_mb_mc_1xywh( h, x+0, y, 1, 2 );
+                    x264_mb_mc_1xywh( h, x+1, y, 1, 2 );
+                    break;
+                case D_L1_4x4:
+                    x264_mb_mc_1xywh( h, x+0, y+0, 1, 1 );
+                    x264_mb_mc_1xywh( h, x+1, y+0, 1, 1 );
+                    x264_mb_mc_1xywh( h, x+0, y+1, 1, 1 );
+                    x264_mb_mc_1xywh( h, x+1, y+1, 1, 1 );
+                    break;
+                case D_BI_8x8:
+                    x264_mb_mc_01xywh( h, x, y, 2, 2 );
+                    break;
+                case D_BI_8x4:
+                    x264_mb_mc_01xywh( h, x, y+0, 2, 1 );
+                    x264_mb_mc_01xywh( h, x, y+1, 2, 1 );
+                    break;
+                case D_BI_4x8:
+                    x264_mb_mc_01xywh( h, x+0, y, 1, 2 );
+                    x264_mb_mc_01xywh( h, x+1, y, 1, 2 );
+                    break;
+                case D_BI_4x4:
+                    x264_mb_mc_01xywh( h, x+0, y+0, 1, 1 );
+                    x264_mb_mc_01xywh( h, x+1, y+0, 1, 1 );
+                    x264_mb_mc_01xywh( h, x+0, y+1, 1, 1 );
+                    x264_mb_mc_01xywh( h, x+1, y+1, 1, 1 );
+                    break;
+                case D_DIRECT_8x8:
+                    x264_mb_mc_direct8x8( h, x, y );
+                    break;
             }
         }
     }
-    else if( h->mb.i_type == B_8x8 || h->mb.i_type == B_DIRECT )
+    else if( h->mb.i_type == B_SKIP || h->mb.i_type == B_DIRECT )
     {
-        x264_log( h, X264_LOG_ERROR, "mc_luma with unsupported mb\n" );
-        return;
+        int i;
+        for( i = 0; i < 4; i++ )
+        {
+            const int x = 2*(i%2);
+            const int y = 2*(i/2);
+            x264_mb_mc_direct8x8( h, x, y );
+        }
     }
     else    /* B_*x* */
     {
@@ -568,13 +830,14 @@ void x264_mb_mc( x264_t *h )
 void x264_macroblock_cache_init( x264_t *h )
 {
     int i, j;
-    int i_mb_count = h->sps->i_mb_width * h->sps->i_mb_height;
+    int i_mb_count = h->mb.i_mb_count;
 
     h->mb.i_mb_stride = h->sps->i_mb_width;
 
     h->mb.type= x264_malloc( i_mb_count * sizeof( int8_t) );
     h->mb.qp  = x264_malloc( i_mb_count * sizeof( int8_t) );
     h->mb.cbp = x264_malloc( i_mb_count * sizeof( int16_t) );
+    h->mb.skipbp = x264_malloc( i_mb_count * sizeof( int8_t) );
 
     /* 0 -> 3 top(4), 4 -> 6 : left(3) */
     h->mb.intra4x4_pred_mode = x264_malloc( i_mb_count * 7 * sizeof( int8_t ) );
@@ -598,6 +861,14 @@ void x264_macroblock_cache_init( x264_t *h )
         for( j=0; j<16; j++ ) /* FIXME: alloc no more than param.i_frame_reference */
             h->mb.mvr[i][j] = x264_malloc( 2 * i_mb_count * sizeof( int16_t ) );
 
+    h->mb.list1ref0.ref = NULL;
+    h->mb.list1ref0.mv = NULL;
+    if( h->param.i_bframe )
+    {
+        h->mb.list1ref0.ref = x264_malloc( 4 * i_mb_count * sizeof( int8_t ) );
+        h->mb.list1ref0.mv = x264_malloc( 2*16 * i_mb_count * sizeof( int16_t ) );
+    }
+
     /* init with not avaiable (for top right idx=7,15) */
     memset( h->mb.cache.ref[0], -2, X264_SCAN8_SIZE * sizeof( int8_t ) );
     memset( h->mb.cache.ref[1], -2, X264_SCAN8_SIZE * sizeof( int8_t ) );
@@ -614,12 +885,18 @@ void x264_macroblock_cache_end( x264_t *h )
         x264_free( h->mb.mvd[0] );
         x264_free( h->mb.mvd[1] );
     }
+    if( h->param.i_bframe )
+    {
+        x264_free( h->mb.list1ref0.ref );
+        x264_free( h->mb.list1ref0.mv );
+    }
     x264_free( h->mb.mv[0] );
     x264_free( h->mb.mv[1] );
     x264_free( h->mb.ref[0] );
     x264_free( h->mb.ref[1] );
     x264_free( h->mb.intra4x4_pred_mode );
     x264_free( h->mb.non_zero_count );
+    x264_free( h->mb.skipbp );
     x264_free( h->mb.cbp );
     x264_free( h->mb.qp );
     x264_free( h->mb.type );
@@ -923,6 +1200,29 @@ void x264_macroblock_cache_load( x264_t *h, int i_mb_x, int i_mb_y )
                 }
             }
         }
+
+        /* load skip */
+        if( h->param.b_cabac )
+        {
+            if( h->sh.i_type == SLICE_TYPE_B )
+            {
+                memset( h->mb.cache.skip, 0, X264_SCAN8_SIZE * sizeof( int8_t ) );
+                if( i_left_xy >= 0 )
+                {
+                    h->mb.cache.skip[x264_scan8[0] - 1] = h->mb.skipbp[i_left_xy] & 0x2;
+                    h->mb.cache.skip[x264_scan8[8] - 1] = h->mb.skipbp[i_left_xy] & 0x8;
+                }
+                if( i_top_xy >= 0 )
+                {
+                    h->mb.cache.skip[x264_scan8[0] - 8] = h->mb.skipbp[i_top_xy] & 0x4;
+                    h->mb.cache.skip[x264_scan8[4] - 8] = h->mb.skipbp[i_top_xy] & 0x8;
+                }
+            }
+            else if( h->mb.i_mb_xy == 0 && h->sh.i_type == SLICE_TYPE_P )
+            {
+                memset( h->mb.cache.skip, 0, X264_SCAN8_SIZE * sizeof( int8_t ) );
+            }
+        }
     }
 }
 
@@ -1036,7 +1336,7 @@ void x264_macroblock_cache_save( x264_t *h )
         else
             h->mb.chroma_pred_mode[i_mb_xy] = I_PRED_CHROMA_DC;
 
-        if( !IS_INTRA( i_mb_type ) && !IS_SKIP( i_mb_type ) )
+        if( !IS_INTRA( i_mb_type ) && !IS_SKIP( i_mb_type ) && !IS_DIRECT( i_mb_type ) )
         {
             int i_list;
             for( i_list  = 0; i_list < 2; i_list++ )
@@ -1070,6 +1370,27 @@ void x264_macroblock_cache_save( x264_t *h )
                 }
             }
         }
+        if( h->sh.i_type == SLICE_TYPE_B )
+        {
+            if( i_mb_type == B_SKIP || i_mb_type == B_DIRECT )
+                h->mb.skipbp[i_mb_xy] = 0xf;
+            else if( i_mb_type == B_8x8 )
+            {
+                int skipbp = 0;
+                for( i = 0; i < 4; i++ )
+                    skipbp |= ( h->mb.i_sub_partition[i] == D_DIRECT_8x8 ) << i;
+                h->mb.skipbp[i_mb_xy] = skipbp;
+            }
+            else
+                h->mb.skipbp[i_mb_xy] = 0;
+        }
     }
 }
 
+void x264_macroblock_direct_ref_save( x264_t *h )
+{
+    /* Manipulation of ref numbers is unnecessary unless we allow
+     * ref list reordering, multiple B-frame delay, or B-frames as refs. */
+    memcpy( h->mb.list1ref0.ref, h->mb.ref[0], 4 * h->mb.i_mb_count * sizeof( int8_t ) );
+    memcpy( h->mb.list1ref0.mv, h->mb.mv[0], 2*16 * h->mb.i_mb_count * sizeof( int16_t ) );
+}
diff --git a/common/macroblock.h b/common/macroblock.h
index 1cc6f6aa..02689868 100644
--- a/common/macroblock.h
+++ b/common/macroblock.h
@@ -37,6 +37,7 @@ enum macroblock_position_e
 /* XXX mb_type isn't the one written in the bitstream -> only internal usage */
 #define IS_INTRA(type) ( (type) == I_4x4 || (type) == I_16x16 )
 #define IS_SKIP(type)  ( (type) == P_SKIP || (type) == B_SKIP )
+#define IS_DIRECT(type)  ( (type) == B_DIRECT )
 enum mb_class_e
 {
     I_4x4           = 0,
@@ -118,6 +119,21 @@ enum mb_partition_e
     D_16x16         = 16,
 };
 
+static const int x264_mb_partition_listX_table[2][17] =
+{{
+    1, 1, 1, 1, /* D_L0_* */
+    0, 0, 0, 0, /* D_L1_* */
+    1, 1, 1, 1, /* D_BI_* */
+    0,          /* D_DIRECT_8x8 */
+    0, 0, 0, 0  /* 8x8 .. 16x16 */
+},
+{
+    0, 0, 0, 0, /* D_L0_* */
+    1, 1, 1, 1, /* D_L1_* */
+    1, 1, 1, 1, /* D_BI_* */
+    0,          /* D_DIRECT_8x8 */
+    0, 0, 0, 0  /* 8x8 .. 16x16 */
+}};
 static const int x264_mb_partition_count_table[17] =
 {
     /* sub L0 */
@@ -137,6 +153,8 @@ void x264_macroblock_cache_load( x264_t *h, int, int );
 void x264_macroblock_cache_save( x264_t *h );
 void x264_macroblock_cache_end( x264_t *h );
 
+void x264_macroblock_direct_ref_save( x264_t *h );
+
 void x264_mb_dequant_4x4_dc( int16_t dct[4][4], int i_qscale );
 void x264_mb_dequant_2x2_dc( int16_t dct[2][2], int i_qscale );
 void x264_mb_dequant_4x4( int16_t dct[4][4], int i_qscale );
@@ -150,14 +168,23 @@ void x264_mb_predict_mv_16x16( x264_t *h, int i_list, int i_ref, int mvp[2] );
  *      h->mb. need only valid values from other blocks */
 void x264_mb_predict_mv_pskip( x264_t *h, int mv[2] );
 /* x264_mb_predict_mv:
- *      set mvp with predicted mv for all blocks except P_SKIP
+ *      set mvp with predicted mv for all blocks except SKIP and DIRECT
  *      h->mb. need valid ref/partition/sub of current block to be valid
  *      and valid mv/ref from other blocks . */
 void x264_mb_predict_mv( x264_t *h, int i_list, int idx, int i_width, int mvp[2] );
+/* x264_mb_predict_mv_direct16x16:
+ *      set h->mb.cache.mv and h->mb.cache.ref for B_SKIP or B_DIRECT
+ *      h->mb. need only valid values from other blocks
+ *      return 1 on success, 0 on failure */
+int x264_mb_predict_mv_direct16x16( x264_t *h );
+/* x264_mb_load_mv_direct8x8:
+ *      set h->mb.cache.mv and h->mb.cache.ref for B_DIRECT
+ *      must be called only after x264_mb_predict_mv_direct16x16 */
+void x264_mb_load_mv_direct8x8( x264_t *h, int idx );
 /* x264_mb_predict_mv_ref16x16:
  *      set mvc with D_16x16 prediction.
  *      uses all neighbors, even those that didn't end up using this ref.
- *      need only valid values from other blocks */
+ *      h->mb. need only valid values from other blocks */
 void x264_mb_predict_mv_ref16x16( x264_t *h, int i_list, int i_ref, int mvc[4][2], int *i_mvc );
 
 
@@ -204,6 +231,17 @@ static inline void x264_macroblock_cache_mvd( x264_t *h, int x, int y, int width
         }
     }
 }
+static inline void x264_macroblock_cache_skip( x264_t *h, int x, int y, int width, int height, int b_skip )
+{
+    int dy, dx;
+    for( dy = 0; dy < height; dy++ )
+    {
+        for( dx = 0; dx < width; dx++ )
+        {
+            h->mb.cache.skip[X264_SCAN8_0+x+dx+8*(y+dy)] = b_skip;
+        }
+    }
+}
 
 #endif
 
diff --git a/encoder/analyse.c b/encoder/analyse.c
index 7893513c..04fd6ffd 100644
--- a/encoder/analyse.c
+++ b/encoder/analyse.c
@@ -91,6 +91,11 @@ typedef struct
     x264_mb_analysis_list_t l1;
 
     int i_cost16x16bi; /* used the same ref and mv as l0 and l1 (at least for now) */
+    int i_cost16x16direct;
+    int i_cost8x8bi;
+    int i_cost8x8direct[4];
+
+    int b_direct_available;
 
 } x264_mb_analysis_t;
 
@@ -163,12 +168,15 @@ static void x264_mb_analyse_init( x264_t *h, x264_mb_analysis_t *a, int i_qp )
                 a->l1.i_cost4x4[i] = -1;
                 a->l1.i_cost8x4[i] = -1;
                 a->l1.i_cost4x8[i] = -1;
+                a->i_cost8x8direct[i] = -1;
             }
 
             a->l1.i_cost16x8   = -1;
             a->l1.i_cost8x16   = -1;
 
             a->i_cost16x16bi   = -1;
+            a->i_cost16x16direct = -1;
+            a->i_cost8x8bi     = -1;
         }
     }
 }
@@ -719,6 +727,27 @@ static void x264_mb_analyse_inter_p4x8( x264_t *h, x264_mb_analysis_t *a, int i8
     a->l0.i_cost4x8[i8x8] = a->l0.me4x8[i8x8][0].cost + a->l0.me4x8[i8x8][1].cost;
 }
 
+static void x264_mb_analyse_inter_direct( x264_t *h, x264_mb_analysis_t *a )
+{
+    /* Assumes that fdec still contains the results of
+     * x264_mb_predict_mv_direct16x16 and x264_mb_mc */
+
+    uint8_t *p_fenc = h->mb.pic.p_fenc[0];
+    uint8_t *p_fdec = h->mb.pic.p_fdec[0];
+    int i_stride= h->mb.pic.i_stride[0];
+    int i;
+
+    a->i_cost16x16direct = 0;
+    for( i = 0; i < 4; i++ )
+    {
+        const int x8 = i%2;
+        const int y8 = i/2;
+        const int off = 8 * x8 + 8 * i_stride * y8;
+        a->i_cost16x16direct +=
+        a->i_cost8x8direct[i] =
+            h->pixf.satd[PIXEL_8x8]( &p_fenc[off], i_stride, &p_fdec[off], i_stride );
+    }
+}
 
 static void x264_mb_analyse_inter_b16x16( x264_t *h, x264_mb_analysis_t *a )
 {
@@ -796,6 +825,121 @@ static void x264_mb_analyse_inter_b16x16( x264_t *h, x264_mb_analysis_t *a )
                                        bs_size_se( a->l1.me16x16.mv[1] - a->l1.me16x16.mvp[1] ) );
 }
 
+static void x264_mb_analyse_inter_b8x8( x264_t *h, x264_mb_analysis_t *a )
+{
+    uint8_t pix[2][8*8];
+    uint8_t *p_fref[2] = { h->mb.pic.p_fref[0][a->l0.i_ref][0],
+                           h->mb.pic.p_fref[1][a->l1.i_ref][0] };
+    uint8_t *p_fenc = h->mb.pic.p_fenc[0];
+    int mvc[2][5][2], i_mvc[2];
+    int i, j;
+
+    /* XXX Needed for x264_mb_predict_mv */
+    h->mb.i_partition = D_8x8;
+
+    a->i_cost8x8bi = 0;
+
+    i_mvc[0] = i_mvc[1] = 1;
+    mvc[0][0][0] = a->l0.me16x16.mv[0];
+    mvc[0][0][1] = a->l0.me16x16.mv[1];
+    mvc[1][0][0] = a->l1.me16x16.mv[0];
+    mvc[1][0][1] = a->l1.me16x16.mv[1];
+
+
+    for( i = 0; i < 4; i++ )
+    {
+        const int x8 = i%2;
+        const int y8 = i/2;
+        uint8_t *p_fenc_i = &p_fenc[8*(y8*h->mb.pic.i_stride[0]+x8)];
+        int i_part_cost;
+        int i_part_cost_bi = 0;
+
+        for( j = 0; j < 2; j++ )
+        {
+            x264_mb_analysis_list_t *l = j ? &a->l1 : &a->l0;
+            x264_me_t *m = &l->me8x8[i];
+
+            m->i_pixel = PIXEL_8x8;
+            m->lm      = a->i_lambda;
+
+            m->p_fenc = p_fenc_i;
+            m->p_fref = &p_fref[j][8*(y8*h->mb.pic.i_stride[0]+x8)];
+            m->i_stride = h->mb.pic.i_stride[0];
+            m->i_mv_range = a->i_mv_range;
+
+            x264_mb_predict_mv( h, j, 4*i, 2, m->mvp );
+            x264_me_search( h, m, mvc[j], i_mvc[j] );
+
+            x264_macroblock_cache_mv( h, 2*x8, 2*y8, 2, 2, j, m->mv[0], m->mv[1] );
+            l->i_cost8x8 += m->cost;
+
+            /* BI mode */
+            h->mc[MC_LUMA]( m->p_fref, m->i_stride, pix[j], 8,
+                            m->mv[0], m->mv[1], 8, 8 );
+            /* FIXME: add ref cost */
+            i_part_cost_bi += a->i_lambda * ( bs_size_se( m->mv[0] - m->mvp[0] ) +
+                                              bs_size_se( m->mv[1] - m->mvp[1] ) );
+        }
+
+        h->pixf.avg[PIXEL_8x8]( pix[0], 8, pix[1], 8 );
+        i_part_cost_bi += h->pixf.satd[PIXEL_8x8]( p_fenc_i, h->mb.pic.i_stride[0], pix[0], 8 );
+
+        i_part_cost = a->l0.me8x8[i].cost;
+        h->mb.i_sub_partition[i] = D_L0_8x8;
+        if( a->l1.me8x8[i].cost < i_part_cost )
+        {
+            i_part_cost = a->l1.me8x8[i].cost;
+            h->mb.i_sub_partition[i] = D_L1_8x8;
+        }
+        if( i_part_cost_bi < i_part_cost )
+        {
+            i_part_cost = i_part_cost_bi;
+            h->mb.i_sub_partition[i] = D_BI_8x8;
+        }
+        if( a->i_cost8x8direct[i] < i_part_cost && a->i_cost8x8direct[i] >= 0)
+        {
+            i_part_cost = a->i_cost8x8direct[i];
+            h->mb.i_sub_partition[i] = D_DIRECT_8x8;
+        }
+        a->i_cost8x8bi += i_part_cost;
+
+        /* XXX Needed for x264_mb_predict_mv */
+        if( h->mb.i_sub_partition[i] == D_DIRECT_8x8 )
+        {
+            x264_mb_load_mv_direct8x8( h, i );
+            x264_macroblock_cache_mvd(  h, 2*x8, 2*y8, 2, 2, 0, 0, 0 );
+            x264_macroblock_cache_mvd(  h, 2*x8, 2*y8, 2, 2, 1, 0, 0 );
+            x264_macroblock_cache_skip( h, 2*x8, 2*y8, 2, 2, 1 );
+        }
+        else
+        {
+            if( h->mb.i_sub_partition[i] == D_L1_8x8 )
+            {
+                x264_macroblock_cache_ref( h, 2*x8, 2*y8, 2, 2, 0, -1 );
+                x264_macroblock_cache_mv(  h, 2*x8, 2*y8, 2, 2, 0, 0, 0 );
+                x264_macroblock_cache_mvd( h, 2*x8, 2*y8, 2, 2, 0, 0, 0 );
+            }
+            else
+            {
+                x264_macroblock_cache_ref( h, 2*x8, 2*y8, 2, 2, 0, a->l0.i_ref );
+                x264_macroblock_cache_mv(  h, 2*x8, 2*y8, 2, 2, 0, a->l0.me8x8[i].mv[0], a->l0.me8x8[i].mv[1] );
+            }
+
+            if( h->mb.i_sub_partition[i] == D_L0_8x8 )
+            {
+                x264_macroblock_cache_ref( h, 2*x8, 2*y8, 2, 2, 1, -1 );
+                x264_macroblock_cache_mv(  h, 2*x8, 2*y8, 2, 2, 1, 0, 0 );
+                x264_macroblock_cache_mvd( h, 2*x8, 2*y8, 2, 2, 1, 0, 0 );
+            }
+            else
+            {
+                x264_macroblock_cache_ref( h, 2*x8, 2*y8, 2, 2, 1, a->l1.i_ref );
+                x264_macroblock_cache_mv(  h, 2*x8, 2*y8, 2, 2, 1, a->l1.me8x8[i].mv[0], a->l1.me8x8[i].mv[1] );
+            }
+        }
+    }
+}
+
 /*****************************************************************************
  * x264_macroblock_analyse:
  *****************************************************************************/
@@ -1011,39 +1155,90 @@ void x264_macroblock_analyse( x264_t *h )
     }
     else if( h->sh.i_type == SLICE_TYPE_B )
     {
+        const unsigned int i_neighbour = h->mb.i_neighbour;
+        const unsigned int flags = h->param.analyse.inter;
+        int b_skip = 0;
         int i_cost;
 
-        /* best inter mode */
-        x264_mb_analyse_inter_b16x16( h, &analysis );
-        h->mb.i_type = B_L0_L0;
-        h->mb.i_partition = D_16x16;
-        i_cost = analysis.l0.me16x16.cost;
-
-        if( analysis.l1.me16x16.cost < i_cost )
+        analysis.b_direct_available = x264_mb_predict_mv_direct16x16( h );
+        if( analysis.b_direct_available )
         {
-            h->mb.i_type = B_L1_L1;
-            i_cost = analysis.l1.me16x16.cost;
-        }
-        if( analysis.i_cost16x16bi < i_cost )
-        {
-            h->mb.i_type = B_BI_BI;
-            i_cost = analysis.i_cost16x16bi;
-        }
+            h->mb.i_type = B_SKIP;
+            x264_mb_mc( h );
 
-        /* best intra mode */
-        x264_mb_analyse_intra( h, &analysis );
-        if( analysis.i_sad_i16x16 >= 0 && analysis.i_sad_i16x16 < i_cost )
-        {
-            h->mb.i_type = I_16x16;
-            i_cost = analysis.i_sad_i16x16;
+            /* Conditioning the probe on neighboring block types
+             * doesn't seem to help speed or quality. */
+            b_skip = x264_macroblock_probe_bskip( h );
         }
-        if( analysis.i_sad_i4x4 >=0 && analysis.i_sad_i4x4 < i_cost )
+
+        if( !b_skip )
         {
-            h->mb.i_type = I_4x4;
-            i_cost = analysis.i_sad_i4x4;
+            /* best inter mode */
+            /* direct must be first */
+            if( analysis.b_direct_available )
+                x264_mb_analyse_inter_direct( h, &analysis );
+
+            x264_mb_analyse_inter_b16x16( h, &analysis );
+
+            /* 8x8 must be last */
+            if( flags & X264_ANALYSE_BSUB16x16 )
+                x264_mb_analyse_inter_b8x8( h, &analysis );
+
+            h->mb.i_type = B_L0_L0;
+            h->mb.i_partition = D_16x16;
+            i_cost = analysis.l0.me16x16.cost;
+            if( analysis.l1.me16x16.cost < i_cost )
+            {
+                h->mb.i_type = B_L1_L1;
+                i_cost = analysis.l1.me16x16.cost;
+            }
+            if( analysis.i_cost16x16bi < i_cost )
+            {
+                h->mb.i_type = B_BI_BI;
+                i_cost = analysis.i_cost16x16bi;
+            }
+            if( analysis.i_cost16x16direct < i_cost && analysis.i_cost16x16direct >= 0 )
+            {
+                h->mb.i_type = B_DIRECT;
+                i_cost = analysis.i_cost16x16direct;
+            }
+            if( analysis.i_cost8x8bi < i_cost && analysis.i_cost8x8bi >= 0 )
+            {
+                h->mb.i_type = B_8x8;
+                h->mb.i_partition = D_8x8;
+                i_cost = analysis.i_cost8x8bi;
+            }
+
+            /* refine qpel */
+            if( h->mb.i_partition == D_16x16 )
+            {
+                if( h->mb.i_type == B_L0_L0 )
+                {
+                    x264_me_refine_qpel( h, &analysis.l0.me16x16 );
+                    i_cost = analysis.l0.me16x16.cost;
+                }
+                else if( h->mb.i_type == B_L1_L1 )
+                {
+                    x264_me_refine_qpel( h, &analysis.l1.me16x16 );
+                    i_cost = analysis.l1.me16x16.cost;
+                }
+            }
+            /* TODO: refine bidir, 8x8 */
+
+            /* best intra mode */
+            x264_mb_analyse_intra( h, &analysis );
+            if( analysis.i_sad_i16x16 >= 0 && analysis.i_sad_i16x16 < i_cost )
+            {
+                h->mb.i_type = I_16x16;
+                i_cost = analysis.i_sad_i16x16;
+            }
+            if( analysis.i_sad_i4x4 >=0 && analysis.i_sad_i4x4 < i_cost )
+            {
+                h->mb.i_type = I_4x4;
+                i_cost = analysis.i_sad_i4x4;
+            }
         }
     }
-#undef BEST_TYPE
 
     /*-------------------- Update MB from the analysis ----------------------*/
     h->mb.type[h->mb.i_mb_xy] = h->mb.i_type;
@@ -1134,6 +1329,15 @@ void x264_macroblock_analyse( x264_t *h )
             break;
         }
 
+        case B_SKIP:
+        case B_DIRECT:
+            /* probably unnecessary for B_SKIP */
+            x264_mb_load_mv_direct8x8( h, 0 );
+            x264_mb_load_mv_direct8x8( h, 1 );
+            x264_mb_load_mv_direct8x8( h, 2 );
+            x264_mb_load_mv_direct8x8( h, 3 );
+            break;
+
         case B_L0_L0:
             switch( h->mb.i_partition )
             {
@@ -1183,6 +1387,9 @@ void x264_macroblock_analyse( x264_t *h )
                     break;
             }
             break;
+        case B_8x8:
+            /* nothing to do: caches were updated during analysis */
+            break;
 
         default:
             fprintf( stderr, "internal error (invalid MB type)\n" );
diff --git a/encoder/cabac.c b/encoder/cabac.c
index 0dc3228c..f30af96c 100644
--- a/encoder/cabac.c
+++ b/encoder/cabac.c
@@ -612,9 +612,9 @@ static inline void x264_cabac_mb_ref( x264_t *h, int i_list, int idx )
     int i_ref  = h->mb.cache.ref[i_list][i8];
     int ctx  = 0;
 
-    if( i_refa > 0 )
+    if( i_refa > 0 && !h->mb.cache.skip[i8 - 1])
         ctx++;
-    if( i_refb > 0 )
+    if( i_refb > 0 && !h->mb.cache.skip[i8 - 8])
         ctx += 2;
 
     while( i_ref > 0 )
@@ -706,6 +706,47 @@ static inline void  x264_cabac_mb_mvd( x264_t *h, int i_list, int idx, int width
     x264_macroblock_cache_mvd( h, block_idx_x[idx], block_idx_y[idx], width, height, i_list, mdx, mdy );
 }
 
+static inline void x264_cabac_mb8x8_mvd( x264_t *h, int i_list )
+{
+    int i;
+    for( i = 0; i < 4; i++ )
+    {
+        if( !x264_mb_partition_listX_table[i_list][ h->mb.i_sub_partition[i] ] )
+        {
+            continue;
+        }
+
+        switch( h->mb.i_sub_partition[i] )
+        {
+            case D_L0_8x8:
+            case D_L1_8x8:
+            case D_BI_8x8:
+                x264_cabac_mb_mvd( h, i_list, 4*i, 2, 2 );
+                break;
+            case D_L0_8x4:
+            case D_L1_8x4:
+            case D_BI_8x4:
+                x264_cabac_mb_mvd( h, i_list, 4*i+0, 2, 1 );
+                x264_cabac_mb_mvd( h, i_list, 4*i+2, 2, 1 );
+                break;
+            case D_L0_4x8:
+            case D_L1_4x8:
+            case D_BI_4x8:
+                x264_cabac_mb_mvd( h, i_list, 4*i+0, 1, 2 );
+                x264_cabac_mb_mvd( h, i_list, 4*i+1, 1, 2 );
+                break;
+            case D_L0_4x4:
+            case D_L1_4x4:
+            case D_BI_4x4:
+                x264_cabac_mb_mvd( h, i_list, 4*i+0, 1, 1 );
+                x264_cabac_mb_mvd( h, i_list, 4*i+1, 1, 1 );
+                x264_cabac_mb_mvd( h, i_list, 4*i+2, 1, 1 );
+                x264_cabac_mb_mvd( h, i_list, 4*i+3, 1, 1 );
+                break;
+        }
+    }
+}
+
 static int x264_cabac_mb_cbf_ctxidxinc( x264_t *h, int i_cat, int i_idx )
 {
     /* TODO: clean up/optimize */
@@ -964,6 +1005,7 @@ void x264_macroblock_write_cabac( x264_t *h, bs_t *s )
     const int i_mb_pos_start = bs_pos( s );
     int       i_mb_pos_tex;
 
+    int i_list;
     int i;
 
     /* Write the MB type */
@@ -1060,40 +1102,36 @@ void x264_macroblock_write_cabac( x264_t *h, bs_t *s )
             x264_cabac_mb_ref( h, 0, 12 );
         }
 
-        for( i = 0; i < 4; i++ )
+        x264_cabac_mb8x8_mvd( h, 0 );
+    }
+    else if( i_mb_type == B_8x8 )
+    {
+        /* sub mb type */
+        x264_cabac_mb_sub_b_partition( h, h->mb.i_sub_partition[0] );
+        x264_cabac_mb_sub_b_partition( h, h->mb.i_sub_partition[1] );
+        x264_cabac_mb_sub_b_partition( h, h->mb.i_sub_partition[2] );
+        x264_cabac_mb_sub_b_partition( h, h->mb.i_sub_partition[3] );
+
+        /* ref */
+        for( i_list = 0; i_list < 2; i_list++ )
         {
-            switch( h->mb.i_sub_partition[i] )
+            if( ( i_list ? h->sh.i_num_ref_idx_l1_active : h->sh.i_num_ref_idx_l0_active ) == 1 )
+                continue;
+            for( i = 0; i < 4; i++ )
             {
-                case D_L0_8x8:
-                    x264_cabac_mb_mvd( h, 0, 4*i, 2, 2 );
-                    break;
-                case D_L0_8x4:
-                    x264_cabac_mb_mvd( h, 0, 4*i+0, 2, 1 );
-                    x264_cabac_mb_mvd( h, 0, 4*i+2, 2, 1 );
-                    break;
-                case D_L0_4x8:
-                    x264_cabac_mb_mvd( h, 0, 4*i+0, 1, 2 );
-                    x264_cabac_mb_mvd( h, 0, 4*i+1, 1, 2 );
-                    break;
-                case D_L0_4x4:
-                    x264_cabac_mb_mvd( h, 0, 4*i+0, 1, 1 );
-                    x264_cabac_mb_mvd( h, 0, 4*i+1, 1, 1 );
-                    x264_cabac_mb_mvd( h, 0, 4*i+2, 1, 1 );
-                    x264_cabac_mb_mvd( h, 0, 4*i+3, 1, 1 );
-                    break;
+                if( x264_mb_partition_listX_table[0][ h->mb.i_sub_partition[i] ] )
+                {
+                    x264_cabac_mb_ref( h, i_list, 4*i );
+                }
             }
         }
-    }
-    else if( i_mb_type == B_8x8 )
-    {
-        /* TODO */
-        fprintf( stderr, "Arggg B_8x8\n" );
-        return;
+
+        x264_cabac_mb8x8_mvd( h, 0 );
+        x264_cabac_mb8x8_mvd( h, 1 );
     }
     else if( i_mb_type != B_DIRECT )
     {
         /* All B mode */
-        int i_list;
         int b_list[2][2];
 
         /* init ref list utilisations */
diff --git a/encoder/cavlc.c b/encoder/cavlc.c
index 5f19be00..3df4422a 100644
--- a/encoder/cavlc.c
+++ b/encoder/cavlc.c
@@ -41,6 +41,10 @@ static const uint8_t inter_cbp_to_golomb[48]=
   1, 32, 33, 36, 34, 37, 44, 40, 35, 45, 38, 41, 39, 42, 43, 19,
   6, 24, 25, 20, 26, 21, 46, 28, 27, 47, 22, 29, 23, 30, 31, 12
 };
+static const uint8_t sub_mb_type_p_to_golomb[4]=
+{ 3, 1, 2, 0 };
+static const uint8_t sub_mb_type_b_to_golomb[13]=
+{ 10,  4,  5,  1, 11,  6,  7,  2, 12,  8,  9,  3,  0 };
 
 static const uint8_t block_idx_x[16] =
 {
@@ -257,6 +261,72 @@ static void block_residual_write_cavlc( x264_t *h, bs_t *s, int i_idx, int *l, i
     }
 }
 
+static void x264_sub_mb_mv_write_cavlc( x264_t *h, bs_t *s, int i_list )
+{
+    int i;
+    for( i = 0; i < 4; i++ )
+    {
+        int mvp[2];
+
+        if( !x264_mb_partition_listX_table[i_list][ h->mb.i_sub_partition[i] ] )
+        {
+            continue;
+        }
+
+        switch( h->mb.i_sub_partition[i] )
+        {
+            case D_L0_8x8:
+            case D_L1_8x8:
+            case D_BI_8x8:
+                x264_mb_predict_mv( h, i_list, 4*i, 2, mvp );
+                bs_write_se( s, h->mb.cache.mv[i_list][x264_scan8[4*i]][0] - mvp[0] );
+                bs_write_se( s, h->mb.cache.mv[i_list][x264_scan8[4*i]][1] - mvp[1] );
+                break;
+            case D_L0_8x4:
+            case D_L1_8x4:
+            case D_BI_8x4:
+                x264_mb_predict_mv( h, i_list, 4*i+0, 2, mvp );
+                bs_write_se( s, h->mb.cache.mv[i_list][x264_scan8[4*i]][0] - mvp[0] );
+                bs_write_se( s, h->mb.cache.mv[i_list][x264_scan8[4*i]][1] - mvp[1] );
+
+                x264_mb_predict_mv( h, i_list, 4*i+2, 2, mvp );
+                bs_write_se( s, h->mb.cache.mv[i_list][x264_scan8[4*i+2]][0] - mvp[0] );
+                bs_write_se( s, h->mb.cache.mv[i_list][x264_scan8[4*i+2]][1] - mvp[1] );
+                break;
+            case D_L0_4x8:
+            case D_L1_4x8:
+            case D_BI_4x8:
+                x264_mb_predict_mv( h, i_list, 4*i+0, 1, mvp );
+                bs_write_se( s, h->mb.cache.mv[i_list][x264_scan8[4*i]][0] - mvp[0] );
+                bs_write_se( s, h->mb.cache.mv[i_list][x264_scan8[4*i]][1] - mvp[1] );
+
+                x264_mb_predict_mv( h, i_list, 4*i+1, 1, mvp );
+                bs_write_se( s, h->mb.cache.mv[i_list][x264_scan8[4*i+1]][0] - mvp[0] );
+                bs_write_se( s, h->mb.cache.mv[i_list][x264_scan8[4*i+1]][1] - mvp[1] );
+                break;
+            case D_L0_4x4:
+            case D_L1_4x4:
+            case D_BI_4x4:
+                x264_mb_predict_mv( h, i_list, 4*i+0, 1, mvp );
+                bs_write_se( s, h->mb.cache.mv[i_list][x264_scan8[4*i]][0] - mvp[0] );
+                bs_write_se( s, h->mb.cache.mv[i_list][x264_scan8[4*i]][1] - mvp[1] );
+
+                x264_mb_predict_mv( h, i_list, 4*i+1, 1, mvp );
+                bs_write_se( s, h->mb.cache.mv[i_list][x264_scan8[4*i+1]][0] - mvp[0] );
+                bs_write_se( s, h->mb.cache.mv[i_list][x264_scan8[4*i+1]][1] - mvp[1] );
+
+                x264_mb_predict_mv( h, i_list, 4*i+2, 1, mvp );
+                bs_write_se( s, h->mb.cache.mv[i_list][x264_scan8[4*i+2]][0] - mvp[0] );
+                bs_write_se( s, h->mb.cache.mv[i_list][x264_scan8[4*i+2]][1] - mvp[1] );
+
+                x264_mb_predict_mv( h, i_list, 4*i+3, 1, mvp );
+                bs_write_se( s, h->mb.cache.mv[i_list][x264_scan8[4*i+3]][0] - mvp[0] );
+                bs_write_se( s, h->mb.cache.mv[i_list][x264_scan8[4*i+3]][1] - mvp[1] );
+                break;
+        }
+    }
+}
+
 /*****************************************************************************
  * x264_macroblock_write:
  *****************************************************************************/
@@ -421,21 +491,7 @@ void x264_macroblock_write_cavlc( x264_t *h, bs_t *s )
         /* sub mb type */
         for( i = 0; i < 4; i++ )
         {
-            switch( h->mb.i_sub_partition[i] )
-            {
-                case D_L0_8x8:
-                    bs_write_ue( s, 0 );
-                    break;
-                case D_L0_8x4:
-                    bs_write_ue( s, 1 );
-                    break;
-                case D_L0_4x8:
-                    bs_write_ue( s, 2 );
-                    break;
-                case D_L0_4x4:
-                    bs_write_ue( s, 3 );
-                    break;
-            }
+            bs_write_ue( s, sub_mb_type_p_to_golomb[ h->mb.i_sub_partition[i] ] );
         }
         /* ref0 */
         if( h->sh.i_num_ref_idx_l0_active > 1 && b_sub_ref0 )
@@ -445,59 +501,36 @@ void x264_macroblock_write_cavlc( x264_t *h, bs_t *s )
             bs_write_te( s, h->sh.i_num_ref_idx_l0_active - 1, h->mb.cache.ref[0][x264_scan8[8]] );
             bs_write_te( s, h->sh.i_num_ref_idx_l0_active - 1, h->mb.cache.ref[0][x264_scan8[12]] );
         }
+
+        x264_sub_mb_mv_write_cavlc( h, s, 0 );
+    }
+    else if( i_mb_type == B_8x8 )
+    {
+        bs_write_ue( s, 22 );
+
+        /* sub mb type */
+        for( i = 0; i < 4; i++ )
+        {
+            bs_write_ue( s, sub_mb_type_b_to_golomb[ h->mb.i_sub_partition[i] ] );
+        }
+        /* ref */
         for( i = 0; i < 4; i++ )
         {
-            int mvp[2];
-
-            switch( h->mb.i_sub_partition[i] )
+            if( x264_mb_partition_listX_table[0][ h->mb.i_sub_partition[i] ] )
             {
-                case D_L0_8x8:
-                    x264_mb_predict_mv( h, 0, 4*i, 2, mvp );
-                    bs_write_se( s, h->mb.cache.mv[0][x264_scan8[4*i]][0] - mvp[0] );
-                    bs_write_se( s, h->mb.cache.mv[0][x264_scan8[4*i]][1] - mvp[1] );
-                    break;
-                case D_L0_8x4:
-                    x264_mb_predict_mv( h, 0, 4*i+0, 2, mvp );
-                    bs_write_se( s, h->mb.cache.mv[0][x264_scan8[4*i]][0] - mvp[0] );
-                    bs_write_se( s, h->mb.cache.mv[0][x264_scan8[4*i]][1] - mvp[1] );
-
-                    x264_mb_predict_mv( h, 0, 4*i+2, 2, mvp );
-                    bs_write_se( s, h->mb.cache.mv[0][x264_scan8[4*i+2]][0] - mvp[0] );
-                    bs_write_se( s, h->mb.cache.mv[0][x264_scan8[4*i+2]][1] - mvp[1] );
-                    break;
-                case D_L0_4x8:
-                    x264_mb_predict_mv( h, 0, 4*i+0, 1, mvp );
-                    bs_write_se( s, h->mb.cache.mv[0][x264_scan8[4*i]][0] - mvp[0] );
-                    bs_write_se( s, h->mb.cache.mv[0][x264_scan8[4*i]][1] - mvp[1] );
-
-                    x264_mb_predict_mv( h, 0, 4*i+1, 1, mvp );
-                    bs_write_se( s, h->mb.cache.mv[0][x264_scan8[4*i+1]][0] - mvp[0] );
-                    bs_write_se( s, h->mb.cache.mv[0][x264_scan8[4*i+1]][1] - mvp[1] );
-                    break;
-                case D_L0_4x4:
-                    x264_mb_predict_mv( h, 0, 4*i+0, 1, mvp );
-                    bs_write_se( s, h->mb.cache.mv[0][x264_scan8[4*i]][0] - mvp[0] );
-                    bs_write_se( s, h->mb.cache.mv[0][x264_scan8[4*i]][1] - mvp[1] );
-
-                    x264_mb_predict_mv( h, 0, 4*i+1, 1, mvp );
-                    bs_write_se( s, h->mb.cache.mv[0][x264_scan8[4*i+1]][0] - mvp[0] );
-                    bs_write_se( s, h->mb.cache.mv[0][x264_scan8[4*i+1]][1] - mvp[1] );
-
-                    x264_mb_predict_mv( h, 0, 4*i+2, 1, mvp );
-                    bs_write_se( s, h->mb.cache.mv[0][x264_scan8[4*i+2]][0] - mvp[0] );
-                    bs_write_se( s, h->mb.cache.mv[0][x264_scan8[4*i+2]][1] - mvp[1] );
-
-                    x264_mb_predict_mv( h, 0, 4*i+3, 1, mvp );
-                    bs_write_se( s, h->mb.cache.mv[0][x264_scan8[4*i+3]][0] - mvp[0] );
-                    bs_write_se( s, h->mb.cache.mv[0][x264_scan8[4*i+3]][1] - mvp[1] );
-                    break;
+                bs_write_te( s, h->sh.i_num_ref_idx_l0_active - 1, h->mb.cache.ref[0][x264_scan8[i*4]] );
             }
         }
-    }
-    else if( i_mb_type == B_8x8 )
-    {
-        fprintf( stderr, "invalid/unhandled mb_type (B_8x8)\n" );
-        return;
+        for( i = 0; i < 4; i++ )
+        {
+            if( x264_mb_partition_listX_table[1][ h->mb.i_sub_partition[i] ] )
+            {
+                bs_write_te( s, h->sh.i_num_ref_idx_l1_active - 1, h->mb.cache.ref[1][x264_scan8[i*4]] );
+            }
+        }
+        /* mvd */
+        x264_sub_mb_mv_write_cavlc( h, s, 0 );
+        x264_sub_mb_mv_write_cavlc( h, s, 1 );
     }
     else if( i_mb_type != B_DIRECT )
     {
diff --git a/encoder/encoder.c b/encoder/encoder.c
index 891ef73a..773c9c80 100644
--- a/encoder/encoder.c
+++ b/encoder/encoder.c
@@ -140,7 +140,7 @@ static void x264_slice_header_init( x264_slice_header_t *sh, x264_param_t *param
 
     sh->i_redundant_pic_cnt = 0;
 
-    sh->b_direct_spatial_mv_pred = 1;
+    sh->b_direct_spatial_mv_pred = ( param->analyse.i_direct_mv_pred == X264_DIRECT_PRED_SPATIAL );
 
     sh->b_num_ref_idx_override = 0;
     sh->i_num_ref_idx_l0_active = 1;
@@ -407,6 +407,8 @@ x264_t *x264_encoder_open   ( x264_param_t *param )
 
     h->pps = &h->pps_array[0];
     x264_pps_init( h->pps, 0, &h->param, h->sps);
+    
+    h->mb.i_mb_count = h->sps->i_mb_width * h->sps->i_mb_height;
 
     /* Init frames. */
     for( i = 0; i < X264_BFRAME_MAX + 1; i++ )
@@ -630,6 +632,12 @@ static inline void x264_reference_update( x264_t *h )
 {
     int i;
 
+    /* save mvs for B-frame prediction */
+    if( h->param.i_bframe )
+    {
+        x264_macroblock_direct_ref_save( h );
+    }
+
     /* apply deblocking filter to the current decoded picture */
     if( h->param.b_deblocking_filter )
     {
@@ -1166,7 +1174,7 @@ do_encode:
             h->i_frame_num--;
 
             /* Do IDR if needed and if we can (won't work with B frames) */
-            if( h->frames.next[0] == NULL &&
+            if( h->frames.current[0] == NULL &&
                 h->frames.i_last_idr + 1 >= h->param.i_idrframe )
             {
                 /* Reset */
diff --git a/encoder/macroblock.c b/encoder/macroblock.c
index 97334068..514b83a8 100644
--- a/encoder/macroblock.c
+++ b/encoder/macroblock.c
@@ -491,6 +491,21 @@ static void x264_mb_encode_8x8( x264_t *h, int b_inter, int i_qscale )
     }
 }
 
+static void x264_macroblock_encode_skip( x264_t *h )
+{
+    int i;
+    h->mb.i_cbp_luma = 0x00;
+    h->mb.i_cbp_chroma = 0x00;
+
+    for( i = 0; i < 16+8; i++ )
+    {
+        h->mb.cache.non_zero_count[x264_scan8[i]] = 0;
+    }
+
+    /* store cbp */
+    h->mb.cbp[h->mb.i_mb_xy] = 0;
+}
+
 /*****************************************************************************
  * x264_macroblock_encode_pskip:
  *  Encode an already marked skip block
@@ -499,7 +514,6 @@ void x264_macroblock_encode_pskip( x264_t *h )
 {
     const int mvx = h->mb.cache.mv[0][x264_scan8[0]][0];
     const int mvy = h->mb.cache.mv[0][x264_scan8[0]][1];
-    int i;
 
     /* Motion compensation XXX probably unneeded */
     h->mc[MC_LUMA]( h->mb.pic.p_fref[0][0][0], h->mb.pic.i_stride[0],
@@ -515,16 +529,7 @@ void x264_macroblock_encode_pskip( x264_t *h )
                       h->mb.pic.p_fdec[2],       h->mb.pic.i_stride[2],
                       mvx, mvy, 8, 8 );
 
-    h->mb.i_cbp_luma = 0x00;
-    h->mb.i_cbp_chroma = 0x00;
-
-    for( i = 0; i < 16+8; i++ )
-    {
-        h->mb.cache.non_zero_count[x264_scan8[i]] = 0;
-    }
-
-    /* store cbp */
-    h->mb.cbp[h->mb.i_mb_xy] = 0;
+    x264_macroblock_encode_skip( h );
 }
 
 /*****************************************************************************
@@ -542,6 +547,13 @@ void x264_macroblock_encode( x264_t *h )
         x264_macroblock_encode_pskip( h );
         return;
     }
+    if( h->mb.i_type == B_SKIP )
+    {
+        /* XXX motion compensation is probably unneeded */
+        x264_mb_mc( h );
+        x264_macroblock_encode_skip( h );
+        return;
+    }
 
     /* quantification scale */
     i_qscale = h->mb.qp[h->mb.i_mb_xy];
@@ -750,14 +762,22 @@ void x264_macroblock_encode( x264_t *h )
             }
         }
     }
+
+    /* Check for B_SKIP */
+    if( h->mb.i_type == B_DIRECT &&
+        h->mb.i_cbp_luma == 0x00 && h->mb.i_cbp_chroma== 0x00 )
+    {
+        h->mb.type[h->mb.i_mb_xy] = h->mb.i_type = B_SKIP;
+        h->mb.qp[h->mb.i_mb_xy] = h->mb.i_last_qp;  /* Needed */
+    }
 }
 
 /*****************************************************************************
- * x264_macroblock_probe_pskip:
- *  Check if the current MB could be encoded as a P_SKIP (it supposes you use
+ * x264_macroblock_probe_skip:
+ *  Check if the current MB could be encoded as a [PB]_SKIP (it supposes you use
  *  the previous QP
  *****************************************************************************/
-int x264_macroblock_probe_pskip( x264_t *h )
+int x264_macroblock_probe_skip( x264_t *h, int b_bidir )
 {
     DECLARE_ALIGNED( int16_t, dct4x4[16][4][4], 16 );
     DECLARE_ALIGNED( int16_t, dct2x2[2][2], 16 );
@@ -771,30 +791,33 @@ int x264_macroblock_probe_pskip( x264_t *h )
     int i8x8, i4x4;
     int i_decimate_mb;
 
-    /* quantification scale */
+    /* quantization scale */
     i_qp = h->mb.qp[h->mb.i_mb_xy];
 
-    /* Get the MV */
-    x264_mb_predict_mv_pskip( h, mvp );
+    if( !b_bidir )
+    {
+        /* Get the MV */
+        x264_mb_predict_mv_pskip( h, mvp );
 
-    /* Special case, need to clip the vector */
-    n = 16 * h->mb.i_mb_x + mvp[0];
-    if( n < -24 )
-        mvp[0] = -24 - 16*h->mb.i_mb_x;
-    else if( n > 16 * h->sps->i_mb_width + 24 )
-        mvp[0] = 16 * ( h->sps->i_mb_width - h->mb.i_mb_x ) + 24;
+        /* Special case, need to clip the vector */
+        n = 16 * h->mb.i_mb_x + mvp[0];
+        if( n < -24 )
+            mvp[0] = -24 - 16*h->mb.i_mb_x;
+        else if( n > 16 * h->sps->i_mb_width + 24 )
+            mvp[0] = 16 * ( h->sps->i_mb_width - h->mb.i_mb_x ) + 24;
 
-    n = 16 * h->mb.i_mb_y + mvp[1];
-    if( n < -24 )
-        mvp[1] = -24 - 16*h->mb.i_mb_y;
-    else if( n > 16 * h->sps->i_mb_height + 8 )
-        mvp[1] = 16 * ( h->sps->i_mb_height - h->mb.i_mb_y ) + 8;
+        n = 16 * h->mb.i_mb_y + mvp[1];
+        if( n < -24 )
+            mvp[1] = -24 - 16*h->mb.i_mb_y;
+        else if( n > 16 * h->sps->i_mb_height + 8 )
+            mvp[1] = 16 * ( h->sps->i_mb_height - h->mb.i_mb_y ) + 8;
 
 
-    /* Motion compensation */
-    h->mc[MC_LUMA]( h->mb.pic.p_fref[0][0][0], h->mb.pic.i_stride[0],
-                    h->mb.pic.p_fdec[0],       h->mb.pic.i_stride[0],
-                    mvp[0], mvp[1], 16, 16 );
+        /* Motion compensation */
+        h->mc[MC_LUMA]( h->mb.pic.p_fref[0][0][0], h->mb.pic.i_stride[0],
+                        h->mb.pic.p_fdec[0],       h->mb.pic.i_stride[0],
+                        mvp[0], mvp[1], 16, 16 );
+    }
 
     /* get luma diff */
     h->dctf.sub16x16_dct( dct4x4, h->mb.pic.p_fenc[0], h->mb.pic.i_stride[0],
@@ -829,9 +852,12 @@ int x264_macroblock_probe_pskip( x264_t *h )
         uint8_t  *p_src = h->mb.pic.p_fenc[1+ch];
         uint8_t  *p_dst = h->mb.pic.p_fdec[1+ch];
 
-        h->mc[MC_CHROMA]( h->mb.pic.p_fref[0][0][1+ch], i_stride,
-                          h->mb.pic.p_fdec[1+ch],       i_stride,
-                          mvp[0], mvp[1], 8, 8 );
+        if( !b_bidir )
+        {
+            h->mc[MC_CHROMA]( h->mb.pic.p_fref[0][0][1+ch], i_stride,
+                              h->mb.pic.p_fdec[1+ch],       i_stride,
+                              mvp[0], mvp[1], 8, 8 );
+        }
 
         h->dctf.sub8x8_dct( dct4x4, p_src, i_stride, p_dst, i_stride );
 
diff --git a/encoder/macroblock.h b/encoder/macroblock.h
index 0b297cc9..4310c2e5 100644
--- a/encoder/macroblock.h
+++ b/encoder/macroblock.h
@@ -26,7 +26,12 @@
 
 #include "../common/macroblock.h"
 
-int x264_macroblock_probe_pskip( x264_t *h );
+int x264_macroblock_probe_skip( x264_t *h, int b_bidir );
+
+static inline int x264_macroblock_probe_pskip( x264_t *h )
+    { return x264_macroblock_probe_skip( h, 0 ); }
+static inline int x264_macroblock_probe_bskip( x264_t *h )
+    { return x264_macroblock_probe_skip( h, 1 ); }
 
 void x264_macroblock_encode      ( x264_t *h );
 void x264_macroblock_write_cabac ( x264_t *h, bs_t *s );
diff --git a/encoder/ratecontrol.c b/encoder/ratecontrol.c
index bb944ddb..c33525aa 100644
--- a/encoder/ratecontrol.c
+++ b/encoder/ratecontrol.c
@@ -171,7 +171,7 @@ int x264_ratecontrol_new( x264_t *h )
 
     rc->gop_size = h->param.i_iframe;
     rc->bitrate = h->param.rc.i_bitrate * 1000;
-    rc->nmb = ((h->param.i_width + 15) / 16) * ((h->param.i_height + 15) / 16);
+    rc->nmb = h->mb.i_mb_count;
 
     rc->qp = h->param.rc.i_qp_constant;
     rc->qpa = rc->qp;
diff --git a/encoder/set.c b/encoder/set.c
index 2f43f4ce..0ad5a21b 100644
--- a/encoder/set.c
+++ b/encoder/set.c
@@ -84,7 +84,8 @@ void x264_sps_init( x264_sps_t *sps, int i_id, x264_param_t *param )
     sps->b_frame_mbs_only = 1;
     sps->b_mb_adaptive_frame_field = 0;
     sps->b_direct8x8_inference = 0;
-    if( sps->b_frame_mbs_only == 0 )
+    if( sps->b_frame_mbs_only == 0 ||
+        !(param->analyse.inter & X264_ANALYSE_PSUB8x8) )
     {
         sps->b_direct8x8_inference = 1;
     }
diff --git a/x264.h b/x264.h
index b276d66f..8c662921 100644
--- a/x264.h
+++ b/x264.h
@@ -26,7 +26,7 @@
 
 #include <stdarg.h>
 
-#define X264_BUILD 0x000c
+#define X264_BUILD 0x000d
 
 /* x264_t:
  *      opaque handler for decoder and encoder */
@@ -50,6 +50,10 @@ typedef struct x264_t x264_t;
 #define X264_ANALYSE_I4x4       0x0001  /* Analyse i4x4 */
 #define X264_ANALYSE_PSUB16x16  0x0010  /* Analyse p16x8, p8x16 and p8x8 */
 #define X264_ANALYSE_PSUB8x8    0x0020  /* Analyse p8x4, p4x8, p4x4 */
+#define X264_ANALYSE_BSUB16x16  0x0100  /* Analyse b16x8, b8x16 and b8x8 */
+#define X264_DIRECT_PRED_NONE        0
+#define X264_DIRECT_PRED_TEMPORAL    1
+#define X264_DIRECT_PRED_SPATIAL     2
 
 /* Colorspace type
  */
@@ -127,6 +131,8 @@ typedef struct
         unsigned int intra;     /* intra flags */
         unsigned int inter;     /* inter flags */
 
+        int          i_direct_mv_pred; /* spatial vs temporal mv prediction */
+
         int          i_subpel_refine; /* subpixel motion estimation quality */
 
         int          b_psnr;    /* Do we compute PSNR stats (save a few % of cpu) */