From a0012bf38d366b1b97e571fe27c665139f3c631c Mon Sep 17 00:00:00 2001
From: Loren Merritt <pengvado@videolan.org>
Date: Sat, 8 Oct 2005 06:49:29 +0000
Subject: [PATCH] new option: --mixed-refs Allows each 8x8 or 16x8 partition to
 independently select a reference frame, as opposed to only one ref per
 macroblock. patch mostly by Alex Wright (alexw0885 at hotmail dot com).

git-svn-id: svn://svn.videolan.org/x264/trunk@318 df754926-b1dd-0310-bc7b-ec298dee348c
---
 encoder/analyse.c | 264 ++++++++++++++++++++++++++++++++++------------
 encoder/me.c      |   7 +-
 encoder/me.h      |   2 +
 x264.c            |   6 ++
 x264.h            |  13 ++-
 5 files changed, 217 insertions(+), 75 deletions(-)

diff --git a/encoder/analyse.c b/encoder/analyse.c
index 09b98569..30fec4f3 100644
--- a/encoder/analyse.c
+++ b/encoder/analyse.c
@@ -44,6 +44,8 @@ typedef struct
 
     /* 8x8 */
     int       i_cost8x8;
+    int       mvc[16][5][2]; /* [ref][0] is 16x16 mv,
+                                [ref][1..4] are 8x8 mv from partition [0..3] */
     x264_me_t me8x8[4];
 
     /* Sub 4x4 */
@@ -663,6 +665,7 @@ static void x264_mb_analyse_intra( x264_t *h, x264_mb_analysis_t *a, int i_cost_
     (m)->p_fenc[0] = &(src)[0][(xoff)+(yoff)*(m)->i_stride[0]]; \
     (m)->p_fenc[1] = &(src)[1][((xoff)>>1)+((yoff)>>1)*(m)->i_stride[1]]; \
     (m)->p_fenc[2] = &(src)[2][((xoff)>>1)+((yoff)>>1)*(m)->i_stride[1]];
+
 #define LOAD_HPELS(m, src, xoff, yoff) \
     (m)->p_fref[0] = &(src)[0][(xoff)+(yoff)*(m)->i_stride[0]]; \
     (m)->p_fref[1] = &(src)[1][(xoff)+(yoff)*(m)->i_stride[0]]; \
@@ -671,6 +674,9 @@ static void x264_mb_analyse_intra( x264_t *h, x264_mb_analysis_t *a, int i_cost_
     (m)->p_fref[4] = &(src)[4][((xoff)>>1)+((yoff)>>1)*(m)->i_stride[1]]; \
     (m)->p_fref[5] = &(src)[5][((xoff)>>1)+((yoff)>>1)*(m)->i_stride[1]];
 
+#define REF_COST(list, ref) \
+    (a->i_lambda * bs_size_te( h->sh.i_num_ref_idx_l##list##_active - 1, ref ))
+
 static void x264_mb_analyse_inter_p16x16( x264_t *h, x264_mb_analysis_t *a )
 {
     x264_me_t m;
@@ -687,8 +693,10 @@ static void x264_mb_analyse_inter_p16x16( x264_t *h, x264_mb_analysis_t *a )
     a->l0.me16x16.cost = INT_MAX;
     for( i_ref = 0; i_ref < h->i_ref0; i_ref++ )
     {
-        const int i_ref_cost = a->i_lambda * bs_size_te( h->sh.i_num_ref_idx_l0_active - 1, i_ref );
+        const int i_ref_cost = REF_COST( 0, i_ref );
         i_fullpel_thresh -= i_ref_cost;
+        m.i_ref_cost = i_ref_cost;
+        m.i_ref = i_ref;
 
         /* search with ref */
         LOAD_HPELS( &m, h->mb.pic.p_fref[0][i_ref], 0, 0 );
@@ -700,18 +708,16 @@ static void x264_mb_analyse_inter_p16x16( x264_t *h, x264_mb_analysis_t *a )
         i_fullpel_thresh += i_ref_cost;
 
         if( m.cost < a->l0.me16x16.cost )
-        {
-            a->l0.i_ref = i_ref;
             a->l0.me16x16 = m;
-        }
 
         /* save mv for predicting neighbors */
+        a->l0.mvc[i_ref][0][0] =
         h->mb.mvr[0][i_ref][h->mb.i_mb_xy][0] = m.mv[0];
+        a->l0.mvc[i_ref][0][1] =
         h->mb.mvr[0][i_ref][h->mb.i_mb_xy][1] = m.mv[1];
     }
 
-    /* Set global ref, needed for all others modes */
-    x264_macroblock_cache_ref( h, 0, 0, 4, 4, 0, a->l0.i_ref );
+    x264_macroblock_cache_ref( h, 0, 0, 4, 4, 0, a->l0.me16x16.i_ref );
 
     if( a->b_mbrd )
     {
@@ -721,26 +727,105 @@ static void x264_mb_analyse_inter_p16x16( x264_t *h, x264_mb_analysis_t *a )
         x264_macroblock_cache_mv ( h, 0, 0, 4, 4, 0, a->l0.me16x16.mv[0], a->l0.me16x16.mv[1] );
         a->l0.me16x16.cost = x264_rd_cost_mb( h, a->i_lambda2 );
     }
-    else
+}
+
+static void x264_mb_analyse_inter_p8x8_mixed_ref( x264_t *h, x264_mb_analysis_t *a )
+{
+    x264_me_t m;
+    int i_ref;
+    uint8_t  **p_fenc = h->mb.pic.p_fenc;
+    int i_fullpel_thresh = INT_MAX;
+    int *p_fullpel_thresh = /*h->i_ref0>1 ? &i_fullpel_thresh : */NULL;
+    int i;
+    int i_maxref = h->i_ref0-1;
+
+    h->mb.i_partition = D_8x8;
+
+    /* early termination: if 16x16 chose ref 0, then evalute no refs older
+     * than those used by the neighbors */
+    if( i_maxref > 0 && a->l0.me16x16.i_ref == 0 &&
+        h->mb.i_mb_type_top && h->mb.i_mb_type_left )
     {
-        /* subtract ref cost, so we don't have to add it for the other P types */
-        a->l0.me16x16.cost -= a->i_lambda * bs_size_te( h->sh.i_num_ref_idx_l0_active - 1, a->l0.i_ref );
+        i_maxref = 0;
+        i_maxref = X264_MAX( i_maxref, h->mb.cache.ref[0][ X264_SCAN8_0 - 8 - 1 ] );
+        i_maxref = X264_MAX( i_maxref, h->mb.cache.ref[0][ X264_SCAN8_0 - 8 + 0 ] );
+        i_maxref = X264_MAX( i_maxref, h->mb.cache.ref[0][ X264_SCAN8_0 - 8 + 2 ] );
+        i_maxref = X264_MAX( i_maxref, h->mb.cache.ref[0][ X264_SCAN8_0 - 8 + 4 ] );
+        i_maxref = X264_MAX( i_maxref, h->mb.cache.ref[0][ X264_SCAN8_0 + 0 - 1 ] );
+        i_maxref = X264_MAX( i_maxref, h->mb.cache.ref[0][ X264_SCAN8_0 + 2*8 - 1 ] );
+    }
+
+    for( i_ref = 0; i_ref <= i_maxref; i_ref++ )
+    {
+         a->l0.mvc[i_ref][0][0] = h->mb.mvr[0][i_ref][h->mb.i_mb_xy][0];
+         a->l0.mvc[i_ref][0][1] = h->mb.mvr[0][i_ref][h->mb.i_mb_xy][1];
+    }
+
+    for( i = 0; i < 4; i++ )
+    {
+        x264_me_t *l0m = &a->l0.me8x8[i];
+        const int x8 = i%2;
+        const int y8 = i/2;
+
+        m.i_pixel = PIXEL_8x8;
+        m.p_cost_mv = a->p_cost_mv;
+
+        LOAD_FENC( &m, p_fenc, 8*x8, 8*y8 );
+        l0m->cost = INT_MAX;
+        for( i_ref = 0; i_ref <= i_maxref; i_ref++ )
+        {
+             const int i_ref_cost = REF_COST( 0, i_ref );
+             i_fullpel_thresh -= i_ref_cost;
+             m.i_ref_cost = i_ref_cost;
+             m.i_ref = i_ref;
+
+             LOAD_HPELS( &m, h->mb.pic.p_fref[0][i_ref], 8*x8, 8*y8 );
+             x264_macroblock_cache_ref( h, 2*x8, 2*y8, 2, 2, 0, i_ref );
+             x264_mb_predict_mv( h, 0, 4*i, 2, m.mvp );
+             x264_me_search_ref( h, &m, a->l0.mvc[i_ref], i+1, p_fullpel_thresh );
+
+             m.cost += i_ref_cost;
+             i_fullpel_thresh += i_ref_cost;
+             *(uint64_t*)a->l0.mvc[i_ref][i+1] = *(uint64_t*)m.mv;
+
+             if( m.cost < l0m->cost )
+                 *l0m = m;
+        }
+        x264_macroblock_cache_mv( h, 2*x8, 2*y8, 2, 2, 0, l0m->mv[0], l0m->mv[1] );
+        x264_macroblock_cache_ref( h, 2*x8, 2*y8, 2, 2, 0, l0m->i_ref );
+
+        /* mb type cost */
+        l0m->cost += a->i_lambda * i_sub_mb_p_cost_table[D_L0_8x8];
+    }
+
+    a->l0.i_cost8x8 = a->l0.me8x8[0].cost + a->l0.me8x8[1].cost +
+                      a->l0.me8x8[2].cost + a->l0.me8x8[3].cost;
+    if( a->b_mbrd )
+    {
+        if( a->i_best_satd > a->l0.i_cost8x8 )
+            a->i_best_satd = a->l0.i_cost8x8;
+        h->mb.i_type = P_8x8;
+        h->mb.i_sub_partition[0] = h->mb.i_sub_partition[1] =
+        h->mb.i_sub_partition[2] = h->mb.i_sub_partition[3] = D_L0_8x8;
+        a->l0.i_cost8x8 = x264_rd_cost_mb( h, a->i_lambda2 );
     }
 }
 
 static void x264_mb_analyse_inter_p8x8( x264_t *h, x264_mb_analysis_t *a )
 {
-    uint8_t  **p_fref = h->mb.pic.p_fref[0][a->l0.i_ref];
+    const int i_ref = a->l0.me16x16.i_ref;
+    const int i_ref_cost = REF_COST( 0, i_ref );
+    uint8_t  **p_fref = h->mb.pic.p_fref[0][i_ref];
     uint8_t  **p_fenc = h->mb.pic.p_fenc;
-    int mvc[5][2], i_mvc;
+    int i_mvc;
+    int (*mvc)[2] = a->l0.mvc[i_ref];
     int i;
 
     /* XXX Needed for x264_mb_predict_mv */
     h->mb.i_partition = D_8x8;
 
     i_mvc = 1;
-    mvc[0][0] = a->l0.me16x16.mv[0];
-    mvc[0][1] = a->l0.me16x16.mv[1];
+    *(uint64_t*)mvc[0] = *(uint64_t*)a->l0.me16x16.mv;
 
     for( i = 0; i < 4; i++ )
     {
@@ -750,25 +835,29 @@ static void x264_mb_analyse_inter_p8x8( x264_t *h, x264_mb_analysis_t *a )
 
         m->i_pixel = PIXEL_8x8;
         m->p_cost_mv = a->p_cost_mv;
+        m->i_ref_cost = i_ref_cost;
+        m->i_ref = i_ref;
 
         LOAD_FENC( m, p_fenc, 8*x8, 8*y8 );
         LOAD_HPELS( m, p_fref, 8*x8, 8*y8 );
-
         x264_mb_predict_mv( h, 0, 4*i, 2, m->mvp );
         x264_me_search( h, m, mvc, i_mvc );
 
         x264_macroblock_cache_mv( h, 2*x8, 2*y8, 2, 2, 0, m->mv[0], m->mv[1] );
 
-        mvc[i_mvc][0] = m->mv[0];
-        mvc[i_mvc][1] = m->mv[1];
+        *(uint64_t*)mvc[i_mvc] = *(uint64_t*)m->mv;
         i_mvc++;
 
         /* mb type cost */
+        m->cost += i_ref_cost;
         m->cost += a->i_lambda * i_sub_mb_p_cost_table[D_L0_8x8];
     }
 
+    /* theoretically this should include 4*ref_cost,
+     * but 3 seems a better approximation of cabac. */
     a->l0.i_cost8x8 = a->l0.me8x8[0].cost + a->l0.me8x8[1].cost +
-                      a->l0.me8x8[2].cost + a->l0.me8x8[3].cost;
+                      a->l0.me8x8[2].cost + a->l0.me8x8[3].cost -
+                      REF_COST( 0, a->l0.me16x16.i_ref );
     if( a->b_mbrd )
     {
         if( a->i_best_satd > a->l0.i_cost8x8 )
@@ -782,33 +871,49 @@ static void x264_mb_analyse_inter_p8x8( x264_t *h, x264_mb_analysis_t *a )
 
 static void x264_mb_analyse_inter_p16x8( x264_t *h, x264_mb_analysis_t *a )
 {
-    uint8_t  **p_fref = h->mb.pic.p_fref[0][a->l0.i_ref];
+    x264_me_t m;
     uint8_t  **p_fenc = h->mb.pic.p_fenc;
-    int mvc[2][2];
-    int i;
+    int mvc[3][2];
+    int i, j;
 
     /* XXX Needed for x264_mb_predict_mv */
     h->mb.i_partition = D_16x8;
 
     for( i = 0; i < 2; i++ )
     {
-        x264_me_t *m = &a->l0.me16x8[i];
+        x264_me_t *l0m = &a->l0.me16x8[i];
+        const int ref8[2] = { a->l0.me8x8[2*i].i_ref, a->l0.me8x8[2*i+1].i_ref };
+        const int i_ref8s = ( ref8[0] == ref8[1] ) ? 1 : 2;
 
-        m->i_pixel = PIXEL_16x8;
-        m->p_cost_mv = a->p_cost_mv;
+        m.i_pixel = PIXEL_16x8;
+        m.p_cost_mv = a->p_cost_mv;
 
-        LOAD_FENC( m, p_fenc, 0, 8*i );
-        LOAD_HPELS( m, p_fref, 0, 8*i );
+        LOAD_FENC( &m, p_fenc, 0, 8*i );
+        l0m->cost = INT_MAX;
+        for( j = 0; j < i_ref8s; j++ )
+        {
+             const int i_ref = ref8[j];
+             const int i_ref_cost = REF_COST( 0, i_ref );
+             m.i_ref_cost = i_ref_cost;
+             m.i_ref = i_ref;
+
+             /* if we skipped the 16x16 predictor, we wouldn't have to copy anything... */
+             *(uint64_t*)mvc[0] = *(uint64_t*)a->l0.mvc[i_ref][0];
+             *(uint64_t*)mvc[1] = *(uint64_t*)a->l0.mvc[i_ref][2*i+1];
+             *(uint64_t*)mvc[2] = *(uint64_t*)a->l0.mvc[i_ref][2*i+2];
 
-        mvc[0][0] = a->l0.me8x8[2*i].mv[0];
-        mvc[0][1] = a->l0.me8x8[2*i].mv[1];
-        mvc[1][0] = a->l0.me8x8[2*i+1].mv[0];
-        mvc[1][1] = a->l0.me8x8[2*i+1].mv[1];
+             LOAD_HPELS( &m, h->mb.pic.p_fref[0][i_ref], 0, 8*i );
+             x264_macroblock_cache_ref( h, 0, 2*i, 4, 2, 0, i_ref );
+             x264_mb_predict_mv( h, 0, 8*i, 4, m.mvp );
+             x264_me_search( h, &m, mvc, 3 );
 
-        x264_mb_predict_mv( h, 0, 8*i, 4, m->mvp );
-        x264_me_search( h, m, mvc, 2 );
+             m.cost += i_ref_cost;
 
-        x264_macroblock_cache_mv( h, 0, 2*i, 4, 2, 0, m->mv[0], m->mv[1] );
+             if( m.cost < l0m->cost )
+                 *l0m = m;
+        }
+        x264_macroblock_cache_mv( h, 0, 2*i, 4, 2, 0, l0m->mv[0], l0m->mv[1] );
+        x264_macroblock_cache_ref( h, 0, 2*i, 4, 2, 0, l0m->i_ref );
     }
 
     a->l0.i_cost16x8 = a->l0.me16x8[0].cost + a->l0.me16x8[1].cost;
@@ -823,33 +928,48 @@ static void x264_mb_analyse_inter_p16x8( x264_t *h, x264_mb_analysis_t *a )
 
 static void x264_mb_analyse_inter_p8x16( x264_t *h, x264_mb_analysis_t *a )
 {
-    uint8_t  **p_fref = h->mb.pic.p_fref[0][a->l0.i_ref];
+    x264_me_t m;
     uint8_t  **p_fenc = h->mb.pic.p_fenc;
-    int mvc[2][2];
-    int i;
+    int mvc[3][2];
+    int i, j;
 
     /* XXX Needed for x264_mb_predict_mv */
     h->mb.i_partition = D_8x16;
 
     for( i = 0; i < 2; i++ )
     {
-        x264_me_t *m = &a->l0.me8x16[i];
+        x264_me_t *l0m = &a->l0.me8x16[i];
+        const int ref8[2] = { a->l0.me8x8[i].i_ref, a->l0.me8x8[i+2].i_ref };
+        const int i_ref8s = ( ref8[0] == ref8[1] ) ? 1 : 2;
 
-        m->i_pixel = PIXEL_8x16;
-        m->p_cost_mv = a->p_cost_mv;
+        m.i_pixel = PIXEL_8x16;
+        m.p_cost_mv = a->p_cost_mv;
+
+        LOAD_FENC( &m, p_fenc, 8*i, 0 );
+        l0m->cost = INT_MAX;
+        for( j = 0; j < i_ref8s; j++ )
+        {
+             const int i_ref = ref8[j];
+             const int i_ref_cost = REF_COST( 0, i_ref );
+             m.i_ref_cost = i_ref_cost;
+             m.i_ref = i_ref;
 
-        LOAD_FENC( m, p_fenc, 8*i, 0 );
-        LOAD_HPELS( m, p_fref, 8*i, 0 );
+             *(uint64_t*)mvc[0] = *(uint64_t*)a->l0.mvc[i_ref][0];
+             *(uint64_t*)mvc[1] = *(uint64_t*)a->l0.mvc[i_ref][i+1];
+             *(uint64_t*)mvc[2] = *(uint64_t*)a->l0.mvc[i_ref][i+3];
 
-        mvc[0][0] = a->l0.me8x8[i].mv[0];
-        mvc[0][1] = a->l0.me8x8[i].mv[1];
-        mvc[1][0] = a->l0.me8x8[i+2].mv[0];
-        mvc[1][1] = a->l0.me8x8[i+2].mv[1];
+             LOAD_HPELS( &m, h->mb.pic.p_fref[0][i_ref], 8*i, 0 );
+             x264_macroblock_cache_ref( h, 2*i, 0, 2, 4, 0, i_ref );
+             x264_mb_predict_mv( h, 0, 4*i, 2, m.mvp );
+             x264_me_search( h, &m, mvc, 3 );
 
-        x264_mb_predict_mv( h, 0, 4*i, 2, m->mvp );
-        x264_me_search( h, m, mvc, 2 );
+             m.cost += i_ref_cost;
 
-        x264_macroblock_cache_mv( h, 2*i, 0, 2, 4, 0, m->mv[0], m->mv[1] );
+             if( m.cost < l0m->cost )
+                 *l0m = m;
+        }
+        x264_macroblock_cache_mv( h, 2*i, 0, 2, 4, 0, l0m->mv[0], l0m->mv[1] );
+        x264_macroblock_cache_ref( h, 2*i, 0, 2, 4, 0, l0m->i_ref );
     }
 
     a->l0.i_cost8x16 = a->l0.me8x16[0].cost + a->l0.me8x16[1].cost;
@@ -896,7 +1016,7 @@ static int x264_mb_analyse_inter_p4x4_chroma( x264_t *h, x264_mb_analysis_t *a,
 
 static void x264_mb_analyse_inter_p4x4( x264_t *h, x264_mb_analysis_t *a, int i8x8 )
 {
-    uint8_t  **p_fref = h->mb.pic.p_fref[0][a->l0.i_ref];
+    uint8_t  **p_fref = h->mb.pic.p_fref[0][a->l0.me8x8[i8x8].i_ref];
     uint8_t  **p_fenc = h->mb.pic.p_fenc;
 
     int i4x4;
@@ -924,19 +1044,19 @@ static void x264_mb_analyse_inter_p4x4( x264_t *h, x264_mb_analysis_t *a, int i8
 
         x264_macroblock_cache_mv( h, x4, y4, 1, 1, 0, m->mv[0], m->mv[1] );
     }
-
     a->l0.i_cost4x4[i8x8] = a->l0.me4x4[i8x8][0].cost +
-                         a->l0.me4x4[i8x8][1].cost +
-                         a->l0.me4x4[i8x8][2].cost +
-                         a->l0.me4x4[i8x8][3].cost +
-                         a->i_lambda * i_sub_mb_p_cost_table[D_L0_4x4];
+                            a->l0.me4x4[i8x8][1].cost +
+                            a->l0.me4x4[i8x8][2].cost +
+                            a->l0.me4x4[i8x8][3].cost +
+                            REF_COST( 0, a->l0.me8x8[i8x8].i_ref ) +
+                            a->i_lambda * i_sub_mb_p_cost_table[D_L0_4x4];
     if( h->mb.b_chroma_me )
         a->l0.i_cost4x4[i8x8] += x264_mb_analyse_inter_p4x4_chroma( h, a, p_fref, i8x8, PIXEL_4x4 );
 }
 
 static void x264_mb_analyse_inter_p8x4( x264_t *h, x264_mb_analysis_t *a, int i8x8 )
 {
-    uint8_t  **p_fref = h->mb.pic.p_fref[0][a->l0.i_ref];
+    uint8_t  **p_fref = h->mb.pic.p_fref[0][a->l0.me8x8[i8x8].i_ref];
     uint8_t  **p_fenc = h->mb.pic.p_fenc;
 
     int i8x4;
@@ -964,8 +1084,8 @@ static void x264_mb_analyse_inter_p8x4( x264_t *h, x264_mb_analysis_t *a, int i8
 
         x264_macroblock_cache_mv( h, x4, y4, 2, 1, 0, m->mv[0], m->mv[1] );
     }
-
     a->l0.i_cost8x4[i8x8] = a->l0.me8x4[i8x8][0].cost + a->l0.me8x4[i8x8][1].cost +
+                            REF_COST( 0, a->l0.me8x8[i8x8].i_ref ) +
                             a->i_lambda * i_sub_mb_p_cost_table[D_L0_8x4];
     if( h->mb.b_chroma_me )
         a->l0.i_cost8x4[i8x8] += x264_mb_analyse_inter_p4x4_chroma( h, a, p_fref, i8x8, PIXEL_8x4 );
@@ -973,7 +1093,7 @@ static void x264_mb_analyse_inter_p8x4( x264_t *h, x264_mb_analysis_t *a, int i8
 
 static void x264_mb_analyse_inter_p4x8( x264_t *h, x264_mb_analysis_t *a, int i8x8 )
 {
-    uint8_t  **p_fref = h->mb.pic.p_fref[0][a->l0.i_ref];
+    uint8_t  **p_fref = h->mb.pic.p_fref[0][a->l0.me8x8[i8x8].i_ref];
     uint8_t  **p_fenc = h->mb.pic.p_fenc;
 
     int i4x8;
@@ -1001,8 +1121,8 @@ static void x264_mb_analyse_inter_p4x8( x264_t *h, x264_mb_analysis_t *a, int i8
 
         x264_macroblock_cache_mv( h, x4, y4, 1, 2, 0, m->mv[0], m->mv[1] );
     }
-
     a->l0.i_cost4x8[i8x8] = a->l0.me4x8[i8x8][0].cost + a->l0.me4x8[i8x8][1].cost +
+                            REF_COST( 0, a->l0.me8x8[i8x8].i_ref ) +
                             a->i_lambda * i_sub_mb_p_cost_table[D_L0_4x8];
     if( h->mb.b_chroma_me )
         a->l0.i_cost4x8[i8x8] += x264_mb_analyse_inter_p4x4_chroma( h, a, p_fref, i8x8, PIXEL_4x8 );
@@ -1073,7 +1193,7 @@ static void x264_mb_analyse_inter_b16x16( x264_t *h, x264_mb_analysis_t *a )
         x264_me_search_ref( h, &m, mvc, i_mvc, p_fullpel_thresh );
 
         /* add ref cost */
-        m.cost += a->i_lambda * bs_size_te( h->sh.i_num_ref_idx_l0_active - 1, i_ref );
+        m.cost += REF_COST( 0, i_ref );
 
         if( m.cost < a->l0.me16x16.cost )
         {
@@ -1086,7 +1206,7 @@ static void x264_mb_analyse_inter_b16x16( x264_t *h, x264_mb_analysis_t *a )
         h->mb.mvr[0][i_ref][h->mb.i_mb_xy][1] = m.mv[1];
     }
     /* subtract ref cost, so we don't have to add it for the other MB types */
-    a->l0.me16x16.cost -= a->i_lambda * bs_size_te( h->sh.i_num_ref_idx_l0_active - 1, a->l0.i_ref );
+    a->l0.me16x16.cost -= REF_COST( 0, a->l0.i_ref );
 
     /* ME for list 1 */
     i_fullpel_thresh = INT_MAX;
@@ -1101,7 +1221,7 @@ static void x264_mb_analyse_inter_b16x16( x264_t *h, x264_mb_analysis_t *a )
         x264_me_search_ref( h, &m, mvc, i_mvc, p_fullpel_thresh );
 
         /* add ref cost */
-        m.cost += a->i_lambda * bs_size_te( h->sh.i_num_ref_idx_l1_active - 1, i_ref );
+        m.cost += REF_COST( 1, i_ref );
 
         if( m.cost < a->l1.me16x16.cost )
         {
@@ -1114,7 +1234,7 @@ static void x264_mb_analyse_inter_b16x16( x264_t *h, x264_mb_analysis_t *a )
         h->mb.mvr[1][i_ref][h->mb.i_mb_xy][1] = m.mv[1];
     }
     /* subtract ref cost, so we don't have to add it for the other MB types */
-    a->l1.me16x16.cost -= a->i_lambda * bs_size_te( h->sh.i_num_ref_idx_l1_active - 1, a->l1.i_ref );
+    a->l1.me16x16.cost -= REF_COST( 1, a->l1.i_ref );
 
     /* Set global ref, needed for other modes? */
     x264_macroblock_cache_ref( h, 0, 0, 4, 4, 0, a->l0.i_ref );
@@ -1157,8 +1277,8 @@ static void x264_mb_analyse_inter_b16x16( x264_t *h, x264_mb_analysis_t *a )
         h->mc.avg[PIXEL_16x16]( pix1, 16, src2, stride2 );
 
     a->i_cost16x16bi = h->pixf.mbcmp[PIXEL_16x16]( h->mb.pic.p_fenc[0], h->mb.pic.i_stride[0], pix1, 16 )
-                     + a->i_lambda * ( bs_size_te( h->sh.i_num_ref_idx_l0_active - 1, a->l0.i_ref )
-                                     + bs_size_te( h->sh.i_num_ref_idx_l1_active - 1, a->l1.i_ref ) )
+                     + REF_COST( 0, a->l0.i_ref )
+                     + REF_COST( 1, a->l1.i_ref )
                      + a->l0.me16x16.cost_mv
                      + a->l1.me16x16.cost_mv;
 
@@ -1579,7 +1699,12 @@ void x264_macroblock_analyse( x264_t *h )
 
             x264_mb_analyse_inter_p16x16( h, &analysis );
             if( flags & X264_ANALYSE_PSUB16x16 )
-                x264_mb_analyse_inter_p8x8( h, &analysis );
+            {
+                if( h->param.analyse.b_mixed_references )
+                    x264_mb_analyse_inter_p8x8_mixed_ref( h, &analysis );
+                else
+                    x264_mb_analyse_inter_p8x8( h, &analysis );
+            }
 
             /* Select best inter mode */
             i_type = P_L0;
@@ -1993,19 +2118,23 @@ static void x264_analyse_update_cache( x264_t *h, x264_mb_analysis_t *a  )
             break;
 
         case P_L0:
-            x264_macroblock_cache_ref( h, 0, 0, 4, 4, 0, a->l0.i_ref );
             switch( h->mb.i_partition )
             {
                 case D_16x16:
+                    x264_macroblock_cache_ref( h, 0, 0, 4, 4, 0, a->l0.me16x16.i_ref );
                     x264_macroblock_cache_mv ( h, 0, 0, 4, 4, 0, a->l0.me16x16.mv[0], a->l0.me16x16.mv[1] );
                     break;
 
                 case D_16x8:
+                    x264_macroblock_cache_ref( h, 0, 0, 4, 2, 0, a->l0.me16x8[0].i_ref );
+                    x264_macroblock_cache_ref( h, 0, 2, 4, 2, 0, a->l0.me16x8[1].i_ref );
                     x264_macroblock_cache_mv ( h, 0, 0, 4, 2, 0, a->l0.me16x8[0].mv[0], a->l0.me16x8[0].mv[1] );
                     x264_macroblock_cache_mv ( h, 0, 2, 4, 2, 0, a->l0.me16x8[1].mv[0], a->l0.me16x8[1].mv[1] );
                     break;
 
                 case D_8x16:
+                    x264_macroblock_cache_ref( h, 0, 0, 2, 4, 0, a->l0.me8x16[0].i_ref );
+                    x264_macroblock_cache_ref( h, 2, 0, 2, 4, 0, a->l0.me8x16[1].i_ref );
                     x264_macroblock_cache_mv ( h, 0, 0, 2, 4, 0, a->l0.me8x16[0].mv[0], a->l0.me8x16[0].mv[1] );
                     x264_macroblock_cache_mv ( h, 2, 0, 2, 4, 0, a->l0.me8x16[1].mv[0], a->l0.me8x16[1].mv[1] );
                     break;
@@ -2017,7 +2146,10 @@ static void x264_analyse_update_cache( x264_t *h, x264_mb_analysis_t *a  )
             break;
 
         case P_8x8:
-            x264_macroblock_cache_ref( h, 0, 0, 4, 4, 0, a->l0.i_ref );
+            x264_macroblock_cache_ref( h, 0, 0, 2, 2, 0, a->l0.me8x8[0].i_ref );
+            x264_macroblock_cache_ref( h, 2, 0, 2, 2, 0, a->l0.me8x8[1].i_ref );
+            x264_macroblock_cache_ref( h, 0, 2, 2, 2, 0, a->l0.me8x8[2].i_ref );
+            x264_macroblock_cache_ref( h, 2, 2, 2, 2, 0, a->l0.me8x8[3].i_ref );
             for( i = 0; i < 4; i++ )
                 x264_mb_cache_mv_p8x8( h, a, i );
             break;
diff --git a/encoder/me.c b/encoder/me.c
index 5ff786d4..b2717547 100644
--- a/encoder/me.c
+++ b/encoder/me.c
@@ -287,8 +287,11 @@ void x264_me_refine_qpel( x264_t *h, x264_me_t *m )
 {
     int hpel = subpel_iterations[h->mb.i_subpel_refine][0];
     int qpel = subpel_iterations[h->mb.i_subpel_refine][1];
-//  if( hpel || qpel )
-	refine_subpel( h, m, hpel, qpel );
+
+    if( m->i_pixel <= PIXEL_8x8 && h->sh.i_type == SLICE_TYPE_P )
+        m->cost -= m->i_ref_cost;
+	
+    refine_subpel( h, m, hpel, qpel );
 }
 
 #define COST_MV( mx, my ) \
diff --git a/encoder/me.h b/encoder/me.h
index 80e88077..20b18668 100644
--- a/encoder/me.h
+++ b/encoder/me.h
@@ -31,6 +31,8 @@ typedef struct
     /* input */
     int      i_pixel;   /* PIXEL_WxH */
     int16_t *p_cost_mv; /* lambda * nbits for each possible mv */
+    int      i_ref_cost;
+    int      i_ref;
 
     uint8_t *p_fref[6];
     uint8_t *p_fenc[3];
diff --git a/x264.c b/x264.c
index fd5f21a2..67f77494 100644
--- a/x264.c
+++ b/x264.c
@@ -253,6 +253,7 @@ static void Help( x264_param_t *defaults )
              "      --merange <integer>     Maximum motion vector search range [%d]\n"
              "  -m, --subme <integer>       Subpixel motion estimation and partition\n"
              "                                  decision quality: 1=fast, 6=best. [%d]\n"
+             "      --mixed-refs            Decide references on a per partition basis\n"
              "      --no-chroma-me          Ignore chroma in motion estimation\n"
              "  -8, --8x8dct                Adaptive spatial transform size\n"
              "\n"
@@ -468,6 +469,7 @@ static int  Parse( int argc, char **argv,
 #define OPT_TRANSFER 311
 #define OPT_COLOURMATRIX 312
 #define OPT_CHROMALOC 313
+#define OPT_MIXED_REFS 314
 
         static struct option long_options[] =
         {
@@ -500,6 +502,7 @@ static int  Parse( int argc, char **argv,
             { "me",      required_argument, NULL, OPT_ME },
             { "merange", required_argument, NULL, OPT_MERANGE },
             { "subme",   required_argument, NULL, 'm' },
+            { "mixed-refs", no_argument,    NULL, OPT_MIXED_REFS },
             { "no-chroma-me", no_argument,  NULL, OPT_NO_CHROMA_ME },
             { "8x8dct",  no_argument,       NULL, '8' },
             { "level",   required_argument, NULL, OPT_LEVEL },
@@ -716,6 +719,9 @@ static int  Parse( int argc, char **argv,
             case 'm':
                 param->analyse.i_subpel_refine = atoi(optarg);
                 break;
+            case OPT_MIXED_REFS:
+                param->analyse.b_mixed_references = 1;
+                break;
             case OPT_NO_CHROMA_ME:
                 param->analyse.b_chroma_me = 0;
                 break;
diff --git a/x264.h b/x264.h
index e5a1728c..a0158007 100644
--- a/x264.h
+++ b/x264.h
@@ -35,7 +35,7 @@
 
 #include <stdarg.h>
 
-#define X264_BUILD 35
+#define X264_BUILD 36
 
 /* x264_t:
  *      opaque handler for decoder and encoder */
@@ -187,17 +187,16 @@ typedef struct
         unsigned int inter;     /* inter partitions */
 
         int          b_transform_8x8;
-
+        int          b_weighted_bipred; /* implicit weighting for B-frames */
         int          i_direct_mv_pred; /* spatial vs temporal mv prediction */
+        int          i_chroma_qp_offset;
+
         int          i_me_method; /* motion estimation algorithm to use (X264_ME_*) */
         int          i_me_range; /* integer pixel motion estimation search range (from predicted mv) */
+        int          i_mv_range; /* maximum length of a mv (in pixels) */
         int          i_subpel_refine; /* subpixel motion estimation quality */
         int          b_chroma_me; /* chroma ME for subpel and mode decision in P-frames */
-        int          i_mv_range; /* maximum length of a mv (in pixels) */
-
-        int          b_weighted_bipred; /* implicit weighting for B-frames */
-
-        int          i_chroma_qp_offset;
+        int          b_mixed_references; /* allow each mb partition in P-frames to have it's own reference number */
 
         int          b_psnr;    /* Do we compute PSNR stats (save a few % of cpu) */
     } analyse;
-- 
2.40.0