]> granicus.if.org Git - libx264/commitdiff
Before evaluating the RD score of any mode, check satd and abort if it's much worse...
authorLoren Merritt <pengvado@videolan.org>
Mon, 24 Apr 2006 19:34:06 +0000 (19:34 +0000)
committerLoren Merritt <pengvado@videolan.org>
Mon, 24 Apr 2006 19:34:06 +0000 (19:34 +0000)
Also apply more early termination to intra search.
speed at -m1:+1%, -m4:+3%, -m6:+8%, -m7:+20%

git-svn-id: svn://svn.videolan.org/x264/trunk@511 df754926-b1dd-0310-bc7b-ec298dee348c

encoder/analyse.c
encoder/me.c
encoder/me.h

index 8b90a2a8ec9a873256acff7674146865a464b22b..819c51ff41ba2d4111808451404b83d76f39a944 100644 (file)
@@ -38,6 +38,7 @@ typedef struct
 {
     /* 16x16 */
     int i_ref;
+    int       i_rd16x16;
     x264_me_t me16x16;
 
     /* 8x8 */
@@ -81,21 +82,22 @@ typedef struct
     /* I: Intra part */
     /* Take some shortcuts in intra search if intra is deemed unlikely */
     int b_fast_intra;
-    int i_best_satd;
     int b_try_pskip;
 
     /* Luma part */
-    int i_sad_i16x16;
+    int i_satd_i16x16;
+    int i_satd_i16x16_dir[7];
     int i_predict16x16;
 
-    int i_sad_i8x8;
-    int i_predict8x8[2][2];
+    int i_satd_i8x8;
+    int i_satd_i8x8_dir[12][4];
+    int i_predict8x8[4];
 
-    int i_sad_i4x4;
-    int i_predict4x4[4][4];
+    int i_satd_i4x4;
+    int i_predict4x4[16];
 
     /* Chroma part */
-    int i_sad_i8x8chroma;
+    int i_satd_i8x8chroma;
     int i_predict8x8chroma;
 
     /* II: Inter part P/B frame */
@@ -108,6 +110,11 @@ typedef struct
     int i_cost8x8direct[4];
     int i_cost16x8bi;
     int i_cost8x16bi;
+    int i_rd16x16bi;
+    int i_rd16x16direct;
+    int i_rd16x8bi;
+    int i_rd8x16bi;
+    int i_rd8x8bi;
 
     int i_mb_partition16x8[2]; /* mb_partition_e */
     int i_mb_partition8x16[2];
@@ -200,13 +207,12 @@ static void x264_mb_analyse_init( x264_t *h, x264_mb_analysis_t *a, int i_qp )
     h->mb.b_noise_reduction = 0;
 
     /* I: Intra part */
-    a->i_sad_i16x16 =
-    a->i_sad_i8x8   =
-    a->i_sad_i4x4   =
-    a->i_sad_i8x8chroma = COST_MAX;
+    a->i_satd_i16x16 =
+    a->i_satd_i8x8   =
+    a->i_satd_i4x4   =
+    a->i_satd_i8x8chroma = COST_MAX;
 
     a->b_fast_intra = 0;
-    a->i_best_satd = COST_MAX;
 
     /* II: Inter part P/B frame */
     if( h->sh.i_type != SLICE_TYPE_I )
@@ -234,6 +240,7 @@ static void x264_mb_analyse_init( x264_t *h, x264_mb_analysis_t *a, int i_qp )
 #undef CLIP_FMV
 
         a->l0.me16x16.cost =
+        a->l0.i_rd16x16    =
         a->l0.i_cost8x8    = COST_MAX;
 
         for( i = 0; i < 4; i++ )
@@ -248,6 +255,7 @@ static void x264_mb_analyse_init( x264_t *h, x264_mb_analysis_t *a, int i_qp )
         if( h->sh.i_type == SLICE_TYPE_B )
         {
             a->l1.me16x16.cost =
+            a->l1.i_rd16x16    =
             a->l1.i_cost8x8    = COST_MAX;
 
             for( i = 0; i < 4; i++ )
@@ -260,7 +268,11 @@ static void x264_mb_analyse_init( x264_t *h, x264_mb_analysis_t *a, int i_qp )
 
             a->l1.i_cost16x8   =
             a->l1.i_cost8x16   =
-
+            a->i_rd16x16bi     =
+            a->i_rd16x16direct =
+            a->i_rd8x8bi       =
+            a->i_rd16x8bi      =
+            a->i_rd8x16bi      =
             a->i_cost16x16bi   =
             a->i_cost16x16direct =
             a->i_cost8x8bi     =
@@ -271,8 +283,7 @@ static void x264_mb_analyse_init( x264_t *h, x264_mb_analysis_t *a, int i_qp )
         /* Fast intra decision */
         if( h->mb.i_mb_xy - h->sh.i_first_mb > 4 )
         {
-            if( a->b_mbrd
-               || IS_INTRA( h->mb.i_mb_type_left )
+            if(   IS_INTRA( h->mb.i_mb_type_left )
                || IS_INTRA( h->mb.i_mb_type_top )
                || IS_INTRA( h->mb.i_mb_type_topleft )
                || IS_INTRA( h->mb.i_mb_type_topright )
@@ -415,7 +426,7 @@ static void x264_mb_analyse_intra_chroma( x264_t *h, x264_mb_analysis_t *a )
 
     uint8_t *p_dstc[2], *p_srcc[2];
 
-    if( a->i_sad_i8x8chroma < COST_MAX )
+    if( a->i_satd_i8x8chroma < COST_MAX )
         return;
 
     /* 8x8 prediction selection for chroma */
@@ -425,10 +436,10 @@ static void x264_mb_analyse_intra_chroma( x264_t *h, x264_mb_analysis_t *a )
     p_srcc[1] = h->mb.pic.p_fenc[2];
 
     predict_8x8chroma_mode_available( h->mb.i_neighbour, predict_mode, &i_max );
-    a->i_sad_i8x8chroma = COST_MAX;
+    a->i_satd_i8x8chroma = COST_MAX;
     for( i = 0; i < i_max; i++ )
     {
-        int i_sad;
+        int i_satd;
         int i_mode;
 
         i_mode = predict_mode[i];
@@ -438,39 +449,27 @@ static void x264_mb_analyse_intra_chroma( x264_t *h, x264_mb_analysis_t *a )
         h->predict_8x8c[i_mode]( p_dstc[1] );
 
         /* we calculate the cost */
-        i_sad = h->pixf.mbcmp[PIXEL_8x8]( p_dstc[0], FDEC_STRIDE,
+        i_satd = h->pixf.mbcmp[PIXEL_8x8]( p_dstc[0], FDEC_STRIDE,
                                           p_srcc[0], FENC_STRIDE ) +
                 h->pixf.mbcmp[PIXEL_8x8]( p_dstc[1], FDEC_STRIDE,
                                           p_srcc[1], FENC_STRIDE ) +
                 a->i_lambda * bs_size_ue( x264_mb_pred_mode8x8c_fix[i_mode] );
 
-        /* if i_score is lower it is better */
-        if( a->i_sad_i8x8chroma > i_sad )
-        {
-            a->i_predict8x8chroma = i_mode;
-            a->i_sad_i8x8chroma   = i_sad;
-        }
+        COPY2_IF_LT( a->i_satd_i8x8chroma, i_satd, a->i_predict8x8chroma, i_mode );
     }
 
     h->mb.i_chroma_pred_mode = a->i_predict8x8chroma;
 }
 
-static void x264_mb_analyse_intra( x264_t *h, x264_mb_analysis_t *a, int i_cost_inter )
+static void x264_mb_analyse_intra( x264_t *h, x264_mb_analysis_t *a, int i_satd_inter )
 {
     const unsigned int flags = h->sh.i_type == SLICE_TYPE_I ? h->param.analyse.intra : h->param.analyse.inter;
     uint8_t  *p_src = h->mb.pic.p_fenc[0];
     uint8_t  *p_dst = h->mb.pic.p_fdec[0];
-    int      f8_satd_rd_ratio = 0;
 
     int i, idx;
     int i_max;
     int predict_mode[9];
-    int i_satd_thresh;
-
-    if( h->sh.i_type == SLICE_TYPE_B )
-        i_satd_thresh = a->i_best_satd * 9/8;
-    else
-        i_satd_thresh = a->i_best_satd * 5/4 + a->i_lambda * 10;
 
     /*---------------- Try all mode and calculate their score ---------------*/
 
@@ -478,186 +477,167 @@ static void x264_mb_analyse_intra( x264_t *h, x264_mb_analysis_t *a, int i_cost_
     predict_16x16_mode_available( h->mb.i_neighbour, predict_mode, &i_max );
     for( i = 0; i < i_max; i++ )
     {
-        int i_sad;
-        int i_mode;
-
-        i_mode = predict_mode[i];
+        int i_satd;
+        int i_mode = predict_mode[i];
         h->predict_16x16[i_mode]( p_dst );
 
-        i_sad = h->pixf.mbcmp[PIXEL_16x16]( p_dst, FDEC_STRIDE, p_src, FENC_STRIDE ) +
+        i_satd = h->pixf.mbcmp[PIXEL_16x16]( p_dst, FDEC_STRIDE, p_src, FENC_STRIDE ) +
                 a->i_lambda * bs_size_ue( x264_mb_pred_mode16x16_fix[i_mode] );
-        if( a->i_sad_i16x16 > i_sad )
-        {
-            a->i_predict16x16 = i_mode;
-            a->i_sad_i16x16   = i_sad;
-        }
+        COPY2_IF_LT( a->i_satd_i16x16, i_satd, a->i_predict16x16, i_mode );
+        a->i_satd_i16x16_dir[i_mode] = i_satd;
     }
 
-    if( a->b_mbrd )
-    {
-        f8_satd_rd_ratio = ((unsigned)i_cost_inter << 8) / a->i_best_satd + 1;
-        x264_mb_analyse_intra_chroma( h, a );
-        if( h->mb.b_chroma_me )
-            a->i_sad_i16x16 += a->i_sad_i8x8chroma;
-        if( a->i_sad_i16x16 < i_satd_thresh )
-        {
-            h->mb.i_type = I_16x16;
-            h->mb.i_intra16x16_pred_mode = a->i_predict16x16;
-            a->i_sad_i16x16 = x264_rd_cost_mb( h, a->i_lambda2 );
-        }
-        else
-            a->i_sad_i16x16 = a->i_sad_i16x16 * f8_satd_rd_ratio >> 8;
-    }
-    else
-    {
-        if( h->sh.i_type == SLICE_TYPE_B )
-            /* cavlc mb type prefix */
-            a->i_sad_i16x16 += a->i_lambda * i_mb_b_cost_table[I_16x16];
-        if( a->b_fast_intra && a->i_sad_i16x16 > 2*i_cost_inter )
-            return;
-    }
+    if( h->sh.i_type == SLICE_TYPE_B )
+        /* cavlc mb type prefix */
+        a->i_satd_i16x16 += a->i_lambda * i_mb_b_cost_table[I_16x16];
+    if( a->b_fast_intra && a->i_satd_i16x16 > 2*i_satd_inter )
+        return;
 
-    /* 4x4 prediction selection */
-    if( flags & X264_ANALYSE_I4x4 )
+    /* 8x8 prediction selection */
+    if( flags & X264_ANALYSE_I8x8 )
     {
-        a->i_sad_i4x4 = 0;
-        for( idx = 0; idx < 16; idx++ )
-        {
-            uint8_t *p_src_by;
-            uint8_t *p_dst_by;
-            int     i_best;
-            int x, y;
-            int i_pred_mode;
-
-            i_pred_mode= x264_mb_predict_intra4x4_mode( h, idx );
-            x = block_idx_x[idx];
-            y = block_idx_y[idx];
+        int i_satd_thresh = a->b_mbrd ? COST_MAX : X264_MIN( i_satd_inter, a->i_satd_i16x16 );
+        int i_cost = 0;
 
-            p_src_by = p_src + 4 * x + 4 * y * FENC_STRIDE;
-            p_dst_by = p_dst + 4 * x + 4 * y * FDEC_STRIDE;
-
-            i_best = COST_MAX;
-            predict_4x4_mode_available( h->mb.i_neighbour4[idx], predict_mode, &i_max );
+        // FIXME some bias like in i4x4?
+        if( h->sh.i_type == SLICE_TYPE_B )
+            i_cost += a->i_lambda * i_mb_b_cost_table[I_8x8];
 
-            if( (h->mb.i_neighbour4[idx] & (MB_TOPRIGHT|MB_TOP)) == MB_TOP )
-                /* emulate missing topright samples */
-                *(uint32_t*) &p_dst_by[4 - FDEC_STRIDE] = p_dst_by[3 - FDEC_STRIDE] * 0x01010101U;
+        for( idx = 0;; idx++ )
+        {
+            int x = idx&1;
+            int y = idx>>1;
+            uint8_t *p_src_by = p_src + 8*x + 8*y*FENC_STRIDE;
+            uint8_t *p_dst_by = p_dst + 8*x + 8*y*FDEC_STRIDE;
+            int i_best = COST_MAX;
+            int i_pred_mode = x264_mb_predict_intra4x4_mode( h, 4*idx );
 
+            predict_4x4_mode_available( h->mb.i_neighbour8[idx], predict_mode, &i_max );
             for( i = 0; i < i_max; i++ )
             {
-                int i_sad;
-                int i_mode;
+                int i_satd;
+                int i_mode = predict_mode[i];
 
-                i_mode = predict_mode[i];
-                h->predict_4x4[i_mode]( p_dst_by );
+                h->predict_8x8[i_mode]( p_dst_by, h->mb.i_neighbour8[idx] );
 
-                i_sad = h->pixf.mbcmp[PIXEL_4x4]( p_dst_by, FDEC_STRIDE,
+                /* could use sa8d, but it doesn't seem worth the speed cost (without mmx at least) */
+                i_satd = h->pixf.mbcmp[PIXEL_8x8]( p_dst_by, FDEC_STRIDE,
                                                   p_src_by, FENC_STRIDE )
                       + a->i_lambda * (i_pred_mode == x264_mb_pred_mode4x4_fix(i_mode) ? 1 : 4);
 
-                if( i_best > i_sad )
-                {
-                    a->i_predict4x4[x][y] = i_mode;
-                    i_best = i_sad;
-                }
+                COPY2_IF_LT( i_best, i_satd, a->i_predict8x8[idx], i_mode );
+                a->i_satd_i8x8_dir[i_mode][idx] = i_satd;
             }
-            a->i_sad_i4x4 += i_best;
+            i_cost += i_best;
+
+            if( idx == 3 || i_cost > i_satd_thresh )
+                break;
 
             /* we need to encode this block now (for next ones) */
-            h->predict_4x4[a->i_predict4x4[x][y]]( p_dst_by );
-            x264_mb_encode_i4x4( h, idx, a->i_qp );
+            h->predict_8x8[a->i_predict8x8[idx]]( p_dst_by, h->mb.i_neighbour8[idx] );
+            x264_mb_encode_i8x8( h, idx, a->i_qp );
 
-            h->mb.cache.intra4x4_pred_mode[x264_scan8[idx]] = a->i_predict4x4[x][y];
+            x264_macroblock_cache_intra8x8_pred( h, 2*x, 2*y, a->i_predict8x8[idx] );
         }
 
-        a->i_sad_i4x4 += a->i_lambda * 24;    /* from JVT (SATD0) */
-        if( a->b_mbrd )
-        {
-            if( h->mb.b_chroma_me )
-                a->i_sad_i4x4 += a->i_sad_i8x8chroma;
-            if( a->i_sad_i4x4 < i_satd_thresh )
-            {
-                h->mb.i_type = I_4x4;
-                a->i_sad_i4x4 = x264_rd_cost_mb( h, a->i_lambda2 );
-            }
-            else
-                a->i_sad_i4x4 = a->i_sad_i4x4 * f8_satd_rd_ratio >> 8;
-        }
+        if( idx == 3 )
+            a->i_satd_i8x8 = i_cost;
         else
         {
-            if( h->sh.i_type == SLICE_TYPE_B )
-                a->i_sad_i4x4 += a->i_lambda * i_mb_b_cost_table[I_4x4];
+            a->i_satd_i8x8 = COST_MAX;
+            i_cost = i_cost * 4/(idx+1);
         }
+        if( X264_MIN(i_cost, a->i_satd_i16x16) > i_satd_inter*(5+a->b_mbrd)/4 )
+            return;
     }
 
-    /* 8x8 prediction selection */
-    if( flags & X264_ANALYSE_I8x8 )
+    /* 4x4 prediction selection */
+    if( flags & X264_ANALYSE_I4x4 )
     {
-        a->i_sad_i8x8 = 0;
-        for( idx = 0; idx < 4; idx++ )
+        int i_cost;
+        int i_satd_thresh = X264_MIN3( i_satd_inter, a->i_satd_i16x16, a->i_satd_i8x8 );
+        if( a->b_mbrd )
+            i_satd_thresh = i_satd_thresh * (10-a->b_fast_intra)/8;
+
+        i_cost = a->i_lambda * 24;    /* from JVT (SATD0) */
+        if( h->sh.i_type == SLICE_TYPE_B )
+            i_cost += a->i_lambda * i_mb_b_cost_table[I_4x4];
+
+        for( idx = 0;; idx++ )
         {
-            uint8_t *p_src_by;
-            uint8_t *p_dst_by;
-            int     i_best;
-            int x, y;
-            int i_pred_mode;
+            int x = block_idx_x[idx];
+            int y = block_idx_y[idx];
+            uint8_t *p_src_by = p_src + 4*x + 4*y*FENC_STRIDE;
+            uint8_t *p_dst_by = p_dst + 4*x + 4*y*FDEC_STRIDE;
+            int i_best = COST_MAX;
+            int i_pred_mode = x264_mb_predict_intra4x4_mode( h, idx );
 
-            i_pred_mode= x264_mb_predict_intra4x4_mode( h, 4*idx );
-            x = idx&1;
-            y = idx>>1;
+            predict_4x4_mode_available( h->mb.i_neighbour4[idx], predict_mode, &i_max );
 
-            p_src_by = p_src + 8 * x + 8 * y * FENC_STRIDE;
-            p_dst_by = p_dst + 8 * x + 8 * y * FDEC_STRIDE;
+            if( (h->mb.i_neighbour4[idx] & (MB_TOPRIGHT|MB_TOP)) == MB_TOP )
+                /* emulate missing topright samples */
+                *(uint32_t*) &p_dst_by[4 - FDEC_STRIDE] = p_dst_by[3 - FDEC_STRIDE] * 0x01010101U;
 
-            i_best = COST_MAX;
-            predict_4x4_mode_available( h->mb.i_neighbour8[idx], predict_mode, &i_max );
             for( i = 0; i < i_max; i++ )
             {
-                int i_sad;
+                int i_satd;
                 int i_mode;
 
                 i_mode = predict_mode[i];
-                h->predict_8x8[i_mode]( p_dst_by, h->mb.i_neighbour8[idx] );
+                h->predict_4x4[i_mode]( p_dst_by );
 
-                /* could use sa8d, but it doesn't seem worth the speed cost (without mmx at least) */
-                i_sad = h->pixf.mbcmp[PIXEL_8x8]( p_dst_by, FDEC_STRIDE,
+                i_satd = h->pixf.mbcmp[PIXEL_4x4]( p_dst_by, FDEC_STRIDE,
                                                   p_src_by, FENC_STRIDE )
                       + a->i_lambda * (i_pred_mode == x264_mb_pred_mode4x4_fix(i_mode) ? 1 : 4);
 
-                if( i_best > i_sad )
-                {
-                    a->i_predict8x8[x][y] = i_mode;
-                    i_best = i_sad;
-                }
+                COPY2_IF_LT( i_best, i_satd, a->i_predict4x4[idx], i_mode );
             }
-            a->i_sad_i8x8 += i_best;
+            i_cost += i_best;
 
-            /* we need to encode this block now (for next ones) */
-            h->predict_8x8[a->i_predict8x8[x][y]]( p_dst_by, h->mb.i_neighbour8[idx] );
-            x264_mb_encode_i8x8( h, idx, a->i_qp );
+            if( i_cost > i_satd_thresh || idx == 15 )
+                break;
 
-            x264_macroblock_cache_intra8x8_pred( h, 2*x, 2*y, a->i_predict8x8[x][y] );
-        }
+            /* we need to encode this block now (for next ones) */
+            h->predict_4x4[a->i_predict4x4[idx]]( p_dst_by );
+            x264_mb_encode_i4x4( h, idx, a->i_qp );
 
-        if( a->b_mbrd )
-        {
-            if( h->mb.b_chroma_me )
-                a->i_sad_i8x8 += a->i_sad_i8x8chroma;
-            if( a->i_sad_i8x8 < i_satd_thresh )
-            {
-                h->mb.i_type = I_8x8;
-                a->i_sad_i8x8 = x264_rd_cost_mb( h, a->i_lambda2 );
-            }
-            else
-                a->i_sad_i8x8 = a->i_sad_i8x8 * f8_satd_rd_ratio >> 8;
+            h->mb.cache.intra4x4_pred_mode[x264_scan8[idx]] = a->i_predict4x4[idx];
         }
+        if( idx == 15 )
+            a->i_satd_i4x4 = i_cost;
         else
-        {
-            // FIXME some bias like in i4x4?
-            if( h->sh.i_type == SLICE_TYPE_B )
-                a->i_sad_i8x8 += a->i_lambda * i_mb_b_cost_table[I_8x8];
-        }
+            a->i_satd_i4x4 = COST_MAX;
+    }
+}
+
+static void x264_intra_rd( x264_t *h, x264_mb_analysis_t *a, int i_satd_thresh )
+{
+    if( a->i_satd_i16x16 <= i_satd_thresh )
+    {
+        h->mb.i_type = I_16x16;
+        x264_analyse_update_cache( h, a );
+        a->i_satd_i16x16 = x264_rd_cost_mb( h, a->i_lambda2 );
+    }
+    else
+        a->i_satd_i16x16 = COST_MAX;
+
+    if( a->i_satd_i4x4 <= i_satd_thresh && a->i_satd_i4x4 < COST_MAX )
+    {
+        h->mb.i_type = I_4x4;
+        x264_analyse_update_cache( h, a );
+        a->i_satd_i4x4 = x264_rd_cost_mb( h, a->i_lambda2 );
+    }
+    else
+        a->i_satd_i4x4 = COST_MAX;
+
+    if( a->i_satd_i8x8 <= i_satd_thresh && a->i_satd_i8x8 < COST_MAX )
+    {
+        h->mb.i_type = I_8x8;
+        x264_analyse_update_cache( h, a );
+        a->i_satd_i8x8 = x264_rd_cost_mb( h, a->i_lambda2 );
     }
+    else
+        a->i_satd_i8x8 = COST_MAX;
 }
 
 static void x264_intra_rd_refine( x264_t *h, x264_mb_analysis_t *a )
@@ -666,26 +646,24 @@ static void x264_intra_rd_refine( x264_t *h, x264_mb_analysis_t *a )
     uint8_t  *p_dst = h->mb.pic.p_fdec[0];
 
     int i, idx, x, y;
-    int i_max, i_sad, i_best, i_mode;
+    int i_max, i_satd, i_best, i_mode;
     int i_pred_mode;
     int predict_mode[9];
 
     if( h->mb.i_type == I_16x16 )
     {
         int old_pred_mode = a->i_predict16x16;
-        i_best = a->i_sad_i16x16;
+        int i_thresh = a->i_satd_i16x16_dir[old_pred_mode] * 9/8;
+        i_best = a->i_satd_i16x16;
         predict_16x16_mode_available( h->mb.i_neighbour, predict_mode, &i_max );
         for( i = 0; i < i_max; i++ )
         {
-            if( predict_mode[i] == old_pred_mode )
+            int i_mode = predict_mode[i];
+            if( i_mode == old_pred_mode || a->i_satd_i16x16_dir[i_mode] > i_thresh )
                 continue;
-            h->mb.i_intra16x16_pred_mode = predict_mode[i];
-            i_sad = x264_rd_cost_mb( h, a->i_lambda2 );
-            if( i_best > i_sad )
-            {
-                a->i_predict16x16 = predict_mode[i];
-                i_best = i_sad;
-            }
+            h->mb.i_intra16x16_pred_mode = i_mode;
+            i_satd = x264_rd_cost_mb( h, a->i_lambda2 );
+            COPY2_IF_LT( i_best, i_satd, a->i_predict16x16, i_mode );
         }
     }
     else if( h->mb.i_type == I_4x4 )
@@ -698,7 +676,7 @@ static void x264_intra_rd_refine( x264_t *h, x264_mb_analysis_t *a )
             uint8_t *p_dst_by;
             i_best = COST_MAX;
 
-            i_pred_mode= x264_mb_predict_intra4x4_mode( h, idx );
+            i_pred_mode = x264_mb_predict_intra4x4_mode( h, idx );
             x = block_idx_x[idx];
             y = block_idx_y[idx];
 
@@ -714,13 +692,12 @@ static void x264_intra_rd_refine( x264_t *h, x264_mb_analysis_t *a )
             {
                 i_mode = predict_mode[i];
                 h->predict_4x4[i_mode]( p_dst_by );
+                i_satd = x264_rd_cost_i4x4( h, a->i_lambda2, idx, i_mode );
 
-                i_sad = x264_rd_cost_i4x4( h, a->i_lambda2, idx, i_mode );
-
-                if( i_best > i_sad )
+                if( i_best > i_satd )
                 {
-                    a->i_predict4x4[x][y] = i_mode;
-                    i_best = i_sad;
+                    a->i_predict4x4[idx] = i_mode;
+                    i_best = i_satd;
                     pels[0] = *(uint32_t*)(p_dst_by+0*FDEC_STRIDE);
                     pels[1] = *(uint32_t*)(p_dst_by+1*FDEC_STRIDE);
                     pels[2] = *(uint32_t*)(p_dst_by+2*FDEC_STRIDE);
@@ -735,7 +712,7 @@ static void x264_intra_rd_refine( x264_t *h, x264_mb_analysis_t *a )
             *(uint32_t*)(p_dst_by+3*FDEC_STRIDE) = pels[3];
             h->mb.cache.non_zero_count[x264_scan8[idx]] = i_nnz;
 
-            h->mb.cache.intra4x4_pred_mode[x264_scan8[idx]] = a->i_predict4x4[x][y];
+            h->mb.cache.intra4x4_pred_mode[x264_scan8[idx]] = a->i_predict4x4[idx];
         }
     }
     else if( h->mb.i_type == I_8x8 )
@@ -748,9 +725,10 @@ static void x264_intra_rd_refine( x264_t *h, x264_mb_analysis_t *a )
             uint8_t *p_src_by;
             uint8_t *p_dst_by;
             int j;
-            i_best = COST_MAX;
+            int i_thresh = a->i_satd_i8x8_dir[a->i_predict8x8[idx]][idx] * 11/8;
 
-            i_pred_mode= x264_mb_predict_intra4x4_mode( h, 4*idx );
+            i_best = COST_MAX;
+            i_pred_mode = x264_mb_predict_intra4x4_mode( h, 4*idx );
             x = idx&1;
             y = idx>>1;
 
@@ -760,14 +738,15 @@ static void x264_intra_rd_refine( x264_t *h, x264_mb_analysis_t *a )
             for( i = 0; i < i_max; i++ )
             {
                 i_mode = predict_mode[i];
+                if( a->i_satd_i8x8_dir[i_mode][idx] > i_thresh )
+                    continue;
                 h->predict_8x8[i_mode]( p_dst_by, h->mb.i_neighbour8[idx] );
+                i_satd = x264_rd_cost_i8x8( h, a->i_lambda2, idx, i_mode );
 
-                i_sad = x264_rd_cost_i8x8( h, a->i_lambda2, idx, i_mode );
-
-                if( i_best > i_sad )
+                if( i_best > i_satd )
                 {
-                    a->i_predict8x8[x][y] = i_mode;
-                    i_best = i_sad;
+                    a->i_predict8x8[idx] = i_mode;
+                    i_best = i_satd;
 
                     pels_h = *(uint64_t*)(p_dst_by+7*FDEC_STRIDE);
                     if( !(idx&1) )
@@ -785,7 +764,7 @@ static void x264_intra_rd_refine( x264_t *h, x264_mb_analysis_t *a )
             for( j=0; j<3; j++ )
                 h->mb.cache.non_zero_count[x264_scan8[4*idx+j+1]] = i_nnz[j];
 
-            x264_macroblock_cache_intra8x8_pred( h, 2*x, 2*y, a->i_predict8x8[x][y] );
+            x264_macroblock_cache_intra8x8_pred( h, 2*x, 2*y, a->i_predict8x8[idx] );
         }
     }
 }
@@ -867,12 +846,16 @@ static void x264_mb_analyse_inter_p16x16( x264_t *h, x264_mb_analysis_t *a )
     x264_macroblock_cache_ref( h, 0, 0, 4, 4, 0, a->l0.me16x16.i_ref );
 
     h->mb.i_type = P_L0;
-    if( a->b_mbrd )
+    if( a->b_mbrd && a->l0.i_ref == 0 )
     {
-        a->i_best_satd = a->l0.me16x16.cost;
-        h->mb.i_partition = D_16x16;
-        x264_macroblock_cache_mv ( h, 0, 0, 4, 4, 0, a->l0.me16x16.mv[0], a->l0.me16x16.mv[1] );
-        a->l0.me16x16.cost = x264_rd_cost_mb( h, a->i_lambda2 );
+        int mvskip[2];
+        x264_mb_predict_mv_pskip( h, mvskip );
+        if( a->l0.me16x16.mv[0] == mvskip[0] && a->l0.me16x16.mv[1] == mvskip[1] )
+        {
+            h->mb.i_partition = D_16x16;
+            x264_macroblock_cache_mv( h, 0, 0, 4, 4, 0, a->l0.me16x16.mv[0], a->l0.me16x16.mv[1] );
+            a->l0.i_rd16x16 = x264_rd_cost_mb( h, a->i_lambda2 );
+        }
     }
 }
 
@@ -947,15 +930,8 @@ static void x264_mb_analyse_inter_p8x8_mixed_ref( x264_t *h, x264_mb_analysis_t
 
     a->l0.i_cost8x8 = a->l0.me8x8[0].cost + a->l0.me8x8[1].cost +
                       a->l0.me8x8[2].cost + a->l0.me8x8[3].cost;
-    if( a->b_mbrd )
-    {
-        if( a->i_best_satd > a->l0.i_cost8x8 )
-            a->i_best_satd = a->l0.i_cost8x8;
-        h->mb.i_type = P_8x8;
-        h->mb.i_sub_partition[0] = h->mb.i_sub_partition[1] =
-        h->mb.i_sub_partition[2] = h->mb.i_sub_partition[3] = D_L0_8x8;
-        a->l0.i_cost8x8 = x264_rd_cost_mb( h, a->i_lambda2 );
-    }
+    h->mb.i_sub_partition[0] = h->mb.i_sub_partition[1] =
+    h->mb.i_sub_partition[2] = h->mb.i_sub_partition[3] = D_L0_8x8;
 }
 
 static void x264_mb_analyse_inter_p8x8( x264_t *h, x264_mb_analysis_t *a )
@@ -1005,15 +981,8 @@ static void x264_mb_analyse_inter_p8x8( x264_t *h, x264_mb_analysis_t *a )
     a->l0.i_cost8x8 = a->l0.me8x8[0].cost + a->l0.me8x8[1].cost +
                       a->l0.me8x8[2].cost + a->l0.me8x8[3].cost -
                       REF_COST( 0, a->l0.me16x16.i_ref );
-    if( a->b_mbrd )
-    {
-        if( a->i_best_satd > a->l0.i_cost8x8 )
-            a->i_best_satd = a->l0.i_cost8x8;
-        h->mb.i_type = P_8x8;
-        h->mb.i_sub_partition[0] = h->mb.i_sub_partition[1] =
-        h->mb.i_sub_partition[2] = h->mb.i_sub_partition[3] = D_L0_8x8;
-        a->l0.i_cost8x8 = x264_rd_cost_mb( h, a->i_lambda2 );
-    }
+    h->mb.i_sub_partition[0] = h->mb.i_sub_partition[1] =
+    h->mb.i_sub_partition[2] = h->mb.i_sub_partition[3] = D_L0_8x8;
 }
 
 static void x264_mb_analyse_inter_p16x8( x264_t *h, x264_mb_analysis_t *a )
@@ -1064,13 +1033,6 @@ static void x264_mb_analyse_inter_p16x8( x264_t *h, x264_mb_analysis_t *a )
     }
 
     a->l0.i_cost16x8 = a->l0.me16x8[0].cost + a->l0.me16x8[1].cost;
-    if( a->b_mbrd )
-    {
-        if( a->i_best_satd > a->l0.i_cost16x8 )
-            a->i_best_satd = a->l0.i_cost16x8;
-        h->mb.i_type = P_L0;
-        a->l0.i_cost16x8 = x264_rd_cost_mb( h, a->i_lambda2 );
-    }
 }
 
 static void x264_mb_analyse_inter_p8x16( x264_t *h, x264_mb_analysis_t *a )
@@ -1120,13 +1082,6 @@ static void x264_mb_analyse_inter_p8x16( x264_t *h, x264_mb_analysis_t *a )
     }
 
     a->l0.i_cost8x16 = a->l0.me8x16[0].cost + a->l0.me8x16[1].cost;
-    if( a->b_mbrd )
-    {
-        if( a->i_best_satd > a->l0.i_cost8x16 )
-            a->i_best_satd = a->l0.i_cost8x16;
-        h->mb.i_type = P_L0;
-        a->l0.i_cost8x16 = x264_rd_cost_mb( h, a->i_lambda2 );
-    }
 }
 
 static int x264_mb_analyse_inter_p4x4_chroma( x264_t *h, x264_mb_analysis_t *a, uint8_t **p_fref, int i8x8, int pixel )
@@ -1286,7 +1241,7 @@ static void x264_mb_analyse_inter_direct( x264_t *h, x264_mb_analysis_t *a )
     uint8_t **p_fdec = h->mb.pic.p_fdec;
     int i;
 
-    a->i_cost16x16direct = 0;
+    a->i_cost16x16direct = a->i_lambda * i_mb_b_cost_table[B_DIRECT];
     for( i = 0; i < 4; i++ )
     {
         const int x = (i&1)*8;
@@ -1298,16 +1253,6 @@ static void x264_mb_analyse_inter_direct( x264_t *h, x264_mb_analysis_t *a )
         /* mb type cost */
         a->i_cost8x8direct[i] += a->i_lambda * i_sub_mb_b_cost_table[D_DIRECT_8x8];
     }
-    a->i_cost16x16direct += a->i_lambda * i_mb_b_cost_table[B_DIRECT];
-
-    if( a->b_mbrd )
-    {
-        if( a->i_cost16x16direct < a->i_best_satd )
-            a->i_best_satd = a->i_cost16x16direct;
-
-        h->mb.i_type = B_DIRECT;
-        a->i_cost16x16direct = x264_rd_cost_mb( h, a->i_lambda2 );
-    }
 }
 
 #define WEIGHTED_AVG( size, pix1, stride1, src2, stride2 ) \
@@ -1438,50 +1383,6 @@ static void x264_mb_analyse_inter_b16x16( x264_t *h, x264_mb_analysis_t *a )
     a->i_cost16x16bi   += a->i_lambda * i_mb_b_cost_table[B_BI_BI];
     a->l0.me16x16.cost += a->i_lambda * i_mb_b_cost_table[B_L0_L0];
     a->l1.me16x16.cost += a->i_lambda * i_mb_b_cost_table[B_L1_L1];
-
-    if( a->b_mbrd )
-    {
-        int i_satd_thresh;
-
-        if( a->l0.me16x16.cost < a->i_best_satd )
-            a->i_best_satd = a->l0.me16x16.cost;
-        if( a->l1.me16x16.cost < a->i_best_satd )
-            a->i_best_satd = a->l1.me16x16.cost;
-        if( a->i_cost16x16bi < a->i_best_satd )
-            a->i_best_satd = a->i_cost16x16bi;
-
-        i_satd_thresh = a->i_best_satd * 3/2;
-
-        h->mb.i_partition = D_16x16;
-        /* L0 */
-        if( a->l0.me16x16.cost < i_satd_thresh )
-        {
-            h->mb.i_type = B_L0_L0;
-            x264_macroblock_cache_mv( h, 0, 0, 4, 4, 0, a->l0.me16x16.mv[0], a->l0.me16x16.mv[1] );
-            a->l0.me16x16.cost = x264_rd_cost_mb( h, a->i_lambda2 );
-        }
-        else
-            a->l0.me16x16.cost = COST_MAX;
-
-        /* L1 */
-        if( a->l1.me16x16.cost < i_satd_thresh )
-        {
-            h->mb.i_type = B_L1_L1;
-            x264_macroblock_cache_mv( h, 0, 0, 4, 4, 1, a->l1.me16x16.mv[0], a->l1.me16x16.mv[1] );
-            a->l1.me16x16.cost = x264_rd_cost_mb( h, a->i_lambda2 );
-        }
-        else
-            a->l1.me16x16.cost = COST_MAX;
-
-        /* BI */
-        if( a->i_cost16x16bi < i_satd_thresh )
-        {
-            h->mb.i_type = B_BI_BI;
-            a->i_cost16x16bi = x264_rd_cost_mb( h, a->i_lambda2 );
-        }
-        else
-            a->i_cost16x16bi = COST_MAX;
-    }
 }
 
 static inline void x264_mb_cache_mv_p8x8( x264_t *h, x264_mb_analysis_t *a, int i )
@@ -1620,21 +1521,9 @@ static void x264_mb_analyse_inter_b8x8( x264_t *h, x264_mb_analysis_t *a )
 
         i_part_cost = a->l0.me8x8[i].cost;
         h->mb.i_sub_partition[i] = D_L0_8x8;
-        if( a->l1.me8x8[i].cost < i_part_cost )
-        {
-            i_part_cost = a->l1.me8x8[i].cost;
-            h->mb.i_sub_partition[i] = D_L1_8x8;
-        }
-        if( i_part_cost_bi < i_part_cost )
-        {
-            i_part_cost = i_part_cost_bi;
-            h->mb.i_sub_partition[i] = D_BI_8x8;
-        }
-        if( a->i_cost8x8direct[i] < i_part_cost )
-        {
-            i_part_cost = a->i_cost8x8direct[i];
-            h->mb.i_sub_partition[i] = D_DIRECT_8x8;
-        }
+        COPY2_IF_LT( i_part_cost, a->l1.me8x8[i].cost, h->mb.i_sub_partition[i], D_L1_8x8 );
+        COPY2_IF_LT( i_part_cost, i_part_cost_bi, h->mb.i_sub_partition[i], D_BI_8x8 );
+        COPY2_IF_LT( i_part_cost, a->i_cost8x8direct[i], h->mb.i_sub_partition[i], D_DIRECT_8x8 );
         a->i_cost8x8bi += i_part_cost;
 
         /* XXX Needed for x264_mb_predict_mv */
@@ -1643,21 +1532,6 @@ static void x264_mb_analyse_inter_b8x8( x264_t *h, x264_mb_analysis_t *a )
 
     /* mb type cost */
     a->i_cost8x8bi += a->i_lambda * i_mb_b_cost_table[B_8x8];
-
-    if( a->b_mbrd )
-    {
-        if( a->i_cost8x8bi < a->i_best_satd )
-            a->i_best_satd = a->i_cost8x8bi;
-
-        if( a->i_cost8x8bi < a->i_best_satd * 3/2 )
-        {
-            h->mb.i_type = B_8x8;
-            h->mb.i_partition = D_8x8;
-            a->i_cost8x8bi = x264_rd_cost_mb( h, a->i_lambda2 );
-        }
-        else
-            a->i_cost8x8bi = COST_MAX;
-    }
 }
 
 static void x264_mb_analyse_inter_b16x8( x264_t *h, x264_mb_analysis_t *a )
@@ -1729,22 +1603,8 @@ static void x264_mb_analyse_inter_b16x8( x264_t *h, x264_mb_analysis_t *a )
         + (a->i_mb_partition16x8[0]>>2) * 3
         + (a->i_mb_partition16x8[1]>>2);
     a->i_cost16x8bi += a->i_lambda * i_mb_b16x8_cost_table[a->i_mb_type16x8];
-
-    if( a->b_mbrd )
-    {
-        if( a->i_cost16x8bi < a->i_best_satd )
-            a->i_best_satd = a->i_cost16x8bi;
-
-        if( a->i_cost16x8bi < a->i_best_satd * 3/2 )
-        {
-            h->mb.i_type = a->i_mb_type16x8;
-            h->mb.i_partition = D_16x8;
-            a->i_cost16x8bi = x264_rd_cost_mb( h, a->i_lambda2 );
-        }
-        else
-            a->i_cost16x8bi = COST_MAX;
-    }
 }
+
 static void x264_mb_analyse_inter_b8x16( x264_t *h, x264_mb_analysis_t *a )
 {
     uint8_t **p_fref[2] =
@@ -1813,20 +1673,138 @@ static void x264_mb_analyse_inter_b8x16( x264_t *h, x264_mb_analysis_t *a )
         + (a->i_mb_partition8x16[0]>>2) * 3
         + (a->i_mb_partition8x16[1]>>2);
     a->i_cost8x16bi += a->i_lambda * i_mb_b16x8_cost_table[a->i_mb_type8x16];
+}
+
+static void x264_mb_analyse_p_rd( x264_t *h, x264_mb_analysis_t *a, int i_satd )
+{
+    int thresh = i_satd * 5/4;
 
-    if( a->b_mbrd )
+    h->mb.i_type = P_L0;
+    if( a->l0.i_rd16x16 == COST_MAX && a->l0.me16x16.cost <= i_satd * 3/2 )
     {
-        if( a->i_cost8x16bi < a->i_best_satd )
-            a->i_best_satd = a->i_cost8x16bi;
+        h->mb.i_partition = D_16x16;
+        x264_analyse_update_cache( h, a );
+        a->l0.i_rd16x16 = x264_rd_cost_mb( h, a->i_lambda2 );
+    }
+    a->l0.me16x16.cost = a->l0.i_rd16x16;
 
-        if( a->i_cost8x16bi < a->i_best_satd * 3/2 )
+    if( a->l0.i_cost16x8 <= thresh )
+    {
+        h->mb.i_partition = D_16x8;
+        x264_analyse_update_cache( h, a );
+        a->l0.i_cost16x8 = x264_rd_cost_mb( h, a->i_lambda2 );
+    }
+    else
+        a->l0.i_cost16x8 = COST_MAX;
+
+    if( a->l0.i_cost8x16 <= thresh )
+    {
+        h->mb.i_partition = D_8x16;
+        x264_analyse_update_cache( h, a );
+        a->l0.i_cost8x16 = x264_rd_cost_mb( h, a->i_lambda2 );
+    }
+    else
+        a->l0.i_cost8x16 = COST_MAX;
+
+    if( a->l0.i_cost8x8 <= thresh )
+    {
+        h->mb.i_type = P_8x8;
+        x264_analyse_update_cache( h, a );
+        a->l0.i_cost8x8 = x264_rd_cost_mb( h, a->i_lambda2 );
+
+        if( h->param.analyse.inter & X264_ANALYSE_PSUB8x8 )
         {
-            h->mb.i_type = a->i_mb_type8x16;
-            h->mb.i_partition = D_8x16;
-            a->i_cost8x16bi = x264_rd_cost_mb( h, a->i_lambda2 );
+            /* FIXME: RD per subpartition */
+            int part_bak[4];
+            int i, i_cost;
+            int b_sub8x8 = 0;
+            for( i=0; i<4; i++ )
+            {
+                part_bak[i] = h->mb.i_sub_partition[i];
+                b_sub8x8 |= (part_bak[i] != D_L0_8x8);
+            }
+            if( b_sub8x8 )
+            {
+                h->mb.i_sub_partition[0] = h->mb.i_sub_partition[1] =
+                h->mb.i_sub_partition[2] = h->mb.i_sub_partition[3] = D_L0_8x8;
+                i_cost = x264_rd_cost_mb( h, a->i_lambda2 );
+                if( a->l0.i_cost8x8 < i_cost )
+                {
+                    for( i=0; i<4; i++ )
+                        h->mb.i_sub_partition[i] = part_bak[i];
+                }
+                else
+                   a->l0.i_cost8x8 = i_cost;
+            }
         }
-        else
-            a->i_cost8x16bi = COST_MAX;
+    }
+    else
+        a->l0.i_cost8x8 = COST_MAX;
+}
+
+static void x264_mb_analyse_b_rd( x264_t *h, x264_mb_analysis_t *a, int i_satd_inter )
+{
+    int thresh = i_satd_inter * 17/16;
+
+    if( a->b_direct_available && a->i_rd16x16direct == COST_MAX )
+    {
+        h->mb.i_type = B_DIRECT;
+        x264_analyse_update_cache( h, a );
+        a->i_rd16x16direct = x264_rd_cost_mb( h, a->i_lambda2 );
+    }
+
+    //FIXME not all the update_cache calls are needed
+    h->mb.i_partition = D_16x16;
+    /* L0 */
+    if( a->l0.me16x16.cost <= thresh && a->l0.i_rd16x16 == COST_MAX )
+    {
+        h->mb.i_type = B_L0_L0;
+        x264_analyse_update_cache( h, a );
+        a->l0.i_rd16x16 = x264_rd_cost_mb( h, a->i_lambda2 );
+    }
+
+    /* L1 */
+    if( a->l1.me16x16.cost <= thresh && a->l1.i_rd16x16 == COST_MAX )
+    {
+        h->mb.i_type = B_L1_L1;
+        x264_analyse_update_cache( h, a );
+        a->l1.i_rd16x16 = x264_rd_cost_mb( h, a->i_lambda2 );
+    }
+
+    /* BI */
+    if( a->i_cost16x16bi <= thresh && a->i_rd16x16bi == COST_MAX )
+    {
+        h->mb.i_type = B_BI_BI;
+        x264_analyse_update_cache( h, a );
+        a->i_rd16x16bi = x264_rd_cost_mb( h, a->i_lambda2 );
+    }
+
+    /* 8x8 */
+    if( a->i_cost8x8bi <= thresh && a->i_rd8x8bi == COST_MAX )
+    {
+        h->mb.i_type = B_8x8;
+        h->mb.i_partition = D_8x8;
+        x264_analyse_update_cache( h, a );
+        a->i_rd8x8bi = x264_rd_cost_mb( h, a->i_lambda2 );
+        x264_macroblock_cache_skip( h, 0, 0, 4, 4, 0 );
+    }
+
+    /* 16x8 */
+    if( a->i_cost16x8bi <= thresh && a->i_rd16x8bi == COST_MAX )
+    {
+        h->mb.i_type = a->i_mb_type16x8;
+        h->mb.i_partition = D_16x8;
+        x264_analyse_update_cache( h, a );
+        a->i_rd16x8bi = x264_rd_cost_mb( h, a->i_lambda2 );
+    }
+
+    /* 8x16 */
+    if( a->i_cost8x16bi <= thresh && a->i_rd8x16bi == COST_MAX )
+    {
+        h->mb.i_type = a->i_mb_type8x16;
+        h->mb.i_partition = D_8x16;
+        x264_analyse_update_cache( h, a );
+        a->i_rd8x16bi = x264_rd_cost_mb( h, a->i_lambda2 );
     }
 }
 
@@ -1880,28 +1858,28 @@ static inline void x264_mb_analyse_transform( x264_t *h )
     }
 }
 
-static inline void x264_mb_analyse_transform_rd( x264_t *h, x264_mb_analysis_t *a, int *i_cost )
+static inline void x264_mb_analyse_transform_rd( x264_t *h, x264_mb_analysis_t *a, int *i_satd, int *i_rd )
 {
     h->mb.cache.b_transform_8x8_allowed =
         h->param.analyse.b_transform_8x8 && x264_mb_transform_8x8_allowed( h );
 
     if( h->mb.cache.b_transform_8x8_allowed )
     {
-        int i_cost8;
+        int i_rd8;
         x264_analyse_update_cache( h, a );
         h->mb.b_transform_8x8 = !h->mb.b_transform_8x8;
         /* FIXME only luma is needed, but the score for comparison already includes chroma */
-        i_cost8 = x264_rd_cost_mb( h, a->i_lambda2 );
+        i_rd8 = x264_rd_cost_mb( h, a->i_lambda2 );
 
-        if( *i_cost >= i_cost8 )
+        if( *i_rd >= i_rd8 )
         {
-            if( *i_cost > 0 )
-                a->i_best_satd = (int64_t)a->i_best_satd * i_cost8 / *i_cost;
-            /* prevent a rare division by zero in x264_mb_analyse_intra */
-            if( a->i_best_satd == 0 )
-                a->i_best_satd = 1;
+            if( *i_rd > 0 )
+                *i_satd = (int64_t)(*i_satd) * i_rd8 / *i_rd;
+            /* prevent a rare division by zero in estimated intra cost */
+            if( *i_satd == 0 )
+                *i_satd = 1;
 
-            *i_cost = i_cost8;
+            *i_rd = i_rd8;
         }
         else
             h->mb.b_transform_8x8 = !h->mb.b_transform_8x8;
@@ -1925,15 +1903,17 @@ void x264_macroblock_analyse( x264_t *h )
     if( h->sh.i_type == SLICE_TYPE_I )
     {
         x264_mb_analyse_intra( h, &analysis, COST_MAX );
+        if( analysis.b_mbrd )
+            x264_intra_rd( h, &analysis, COST_MAX );
 
-        i_cost = analysis.i_sad_i16x16;
+        i_cost = analysis.i_satd_i16x16;
         h->mb.i_type = I_16x16;
-        if( analysis.i_sad_i4x4 < i_cost )
+        if( analysis.i_satd_i4x4 < i_cost )
         {
-            i_cost = analysis.i_sad_i4x4;
+            i_cost = analysis.i_satd_i4x4;
             h->mb.i_type = I_4x4;
         }
-        if( analysis.i_sad_i8x8 < i_cost )
+        if( analysis.i_satd_i8x8 < i_cost )
             h->mb.i_type = I_8x8;
 
         if( h->mb.i_subpel_refine >= 7 )
@@ -1968,6 +1948,7 @@ void x264_macroblock_analyse( x264_t *h )
             int i_type;
             int i_partition;
             int i_thresh16x8;
+            int i_satd_inter, i_satd_intra;
 
             x264_mb_analyse_load_costs( h, &analysis );
 
@@ -1994,16 +1975,11 @@ void x264_macroblock_analyse( x264_t *h )
             {
                 i_type = P_8x8;
                 i_partition = D_8x8;
-                h->mb.i_sub_partition[0] = h->mb.i_sub_partition[1] =
-                h->mb.i_sub_partition[2] = h->mb.i_sub_partition[3] = D_L0_8x8;
-
                 i_cost = analysis.l0.i_cost8x8;
 
                 /* Do sub 8x8 */
                 if( flags & X264_ANALYSE_PSUB8x8 )
                 {
-                    int i_cost_bak = i_cost;
-                    int b_sub8x8 = 0;
                     for( i = 0; i < 4; i++ )
                     {
                         x264_mb_analyse_inter_p4x4( h, &analysis, i );
@@ -2013,60 +1989,31 @@ void x264_macroblock_analyse( x264_t *h )
                             h->mb.i_sub_partition[i] = D_L0_4x4;
 
                             x264_mb_analyse_inter_p8x4( h, &analysis, i );
-                            if( analysis.l0.i_cost8x4[i] < i_cost8x8 )
-                            {
-                                h->mb.i_sub_partition[i] = D_L0_8x4;
-                                i_cost8x8 = analysis.l0.i_cost8x4[i];
-                            }
+                            COPY2_IF_LT( i_cost8x8, analysis.l0.i_cost8x4[i],
+                                         h->mb.i_sub_partition[i], D_L0_8x4 );
 
                             x264_mb_analyse_inter_p4x8( h, &analysis, i );
-                            if( analysis.l0.i_cost4x8[i] < i_cost8x8 )
-                            {
-                                h->mb.i_sub_partition[i] = D_L0_4x8;
-                                i_cost8x8 = analysis.l0.i_cost4x8[i];
-                            }
+                            COPY2_IF_LT( i_cost8x8, analysis.l0.i_cost4x8[i],
+                                         h->mb.i_sub_partition[i], D_L0_4x8 );
 
                             i_cost += i_cost8x8 - analysis.l0.me8x8[i].cost;
-                            b_sub8x8 = 1;
                         }
                         x264_mb_cache_mv_p8x8( h, &analysis, i );
                     }
-                    /* TODO: RD per subpartition */
-                    if( b_sub8x8 && analysis.b_mbrd )
-                    {
-                        i_cost = x264_rd_cost_mb( h, analysis.i_lambda2 );
-                        if( i_cost > i_cost_bak )
-                        {
-                            i_cost = i_cost_bak;
-                            h->mb.i_sub_partition[0] = h->mb.i_sub_partition[1] =
-                            h->mb.i_sub_partition[2] = h->mb.i_sub_partition[3] = D_L0_8x8;
-                        }
-                    }
+                    analysis.l0.i_cost8x8 = i_cost;
                 }
             }
 
             /* Now do 16x8/8x16 */
             i_thresh16x8 = analysis.l0.me8x8[1].cost_mv + analysis.l0.me8x8[2].cost_mv;
-            if( analysis.b_mbrd )
-                i_thresh16x8 = i_thresh16x8 * analysis.i_lambda2 / analysis.i_lambda;
             if( ( flags & X264_ANALYSE_PSUB16x16 ) &&
                 analysis.l0.i_cost8x8 < analysis.l0.me16x16.cost + i_thresh16x8 )
             {
                 x264_mb_analyse_inter_p16x8( h, &analysis );
-                if( analysis.l0.i_cost16x8 < i_cost )
-                {
-                    i_type = P_L0;
-                    i_partition = D_16x8;
-                    i_cost = analysis.l0.i_cost16x8;
-                }
+                COPY3_IF_LT( i_cost, analysis.l0.i_cost16x8, i_type, P_L0, i_partition, D_16x8 );
 
                 x264_mb_analyse_inter_p8x16( h, &analysis );
-                if( analysis.l0.i_cost8x16 < i_cost )
-                {
-                    i_type = P_L0;
-                    i_partition = D_8x16;
-                    i_cost = analysis.l0.i_cost8x16;
-                }
+                COPY3_IF_LT( i_cost, analysis.l0.i_cost8x16, i_type, P_L0, i_partition, D_8x16 );
             }
 
             h->mb.i_partition = i_partition;
@@ -2075,8 +2022,7 @@ void x264_macroblock_analyse( x264_t *h )
             //FIXME mb_type costs?
             if( analysis.b_mbrd )
             {
-                h->mb.i_type = i_type;
-                x264_mb_analyse_transform_rd( h, &analysis, &i_cost );
+                /* refine later */
             }
             else if( i_partition == D_16x16 )
             {
@@ -2137,37 +2083,46 @@ void x264_macroblock_analyse( x264_t *h )
                 }
             }
 
-            x264_mb_analyse_intra( h, &analysis, i_cost );
-            if( h->mb.b_chroma_me && !analysis.b_mbrd &&
-                ( analysis.i_sad_i16x16 < i_cost
-               || analysis.i_sad_i8x8 < i_cost
-               || analysis.i_sad_i4x4 < i_cost ))
+            if( h->mb.b_chroma_me )
             {
                 x264_mb_analyse_intra_chroma( h, &analysis );
-                analysis.i_sad_i16x16 += analysis.i_sad_i8x8chroma;
-                analysis.i_sad_i8x8 += analysis.i_sad_i8x8chroma;
-                analysis.i_sad_i4x4 += analysis.i_sad_i8x8chroma;
+                x264_mb_analyse_intra( h, &analysis, i_cost - analysis.i_satd_i8x8chroma );
+                analysis.i_satd_i16x16 += analysis.i_satd_i8x8chroma;
+                analysis.i_satd_i8x8 += analysis.i_satd_i8x8chroma;
+                analysis.i_satd_i4x4 += analysis.i_satd_i8x8chroma;
             }
+            else
+                x264_mb_analyse_intra( h, &analysis, i_cost );
 
-            i_intra_type = I_16x16;
-            i_intra_cost = analysis.i_sad_i16x16;
+            i_satd_inter = i_cost;
+            i_satd_intra = X264_MIN3( analysis.i_satd_i16x16,
+                                      analysis.i_satd_i8x8,
+                                      analysis.i_satd_i4x4 );
 
-            if( analysis.i_sad_i8x8 < i_intra_cost )
-            {
-                i_intra_type = I_8x8;
-                i_intra_cost = analysis.i_sad_i8x8;
-            }
-            if( analysis.i_sad_i4x4 < i_intra_cost )
+            if( analysis.b_mbrd )
             {
-                i_intra_type = I_4x4;
-                i_intra_cost = analysis.i_sad_i4x4;
+                x264_mb_analyse_p_rd( h, &analysis, X264_MIN(i_satd_inter, i_satd_intra) );
+                i_type = P_L0;
+                i_partition = D_16x16;
+                i_cost = analysis.l0.me16x16.cost;
+                COPY2_IF_LT( i_cost, analysis.l0.i_cost16x8, i_partition, D_16x8 );
+                COPY2_IF_LT( i_cost, analysis.l0.i_cost8x16, i_partition, D_8x16 );
+                COPY3_IF_LT( i_cost, analysis.l0.i_cost8x8, i_partition, D_8x8, i_type, P_8x8 );
+                h->mb.i_type = i_type;
+                h->mb.i_partition = i_partition;
+                if( i_cost < COST_MAX )
+                    x264_mb_analyse_transform_rd( h, &analysis, &i_satd_inter, &i_cost );
+                x264_intra_rd( h, &analysis, i_satd_inter * 5/4 );
             }
 
-            if( i_intra_cost < i_cost )
-            {
-                i_type = i_intra_type;
-                i_cost = i_intra_cost;
-            }
+            i_intra_type = I_16x16;
+            i_intra_cost = analysis.i_satd_i16x16;
+            COPY2_IF_LT( i_intra_cost, analysis.i_satd_i8x8, i_intra_type, I_8x8 );
+            COPY2_IF_LT( i_intra_cost, analysis.i_satd_i4x4, i_intra_type, I_4x4 );
+            COPY2_IF_LT( i_cost, i_intra_cost, i_type, i_intra_type );
+
+            if( i_intra_cost == COST_MAX )
+                i_intra_cost = i_cost * i_satd_intra / i_satd_inter + 1;
 
             h->mb.i_type = i_type;
             h->stat.frame.i_intra_cost += i_intra_cost;
@@ -2285,27 +2240,22 @@ void x264_macroblock_analyse( x264_t *h )
             i_type = B_L0_L0;
             i_partition = D_16x16;
             i_cost = analysis.l0.me16x16.cost;
-            if( analysis.l1.me16x16.cost < i_cost )
-            {
-                i_type = B_L1_L1;
-                i_cost = analysis.l1.me16x16.cost;
-            }
-            if( analysis.i_cost16x16bi < i_cost )
-            {
-                i_type = B_BI_BI;
-                i_cost = analysis.i_cost16x16bi;
-            }
-            if( analysis.i_cost16x16direct < i_cost )
-            {
-                i_type = B_DIRECT;
-                i_cost = analysis.i_cost16x16direct;
-            }
+            COPY2_IF_LT( i_cost, analysis.l1.me16x16.cost, i_type, B_L1_L1 );
+            COPY2_IF_LT( i_cost, analysis.i_cost16x16bi, i_type, B_BI_BI );
+            COPY2_IF_LT( i_cost, analysis.i_cost16x16direct, i_type, B_DIRECT );
 
-            if( i_bskip_cost <= i_cost )
+            if( analysis.b_mbrd && analysis.i_cost16x16direct <= i_cost * 33/32 )
             {
-                h->mb.i_type = B_SKIP;
-                x264_analyse_update_cache( h, &analysis );
-                return;
+                x264_mb_analyse_b_rd( h, &analysis, i_cost );
+                if( i_bskip_cost < analysis.i_rd16x16direct &&
+                    i_bskip_cost < analysis.i_rd16x16bi &&
+                    i_bskip_cost < analysis.l0.i_rd16x16 &&
+                    i_bskip_cost < analysis.l1.i_rd16x16 )
+                {
+                    h->mb.i_type = B_SKIP;
+                    x264_analyse_update_cache( h, &analysis );
+                    return;
+                }
             }
 
             if( flags & X264_ANALYSE_BSUB16x16 )
@@ -2321,33 +2271,24 @@ void x264_macroblock_analyse( x264_t *h )
                         h->mb.i_sub_partition[2] == h->mb.i_sub_partition[3] )
                     {
                         x264_mb_analyse_inter_b16x8( h, &analysis );
-                        if( analysis.i_cost16x8bi < i_cost )
-                        {
-                            i_partition = D_16x8;
-                            i_cost = analysis.i_cost16x8bi;
-                            i_type = analysis.i_mb_type16x8;
-                        }
+                        COPY3_IF_LT( i_cost, analysis.i_cost16x8bi,
+                                     i_type, analysis.i_mb_type16x8,
+                                     i_partition, D_16x8 );
                     }
                     if( h->mb.i_sub_partition[0] == h->mb.i_sub_partition[2] ||
                         h->mb.i_sub_partition[1] == h->mb.i_sub_partition[3] )
                     {
                         x264_mb_analyse_inter_b8x16( h, &analysis );
-                        if( analysis.i_cost8x16bi < i_cost )
-                        {
-                            i_partition = D_8x16;
-                            i_cost = analysis.i_cost8x16bi;
-                            i_type = analysis.i_mb_type8x16;
-                        }
+                        COPY3_IF_LT( i_cost, analysis.i_cost8x16bi,
+                                     i_type, analysis.i_mb_type8x16,
+                                     i_partition, D_8x16 );
                     }
                 }
             }
 
-            h->mb.i_partition = i_partition;
-
             if( analysis.b_mbrd )
             {
-                h->mb.i_type = i_type;
-                x264_mb_analyse_transform_rd( h, &analysis, &i_cost );
+                /* refine later */
             }
             /* refine qpel */
             else if( i_partition == D_16x16 )
@@ -2428,26 +2369,35 @@ void x264_macroblock_analyse( x264_t *h )
                 }
             }
 
-            /* best intra mode */
             x264_mb_analyse_intra( h, &analysis, i_cost );
 
-            if( analysis.i_sad_i16x16 < i_cost )
-            {
-                i_type = I_16x16;
-                i_cost = analysis.i_sad_i16x16;
-            }
-            if( analysis.i_sad_i8x8 < i_cost )
-            {
-                i_type = I_8x8;
-                i_cost = analysis.i_sad_i8x8;
-            }
-            if( analysis.i_sad_i4x4 < i_cost )
+            if( analysis.b_mbrd )
             {
-                i_type = I_4x4;
-                i_cost = analysis.i_sad_i4x4;
+                int i_satd_inter = i_cost;
+                x264_mb_analyse_b_rd( h, &analysis, i_satd_inter );
+                i_type = B_SKIP;
+                i_cost = i_bskip_cost;
+                i_partition = D_16x16;
+                COPY2_IF_LT( i_cost, analysis.l0.i_rd16x16, i_type, B_L0_L0 );
+                COPY2_IF_LT( i_cost, analysis.l1.i_rd16x16, i_type, B_L1_L1 );
+                COPY2_IF_LT( i_cost, analysis.i_rd16x16bi, i_type, B_BI_BI );
+                COPY2_IF_LT( i_cost, analysis.i_rd16x16direct, i_type, B_DIRECT );
+                COPY3_IF_LT( i_cost, analysis.i_rd16x8bi, i_type, analysis.i_mb_type16x8, i_partition, D_16x8 );
+                COPY3_IF_LT( i_cost, analysis.i_rd8x16bi, i_type, analysis.i_mb_type8x16, i_partition, D_8x16 );
+                COPY3_IF_LT( i_cost, analysis.i_rd8x8bi, i_type, B_8x8, i_partition, D_8x8 );
+
+                h->mb.i_type = i_type;
+                h->mb.i_partition = i_partition;
+                x264_mb_analyse_transform_rd( h, &analysis, &i_satd_inter, &i_cost );
+                x264_intra_rd( h, &analysis, i_satd_inter * 17/16 );
             }
 
+            COPY2_IF_LT( i_cost, analysis.i_satd_i16x16, i_type, I_16x16 );
+            COPY2_IF_LT( i_cost, analysis.i_satd_i8x8, i_type, I_8x8 );
+            COPY2_IF_LT( i_cost, analysis.i_satd_i4x4, i_type, I_4x4 );
+
             h->mb.i_type = i_type;
+            h->mb.i_partition = i_partition;
 
             if( h->param.analyse.b_bidir_me )
                 refine_bidir( h, &analysis );
@@ -2472,17 +2422,13 @@ static void x264_analyse_update_cache( x264_t *h, x264_mb_analysis_t *a  )
     {
         case I_4x4:
             for( i = 0; i < 16; i++ )
-            {
-                h->mb.cache.intra4x4_pred_mode[x264_scan8[i]] =
-                    a->i_predict4x4[block_idx_x[i]][block_idx_y[i]];
-            }
+                h->mb.cache.intra4x4_pred_mode[x264_scan8[i]] = a->i_predict4x4[i];
 
             x264_mb_analyse_intra_chroma( h, a );
             break;
         case I_8x8:
             for( i = 0; i < 4; i++ )
-                x264_macroblock_cache_intra8x8_pred( h, 2*(i&1), 2*(i>>1),
-                    a->i_predict8x8[i&1][i>>1] );
+                x264_macroblock_cache_intra8x8_pred( h, 2*(i&1), 2*(i>>1), a->i_predict8x8[i] );
 
             x264_mb_analyse_intra_chroma( h, a );
             break;
index 817d32b641e9415119ed8f1c0f78b38200c68f48..e0e8e24d7620d49130a273980b3b5ebbe3b676b8 100644 (file)
@@ -44,21 +44,6 @@ static const int subpel_iterations[][4] =
 
 static void refine_subpel( x264_t *h, x264_me_t *m, int hpel_iters, int qpel_iters, int *p_halfpel_thresh, int b_refine_qpel );
 
-#define COPY2_IF_LT(x,y,a,b)\
-if((y)<(x))\
-{\
-    (x)=(y);\
-    (a)=(b);\
-}
-
-#define COPY3_IF_LT(x,y,a,b,c,d)\
-if((y)<(x))\
-{\
-    (x)=(y);\
-    (a)=(b);\
-    (c)=(d);\
-}
-
 #define BITS_MVD( mx, my )\
     (p_cost_mvx[(mx)<<2] + p_cost_mvy[(my)<<2])
 
@@ -794,88 +779,119 @@ int x264_me_refine_bidir( x264_t *h, x264_me_t *m0, x264_me_t *m1, int i_weight
     return bcost;
 }
 
-#define COST_MV_RD( mx, my, dir ) \
+#undef COST_MV_SATD
+#define COST_MV_SATD( mx, my, dst ) \
 { \
-    if( (dir^1) != odir && (dir<0 || !p_visited[(mx)+(my)*16]) ) \
+    int stride = 16; \
+    uint8_t *src = h->mc.get_ref( m->p_fref, m->i_stride[0], pix, &stride, mx, my, bw*4, bh*4 ); \
+    dst = h->pixf.mbcmp[i_pixel]( m->p_fenc[0], FENC_STRIDE, src, stride ) \
+        + p_cost_mvx[mx] + p_cost_mvy[my]; \
+    COPY1_IF_LT( bsatd, dst ); \
+}
+
+#define COST_MV_RD( mx, my, satd, dir ) \
+{ \
+    if( satd <= bsatd * SATD_THRESH \
+        && (dir^1) != odir \
+        && (dir<0 || !p_visited[(mx)+(my)*16]) ) \
     { \
         int cost; \
         cache_mv[0] = cache_mv2[0] = mx; \
         cache_mv[1] = cache_mv2[1] = my; \
         cost = x264_rd_cost_part( h, i_lambda2, i8, m->i_pixel ); \
-        if( cost < bcost ) \
-        {                  \
-            bcost = cost;  \
-            bmx = mx;      \
-            bmy = my;      \
-        } \
+        COPY3_IF_LT( bcost, cost, bmx, mx, bmy, my ); \
         if(dir>=0) p_visited[(mx)+(my)*16] = 1; \
     } \
 }
 
+#define SATD_THRESH 17/16
+
 void x264_me_refine_qpel_rd( x264_t *h, x264_me_t *m, int i_lambda2, int i8 )
 {
     // don't have to fill the whole mv cache rectangle
     static const int pixel_mv_offs[] = { 0, 4, 4*8, 0 };
     int16_t *cache_mv = h->mb.cache.mv[0][x264_scan8[i8*4]];
     int16_t *cache_mv2 = cache_mv + pixel_mv_offs[m->i_pixel];
+    const int16_t *p_cost_mvx, *p_cost_mvy;
     const int bw = x264_pixel_size[m->i_pixel].w>>2;
     const int bh = x264_pixel_size[m->i_pixel].h>>2;
+    const int i_pixel = m->i_pixel;
 
+    DECLARE_ALIGNED( uint8_t, pix[16*16], 16 );
     int bcost = m->i_pixel == PIXEL_16x16 ? m->cost : COST_MAX;
-    int bmx = m->mv[0]; 
+    int bmx = m->mv[0];
     int bmy = m->mv[1];
-    int omx, omy, i;
+    int pmx, pmy, omx, omy, i;
     int odir = -1, bdir;
+    unsigned bsatd, satds[4];
 
     int visited[16*13] = {0}; // only need 13x13, but 16 is more convenient
     int *p_visited = &visited[6+6*16];
 
-    if( m->i_pixel != PIXEL_16x16 )
-    {
-        COST_MV_RD( bmx, bmy, -1 );
+    if( m->i_pixel != PIXEL_16x16 && i8 != 0 )
         x264_mb_predict_mv( h, 0, i8*4, bw, m->mvp );
-    }
+    pmx = m->mvp[0];
+    pmy = m->mvp[1];
+    p_cost_mvx = m->p_cost_mv - pmx;
+    p_cost_mvy = m->p_cost_mv - pmy;
+    COST_MV_SATD( bmx, bmy, bsatd );
+    if( m->i_pixel != PIXEL_16x16 )
+        COST_MV_RD( bmx, bmy, 0, -1 );
 
     /* check the predicted mv */
-    if( bmx != m->mvp[0] || bmy != m->mvp[1] )
-        COST_MV_RD( m->mvp[0], m->mvp[1], -1 );
+    if( (bmx != pmx || bmy != pmy)
+        && pmx >= h->mb.mv_min_spel[0] && pmx <= h->mb.mv_max_spel[0]
+        && pmy >= h->mb.mv_min_spel[1] && pmy <= h->mb.mv_max_spel[1] )
+    {
+        int satd;
+        COST_MV_SATD( pmx, pmy, satd );
+        COST_MV_RD( pmx, pmy, satd, -1 );
+    }
 
     /* mark mv and mvp as visited */
     p_visited[0] = 1;
     p_visited -= bmx + bmy*16;
     {
-        int mx = bmx ^ m->mv[0] ^ m->mvp[0];
-        int my = bmy ^ m->mv[1] ^ m->mvp[1];
+        int mx = bmx ^ m->mv[0] ^ pmx;
+        int my = bmy ^ m->mv[1] ^ pmy;
         if( abs(mx-bmx) < 7 && abs(my-bmy) < 7 )
             p_visited[mx + my*16] = 1;
     }
 
-    /* hpel */  
+    /* hpel diamond */
     bdir = -1;
     for( i = 0; i < 2; i++ )
     {
          omx = bmx;
          omy = bmy;
          odir = bdir;
-         COST_MV_RD( omx, omy - 2, 0 );
-         COST_MV_RD( omx, omy + 2, 1 );
-         COST_MV_RD( omx - 2, omy, 2 );
-         COST_MV_RD( omx + 2, omy, 3 );
+         COST_MV_SATD( omx, omy - 2, satds[0] );
+         COST_MV_SATD( omx, omy + 2, satds[1] );
+         COST_MV_SATD( omx - 2, omy, satds[2] );
+         COST_MV_SATD( omx + 2, omy, satds[3] );
+         COST_MV_RD( omx, omy - 2, satds[0], 0 );
+         COST_MV_RD( omx, omy + 2, satds[1], 1 );
+         COST_MV_RD( omx - 2, omy, satds[2], 2 );
+         COST_MV_RD( omx + 2, omy, satds[3], 3 );
          if( bmx == omx && bmy == omy )
             break;
     }
-    
-    /* qpel */
+
+    /* qpel diamond */
     bdir = -1;
     for( i = 0; i < 2; i++ )
     {
          omx = bmx;
          omy = bmy;
          odir = bdir;
-         COST_MV_RD( omx, omy - 1, 0 );
-         COST_MV_RD( omx, omy + 1, 1 );
-         COST_MV_RD( omx - 1, omy, 2 );
-         COST_MV_RD( omx + 1, omy, 3 );
+         COST_MV_SATD( omx, omy - 1, satds[0] );
+         COST_MV_SATD( omx, omy + 1, satds[1] );
+         COST_MV_SATD( omx - 1, omy, satds[2] );
+         COST_MV_SATD( omx + 1, omy, satds[3] );
+         COST_MV_RD( omx, omy - 1, satds[0], 0 );
+         COST_MV_RD( omx, omy + 1, satds[1], 1 );
+         COST_MV_RD( omx - 1, omy, satds[2], 2 );
+         COST_MV_RD( omx + 1, omy, satds[3], 3 );
          if( bmx == omx && bmy == omy )
             break;
     }
@@ -885,6 +901,6 @@ void x264_me_refine_qpel_rd( x264_t *h, x264_me_t *m, int i_lambda2, int i8 )
     m->mv[1] = bmy;
 
     x264_macroblock_cache_mv ( h, 2*(i8&1), i8&2, bw, bh, 0, bmx, bmy );
-    x264_macroblock_cache_mvd( h, 2*(i8&1), i8&2, bw, bh, 0, bmx - m->mvp[0], bmy - m->mvp[1] );
+    x264_macroblock_cache_mvd( h, 2*(i8&1), i8&2, bw, bh, 0, bmx - pmx, bmy - pmy );
 }
 
index 8c640a9733118e8cba0ca82286c4512a186315ed..967787ff86a9213ed88b3fa450e50425a1504d86 100644 (file)
@@ -56,4 +56,23 @@ void x264_me_refine_qpel_rd( x264_t *h, x264_me_t *m, int i_lambda2, int i8 );
 int x264_me_refine_bidir( x264_t *h, x264_me_t *m0, x264_me_t *m1, int i_weight );
 int x264_rd_cost_part( x264_t *h, int i_lambda2, int i8, int i_pixel );
 
+#define COPY1_IF_LT(x,y)\
+if((y)<(x))\
+    (x)=(y);
+
+#define COPY2_IF_LT(x,y,a,b)\
+if((y)<(x))\
+{\
+    (x)=(y);\
+    (a)=(b);\
+}
+
+#define COPY3_IF_LT(x,y,a,b,c,d)\
+if((y)<(x))\
+{\
+    (x)=(y);\
+    (a)=(b);\
+    (c)=(d);\
+}
+
 #endif