From: Alex Wright <alexw0885@gmail.com>
Date: Sun, 19 Sep 2010 12:08:22 +0000 (-0700)
Subject: Chroma mode decision/subpel for B-frames
X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=b6b8aea6baaac8284a61f5879ba94a26a3cd6156;p=libx264

Chroma mode decision/subpel for B-frames
Improves compression ~0.4-1%. Helps more on videos with lots of chroma detail.
Enabled at subme 9 (preset slower) and higher.
---

diff --git a/common/macroblock.c b/common/macroblock.c
index cc8f097e..b10cd246 100644
--- a/common/macroblock.c
+++ b/common/macroblock.c
@@ -448,8 +448,9 @@ void x264_macroblock_thread_init( x264_t *h )
     h->mb.i_subpel_refine = h->param.analyse.i_subpel_refine;
     if( h->sh.i_type == SLICE_TYPE_B && (h->mb.i_subpel_refine == 6 || h->mb.i_subpel_refine == 8) )
         h->mb.i_subpel_refine--;
-    h->mb.b_chroma_me = h->param.analyse.b_chroma_me && h->sh.i_type == SLICE_TYPE_P
-                        && h->mb.i_subpel_refine >= 5;
+    h->mb.b_chroma_me = h->param.analyse.b_chroma_me &&
+                        ((h->sh.i_type == SLICE_TYPE_P && h->mb.i_subpel_refine >= 5) ||
+                         (h->sh.i_type == SLICE_TYPE_B && h->mb.i_subpel_refine >= 9));
     h->mb.b_dct_decimate = h->sh.i_type == SLICE_TYPE_B ||
                           (h->param.analyse.b_dct_decimate && h->sh.i_type != SLICE_TYPE_I);
 
diff --git a/encoder/analyse.c b/encoder/analyse.c
index 4b6a8a23..d7d0b635 100644
--- a/encoder/analyse.c
+++ b/encoder/analyse.c
@@ -1679,6 +1679,37 @@ static void x264_mb_analyse_inter_p4x8( x264_t *h, x264_mb_analysis_t *a, int i8
         a->l0.i_cost4x8[i8x8] += x264_mb_analyse_inter_p4x4_chroma( h, a, p_fref, i8x8, PIXEL_4x8 );
 }
 
+static ALWAYS_INLINE int x264_analyse_bi_chroma( x264_t *h, x264_mb_analysis_t *a, int idx, int i_pixel )
+{
+    ALIGNED_ARRAY_8( pixel, pix, [4],[8*8] );
+    ALIGNED_ARRAY_8( pixel,  bi, [2],[8*8] );
+    int l0_mvy_offset, l1_mvy_offset;
+    int i_chroma_cost = 0;
+
+#define COST_BI_CHROMA( m0, m1, width, height ) \
+{ \
+    l0_mvy_offset = h->mb.b_interlaced & m0.i_ref ? (h->mb.i_mb_y & 1)*4 - 2 : 0; \
+    l1_mvy_offset = h->mb.b_interlaced & m1.i_ref ? (h->mb.i_mb_y & 1)*4 - 2 : 0; \
+    h->mc.mc_chroma( pix[0], pix[1], 8, m0.p_fref[4], m0.i_stride[1], m0.mv[0], m0.mv[1] + l0_mvy_offset, width, height ); \
+    h->mc.mc_chroma( pix[2], pix[3], 8, m1.p_fref[4], m1.i_stride[1], m1.mv[0], m1.mv[1] + l1_mvy_offset, width, height ); \
+    h->mc.avg[i_pixel+3]( bi[0], 8, pix[0], 8, pix[2], 8, h->mb.bipred_weight[m0.i_ref][m1.i_ref] ); \
+    h->mc.avg[i_pixel+3]( bi[1], 8, pix[1], 8, pix[3], 8, h->mb.bipred_weight[m0.i_ref][m1.i_ref] ); \
+    i_chroma_cost  = h->pixf.mbcmp[i_pixel+3]( m0.p_fenc[1], FENC_STRIDE, bi[0], 8 ); \
+    i_chroma_cost += h->pixf.mbcmp[i_pixel+3]( m0.p_fenc[2], FENC_STRIDE, bi[1], 8 ); \
+}
+
+    if( i_pixel == PIXEL_16x16 )
+        COST_BI_CHROMA( a->l0.bi16x16, a->l1.bi16x16, 8, 8 )
+    else if( i_pixel == PIXEL_16x8 )
+        COST_BI_CHROMA( a->l0.me16x8[idx], a->l1.me16x8[idx], 8, 4 )
+    else if( i_pixel == PIXEL_8x16 )
+        COST_BI_CHROMA( a->l0.me8x16[idx], a->l1.me8x16[idx], 4, 8 )
+    else
+        COST_BI_CHROMA( a->l0.me8x8[idx], a->l1.me8x8[idx], 4, 4 )
+
+    return i_chroma_cost;
+}
+
 static void x264_mb_analyse_inter_direct( x264_t *h, x264_mb_analysis_t *a )
 {
     /* Assumes that fdec still contains the results of
@@ -1693,15 +1724,29 @@ static void x264_mb_analyse_inter_direct( x264_t *h, x264_mb_analysis_t *a )
         {
             const int x = (i&1)*8;
             const int y = (i>>1)*8;
-            a->i_cost16x16direct +=
-            a->i_cost8x8direct[i] =
-                h->pixf.mbcmp[PIXEL_8x8]( &p_fenc[x+y*FENC_STRIDE], FENC_STRIDE, &p_fdec[x+y*FDEC_STRIDE], FDEC_STRIDE );
+            a->i_cost8x8direct[i] = h->pixf.mbcmp[PIXEL_8x8]( &p_fenc[x+y*FENC_STRIDE], FENC_STRIDE,
+                                                              &p_fdec[x+y*FDEC_STRIDE], FDEC_STRIDE );
+            if( h->mb.b_chroma_me )
+            {
+                a->i_cost8x8direct[i] += h->pixf.mbcmp[PIXEL_4x4]( &h->mb.pic.p_fenc[1][(x>>1)+(y>>1)*FENC_STRIDE], FENC_STRIDE,
+                                                                   &h->mb.pic.p_fdec[1][(x>>1)+(y>>1)*FDEC_STRIDE], FDEC_STRIDE )
+                                      +  h->pixf.mbcmp[PIXEL_4x4]( &h->mb.pic.p_fenc[2][(x>>1)+(y>>1)*FENC_STRIDE], FENC_STRIDE,
+                                                                   &h->mb.pic.p_fdec[2][(x>>1)+(y>>1)*FDEC_STRIDE], FDEC_STRIDE );
+            }
+            a->i_cost16x16direct += a->i_cost8x8direct[i];
 
             /* mb type cost */
             a->i_cost8x8direct[i] += a->i_lambda * i_sub_mb_b_cost_table[D_DIRECT_8x8];
         }
     else
+    {
         a->i_cost16x16direct += h->pixf.mbcmp[PIXEL_16x16]( p_fenc, FENC_STRIDE, p_fdec, FDEC_STRIDE );
+        if( h->mb.b_chroma_me )
+        {
+            a->i_cost16x16direct += h->pixf.mbcmp[PIXEL_8x8]( h->mb.pic.p_fenc[1], FENC_STRIDE, h->mb.pic.p_fdec[1], FDEC_STRIDE )
+                                 +  h->pixf.mbcmp[PIXEL_8x8]( h->mb.pic.p_fenc[2], FENC_STRIDE, h->mb.pic.p_fdec[2], FDEC_STRIDE );
+        }
+    }
 }
 
 static void x264_mb_analyse_inter_b16x16( x264_t *h, x264_mb_analysis_t *a )
@@ -1807,6 +1852,9 @@ static void x264_mb_analyse_inter_b16x16( x264_t *h, x264_mb_analysis_t *a )
                      + a->l0.bi16x16.cost_mv
                      + a->l1.bi16x16.cost_mv;
 
+    if( h->mb.b_chroma_me )
+        a->i_cost16x16bi += x264_analyse_bi_chroma( h, a, 0, PIXEL_16x16 );
+
     /* Always try the 0,0,0,0 vector; helps avoid errant motion vectors in fades */
     if( M32( a->l0.bi16x16.mv ) | M32( a->l1.bi16x16.mv ) )
     {
@@ -1819,6 +1867,39 @@ static void x264_mb_analyse_inter_b16x16( x264_t *h, x264_mb_analysis_t *a )
                                 h->mb.bipred_weight[a->l0.bi16x16.i_ref][a->l1.bi16x16.i_ref] );
         int cost00 = h->pixf.mbcmp[PIXEL_16x16]( h->mb.pic.p_fenc[0], FENC_STRIDE, pix0, 16 )
                    + ref_costs + l0_mv_cost + l1_mv_cost;
+
+        if( h->mb.b_chroma_me )
+        {
+            ALIGNED_ARRAY_8( pixel, pixuv, [2],[8*FENC_STRIDE] );
+            ALIGNED_ARRAY_8( pixel, bi, [8*FENC_STRIDE] );
+
+            if( h->mb.b_interlaced & a->l0.bi16x16.i_ref )
+            {
+                int l0_mvy_offset = h->mb.b_interlaced & a->l0.bi16x16.i_ref ? (h->mb.i_mb_y & 1)*4 - 2 : 0;
+                h->mc.mc_chroma( pixuv[0], pixuv[0]+8, FENC_STRIDE, h->mb.pic.p_fref[0][a->l0.bi16x16.i_ref][4],
+                                 h->mb.pic.i_stride[1], 0, 0 + l0_mvy_offset, 8, 8 );
+            }
+            else
+                h->mc.load_deinterleave_8x8x2_fenc( pixuv[0], h->mb.pic.p_fref[0][a->l0.bi16x16.i_ref][4], h->mb.pic.i_stride[1] );
+
+            if( h->mb.b_interlaced & a->l1.bi16x16.i_ref )
+            {
+                int l1_mvy_offset = h->mb.b_interlaced & a->l1.bi16x16.i_ref ? (h->mb.i_mb_y & 1)*4 - 2 : 0;
+                h->mc.mc_chroma( pixuv[1], pixuv[1]+8, FENC_STRIDE, h->mb.pic.p_fref[1][a->l1.bi16x16.i_ref][4],
+                                 h->mb.pic.i_stride[1], 0, 0 + l1_mvy_offset, 8, 8 );
+            }
+            else
+                h->mc.load_deinterleave_8x8x2_fenc( pixuv[1], h->mb.pic.p_fref[1][a->l1.bi16x16.i_ref][4], h->mb.pic.i_stride[1] );
+
+            h->mc.avg[PIXEL_8x8]( bi, FENC_STRIDE, pixuv[0], FENC_STRIDE, pixuv[1], FENC_STRIDE,
+                                  h->mb.bipred_weight[a->l0.bi16x16.i_ref][a->l1.bi16x16.i_ref] );
+            h->mc.avg[PIXEL_8x8]( bi+8, FENC_STRIDE, pixuv[0]+8, FENC_STRIDE, pixuv[1]+8, FENC_STRIDE,
+                                  h->mb.bipred_weight[a->l0.bi16x16.i_ref][a->l1.bi16x16.i_ref] );
+
+            cost00 += h->pixf.mbcmp[PIXEL_8x8]( h->mb.pic.p_fenc[1], FENC_STRIDE, bi, FENC_STRIDE )
+                   +  h->pixf.mbcmp[PIXEL_8x8]( h->mb.pic.p_fenc[2], FENC_STRIDE, bi+8, FENC_STRIDE );
+        }
+
         if( cost00 < a->i_cost16x16bi )
         {
             M32( a->l0.bi16x16.mv ) = 0;
@@ -2017,6 +2098,13 @@ static void x264_mb_analyse_inter_b8x8_mixed_ref( x264_t *h, x264_mb_analysis_t
                          + a->l0.me8x8[i].i_ref_cost + a->l1.me8x8[i].i_ref_cost
                          + a->i_lambda * i_sub_mb_b_cost_table[D_BI_8x8];
 
+        if( h->mb.b_chroma_me )
+        {
+            int i_chroma_cost = x264_analyse_bi_chroma( h, a, i, PIXEL_8x8 );
+            i_part_cost_bi += i_chroma_cost;
+            a->i_satd8x8[2][i] += i_chroma_cost;
+        }
+
         a->l0.me8x8[i].cost += a->i_lambda * i_sub_mb_b_cost_table[D_L0_8x8];
         a->l1.me8x8[i].cost += a->i_lambda * i_sub_mb_b_cost_table[D_L1_8x8];
 
@@ -2090,6 +2178,13 @@ static void x264_mb_analyse_inter_b8x8( x264_t *h, x264_mb_analysis_t *a )
         a->l0.me8x8[i].cost += a->i_lambda * i_sub_mb_b_cost_table[D_L0_8x8];
         a->l1.me8x8[i].cost += a->i_lambda * i_sub_mb_b_cost_table[D_L1_8x8];
 
+        if( h->mb.b_chroma_me )
+        {
+            int i_chroma_cost = x264_analyse_bi_chroma( h, a, i, PIXEL_8x8 );
+            i_part_cost_bi += i_chroma_cost;
+            a->i_satd8x8[2][i] += i_chroma_cost;
+        }
+
         i_part_cost = a->l0.me8x8[i].cost;
         h->mb.i_sub_partition[i] = D_L0_8x8;
         COPY2_IF_LT( i_part_cost, a->l1.me8x8[i].cost, h->mb.i_sub_partition[i], D_L1_8x8 );
@@ -2162,6 +2257,9 @@ static void x264_mb_analyse_inter_b16x8( x264_t *h, x264_mb_analysis_t *a, int i
                         + a->l0.me16x8[i].cost_mv + a->l1.me16x8[i].cost_mv + a->l0.me16x8[i].i_ref_cost
                         + a->l1.me16x8[i].i_ref_cost;
 
+        if( h->mb.b_chroma_me )
+            i_part_cost_bi += x264_analyse_bi_chroma( h, a, i, PIXEL_16x8 );
+
         i_part_cost = a->l0.me16x8[i].cost;
         a->i_mb_partition16x8[i] = D_L0_8x8; /* not actually 8x8, only the L0 matters */
 
@@ -2252,6 +2350,9 @@ static void x264_mb_analyse_inter_b8x16( x264_t *h, x264_mb_analysis_t *a, int i
                         + a->l0.me8x16[i].cost_mv + a->l1.me8x16[i].cost_mv + a->l0.me8x16[i].i_ref_cost
                         + a->l1.me8x16[i].i_ref_cost;
 
+        if( h->mb.b_chroma_me )
+            i_part_cost_bi += x264_analyse_bi_chroma( h, a, i, PIXEL_8x16 );
+
         i_part_cost = a->l0.me8x16[i].cost;
         a->i_mb_partition8x16[i] = D_L0_8x8;
 
@@ -3249,7 +3350,16 @@ intra_analysis:
                 h->mb.i_partition = i_partition;
             }
 
-            x264_mb_analyse_intra( h, &analysis, i_satd_inter );
+            if( h->mb.b_chroma_me )
+            {
+                x264_mb_analyse_intra_chroma( h, &analysis );
+                x264_mb_analyse_intra( h, &analysis, i_satd_inter - analysis.i_satd_i8x8chroma );
+                analysis.i_satd_i16x16 += analysis.i_satd_i8x8chroma;
+                analysis.i_satd_i8x8 += analysis.i_satd_i8x8chroma;
+                analysis.i_satd_i4x4 += analysis.i_satd_i8x8chroma;
+            }
+            else
+                x264_mb_analyse_intra( h, &analysis, i_satd_inter );
 
             if( analysis.i_mbrd )
             {