From 48c2e935e3638a38c988b11204ff52a85bf48fc9 Mon Sep 17 00:00:00 2001
From: Loren Merritt <pengvado@videolan.org>
Date: Sat, 15 Oct 2005 00:27:17 +0000
Subject: [PATCH] Use SAD instead of SATD for halfpel motion search. Move
 multiref termination after halfpel search. Total: 3-7% speedup and +/-.02 dB.
 patch by Alex Wright.

git-svn-id: svn://svn.videolan.org/x264/trunk@329 df754926-b1dd-0310-bc7b-ec298dee348c
---
 encoder/analyse.c |  32 ++++++------
 encoder/me.c      | 130 +++++++++++++++++++++++++++-------------------
 2 files changed, 93 insertions(+), 69 deletions(-)

diff --git a/encoder/analyse.c b/encoder/analyse.c
index 6e4c3ab0..a2348ab6 100644
--- a/encoder/analyse.c
+++ b/encoder/analyse.c
@@ -674,8 +674,8 @@ static void x264_mb_analyse_inter_p16x16( x264_t *h, x264_mb_analysis_t *a )
     x264_me_t m;
     int i_ref;
     int mvc[7][2], i_mvc;
-    int i_fullpel_thresh = INT_MAX;
-    int *p_fullpel_thresh = h->i_ref0>1 ? &i_fullpel_thresh : NULL;
+    int i_halfpel_thresh = INT_MAX;
+    int *p_halfpel_thresh = h->i_ref0>1 ? &i_halfpel_thresh : NULL;
 
     /* 16x16 Search on all ref frame */
     m.i_pixel = PIXEL_16x16;
@@ -686,7 +686,7 @@ static void x264_mb_analyse_inter_p16x16( x264_t *h, x264_mb_analysis_t *a )
     for( i_ref = 0; i_ref < h->i_ref0; i_ref++ )
     {
         const int i_ref_cost = REF_COST( 0, i_ref );
-        i_fullpel_thresh -= i_ref_cost;
+        i_halfpel_thresh -= i_ref_cost;
         m.i_ref_cost = i_ref_cost;
         m.i_ref = i_ref;
 
@@ -694,10 +694,10 @@ static void x264_mb_analyse_inter_p16x16( x264_t *h, x264_mb_analysis_t *a )
         LOAD_HPELS( &m, h->mb.pic.p_fref[0][i_ref], 0, 0 );
         x264_mb_predict_mv_16x16( h, 0, i_ref, m.mvp );
         x264_mb_predict_mv_ref16x16( h, 0, i_ref, mvc, &i_mvc );
-        x264_me_search_ref( h, &m, mvc, i_mvc, p_fullpel_thresh );
+        x264_me_search_ref( h, &m, mvc, i_mvc, p_halfpel_thresh );
 
         m.cost += i_ref_cost;
-        i_fullpel_thresh += i_ref_cost;
+        i_halfpel_thresh += i_ref_cost;
 
         if( m.cost < a->l0.me16x16.cost )
             a->l0.me16x16 = m;
@@ -726,8 +726,8 @@ static void x264_mb_analyse_inter_p8x8_mixed_ref( x264_t *h, x264_mb_analysis_t
     x264_me_t m;
     int i_ref;
     uint8_t  **p_fenc = h->mb.pic.p_fenc;
-    int i_fullpel_thresh = INT_MAX;
-    int *p_fullpel_thresh = /*h->i_ref0>1 ? &i_fullpel_thresh : */NULL;
+    int i_halfpel_thresh = INT_MAX;
+    int *p_halfpel_thresh = /*h->i_ref0>1 ? &i_halfpel_thresh : */NULL;
     int i;
     int i_maxref = h->i_ref0-1;
 
@@ -767,17 +767,17 @@ static void x264_mb_analyse_inter_p8x8_mixed_ref( x264_t *h, x264_mb_analysis_t
         for( i_ref = 0; i_ref <= i_maxref; i_ref++ )
         {
              const int i_ref_cost = REF_COST( 0, i_ref );
-             i_fullpel_thresh -= i_ref_cost;
+             i_halfpel_thresh -= i_ref_cost;
              m.i_ref_cost = i_ref_cost;
              m.i_ref = i_ref;
 
              LOAD_HPELS( &m, h->mb.pic.p_fref[0][i_ref], 8*x8, 8*y8 );
              x264_macroblock_cache_ref( h, 2*x8, 2*y8, 2, 2, 0, i_ref );
              x264_mb_predict_mv( h, 0, 4*i, 2, m.mvp );
-             x264_me_search_ref( h, &m, a->l0.mvc[i_ref], i+1, p_fullpel_thresh );
+             x264_me_search_ref( h, &m, a->l0.mvc[i_ref], i+1, p_halfpel_thresh );
 
              m.cost += i_ref_cost;
-             i_fullpel_thresh += i_ref_cost;
+             i_halfpel_thresh += i_ref_cost;
              *(uint64_t*)a->l0.mvc[i_ref][i+1] = *(uint64_t*)m.mv;
 
              if( m.cost < l0m->cost )
@@ -1166,8 +1166,8 @@ static void x264_mb_analyse_inter_b16x16( x264_t *h, x264_mb_analysis_t *a )
     x264_me_t m;
     int i_ref;
     int mvc[8][2], i_mvc;
-    int i_fullpel_thresh = INT_MAX;
-    int *p_fullpel_thresh = h->i_ref0>1 ? &i_fullpel_thresh : NULL;
+    int i_halfpel_thresh = INT_MAX;
+    int *p_halfpel_thresh = h->i_ref0>1 ? &i_halfpel_thresh : NULL;
 
     /* 16x16 Search on all ref frame */
     m.i_pixel = PIXEL_16x16;
@@ -1182,7 +1182,7 @@ static void x264_mb_analyse_inter_b16x16( x264_t *h, x264_mb_analysis_t *a )
         LOAD_HPELS( &m, h->mb.pic.p_fref[0][i_ref], 0, 0 );
         x264_mb_predict_mv_16x16( h, 0, i_ref, m.mvp );
         x264_mb_predict_mv_ref16x16( h, 0, i_ref, mvc, &i_mvc );
-        x264_me_search_ref( h, &m, mvc, i_mvc, p_fullpel_thresh );
+        x264_me_search_ref( h, &m, mvc, i_mvc, p_halfpel_thresh );
 
         /* add ref cost */
         m.cost += REF_COST( 0, i_ref );
@@ -1201,8 +1201,8 @@ static void x264_mb_analyse_inter_b16x16( x264_t *h, x264_mb_analysis_t *a )
     a->l0.me16x16.cost -= REF_COST( 0, a->l0.i_ref );
 
     /* ME for list 1 */
-    i_fullpel_thresh = INT_MAX;
-    p_fullpel_thresh = h->i_ref1>1 ? &i_fullpel_thresh : NULL;
+    i_halfpel_thresh = INT_MAX;
+    p_halfpel_thresh = h->i_ref1>1 ? &i_halfpel_thresh : NULL;
     a->l1.me16x16.cost = INT_MAX;
     for( i_ref = 0; i_ref < h->i_ref1; i_ref++ )
     {
@@ -1210,7 +1210,7 @@ static void x264_mb_analyse_inter_b16x16( x264_t *h, x264_mb_analysis_t *a )
         LOAD_HPELS( &m, h->mb.pic.p_fref[1][i_ref], 0, 0 );
         x264_mb_predict_mv_16x16( h, 1, i_ref, m.mvp );
         x264_mb_predict_mv_ref16x16( h, 1, i_ref, mvc, &i_mvc );
-        x264_me_search_ref( h, &m, mvc, i_mvc, p_fullpel_thresh );
+        x264_me_search_ref( h, &m, mvc, i_mvc, p_halfpel_thresh );
 
         /* add ref cost */
         m.cost += REF_COST( 1, i_ref );
diff --git a/encoder/me.c b/encoder/me.c
index b2717547..326ee721 100644
--- a/encoder/me.c
+++ b/encoder/me.c
@@ -36,13 +36,13 @@
 static const int subpel_iterations[][4] = 
    {{1,0,0,0},
     {1,1,0,0},
-    {1,2,0,0},
+    {0,1,1,0},
     {0,2,1,0},
     {0,2,1,1},
     {0,2,1,2},
     {0,0,2,3}};
 
-static void refine_subpel( x264_t *h, x264_me_t *m, int hpel_iters, int qpel_iters );
+static void refine_subpel( x264_t *h, x264_me_t *m, int hpel_iters, int qpel_iters, int *p_halfpel_thresh, int b_refine_qpel );
 
 #define COST_MV( mx, my ) \
 { \
@@ -58,11 +58,10 @@ static void refine_subpel( x264_t *h, x264_me_t *m, int hpel_iters, int qpel_ite
     } \
 }
 
-void x264_me_search_ref( x264_t *h, x264_me_t *m, int (*mvc)[2], int i_mvc, int *p_fullpel_thresh )
+void x264_me_search_ref( x264_t *h, x264_me_t *m, int (*mvc)[2], int i_mvc, int *p_halfpel_thresh )
 {
     const int i_pixel = m->i_pixel;
     const int i_me_range = h->param.analyse.i_me_range;
-    const int b_chroma_me = h->mb.b_chroma_me && i_pixel <= PIXEL_8x8;
     int bmx, bmy, bcost;
     int omx, omy, pmx, pmy;
     uint8_t *p_fref = m->p_fref[0];
@@ -86,7 +85,7 @@ void x264_me_search_ref( x264_t *h, x264_me_t *m, int (*mvc)[2], int i_mvc, int
     bmx = pmx = x264_clip3( ( m->mvp[0] + 2 ) >> 2, mv_x_min, mv_x_max );
     bmy = pmy = x264_clip3( ( m->mvp[1] + 2 ) >> 2, mv_y_min, mv_y_max );
     bcost = COST_MAX;
-    COST_MV( bmx, bmy );
+    COST_MV( pmx, pmy );
     /* I don't know why this helps */
     bcost -= p_cost_mvx[ bmx<<2 ] + p_cost_mvy[ bmy<<2 ];
 
@@ -246,39 +245,16 @@ umh_small_hex:
 
     /* compute the real cost */
     m->cost_mv = p_cost_mvx[ m->mv[0] ] + p_cost_mvy[ m->mv[1] ];
-    m->cost = h->pixf.mbcmp[i_pixel]( m->p_fenc[0], m->i_stride[0],
-                    &p_fref[bmy * m->i_stride[0] + bmx], m->i_stride[0] )
-            + m->cost_mv;
-    if( b_chroma_me )
-    {
-        const int bw = x264_pixel_size[m->i_pixel].w;
-        const int bh = x264_pixel_size[m->i_pixel].h;
-        DECLARE_ALIGNED( uint8_t, pix[8*8*2], 16 );
-        h->mc.mc_chroma( m->p_fref[4], m->i_stride[1], pix, 8, m->mv[0], m->mv[1], bw/2, bh/2 );
-        h->mc.mc_chroma( m->p_fref[5], m->i_stride[1], pix+8*8, 8, m->mv[0], m->mv[1], bw/2, bh/2 );
-        m->cost += h->pixf.mbcmp[i_pixel+3]( m->p_fenc[1], m->i_stride[1], pix, 8 )
-                 + h->pixf.mbcmp[i_pixel+3]( m->p_fenc[2], m->i_stride[1], pix+8*8, 8 );
-    }
-
+    m->cost = bcost;
+    if( bmx == pmx && bmy == pmy )
+        m->cost += m->cost_mv;
+    
     /* subpel refine */
-    if( h->mb.i_subpel_refine >= 3 )
+    if( h->mb.i_subpel_refine >= 2 )
     {
-        int hpel, qpel;
-
-        /* early termination (when examining multiple reference frames)
-         * FIXME: this can update fullpel_thresh even if the match
-         *        ref is rejected after subpel refinement */
-        if( p_fullpel_thresh )
-        {
-            if( (m->cost*7)>>3 > *p_fullpel_thresh )
-                return;
-            else if( m->cost < *p_fullpel_thresh )
-                *p_fullpel_thresh = m->cost;
-        }
-
-        hpel = subpel_iterations[h->mb.i_subpel_refine][2];
-        qpel = subpel_iterations[h->mb.i_subpel_refine][3];
-        refine_subpel( h, m, hpel, qpel );
+        int hpel = subpel_iterations[h->mb.i_subpel_refine][2];
+        int qpel = subpel_iterations[h->mb.i_subpel_refine][3];
+        refine_subpel( h, m, hpel, qpel, p_halfpel_thresh, 0 );
     }
 }
 #undef COST_MV
@@ -291,10 +267,24 @@ void x264_me_refine_qpel( x264_t *h, x264_me_t *m )
     if( m->i_pixel <= PIXEL_8x8 && h->sh.i_type == SLICE_TYPE_P )
         m->cost -= m->i_ref_cost;
 	
-    refine_subpel( h, m, hpel, qpel );
+    refine_subpel( h, m, hpel, qpel, NULL, 1 );
 }
 
-#define COST_MV( mx, my ) \
+#define COST_MV_SAD( mx, my ) \
+{ \
+    int stride = 16; \
+    uint8_t *src = h->mc.get_ref( m->p_fref, m->i_stride[0], pix, &stride, mx, my, bw, bh ); \
+    int cost = h->pixf.sad[i_pixel]( m->p_fenc[0], m->i_stride[0], src, stride ) \
+             + p_cost_mvx[ mx ] + p_cost_mvy[ my ]; \
+    if( cost < bcost ) \
+    {                  \
+        bcost = cost;  \
+        bmx = mx;      \
+        bmy = my;      \
+    } \
+}
+
+#define COST_MV_SATD( mx, my ) \
 { \
     int stride = 16; \
     uint8_t *src = h->mc.get_ref( m->p_fref, m->i_stride[0], pix, &stride, mx, my, bw, bh ); \
@@ -318,7 +308,7 @@ void x264_me_refine_qpel( x264_t *h, x264_me_t *m )
     } \
 }
 
-static void refine_subpel( x264_t *h, x264_me_t *m, int hpel_iters, int qpel_iters )
+static void refine_subpel( x264_t *h, x264_me_t *m, int hpel_iters, int qpel_iters, int *p_halfpel_thresh, int b_refine_qpel )
 {
     const int bw = x264_pixel_size[m->i_pixel].w;
     const int bh = x264_pixel_size[m->i_pixel].h;
@@ -328,12 +318,14 @@ static void refine_subpel( x264_t *h, x264_me_t *m, int hpel_iters, int qpel_ite
     const int b_chroma_me = h->mb.b_chroma_me && i_pixel <= PIXEL_8x8;
 
     DECLARE_ALIGNED( uint8_t, pix[16*16], 16 );
-    int step, i;
+    int omx, omy;
+    int i;
 
     int bmx = m->mv[0];
     int bmy = m->mv[1];
     int bcost = m->cost;
 
+
     /* try the subpel component of the predicted mv if it's close to
      * the result of the fullpel search */
     if( hpel_iters )
@@ -341,22 +333,54 @@ static void refine_subpel( x264_t *h, x264_me_t *m, int hpel_iters, int qpel_ite
         int mx = X264_ABS(bmx - m->mvp[0]) < 4 ? m->mvp[0] : bmx;
         int my = X264_ABS(bmy - m->mvp[1]) < 4 ? m->mvp[1] : bmy;
         if( mx != bmx || my != bmy )
-            COST_MV( mx, my );
+            COST_MV_SAD( mx, my );
     }
-
-    for( step = 2; step >= 1; step-- )
+    
+    /* hpel search */
+    for( i = hpel_iters; i > 0; i-- )
+    {
+        omx = bmx;
+        omy = bmy;
+        COST_MV_SAD( omx, omy - 2 );
+        COST_MV_SAD( omx, omy + 2 );
+        COST_MV_SAD( omx - 2, omy );
+        COST_MV_SAD( omx + 2, omy );
+        if( bmx == omx && bmy == omy )
+            break;
+    }
+    
+    if( !b_refine_qpel )
     {
-	for( i = step>1 ? hpel_iters : qpel_iters; i > 0; i-- )
+        bcost = COST_MAX;
+        COST_MV_SATD( bmx, bmy );
+    }
+    
+    /* early termination when examining multiple reference frames */
+    if( p_halfpel_thresh )
+    {
+        if( (bcost*7)>>3 > *p_halfpel_thresh )
         {
-            int omx = bmx;
-            int omy = bmy;
-            COST_MV( omx, omy - step );
-            COST_MV( omx, omy + step );
-            COST_MV( omx - step, omy );
-            COST_MV( omx + step, omy );
-            if( bmx == omx && bmy == omy )
-                break;
-	}
+            m->cost = bcost;
+            m->mv[0] = bmx;
+            m->mv[1] = bmy;
+            // don't need cost_mv
+            return;
+        }
+        else if( bcost < *p_halfpel_thresh )
+            *p_halfpel_thresh = bcost;
+    }
+
+    /* qpel search */
+    for( i = qpel_iters; i > 0; i-- )
+    {
+        omx = bmx;
+        omy = bmy;
+        COST_MV_SATD( omx, omy - 1 );
+        COST_MV_SATD( omx, omy + 1 );
+        COST_MV_SATD( omx - 1, omy );
+        COST_MV_SATD( omx + 1, omy );
+        if( bmx == omx && bmy == omy )
+            break;
     }
 
     m->cost = bcost;
-- 
2.40.0