From 8d09ebe2e862688ce213d3f098ce7eca719fea23 Mon Sep 17 00:00:00 2001
From: Loren Merritt <pengvado@videolan.org>
Date: Sun, 27 Jan 2008 11:36:11 +0000
Subject: [PATCH] satd exhaustive motion search (--me tesa)

git-svn-id: svn://svn.videolan.org/x264/trunk@728 df754926-b1dd-0310-bc7b-ec298dee348c
---
 common/frame.c    |   2 +-
 common/pixel.c    |  47 ++++++++++++++++++
 common/pixel.h    |   8 +++-
 encoder/analyse.c |   2 +-
 encoder/encoder.c |  16 +++++--
 encoder/me.c      | 118 +++++++++++++++++++++++++++++++++++++++-------
 x264.c            |   3 +-
 x264.h            |   3 +-
 8 files changed, 172 insertions(+), 27 deletions(-)

diff --git a/common/frame.c b/common/frame.c
index 570441bf..ce8af34d 100644
--- a/common/frame.c
+++ b/common/frame.c
@@ -94,7 +94,7 @@ x264_frame_t *x264_frame_new( x264_t *h )
         }
     }
 
-    if( h->param.analyse.i_me_method == X264_ME_ESA )
+    if( h->param.analyse.i_me_method >= X264_ME_ESA )
     {
         CHECKED_MALLOC( frame->buffer[7],
                         2 * frame->i_stride[0] * (frame->i_lines[0] + 2*i_padv) * sizeof(uint16_t) );
diff --git a/common/pixel.c b/common/pixel.c
index 6bfc3a1c..046e01f3 100644
--- a/common/pixel.c
+++ b/common/pixel.c
@@ -323,6 +323,45 @@ SAD_X( 8x16_vis )
 SAD_X( 8x8_vis )
 #endif
 
+/****************************************************************************
+ * pixel_satd_x4
+ * no faster than single satd, but needed for satd to be a drop-in replacement for sad
+ ****************************************************************************/
+
+#define SATD_X( size, cpu ) \
+static void x264_pixel_satd_x3_##size##cpu( uint8_t *fenc, uint8_t *pix0, uint8_t *pix1, uint8_t *pix2, int i_stride, int scores[3] )\
+{\
+    scores[0] = x264_pixel_satd_##size##cpu( fenc, FENC_STRIDE, pix0, i_stride );\
+    scores[1] = x264_pixel_satd_##size##cpu( fenc, FENC_STRIDE, pix1, i_stride );\
+    scores[2] = x264_pixel_satd_##size##cpu( fenc, FENC_STRIDE, pix2, i_stride );\
+}\
+static void x264_pixel_satd_x4_##size##cpu( uint8_t *fenc, uint8_t *pix0, uint8_t *pix1, uint8_t *pix2, uint8_t *pix3, int i_stride, int scores[4] )\
+{\
+    scores[0] = x264_pixel_satd_##size##cpu( fenc, FENC_STRIDE, pix0, i_stride );\
+    scores[1] = x264_pixel_satd_##size##cpu( fenc, FENC_STRIDE, pix1, i_stride );\
+    scores[2] = x264_pixel_satd_##size##cpu( fenc, FENC_STRIDE, pix2, i_stride );\
+    scores[3] = x264_pixel_satd_##size##cpu( fenc, FENC_STRIDE, pix3, i_stride );\
+}
+#define SATD_X_DECL5( cpu )\
+SATD_X( 16x16, cpu )\
+SATD_X( 16x8, cpu )\
+SATD_X( 8x16, cpu )\
+SATD_X( 8x8, cpu )\
+SATD_X( 8x4, cpu )
+#define SATD_X_DECL7( cpu )\
+SATD_X_DECL5( cpu )\
+SATD_X( 4x8, cpu )\
+SATD_X( 4x4, cpu )
+
+SATD_X_DECL7()
+#ifdef HAVE_MMX
+SATD_X_DECL7( _mmxext )
+SATD_X_DECL5( _sse2 )
+#ifdef HAVE_SSE3
+SATD_X_DECL5( _ssse3 )
+#endif
+#endif
+
 /****************************************************************************
  * structural similarity metric
  ****************************************************************************/
@@ -487,6 +526,8 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf )
     INIT7( sad_x4, );
     INIT7( ssd, );
     INIT7( satd, );
+    INIT7( satd_x3, );
+    INIT7( satd_x4, );
     INIT4( sa8d, );
     INIT_ADS( );
 
@@ -505,6 +546,8 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf )
         INIT7( sad_x3, _mmxext );
         INIT7( sad_x4, _mmxext );
         INIT7( satd, _mmxext );
+        INIT7( satd_x3, _mmxext );
+        INIT7( satd_x4, _mmxext );
         INIT_ADS( _mmxext );
 
 #ifdef ARCH_X86
@@ -552,6 +595,8 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf )
         INIT2( sad_x3, _sse2 );
         INIT2( sad_x4, _sse2 );
         INIT5( satd, _sse2 );
+        INIT5( satd_x3, _sse2 );
+        INIT5( satd_x4, _sse2 );
         INIT_ADS( _sse2 );
 
 #ifdef ARCH_X86
@@ -588,6 +633,8 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf )
     if( cpu&X264_CPU_SSSE3 )
     {
         INIT5( satd, _ssse3 );
+        INIT5( satd_x3, _ssse3 );
+        INIT5( satd_x4, _ssse3 );
         INIT_ADS( _ssse3 );
 #ifdef ARCH_X86_64
         pixf->sa8d[PIXEL_16x16]= x264_pixel_sa8d_16x16_ssse3;
diff --git a/common/pixel.h b/common/pixel.h
index 1a8d0ac2..fb5f99ec 100644
--- a/common/pixel.h
+++ b/common/pixel.h
@@ -69,15 +69,19 @@ typedef struct
     x264_pixel_cmp_t ssim[7];
     x264_pixel_cmp_t sa8d[4];
     x264_pixel_cmp_t mbcmp[7]; /* either satd or sad for subpel refine and mode decision */
-    x264_pixel_cmp_t rdcmp[7]; /* either ssd or ssim for rate-distortion */
+    x264_pixel_cmp_t fpelcmp[7]; /* either satd or sad for fullpel motion search */
+    x264_pixel_cmp_x3_t fpelcmp_x3[7];
+    x264_pixel_cmp_x4_t fpelcmp_x4[7];
 
     void (*ssim_4x4x2_core)( const uint8_t *pix1, int stride1,
                              const uint8_t *pix2, int stride2, int sums[2][4] );
     float (*ssim_end4)( int sum0[5][4], int sum1[5][4], int width );
 
-    /* multiple parallel calls to sad. */
+    /* multiple parallel calls to cmp. */
     x264_pixel_cmp_x3_t sad_x3[7];
     x264_pixel_cmp_x4_t sad_x4[7];
+    x264_pixel_cmp_x3_t satd_x3[7];
+    x264_pixel_cmp_x4_t satd_x4[7];
 
     /* abs-diff-sum for successive elimination.
      * may round width up to a multiple of 16. */
diff --git a/encoder/analyse.c b/encoder/analyse.c
index 62971585..fbf2b92b 100644
--- a/encoder/analyse.c
+++ b/encoder/analyse.c
@@ -190,7 +190,7 @@ static void x264_mb_analyse_load_costs( x264_t *h, x264_mb_analysis_t *a )
     a->p_cost_mv = p_cost_mv[a->i_qp];
 
     /* FIXME is this useful for all me methods? */
-    if( h->param.analyse.i_me_method == X264_ME_ESA && !x264_cost_mv_fpel[a->i_qp][0] )
+    if( h->param.analyse.i_me_method >= X264_ME_ESA && !x264_cost_mv_fpel[a->i_qp][0] )
     {
         for( j=0; j<4; j++ )
         {
diff --git a/encoder/encoder.c b/encoder/encoder.c
index 1715c68f..b7baf938 100644
--- a/encoder/encoder.c
+++ b/encoder/encoder.c
@@ -363,7 +363,7 @@ static int x264_validate_parameters( x264_t *h )
 
     if( h->param.b_interlaced )
     {
-        if( h->param.analyse.i_me_method == X264_ME_ESA )
+        if( h->param.analyse.i_me_method >= X264_ME_ESA )
         {
             x264_log( h, X264_LOG_WARNING, "interlace + me=esa is not implemented\n" );
             h->param.analyse.i_me_method = X264_ME_UMH;
@@ -449,12 +449,15 @@ static int x264_validate_parameters( x264_t *h )
         h->param.i_cqm_preset = X264_CQM_FLAT;
 
     if( h->param.analyse.i_me_method < X264_ME_DIA ||
-        h->param.analyse.i_me_method > X264_ME_ESA )
+        h->param.analyse.i_me_method > X264_ME_TESA )
         h->param.analyse.i_me_method = X264_ME_HEX;
     if( h->param.analyse.i_me_range < 4 )
         h->param.analyse.i_me_range = 4;
     if( h->param.analyse.i_me_range > 16 && h->param.analyse.i_me_method <= X264_ME_HEX )
         h->param.analyse.i_me_range = 16;
+    if( h->param.analyse.i_me_method == X264_ME_TESA &&
+        (h->mb.b_lossless || h->param.analyse.i_subpel_refine <= 1) )
+        h->param.analyse.i_me_method = X264_ME_ESA;
     h->param.analyse.i_subpel_refine = x264_clip3( h->param.analyse.i_subpel_refine, 1, 7 );
     h->param.analyse.b_bframe_rdo = h->param.analyse.b_bframe_rdo && h->param.analyse.i_subpel_refine >= 6;
     h->param.analyse.b_mixed_references = h->param.analyse.b_mixed_references && h->param.i_frame_reference > 1;
@@ -546,9 +549,12 @@ static int x264_validate_parameters( x264_t *h )
 
 static void mbcmp_init( x264_t *h )
 {
-    memcpy( h->pixf.mbcmp,
-            ( h->mb.b_lossless || h->param.analyse.i_subpel_refine <= 1 ) ? h->pixf.sad : h->pixf.satd,
-            sizeof(h->pixf.mbcmp) );
+    int satd = !h->mb.b_lossless && h->param.analyse.i_subpel_refine > 1;
+    memcpy( h->pixf.mbcmp, satd ? h->pixf.satd : h->pixf.sad, sizeof(h->pixf.mbcmp) );
+    satd &= h->param.analyse.i_me_method == X264_ME_TESA;
+    memcpy( h->pixf.fpelcmp, satd ? h->pixf.satd : h->pixf.sad, sizeof(h->pixf.fpelcmp) );
+    memcpy( h->pixf.fpelcmp_x3, satd ? h->pixf.satd_x3 : h->pixf.sad_x3, sizeof(h->pixf.fpelcmp_x3) );
+    memcpy( h->pixf.fpelcmp_x4, satd ? h->pixf.satd_x4 : h->pixf.sad_x4, sizeof(h->pixf.fpelcmp_x4) );
 }
 
 /****************************************************************************
diff --git a/encoder/me.c b/encoder/me.c
index 9706cd45..61c6db2a 100644
--- a/encoder/me.c
+++ b/encoder/me.c
@@ -54,7 +54,7 @@ static void refine_subpel( x264_t *h, x264_me_t *m, int hpel_iters, int qpel_ite
 
 #define COST_MV( mx, my )\
 {\
-    int cost = h->pixf.sad[i_pixel]( m->p_fenc[0], FENC_STRIDE,\
+    int cost = h->pixf.fpelcmp[i_pixel]( m->p_fenc[0], FENC_STRIDE,\
                    &p_fref[(my)*m->i_stride[0]+(mx)], m->i_stride[0] )\
              + BITS_MVD(mx,my);\
     COPY3_IF_LT( bcost, cost, bmx, mx, bmy, my );\
@@ -64,7 +64,7 @@ static void refine_subpel( x264_t *h, x264_me_t *m, int hpel_iters, int qpel_ite
 { \
     int stride = 16; \
     uint8_t *src = h->mc.get_ref( pix, &stride, m->p_fref, m->i_stride[0], mx, my, bw, bh ); \
-    int cost = h->pixf.sad[i_pixel]( m->p_fenc[0], FENC_STRIDE, src, stride ) \
+    int cost = h->pixf.fpelcmp[i_pixel]( m->p_fenc[0], FENC_STRIDE, src, stride ) \
              + p_cost_mvx[ mx ] + p_cost_mvy[ my ]; \
     COPY3_IF_LT( bpred_cost, cost, bpred_mx, mx, bpred_my, my ); \
 }
@@ -72,7 +72,7 @@ static void refine_subpel( x264_t *h, x264_me_t *m, int hpel_iters, int qpel_ite
 #define COST_MV_X3_DIR( m0x, m0y, m1x, m1y, m2x, m2y, costs )\
 {\
     uint8_t *pix_base = p_fref + bmx + bmy*m->i_stride[0];\
-    h->pixf.sad_x3[i_pixel]( m->p_fenc[0],\
+    h->pixf.fpelcmp_x3[i_pixel]( m->p_fenc[0],\
         pix_base + (m0x) + (m0y)*m->i_stride[0],\
         pix_base + (m1x) + (m1y)*m->i_stride[0],\
         pix_base + (m2x) + (m2y)*m->i_stride[0],\
@@ -85,7 +85,7 @@ static void refine_subpel( x264_t *h, x264_me_t *m, int hpel_iters, int qpel_ite
 #define COST_MV_X4( m0x, m0y, m1x, m1y, m2x, m2y, m3x, m3y )\
 {\
     uint8_t *pix_base = p_fref + omx + omy*m->i_stride[0];\
-    h->pixf.sad_x4[i_pixel]( m->p_fenc[0],\
+    h->pixf.fpelcmp_x4[i_pixel]( m->p_fenc[0],\
         pix_base + (m0x) + (m0y)*m->i_stride[0],\
         pix_base + (m1x) + (m1y)*m->i_stride[0],\
         pix_base + (m2x) + (m2y)*m->i_stride[0],\
@@ -103,7 +103,7 @@ static void refine_subpel( x264_t *h, x264_me_t *m, int hpel_iters, int qpel_ite
 
 #define COST_MV_X3_ABS( m0x, m0y, m1x, m1y, m2x, m2y )\
 {\
-    h->pixf.sad_x3[i_pixel]( m->p_fenc[0],\
+    h->pixf.fpelcmp_x3[i_pixel]( m->p_fenc[0],\
         p_fref + (m0x) + (m0y)*m->i_stride[0],\
         p_fref + (m1x) + (m1y)*m->i_stride[0],\
         p_fref + (m2x) + (m2y)*m->i_stride[0],\
@@ -450,6 +450,7 @@ me_hex2:
         }
 
     case X264_ME_ESA:
+    case X264_ME_TESA:
         {
             const int min_x = X264_MAX( bmx - i_me_range, mv_x_min );
             const int min_y = X264_MAX( bmy - i_me_range, mv_y_min );
@@ -488,16 +489,101 @@ me_hex2:
             if( i_pixel == PIXEL_8x16 || i_pixel == PIXEL_4x8 )
                 enc_dc[1] = enc_dc[2];
 
-            for( my = min_y; my <= max_y; my++ )
+            if( h->mb.i_me_method == X264_ME_TESA )
+            {
+                // ADS threshold, then SAD threshold, then keep the best few SADs, then SATD
+                typedef struct {
+                    int sad;
+                    int16_t mx, my;
+                } mvsad_t;
+                mvsad_t *mvsads = x264_malloc( width*(max_y-min_y+1)*sizeof(mvsad_t) );
+                int nmvsad = 0, limit;
+                int sad_thresh = i_me_range <= 16 ? 10 : i_me_range <= 24 ? 11 : 12;
+                int bsad = h->pixf.sad[i_pixel]( m->p_fenc[0], FENC_STRIDE, p_fref+bmy*stride+bmx, stride )
+                         + BITS_MVD( bmx, bmy );
+                for( my = min_y; my <= max_y; my++ )
+                {
+                    int ycost = p_cost_mvy[my<<2];
+                    bsad -= ycost;
+                    xn = h->pixf.ads[i_pixel]( enc_dc, sums_base + min_x + my * stride, delta,
+                                               cost_fpel_mvx+min_x, xs, width, bsad*17/16 );
+                    for( i=0; i<xn-2; i+=3 )
+                    {
+                        uint8_t *ref = p_fref+min_x+my*stride;
+                        int sads[3];
+                        h->pixf.sad_x3[i_pixel]( m->p_fenc[0], ref+xs[i], ref+xs[i+1], ref+xs[i+2], stride, sads );
+                        for( j=0; j<3; j++ )
+                        {
+                            int sad = sads[j] + cost_fpel_mvx[xs[i+j]];
+                            if( sad < bsad*sad_thresh>>3 )
+                            {
+                                COPY1_IF_LT( bsad, sad );
+                                mvsads[nmvsad].sad = sad + ycost;
+                                mvsads[nmvsad].mx = min_x+xs[i+j];
+                                mvsads[nmvsad].my = my;
+                                nmvsad++;
+                            }
+                        }
+                    }
+                    for( ; i<xn; i++ )
+                    {
+                        int mx = min_x+xs[i];
+                        int sad = h->pixf.sad[i_pixel]( m->p_fenc[0], FENC_STRIDE, p_fref+mx+my*stride, stride )
+                                + cost_fpel_mvx[xs[i]];
+                        if( sad < bsad*sad_thresh>>3 )
+                        {
+                            COPY1_IF_LT( bsad, sad );
+                            mvsads[nmvsad].sad = sad + ycost;
+                            mvsads[nmvsad].mx = mx;
+                            mvsads[nmvsad].my = my;
+                            nmvsad++;
+                        }
+                    }
+                    bsad += ycost;
+                }
+
+                limit = i_me_range / 2;
+                if( nmvsad > limit*2 )
+                {
+                    // halve the range if the domain is too large... eh, close enough
+                    bsad = bsad*(sad_thresh+8)>>4;
+                    for( i=0; i<nmvsad && mvsads[i].sad <= bsad; i++ );
+                    for( j=i; j<nmvsad; j++ )
+                        if( mvsads[j].sad <= bsad )
+                            mvsads[i++] = mvsads[j];
+                    nmvsad = i;
+                }
+                if( nmvsad > limit )
+                {
+                    for( i=0; i<limit; i++ )
+                    {
+                        int bj = i;
+                        int bsad = mvsads[bj].sad;
+                        for( j=i+1; j<nmvsad; j++ )
+                            COPY2_IF_LT( bsad, mvsads[j].sad, bj, j );
+                        if( bj > i )
+                            XCHG( mvsad_t, mvsads[i], mvsads[bj] );
+                    }
+                    nmvsad = limit;
+                }
+                for( i=0; i<nmvsad; i++ )
+                    COST_MV( mvsads[i].mx, mvsads[i].my );
+                x264_free( mvsads );
+            }
+            else
             {
-                bcost -= p_cost_mvy[my<<2];
-                xn = h->pixf.ads[i_pixel]( enc_dc, sums_base + min_x + my * stride, delta,
-                                           cost_fpel_mvx+min_x, xs, width, bcost );
-                for( i=0; i<xn-2; i+=3 )
-                    COST_MV_X3_ABS( min_x+xs[i],my, min_x+xs[i+1],my, min_x+xs[i+2],my );
-                bcost += p_cost_mvy[my<<2];
-                for( ; i<xn; i++ )
-                    COST_MV( min_x+xs[i], my );
+                // just ADS and SAD
+                for( my = min_y; my <= max_y; my++ )
+                {
+                    bcost -= p_cost_mvy[my<<2];
+                    xn = h->pixf.ads[i_pixel]( enc_dc, sums_base + min_x + my * stride, delta,
+                                               cost_fpel_mvx+min_x, xs, width, bcost );
+                    for( i=0; i<xn-2; i+=3 )
+                        COST_MV_X3_ABS( min_x+xs[i],my, min_x+xs[i+1],my, min_x+xs[i+2],my );
+                    bcost += p_cost_mvy[my<<2];
+                    for( ; i<xn; i++ )
+                        COST_MV( min_x+xs[i], my );
+                }
             }
 
             if( xs != xs_buf )
@@ -553,7 +639,7 @@ void x264_me_refine_qpel( x264_t *h, x264_me_t *m )
 { \
     int stride = 16; \
     uint8_t *src = h->mc.get_ref( pix[0], &stride, m->p_fref, m->i_stride[0], mx, my, bw, bh ); \
-    int cost = h->pixf.sad[i_pixel]( m->p_fenc[0], FENC_STRIDE, src, stride ) \
+    int cost = h->pixf.fpelcmp[i_pixel]( m->p_fenc[0], FENC_STRIDE, src, stride ) \
              + p_cost_mvx[ mx ] + p_cost_mvy[ my ]; \
     COPY3_IF_LT( bcost, cost, bmx, mx, bmy, my ); \
 }
@@ -623,7 +709,7 @@ static void refine_subpel( x264_t *h, x264_me_t *m, int hpel_iters, int qpel_ite
         src2 = h->mc.get_ref( pix[1], &stride, m->p_fref, m->i_stride[0], omx-2, omy, bw+4, bh );
         src1 = src0 + stride;
         src3 = src2 + 1;
-        h->pixf.sad_x4[i_pixel]( m->p_fenc[0], src0, src1, src2, src3, stride, costs );
+        h->pixf.fpelcmp_x4[i_pixel]( m->p_fenc[0], src0, src1, src2, src3, stride, costs );
         COPY2_IF_LT( bcost, costs[0] + p_cost_mvx[omx  ] + p_cost_mvy[omy-2], bmy, omy-2 );
         COPY2_IF_LT( bcost, costs[1] + p_cost_mvx[omx  ] + p_cost_mvy[omy+2], bmy, omy+2 );
         COPY3_IF_LT( bcost, costs[2] + p_cost_mvx[omx-2] + p_cost_mvy[omy  ], bmx, omx-2, bmy, omy );
diff --git a/x264.c b/x264.c
index 73877c9a..f68755d9 100644
--- a/x264.c
+++ b/x264.c
@@ -225,7 +225,8 @@ static void Help( x264_param_t *defaults, int b_longhelp )
     H1( "                                  - dia: diamond search, radius 1 (fast)\n"
         "                                  - hex: hexagonal search, radius 2\n"
         "                                  - umh: uneven multi-hexagon search\n"
-        "                                  - esa: exhaustive search (slow)\n" );
+        "                                  - esa: exhaustive search\n"
+        "                                  - tesa: hadamard exhaustive search (slow)\n" );
     else H0( "                                  - dia, hex, umh\n" );
     H0( "      --merange <integer>     Maximum motion vector search range [%d]\n", defaults->analyse.i_me_range );
     H1( "      --mvrange <integer>     Maximum motion vector length [-1 (auto)]\n" );
diff --git a/x264.h b/x264.h
index 6d2ee753..f4fd1d13 100644
--- a/x264.h
+++ b/x264.h
@@ -74,6 +74,7 @@ typedef struct x264_t x264_t;
 #define X264_ME_HEX                  1
 #define X264_ME_UMH                  2
 #define X264_ME_ESA                  3
+#define X264_ME_TESA                 4
 #define X264_CQM_FLAT                0
 #define X264_CQM_JVT                 1
 #define X264_CQM_CUSTOM              2
@@ -83,7 +84,7 @@ typedef struct x264_t x264_t;
 #define X264_RC_ABR                  2
 
 static const char * const x264_direct_pred_names[] = { "none", "spatial", "temporal", "auto", 0 };
-static const char * const x264_motion_est_names[] = { "dia", "hex", "umh", "esa", 0 };
+static const char * const x264_motion_est_names[] = { "dia", "hex", "umh", "esa", "tesa", 0 };
 static const char * const x264_overscan_names[] = { "undef", "show", "crop", 0 };
 static const char * const x264_vidformat_names[] = { "component", "pal", "ntsc", "secam", "mac", "undef", 0 };
 static const char * const x264_fullrange_names[] = { "off", "on", 0 };
-- 
2.40.0