From 0f4c0eb836912fcbd2376c920a9dd7bf438f4e43 Mon Sep 17 00:00:00 2001
From: Loren Merritt <pengvado@videolan.org>
Date: Mon, 10 Apr 2006 17:56:02 +0000
Subject: [PATCH] more interleaved SAD. 1% faster umh, 6% faster esa.

git-svn-id: svn://svn.videolan.org/x264/trunk@491 df754926-b1dd-0310-bc7b-ec298dee348c
---
 encoder/me.c | 141 ++++++++++++++++++++++++++-------------------------
 1 file changed, 73 insertions(+), 68 deletions(-)

diff --git a/encoder/me.c b/encoder/me.c
index 479949b8..c4eddcf1 100644
--- a/encoder/me.c
+++ b/encoder/me.c
@@ -44,35 +44,6 @@ static const int subpel_iterations[][4] =
 
 static void refine_subpel( x264_t *h, x264_me_t *m, int hpel_iters, int qpel_iters, int *p_halfpel_thresh, int b_refine_qpel );
 
-#define BITS_MVD( mx, my )\
-     (p_cost_mvx[(mx)<<2] + p_cost_mvy[(my)<<2])
-
-#define COST_MV( mx, my ) \
-{ \
-    int cost = h->pixf.sad[i_pixel]( m->p_fenc[0], FENC_STRIDE, \
-                   &p_fref[(my)*m->i_stride[0]+(mx)], m->i_stride[0] ) \
-             + BITS_MVD(mx,my); \
-    if( cost < bcost ) \
-    {                  \
-        bcost = cost;  \
-        bmx = mx;      \
-        bmy = my;      \
-    } \
-}
-
-#define COST_MV_PDE( mx, my ) \
-{ \
-    int cost = h->pixf.sad_pde[i_pixel]( m->p_fenc[0], FENC_STRIDE, \
-                   &p_fref[(my)*m->i_stride[0]+(mx)], m->i_stride[0], \
-                   bcost - p_cost_mvx[ (mx)<<2 ] - p_cost_mvy[ (my)<<2 ] ); \
-    if( cost < bcost - BITS_MVD(mx,my) ) \
-    { \
-        bcost = cost + BITS_MVD(mx,my); \
-        bmx = mx; \
-        bmy = my; \
-    } \
-}
-
 #define COPY2_IF_LT(x,y,a,b)\
 if((y)<(x))\
 {\
@@ -88,6 +59,17 @@ if((y)<(x))\
     (c)=(d);\
 }
 
+#define BITS_MVD( mx, my )\
+    (p_cost_mvx[(mx)<<2] + p_cost_mvy[(my)<<2])
+
+#define COST_MV( mx, my )\
+{\
+    int cost = h->pixf.sad[i_pixel]( m->p_fenc[0], FENC_STRIDE,\
+                   &p_fref[(my)*m->i_stride[0]+(mx)], m->i_stride[0] )\
+             + BITS_MVD(mx,my);\
+    COPY3_IF_LT( bcost, cost, bmx, mx, bmy, my );\
+}
+
 #define COST_MV_X3_DIR( m0x, m0y, m1x, m1y, m2x, m2y, costs )\
 {\
     uint8_t *pix_base = p_fref + bmx + bmy*m->i_stride[0];\
@@ -120,6 +102,24 @@ if((y)<(x))\
     COPY3_IF_LT( bcost, costs[3], bmx, omx+(m3x), bmy, omy+(m3y) );\
 }
 
+#define COST_MV_X4_ABS( m0x, m0y, m1x, m1y, m2x, m2y, m3x, m3y )\
+{\
+    h->pixf.sad_x4[i_pixel]( m->p_fenc[0],\
+        p_fref + (m0x) + (m0y)*m->i_stride[0],\
+        p_fref + (m1x) + (m1y)*m->i_stride[0],\
+        p_fref + (m2x) + (m2y)*m->i_stride[0],\
+        p_fref + (m3x) + (m3y)*m->i_stride[0],\
+        m->i_stride[0], costs );\
+    costs[0] += BITS_MVD( m0x, m0y );\
+    costs[1] += BITS_MVD( m1x, m1y );\
+    costs[2] += BITS_MVD( m2x, m2y );\
+    costs[3] += BITS_MVD( m3x, m3y );\
+    COPY3_IF_LT( bcost, costs[0], bmx, m0x, bmy, m0y );\
+    COPY3_IF_LT( bcost, costs[1], bmx, m1x, bmy, m1y );\
+    COPY3_IF_LT( bcost, costs[2], bmx, m2x, bmy, m2y );\
+    COPY3_IF_LT( bcost, costs[3], bmx, m3x, bmy, m3y );\
+}
+
 /*  1  */
 /* 101 */
 /*  1  */
@@ -129,23 +129,31 @@ if((y)<(x))\
     COST_MV_X4( 0,-1, 0,1, -1,0, 1,0 );\
 }
 
-#define CROSS( start, x_max, y_max ) \
-    { \
-        for( i = start; i < x_max; i+=2 ) \
-        { \
-            if( omx + i <= mv_x_max ) \
-                COST_MV( omx + i, omy ); \
-            if( omx - i >= mv_x_min ) \
-                COST_MV( omx - i, omy ); \
-        } \
-        for( i = start; i < y_max; i+=2 ) \
-        { \
-            if( omy + i <= mv_y_max ) \
-                COST_MV( omx, omy + i ); \
-            if( omy - i >= mv_y_min ) \
-                COST_MV( omx, omy - i ); \
-        } \
-    }
+#define CROSS( start, x_max, y_max )\
+{\
+    i = start;\
+    if( x_max <= X264_MIN(mv_x_max-omx, omx-mv_x_min) )\
+        for( ; i < x_max-2; i+=4 )\
+            COST_MV_X4( i,0, -i,0, i+2,0, -i-2,0 );\
+    for( ; i < x_max; i+=2 )\
+    {\
+        if( omx+i <= mv_x_max )\
+            COST_MV( omx+i, omy );\
+        if( omx-i >= mv_x_min )\
+            COST_MV( omx-i, omy );\
+    }\
+    i = start;\
+    if( y_max <= X264_MIN(mv_y_max-omy, omy-mv_y_min) )\
+        for( ; i < y_max-2; i+=4 )\
+            COST_MV_X4( 0,i, 0,-i, 0,i+2, 0,-i-2 );\
+    for( ; i < y_max; i+=2 )\
+    {\
+        if( omy+i <= mv_y_max )\
+            COST_MV( omx, omy+i );\
+        if( omy-i >= mv_y_min )\
+            COST_MV( omx, omy-i );\
+    }\
+}
 
 
 void x264_me_search_ref( x264_t *h, x264_me_t *m, int (*mvc)[2], int i_mvc, int *p_halfpel_thresh )
@@ -441,31 +449,28 @@ me_hex2:
             const int enc_dc = h->pixf.sad[i_pixel]( m->p_fenc[0], FENC_STRIDE, zero, 16 );
             const uint16_t *integral_base = &m->integral[ -1 - 1*stride ];
 
-            if( h->pixf.sad_pde[i_pixel] )
-            {
-                for( my = min_y; my <= max_y; my++ )
-                    for( mx = min_x; mx <= max_x; mx++ )
-                    {
-                        const uint16_t *integral = &integral_base[ mx + my * stride ];
-                        const uint16_t ref_dc = integral[  0 ] + integral[ dh + dw ]
-                                              - integral[ dw ] - integral[ dh ];
-                        const int bsad = bcost - BITS_MVD(mx,my);
-                        if( abs( ref_dc - enc_dc ) < bsad )
-                            COST_MV_PDE( mx, my );
-                    }
-            }
-            else
+            for( my = min_y; my <= max_y; my++ )
             {
-                for( my = min_y; my <= max_y; my++ )
-                    for( mx = min_x; mx <= max_x; mx++ )
+                int mvs[3], i_mvs=0;
+                for( mx = min_x; mx <= max_x; mx++ )
+                {
+                    const uint16_t *integral = &integral_base[ mx + my * stride ];
+                    const uint16_t ref_dc = integral[  0 ] + integral[ dh + dw ]
+                                          - integral[ dw ] - integral[ dh ];
+                    const int bsad = bcost - BITS_MVD(mx,my);
+                    if( abs( ref_dc - enc_dc ) < bsad )
                     {
-                        const uint16_t *integral = &integral_base[ mx + my * stride ];
-                        const uint16_t ref_dc = integral[  0 ] + integral[ dh + dw ]
-                                              - integral[ dw ] - integral[ dh ];
-                        const int bsad = bcost - BITS_MVD(mx,my);
-                        if( abs( ref_dc - enc_dc ) < bsad )
-                            COST_MV( mx, my );
+                        if( i_mvs == 3 )
+                        {
+                            COST_MV_X4_ABS( mvs[0],my, mvs[1],my, mvs[2],my, mx,my );
+                            i_mvs = 0;
+                        }
+                        else
+                            mvs[i_mvs++] = mx;
                     }
+                }
+                for( i=0; i<i_mvs; i++ )
+                    COST_MV( mvs[i], my );
             }
 #endif
         }
-- 
2.40.0