From: Loren Merritt <pengvado@videolan.org>
Date: Tue, 12 Sep 2006 22:18:29 +0000 (+0000)
Subject: faster ESA
X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=f8652aab3dda281aa446ead0674d7e1f1c6d6e74;p=libx264

faster ESA


git-svn-id: svn://svn.videolan.org/x264/trunk@561 df754926-b1dd-0310-bc7b-ec298dee348c
---

diff --git a/common/frame.c b/common/frame.c
index b64cbb58..8afbc9a4 100644
--- a/common/frame.c
+++ b/common/frame.c
@@ -92,7 +92,7 @@ x264_frame_t *x264_frame_new( x264_t *h )
     if( h->param.analyse.i_me_method == X264_ME_ESA )
     {
         CHECKED_MALLOC( frame->buffer[11],
-                        frame->i_stride[0] * (frame->i_lines[0] + 64) * sizeof(uint16_t) );
+                        2 * frame->i_stride[0] * (frame->i_lines[0] + 64) * sizeof(uint16_t) );
         frame->integral = (uint16_t*)frame->buffer[11] + frame->i_stride[0] * 32 + 32;
     }
 
diff --git a/common/mc.c b/common/mc.c
index ed9a5cfd..68802690 100644
--- a/common/mc.c
+++ b/common/mc.c
@@ -417,27 +417,31 @@ void x264_frame_filter( int cpu, x264_frame_t *frame )
     }
 
     /* generate integral image:
-     * each entry in frame->integral is the sum of all luma samples above and
-     * to the left of its location (inclusive).
-     * this allows us to calculate the DC of any rectangle by looking only
-     * at the corner entries.
-     * individual entries will overflow 16 bits, but that's ok:
-     * we only need the differences between entries, and those will be correct
-     * as long as we don't try to evaluate a rectangle bigger than 16x16.
-     * likewise, we don't really have to init the edges to 0, leaving garbage
-     * there wouldn't affect the results.*/
+     * frame->integral contains 2 planes. in the upper plane, each element is
+     * the sum of an 8x8 pixel region with top-left corner on that point.
+     * in the lower plane, 4x4 sums (needed only with --analyse p4x4). */
 
     if( frame->integral )
     {
         memset( frame->integral - 32 * stride - 32, 0, stride * sizeof(uint16_t) );
-        for( y = -31; y < frame->i_lines[0] + 32; y++ )
+        for( y = -32; y < frame->i_lines[0] + 31; y++ )
         {
             uint8_t  *ref  = frame->plane[0] + y * stride - 32;
-            uint16_t *line = frame->integral + y * stride - 32;
+            uint16_t *line = frame->integral + (y+1) * stride - 31;
             uint16_t v = line[0] = 0;
-            for( x = 1; x < stride; x++ )
+            for( x = 0; x < stride-1; x++ )
                 line[x] = v += ref[x] + line[x-stride] - line[x-stride-1];
         }
+        for( y = -31; y < frame->i_lines[0] + 24; y++ )
+        {
+            uint16_t *line = frame->integral + y * stride - 31;
+            uint16_t *sum4 = line + frame->i_stride[0] * (frame->i_lines[0] + 64);
+            for( x = -31; x < stride - 40; x++, line++, sum4++ )
+            {
+                sum4[0] =  line[4+4*stride] - line[4] - line[4*stride] + line[0];
+                line[0] += line[8+8*stride] - line[8] - line[8*stride];
+            }
+        }
     }
 }
 
diff --git a/encoder/me.c b/encoder/me.c
index e0e8e24d..d113e0c7 100644
--- a/encoder/me.c
+++ b/encoder/me.c
@@ -104,10 +104,10 @@ static void refine_subpel( x264_t *h, x264_me_t *m, int hpel_iters, int qpel_ite
         p_fref + (m2x) + (m2y)*m->i_stride[0],\
         p_fref + (m3x) + (m3y)*m->i_stride[0],\
         m->i_stride[0], costs );\
-    costs[0] += BITS_MVD( m0x, m0y );\
-    costs[1] += BITS_MVD( m1x, m1y );\
-    costs[2] += BITS_MVD( m2x, m2y );\
-    costs[3] += BITS_MVD( m3x, m3y );\
+    costs[0] += p_cost_mvx[m0x<<2]; /* no cost_mvy */\
+    costs[1] += p_cost_mvx[m1x<<2];\
+    costs[2] += p_cost_mvx[m2x<<2];\
+    costs[3] += p_cost_mvx[m3x<<2];\
     COPY3_IF_LT( bcost, costs[0], bmx, m0x, bmy, m0y );\
     COPY3_IF_LT( bcost, costs[1], bmx, m1x, bmy, m1y );\
     COPY3_IF_LT( bcost, costs[2], bmx, m2x, bmy, m2y );\
@@ -462,35 +462,64 @@ me_hex2:
             /* successive elimination by comparing DC before a full SAD,
              * because sum(abs(diff)) >= abs(diff(sum)). */
             const int stride = m->i_stride[0];
-            const int dw = x264_pixel_size[i_pixel].w;
-            const int dh = x264_pixel_size[i_pixel].h * stride;
+            const uint16_t *integral_base = m->integral;
             static uint8_t zero[16*16] = {0,};
-            const int enc_dc = h->pixf.sad[i_pixel]( m->p_fenc[0], FENC_STRIDE, zero, 16 );
-            const uint16_t *integral_base = &m->integral[ -1 - 1*stride ];
+            int enc_dc[4];
+            int sad_size = i_pixel <= PIXEL_8x8 ? PIXEL_8x8 : PIXEL_4x4;
+            int sad_w = x264_pixel_size[sad_size].w;
+            h->pixf.sad_x4[sad_size]( zero, m->p_fenc[0], m->p_fenc[0]+sad_w,
+                m->p_fenc[0]+sad_w*FENC_STRIDE, m->p_fenc[0]+sad_w+sad_w*FENC_STRIDE,
+                FENC_STRIDE, enc_dc );
+            if( sad_w == 4 )
+                integral_base += stride * (h->fenc->i_lines[0] + 64);
+
+#define ESA(ADS) \
+            for( my = min_y; my <= max_y; my++ )\
+            {\
+                int mvs[3], i_mvs=0;\
+                bcost -= p_cost_mvy[my<<2];\
+                for( mx = min_x; mx <= max_x; mx++ )\
+                {\
+                    const uint16_t *integral = &integral_base[ mx + my * stride ];\
+                    if( ADS < bcost - p_cost_mvx[mx<<2] )\
+                    {\
+                        if( i_mvs == 3 )\
+                        {\
+                            COST_MV_X4_ABS( mvs[0],my, mvs[1],my, mvs[2],my, mx,my );\
+                            i_mvs = 0;\
+                        }\
+                        else\
+                            mvs[i_mvs++] = mx;\
+                    }\
+                }\
+                bcost += p_cost_mvy[my<<2];\
+                for( i=0; i<i_mvs; i++ )\
+                    COST_MV( mvs[i], my );\
+            }
 
-            for( my = min_y; my <= max_y; my++ )
+            if( i_pixel == PIXEL_16x16 )
             {
-                int mvs[3], i_mvs=0;
-                for( mx = min_x; mx <= max_x; mx++ )
+                ESA( abs( enc_dc[0] - integral[0] )
+                   + abs( enc_dc[1] - integral[8] )
+                   + abs( enc_dc[2] - integral[8*stride] )
+                   + abs( enc_dc[3] - integral[8*stride+8] ) );
+            }
+            else if( i_pixel == PIXEL_8x8 || i_pixel == PIXEL_4x4 )
+            {
+                ESA( abs( enc_dc[0] - integral[0] ) );
+            }
+            else
+            {
+                int dw = i_pixel < PIXEL_8x8 ? 8 : 4;
+                if( i_pixel == PIXEL_8x16 || i_pixel == PIXEL_4x8 )
                 {
-                    const uint16_t *integral = &integral_base[ mx + my * stride ];
-                    const uint16_t ref_dc = integral[  0 ] + integral[ dh + dw ]
-                                          - integral[ dw ] - integral[ dh ];
-                    const int bsad = bcost - BITS_MVD(mx,my);
-                    if( abs( ref_dc - enc_dc ) < bsad )
-                    {
-                        if( i_mvs == 3 )
-                        {
-                            COST_MV_X4_ABS( mvs[0],my, mvs[1],my, mvs[2],my, mx,my );
-                            i_mvs = 0;
-                        }
-                        else
-                            mvs[i_mvs++] = mx;
-                    }
+                    dw *= stride;
+                    enc_dc[1] = enc_dc[2];
                 }
-                for( i=0; i<i_mvs; i++ )
-                    COST_MV( mvs[i], my );
+                ESA( abs( enc_dc[0] - integral[0] )
+                   + abs( enc_dc[1] - integral[dw] ) );
             }
+#undef ESA
 #endif
         }
         break;