From 8850b6faaf55b83ed3aa86ff9fcb5e35c439b236 Mon Sep 17 00:00:00 2001
From: Loren Merritt <pengvado@videolan.org>
Date: Tue, 12 Sep 2006 22:21:23 +0000
Subject: [PATCH] faster ESA

git-svn-id: svn://svn.videolan.org/x264/trunk@562 df754926-b1dd-0310-bc7b-ec298dee348c
---
 common/amd64/pixel-a.asm |  84 ++++++++++++++++++++++++++++++++
 common/i386/pixel-a.asm  | 100 +++++++++++++++++++++++++++++++++++++++
 common/i386/pixel.h      |   7 +++
 common/pixel.c           |  45 ++++++++++++++++++
 common/pixel.h           |   5 ++
 encoder/me.c             |  80 ++++++++++++-------------------
 tools/checkasm.c         |  21 +++++++-
 7 files changed, 292 insertions(+), 50 deletions(-)

diff --git a/common/amd64/pixel-a.asm b/common/amd64/pixel-a.asm
index f954a325..705596b7 100644
--- a/common/amd64/pixel-a.asm
+++ b/common/amd64/pixel-a.asm
@@ -456,6 +456,10 @@ cglobal x264_intra_satd_x3_4x4_mmxext
 cglobal x264_intra_satd_x3_8x8c_mmxext
 cglobal x264_intra_satd_x3_16x16_mmxext
 
+cglobal x264_pixel_ads4_mmxext
+cglobal x264_pixel_ads2_mmxext
+cglobal x264_pixel_ads1_mmxext
+
 
 %macro SAD_START 0
     pxor    mm0, mm0
@@ -1110,3 +1114,83 @@ x264_intra_satd_x3_8x8c_mmxext:
     movd        [parm3q+4], mm1 ; i8x8c_h satd
     movd        [parm3q+8], mm2 ; i8x8c_v satd
     ret
+
+
+
+;-----------------------------------------------------------------------------
+;  void x264_pixel_ads4_mmxext( int enc_dc[4], uint16_t *sums, int delta,
+;                               uint16_t *res, int width )
+;-----------------------------------------------------------------------------
+ALIGN 16
+x264_pixel_ads4_mmxext:
+    movq    mm6, [parm1q]
+    movq    mm4, [parm1q+8]
+    pshufw  mm7, mm6, 0
+    pshufw  mm6, mm6, 0xAA
+    pshufw  mm5, mm4, 0
+    pshufw  mm4, mm4, 0xAA
+    shl     parm3q, 1
+.loop:
+    movq    mm0, [parm2q]
+    movq    mm1, [parm2q+16]
+    psubw   mm0, mm7
+    psubw   mm1, mm6
+    MMX_ABS mm0, mm2
+    MMX_ABS mm1, mm3
+    movq    mm2, [parm2q+parm3q]
+    movq    mm3, [parm2q+parm3q+16]
+    psubw   mm2, mm5
+    psubw   mm3, mm4
+    paddw   mm0, mm1
+    MMX_ABS mm2, mm1
+    MMX_ABS mm3, mm1
+    paddw   mm0, mm2
+    paddw   mm0, mm3
+    movq    [parm4q], mm0
+    add     parm2q, 8
+    add     parm4q, 8
+    sub     parm5d, 4
+    jg      .loop
+    nop
+    ret
+
+ALIGN 16
+x264_pixel_ads2_mmxext:
+    movq    mm6, [parm1q]
+    pshufw  mm7, mm6, 0
+    pshufw  mm6, mm6, 0xAA
+    shl     parm3q, 1
+.loop:
+    movq    mm0, [parm2q]
+    movq    mm1, [parm2q+parm3q]
+    psubw   mm0, mm7
+    psubw   mm1, mm6
+    MMX_ABS mm0, mm2
+    MMX_ABS mm1, mm3
+    paddw   mm0, mm1
+    movq    [parm4q], mm0
+    add     parm2q, 8
+    add     parm4q, 8
+    sub     parm5d, 4
+    jg      .loop
+    nop
+    ret
+
+ALIGN 16
+x264_pixel_ads1_mmxext:
+    pshufw  mm7, [parm1q], 0
+.loop:
+    movq    mm0, [parm2q]
+    movq    mm1, [parm2q+8]
+    psubw   mm0, mm7
+    psubw   mm1, mm7
+    MMX_ABS mm0, mm2
+    MMX_ABS mm1, mm3
+    movq    [parm4q], mm0
+    movq    [parm4q+8], mm1
+    add     parm2q, 16
+    add     parm4q, 16
+    sub     parm5d, 8
+    jg      .loop
+    nop
+    ret
diff --git a/common/i386/pixel-a.asm b/common/i386/pixel-a.asm
index 66ee5cd0..dad09d99 100644
--- a/common/i386/pixel-a.asm
+++ b/common/i386/pixel-a.asm
@@ -492,6 +492,10 @@ cglobal x264_intra_sa8d_x3_8x8_core_mmxext
 
 cglobal x264_pixel_ssim_4x4x2_core_mmxext
 
+cglobal x264_pixel_ads4_mmxext
+cglobal x264_pixel_ads2_mmxext
+cglobal x264_pixel_ads1_mmxext
+
 %macro SAD_START 0
     push    ebx
 
@@ -1635,3 +1639,99 @@ x264_pixel_ssim_4x4x2_core_mmxext:
     pop       ebx
     emms
     ret
+
+
+
+;-----------------------------------------------------------------------------
+;  void x264_pixel_ads4_mmxext( int enc_dc[4], uint16_t *sums, int delta,
+;                               uint16_t *res, int width )
+;-----------------------------------------------------------------------------
+ALIGN 16
+x264_pixel_ads4_mmxext:
+    push    ebx
+    mov     eax, [esp+8]
+    movq    mm6, [eax]
+    movq    mm4, [eax+8]
+    pshufw  mm7, mm6, 0
+    pshufw  mm6, mm6, 0xAA
+    pshufw  mm5, mm4, 0
+    pshufw  mm4, mm4, 0xAA
+    mov     eax, [esp+12]
+    mov     ebx, [esp+16]
+    mov     ecx, [esp+20]
+    mov     edx, [esp+24]
+    shl     ebx, 1
+.loop:
+    movq    mm0, [eax]
+    movq    mm1, [eax+16]
+    psubw   mm0, mm7
+    psubw   mm1, mm6
+    MMX_ABS mm0, mm2
+    MMX_ABS mm1, mm3
+    movq    mm2, [eax+ebx]
+    movq    mm3, [eax+ebx+16]
+    psubw   mm2, mm5
+    psubw   mm3, mm4
+    paddw   mm0, mm1
+    MMX_ABS mm2, mm1
+    MMX_ABS mm3, mm1
+    paddw   mm0, mm2
+    paddw   mm0, mm3
+    movq    [ecx], mm0
+    add     eax, 8
+    add     ecx, 8
+    sub     edx, 4
+    jg      .loop
+    pop     ebx
+    ret
+
+ALIGN 16
+x264_pixel_ads2_mmxext:
+    push    ebx
+    mov     eax, [esp+8]
+    movq    mm6, [eax]
+    pshufw  mm7, mm6, 0
+    pshufw  mm6, mm6, 0xAA
+    mov     eax, [esp+12]
+    mov     ebx, [esp+16]
+    mov     ecx, [esp+20]
+    mov     edx, [esp+24]
+    shl     ebx, 1
+.loop:
+    movq    mm0, [eax]
+    movq    mm1, [eax+ebx]
+    psubw   mm0, mm7
+    psubw   mm1, mm6
+    MMX_ABS mm0, mm2
+    MMX_ABS mm1, mm3
+    paddw   mm0, mm1
+    movq    [ecx], mm0
+    add     eax, 8
+    add     ecx, 8
+    sub     edx, 4
+    jg      .loop
+    pop     ebx
+    ret
+
+ALIGN 16
+x264_pixel_ads1_mmxext:
+    mov     eax, [esp+4]
+    pshufw  mm7, [eax], 0
+    mov     eax, [esp+8]
+    mov     ecx, [esp+16]
+    mov     edx, [esp+20]
+.loop:
+    movq    mm0, [eax]
+    movq    mm1, [eax+8]
+    psubw   mm0, mm7
+    psubw   mm1, mm7
+    MMX_ABS mm0, mm2
+    MMX_ABS mm1, mm3
+    movq    [ecx], mm0
+    movq    [ecx+8], mm1
+    add     eax, 16
+    add     ecx, 16
+    sub     edx, 8
+    jg      .loop
+    nop
+    ret
diff --git a/common/i386/pixel.h b/common/i386/pixel.h
index f33b22d7..fb06cccf 100644
--- a/common/i386/pixel.h
+++ b/common/i386/pixel.h
@@ -104,4 +104,11 @@ void x264_pixel_ssim_4x4x2_core_sse2( const uint8_t *pix1, int stride1,
                                       const uint8_t *pix2, int stride2, int sums[2][4] );
 float x264_pixel_ssim_end4_sse2( int sum0[5][4], int sum1[5][4], int width );
 
+void x264_pixel_ads4_mmxext( int enc_dc[4], uint16_t *sums, int delta,
+                             uint16_t *res, int width );
+void x264_pixel_ads2_mmxext( int enc_dc[2], uint16_t *sums, int delta,
+                             uint16_t *res, int width );
+void x264_pixel_ads1_mmxext( int enc_dc[1], uint16_t *sums, int delta,
+                             uint16_t *res, int width );
+
 #endif
diff --git a/common/pixel.c b/common/pixel.c
index aabc69c3..5ab6f726 100644
--- a/common/pixel.c
+++ b/common/pixel.c
@@ -399,6 +399,38 @@ float x264_pixel_ssim_wxh( x264_pixel_function_t *pf,
 }
 
 
+/****************************************************************************
+ * successive elimination
+ ****************************************************************************/
+static void pixel_ads4( int enc_dc[4], uint16_t *sums, int delta,
+                        uint16_t *res, int width )
+{
+    int i;
+    for( i=0; i<width; i++, sums++ )
+        res[i] = abs( enc_dc[0] - sums[0] )
+               + abs( enc_dc[1] - sums[8] )
+               + abs( enc_dc[2] - sums[delta] )
+               + abs( enc_dc[3] - sums[delta+8] );
+}
+
+static void pixel_ads2( int enc_dc[2], uint16_t *sums, int delta,
+                        uint16_t *res, int width )
+{
+    int i;
+    for( i=0; i<width; i++, sums++ )
+        res[i] = abs( enc_dc[0] - sums[0] )
+               + abs( enc_dc[1] - sums[delta] );
+}
+
+static void pixel_ads1( int enc_dc[1], uint16_t *sums, int delta,
+                        uint16_t *res, int width )
+{
+    int i;
+    for( i=0; i<width; i++, sums++ )
+        res[i] = abs( enc_dc[0] - sums[0] );
+}
+
+
 /****************************************************************************
  * x264_pixel_init:
  ****************************************************************************/
@@ -428,6 +460,10 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf )
     pixf->ssim_4x4x2_core = ssim_4x4x2_core;
     pixf->ssim_end4 = ssim_end4;
 
+    pixf->ads[PIXEL_16x16] = pixel_ads4;
+    pixf->ads[PIXEL_16x8] = pixel_ads2;
+    pixf->ads[PIXEL_8x8] = pixel_ads1;
+
 #ifdef HAVE_MMXEXT
     if( cpu&X264_CPU_MMX )
     {
@@ -445,6 +481,10 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf )
         pixf->sad_pde[PIXEL_16x8 ] = x264_pixel_sad_pde_16x8_mmxext;
         pixf->sad_pde[PIXEL_8x16 ] = x264_pixel_sad_pde_8x16_mmxext;
 
+        pixf->ads[PIXEL_16x16] = x264_pixel_ads4_mmxext;
+        pixf->ads[PIXEL_16x8 ] = x264_pixel_ads2_mmxext;
+        pixf->ads[PIXEL_8x8  ] = x264_pixel_ads1_mmxext;
+
 #ifdef ARCH_X86
         pixf->sa8d[PIXEL_16x16] = x264_pixel_sa8d_16x16_mmxext;
         pixf->sa8d[PIXEL_8x8]   = x264_pixel_sa8d_8x8_mmxext;
@@ -516,5 +556,10 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf )
     pixf->sad_x4[PIXEL_16x8]  = x264_pixel_sad_x4_16x8_vis;
     pixf->sad_x4[PIXEL_16x16] = x264_pixel_sad_x4_16x16_vis;
 #endif
+
+    pixf->ads[PIXEL_8x16] =
+    pixf->ads[PIXEL_8x4] =
+    pixf->ads[PIXEL_4x8] = pixf->ads[PIXEL_16x8];
+    pixf->ads[PIXEL_4x4] = pixf->ads[PIXEL_8x8];
 }
 
diff --git a/common/pixel.h b/common/pixel.h
index d6b014cf..c1d4fca1 100644
--- a/common/pixel.h
+++ b/common/pixel.h
@@ -85,6 +85,11 @@ typedef struct
     x264_pixel_cmp_x3_t sad_x3[7];
     x264_pixel_cmp_x4_t sad_x4[7];
 
+    /* abs-diff-sum for successive elimination.
+     * may round width up to a multiple of 8. */
+    void (*ads[7])( int enc_dc[4], uint16_t *sums, int delta,
+                    uint16_t *res, int width );
+
     /* calculate satd of V, H, and DC modes.
      * may be NULL, in which case just use pred+satd instead. */
     void (*intra_satd_x3_16x16)( uint8_t *fenc, uint8_t *fdec, int res[3] );
diff --git a/encoder/me.c b/encoder/me.c
index d113e0c7..035a58bb 100644
--- a/encoder/me.c
+++ b/encoder/me.c
@@ -462,64 +462,46 @@ me_hex2:
             /* successive elimination by comparing DC before a full SAD,
              * because sum(abs(diff)) >= abs(diff(sum)). */
             const int stride = m->i_stride[0];
-            const uint16_t *integral_base = m->integral;
             static uint8_t zero[16*16] = {0,};
+            uint16_t *sums_base = m->integral;
             int enc_dc[4];
             int sad_size = i_pixel <= PIXEL_8x8 ? PIXEL_8x8 : PIXEL_4x4;
-            int sad_w = x264_pixel_size[sad_size].w;
-            h->pixf.sad_x4[sad_size]( zero, m->p_fenc[0], m->p_fenc[0]+sad_w,
-                m->p_fenc[0]+sad_w*FENC_STRIDE, m->p_fenc[0]+sad_w+sad_w*FENC_STRIDE,
+            int delta = x264_pixel_size[sad_size].w;
+            uint16_t *ads = alloca((max_x-min_x+8) * sizeof(uint16_t));
+
+            h->pixf.sad_x4[sad_size]( zero, m->p_fenc[0], m->p_fenc[0]+delta,
+                m->p_fenc[0]+delta*FENC_STRIDE, m->p_fenc[0]+delta+delta*FENC_STRIDE,
                 FENC_STRIDE, enc_dc );
-            if( sad_w == 4 )
-                integral_base += stride * (h->fenc->i_lines[0] + 64);
-
-#define ESA(ADS) \
-            for( my = min_y; my <= max_y; my++ )\
-            {\
-                int mvs[3], i_mvs=0;\
-                bcost -= p_cost_mvy[my<<2];\
-                for( mx = min_x; mx <= max_x; mx++ )\
-                {\
-                    const uint16_t *integral = &integral_base[ mx + my * stride ];\
-                    if( ADS < bcost - p_cost_mvx[mx<<2] )\
-                    {\
-                        if( i_mvs == 3 )\
-                        {\
-                            COST_MV_X4_ABS( mvs[0],my, mvs[1],my, mvs[2],my, mx,my );\
-                            i_mvs = 0;\
-                        }\
-                        else\
-                            mvs[i_mvs++] = mx;\
-                    }\
-                }\
-                bcost += p_cost_mvy[my<<2];\
-                for( i=0; i<i_mvs; i++ )\
-                    COST_MV( mvs[i], my );\
-            }
+            if( delta == 4 )
+                sums_base += stride * (h->fenc->i_lines[0] + 64);
+            if( i_pixel == PIXEL_16x16 || i_pixel == PIXEL_8x16 || i_pixel == PIXEL_4x8 )
+                delta *= stride;
+            if( i_pixel == PIXEL_8x16 || i_pixel == PIXEL_4x8 )
+                enc_dc[1] = enc_dc[2];
 
-            if( i_pixel == PIXEL_16x16 )
-            {
-                ESA( abs( enc_dc[0] - integral[0] )
-                   + abs( enc_dc[1] - integral[8] )
-                   + abs( enc_dc[2] - integral[8*stride] )
-                   + abs( enc_dc[3] - integral[8*stride+8] ) );
-            }
-            else if( i_pixel == PIXEL_8x8 || i_pixel == PIXEL_4x4 )
-            {
-                ESA( abs( enc_dc[0] - integral[0] ) );
-            }
-            else
+            for( my = min_y; my <= max_y; my++ )
             {
-                int dw = i_pixel < PIXEL_8x8 ? 8 : 4;
-                if( i_pixel == PIXEL_8x16 || i_pixel == PIXEL_4x8 )
+                int mvs[3], i_mvs=0;
+                bcost -= p_cost_mvy[my<<2];
+                h->pixf.ads[i_pixel]( enc_dc, sums_base + min_x + my * stride, delta,
+                                      ads, max_x-min_x+1 );
+                for( mx = min_x; mx <= max_x; mx++ )
                 {
-                    dw *= stride;
-                    enc_dc[1] = enc_dc[2];
+                    if( ads[mx-min_x] < bcost - p_cost_mvx[mx<<2] )
+                    {
+                        if( i_mvs == 3 )
+                        {
+                            COST_MV_X4_ABS( mvs[0],my, mvs[1],my, mvs[2],my, mx,my );
+                            i_mvs = 0;
+                        }
+                        else
+                            mvs[i_mvs++] = mx;
+                    }
                 }
-                ESA( abs( enc_dc[0] - integral[0] )
-                   + abs( enc_dc[1] - integral[dw] ) );
+                bcost += p_cost_mvy[my<<2];
+                for( i=0; i<i_mvs; i++ )
+                    COST_MV( mvs[i], my );
             }
-#undef ESA
 #endif
         }
         break;
diff --git a/tools/checkasm.c b/tools/checkasm.c
index 450d04b3..888cd14c 100644
--- a/tools/checkasm.c
+++ b/tools/checkasm.c
@@ -38,7 +38,7 @@ static int check_pixel( int cpu_ref, int cpu_new )
     x264_predict8x8_t predict_8x8[9+3];
     DECLARE_ALIGNED( uint8_t, edge[33], 8 );
     int ret = 0, ok, used_asm;
-    int i;
+    int i, j;
 
     x264_pixel_init( 0, &pixel_c );
     x264_pixel_init( cpu_ref, &pixel_ref );
@@ -147,6 +147,25 @@ static int check_pixel( int cpu_ref, int cpu_new )
         report( "ssim :" );
     }
 
+    ok = 1; used_asm = 0;
+    for( i=0; i<4; i++ )
+        if( pixel_asm.ads[i] != pixel_ref.ads[i] )
+        {
+            uint16_t res_a[32], res_c[32];
+            uint16_t sums[72];
+            int dc[4];
+            for( j=0; j<72; j++ )
+                sums[j] = rand() & 0x3fff;
+            for( j=0; j<4; j++ )
+                dc[j] = rand() & 0x3fff;
+            used_asm = 1;
+            pixel_c.ads[i]( dc, sums, 32, res_c, 32 );
+            pixel_asm.ads[i]( dc, sums, 32, res_a, 32 );
+            if( memcmp(res_a, res_c, sizeof(res_c)) )
+                ok = 0;
+        }
+    report( "esa ads:" );
+
     return ret;
 }
 
-- 
2.40.0