From bc29c635327d79f6a5372df30477db28635e3846 Mon Sep 17 00:00:00 2001
From: Loren Merritt <pengvado@akuvian.org>
Date: Thu, 11 Dec 2008 19:47:17 +0000
Subject: [PATCH] faster ESA init reduce memory if using ESA and not p4x4

---
 common/common.h      |  1 +
 common/frame.c       |  2 +-
 common/mc.c          | 71 +++++++++++++++++++++++++-------
 common/mc.h          |  6 +++
 common/x86/mc-a2.asm | 98 ++++++++++++++++++++++++++++++++++++++++++++
 common/x86/mc-c.c    | 16 ++++++++
 encoder/encoder.c    |  3 ++
 tools/checkasm.c     | 66 +++++++++++++++++++----------
 8 files changed, 227 insertions(+), 36 deletions(-)

diff --git a/common/common.h b/common/common.h
index f2a0c54a..1668a630 100644
--- a/common/common.h
+++ b/common/common.h
@@ -338,6 +338,7 @@ struct x264_t
         int i_max_ref1;
         int i_delay;    /* Number of frames buffered for B reordering */
         int b_have_lowres;  /* Whether 1/2 resolution luma planes are being used */
+        int b_have_sub8x8_esa;
     } frames;
 
     /* current frame being encoded */
diff --git a/common/frame.c b/common/frame.c
index 482992d2..021242f1 100644
--- a/common/frame.c
+++ b/common/frame.c
@@ -99,7 +99,7 @@ x264_frame_t *x264_frame_new( x264_t *h )
     if( h->param.analyse.i_me_method >= X264_ME_ESA )
     {
         CHECKED_MALLOC( frame->buffer[3],
-                        2 * frame->i_stride[0] * (frame->i_lines[0] + 2*i_padv) * sizeof(uint16_t) );
+                        frame->i_stride[0] * (frame->i_lines[0] + 2*i_padv) * sizeof(uint16_t) << h->frames.b_have_sub8x8_esa );
         frame->integral = (uint16_t*)frame->buffer[3] + frame->i_stride[0] * i_padv + PADH;
     }
 
diff --git a/common/mc.c b/common/mc.c
index fe37c470..7422ba4e 100644
--- a/common/mc.c
+++ b/common/mc.c
@@ -269,6 +269,42 @@ static void memzero_aligned( void * dst, int n )
     memset( dst, 0, n );
 }
 
+static void integral_init4h( uint16_t *sum, uint8_t *pix, int stride )
+{
+    int x, v = pix[0]+pix[1]+pix[2]+pix[3];
+    for( x=0; x<stride-4; x++ )
+    {
+        sum[x] = v + sum[x-stride];
+        v += pix[x+4] - pix[x];
+    }
+}
+
+static void integral_init8h( uint16_t *sum, uint8_t *pix, int stride )
+{
+    int x, v = pix[0]+pix[1]+pix[2]+pix[3]+pix[4]+pix[5]+pix[6]+pix[7];
+    for( x=0; x<stride-8; x++ )
+    {
+        sum[x] = v + sum[x-stride];
+        v += pix[x+8] - pix[x];
+    }
+}
+
+static void integral_init4v( uint16_t *sum8, uint16_t *sum4, int stride )
+{
+    int x;
+    for( x=0; x<stride-8; x++ )
+        sum4[x] = sum8[x+4*stride] - sum8[x];
+    for( x=0; x<stride-8; x++ )
+        sum8[x] = sum8[x+8*stride] + sum8[x+8*stride+4] - sum8[x] - sum8[x+4];
+}
+
+static void integral_init8v( uint16_t *sum8, int stride )
+{
+    int x;
+    for( x=0; x<stride-8; x++ )
+        sum8[x] = sum8[x+8*stride] - sum8[x];
+}
+
 void x264_frame_init_lowres( x264_t *h, x264_frame_t *frame )
 {
     uint8_t *src = frame->plane[0];
@@ -353,6 +389,11 @@ void x264_mc_init( int cpu, x264_mc_functions_t *pf )
     pf->memzero_aligned = memzero_aligned;
     pf->frame_init_lowres_core = frame_init_lowres_core;
 
+    pf->integral_init4h = integral_init4h;
+    pf->integral_init8h = integral_init8h;
+    pf->integral_init4v = integral_init4v;
+    pf->integral_init8v = integral_init8v;
+
 #ifdef HAVE_MMX
     x264_mc_init_mmx( cpu, pf );
 #endif
@@ -370,7 +411,7 @@ void x264_frame_filter( x264_t *h, x264_frame_t *frame, int mb_y, int b_end )
     int start = (mb_y*16 >> b_interlaced) - 8; // buffer = 4 for deblock + 3 for 6tap, rounded to 8
     int height = ((b_end ? frame->i_lines[0] : mb_y*16) >> b_interlaced) + 8;
     int offs = start*stride - 8; // buffer = 3 for 6tap, aligned to 8 for simd
-    int x, y;
+    int y;
 
     if( mb_y & b_interlaced )
         return;
@@ -401,20 +442,22 @@ void x264_frame_filter( x264_t *h, x264_frame_t *frame, int mb_y, int b_end )
             height += PADV-8;
         for( y = start; y < height; y++ )
         {
-            uint8_t  *ref  = frame->plane[0] + y * stride - PADH;
-            uint16_t *line = frame->integral + (y+1) * stride - PADH + 1;
-            uint16_t v = line[0] = 0;
-            for( x = 1; x < stride-1; x++ )
-                line[x] = v += ref[x] + line[x-stride] - line[x-stride-1];
-            line -= 8*stride;
-            if( y >= 9-PADV )
+            uint8_t  *pix  = frame->plane[0] + y * stride - PADH;
+            uint16_t *sum8 = frame->integral + (y+1) * stride - PADH;
+            uint16_t *sum4;
+            if( h->frames.b_have_sub8x8_esa )
+            {
+                h->mc.integral_init4h( sum8, pix, stride );
+                sum8 -= 8*stride;
+                sum4 = sum8 + stride * (frame->i_lines[0] + PADV*2);
+                if( y >= 8-PADV )
+                    h->mc.integral_init4v( sum8, sum4, stride );
+            }
+            else
             {
-                uint16_t *sum4 = line + stride * (frame->i_lines[0] + PADV*2);
-                for( x = 1; x < stride-8; x++, line++, sum4++ )
-                {
-                    sum4[0] =  line[4+4*stride] - line[4] - line[4*stride] + line[0];
-                    line[0] += line[8+8*stride] - line[8] - line[8*stride];
-                }
+                h->mc.integral_init8h( sum8, pix, stride );
+                if( y >= 8-PADV )
+                    h->mc.integral_init8v( sum8-8*stride, stride );
             }
         }
     }
diff --git a/common/mc.h b/common/mc.h
index 57c596cf..884d0165 100644
--- a/common/mc.h
+++ b/common/mc.h
@@ -66,6 +66,12 @@ typedef struct
     void *(*memcpy_aligned)( void *dst, const void *src, size_t n );
     void (*memzero_aligned)( void *dst, int n );
 
+    /* successive elimination prefilter */
+    void (*integral_init4h)( uint16_t *sum, uint8_t *pix, int stride );
+    void (*integral_init8h)( uint16_t *sum, uint8_t *pix, int stride );
+    void (*integral_init4v)( uint16_t *sum8, uint16_t *sum4, int stride );
+    void (*integral_init8v)( uint16_t *sum8, int stride );
+
     void (*frame_init_lowres_core)( uint8_t *src0, uint8_t *dst0, uint8_t *dsth, uint8_t *dstv, uint8_t *dstc,
                                     int src_stride, int dst_stride, int width, int height );
 } x264_mc_functions_t;
diff --git a/common/x86/mc-a2.asm b/common/x86/mc-a2.asm
index fa3e3bd9..82daf2ce 100644
--- a/common/x86/mc-a2.asm
+++ b/common/x86/mc-a2.asm
@@ -694,6 +694,104 @@ MEMZERO sse2
 
 
 
+;-----------------------------------------------------------------------------
+; void x264_integral_init4h_sse4( uint16_t *sum, uint8_t *pix, int stride )
+;-----------------------------------------------------------------------------
+cglobal x264_integral_init4h_sse4, 3,4
+    lea     r3, [r0+r2*2]
+    add     r1, r2
+    neg     r2
+    pxor    m4, m4
+.loop:
+    movdqa  m0, [r1+r2]
+    movdqu  m1, [r1+r2+8]
+    mpsadbw m0, m4, 0
+    mpsadbw m1, m4, 0
+    paddw   m0, [r0+r2*2]
+    paddw   m1, [r0+r2*2+16]
+    movdqa  [r3+r2*2   ], m0
+    movdqa  [r3+r2*2+16], m1
+    add     r2, 16
+    jl .loop
+    REP_RET
+
+cglobal x264_integral_init8h_sse4, 3,4
+    lea     r3, [r0+r2*2]
+    add     r1, r2
+    neg     r2
+    pxor    m4, m4
+.loop:
+    movdqa  m0, [r1+r2]
+    movdqu  m1, [r1+r2+8]
+    movdqa  m2, m0
+    movdqa  m3, m1
+    mpsadbw m0, m4, 0
+    mpsadbw m1, m4, 0
+    mpsadbw m2, m4, 4
+    mpsadbw m3, m4, 4
+    paddw   m0, [r0+r2*2]
+    paddw   m1, [r0+r2*2+16]
+    paddw   m0, m2
+    paddw   m1, m3
+    movdqa  [r3+r2*2   ], m0
+    movdqa  [r3+r2*2+16], m1
+    add     r2, 16
+    jl .loop
+    REP_RET
+
+%macro INTEGRAL_INIT 1
+;-----------------------------------------------------------------------------
+; void x264_integral_init4v_mmx( uint16_t *sum8, uint16_t *sum4, int stride )
+;-----------------------------------------------------------------------------
+cglobal x264_integral_init4v_%1, 3,5
+    shl   r2, 1
+    add   r0, r2
+    add   r1, r2
+    lea   r3, [r0+r2*4]
+    lea   r4, [r0+r2*8]
+    neg   r2
+.loop:
+    movu  m0, [r0+r2+8]
+    mova  m2, [r0+r2]
+    movu  m1, [r4+r2+8]
+    paddw m0, m2
+    paddw m1, [r4+r2]
+    mova  m3, [r3+r2]
+    psubw m1, m0
+    psubw m3, m2
+    mova  [r0+r2], m1
+    mova  [r1+r2], m3
+    add   r2, mmsize
+    jl .loop
+    REP_RET
+
+;-----------------------------------------------------------------------------
+; void x264_integral_init8v_mmx( uint16_t *sum8, int stride )
+;-----------------------------------------------------------------------------
+cglobal x264_integral_init8v_%1, 3,3
+    shl   r1, 1
+    add   r0, r1
+    lea   r2, [r0+r1*8]
+    neg   r1
+.loop:
+    mova  m0, [r2+r1]
+    mova  m1, [r2+r1+mmsize]
+    psubw m0, [r0+r1]
+    psubw m1, [r0+r1+mmsize]
+    mova  [r0+r1], m0
+    mova  [r0+r1+mmsize], m1
+    add   r1, 2*mmsize
+    jl .loop
+    REP_RET
+%endmacro
+
+INIT_MMX
+INTEGRAL_INIT mmx
+INIT_XMM
+INTEGRAL_INIT sse2
+
+
+
 %macro FILT8x4 7
     mova      %3, [r0+%7]
     mova      %4, [r0+r5+%7]
diff --git a/common/x86/mc-c.c b/common/x86/mc-c.c
index 0ec7adef..56ca4c4e 100644
--- a/common/x86/mc-c.c
+++ b/common/x86/mc-c.c
@@ -64,6 +64,12 @@ extern void *x264_memcpy_aligned_mmx( void * dst, const void * src, size_t n );
 extern void *x264_memcpy_aligned_sse2( void * dst, const void * src, size_t n );
 extern void x264_memzero_aligned_mmx( void * dst, int n );
 extern void x264_memzero_aligned_sse2( void * dst, int n );
+extern void x264_integral_init4h_sse4( uint16_t *sum, uint8_t *pix, int stride );
+extern void x264_integral_init8h_sse4( uint16_t *sum, uint8_t *pix, int stride );
+extern void x264_integral_init4v_mmx( uint16_t *sum8, uint16_t *sum4, int stride );
+extern void x264_integral_init4v_sse2( uint16_t *sum8, uint16_t *sum4, int stride );
+extern void x264_integral_init8v_mmx( uint16_t *sum8, int stride );
+extern void x264_integral_init8v_sse2( uint16_t *sum8, int stride );
 #define LOWRES(cpu) \
 extern void x264_frame_init_lowres_core_##cpu( uint8_t *src0, uint8_t *dst0, uint8_t *dsth, uint8_t *dstv, uint8_t *dstc,\
                                                int src_stride, int dst_stride, int width, int height );
@@ -242,6 +248,8 @@ void x264_mc_init_mmx( int cpu, x264_mc_functions_t *pf )
     pf->copy[PIXEL_4x4]   = x264_mc_copy_w4_mmx;
     pf->memcpy_aligned = x264_memcpy_aligned_mmx;
     pf->memzero_aligned = x264_memzero_aligned_mmx;
+    pf->integral_init4v = x264_integral_init4v_mmx;
+    pf->integral_init8v = x264_integral_init8v_mmx;
 
     if( !(cpu&X264_CPU_MMXEXT) )
         return;
@@ -286,6 +294,8 @@ void x264_mc_init_mmx( int cpu, x264_mc_functions_t *pf )
 
     pf->memcpy_aligned = x264_memcpy_aligned_sse2;
     pf->memzero_aligned = x264_memzero_aligned_sse2;
+    pf->integral_init4v = x264_integral_init4v_sse2;
+    pf->integral_init8v = x264_integral_init8v_sse2;
     pf->hpel_filter = x264_hpel_filter_sse2_amd;
 
     if( cpu&X264_CPU_SSE2_IS_SLOW )
@@ -331,4 +341,10 @@ void x264_mc_init_mmx( int cpu, x264_mc_functions_t *pf )
     pf->hpel_filter = x264_hpel_filter_ssse3;
     pf->frame_init_lowres_core = x264_frame_init_lowres_core_ssse3;
     pf->mc_chroma = x264_mc_chroma_ssse3;
+
+    if( !(cpu&X264_CPU_SSE4) )
+        return;
+
+    pf->integral_init4h = x264_integral_init4h_sse4;
+    pf->integral_init8h = x264_integral_init8h_sse4;
 }
diff --git a/encoder/encoder.c b/encoder/encoder.c
index 73d33f4a..0a91134e 100644
--- a/encoder/encoder.c
+++ b/encoder/encoder.c
@@ -713,6 +713,7 @@ x264_t *x264_encoder_open   ( x264_param_t *param )
           || h->param.i_bframe_adaptive
           || h->param.b_pre_scenecut );
     h->frames.b_have_lowres |= (h->param.rc.b_stat_read && h->param.rc.i_vbv_buffer_size > 0);
+    h->frames.b_have_sub8x8_esa = !!(h->param.analyse.inter & X264_ANALYSE_PSUB8x8);
 
     h->frames.i_last_idr = - h->param.i_keyint_max;
     h->frames.i_input    = 0;
@@ -839,6 +840,8 @@ int x264_encoder_reconfig( x264_t *h, x264_param_t *param )
     // can only twiddle these if they were enabled to begin with:
     if( h->param.analyse.i_me_method >= X264_ME_ESA || param->analyse.i_me_method < X264_ME_ESA )
         COPY( analyse.i_me_method );
+    if( h->param.analyse.i_me_method >= X264_ME_ESA && !h->frames.b_have_sub8x8_esa )
+        h->param.analyse.inter &= ~X264_ANALYSE_PSUB8x8;
     if( h->pps->b_transform_8x8_mode )
         COPY( analyse.b_transform_8x8 );
     if( h->frames.i_max_ref1 > 1 )
diff --git a/tools/checkasm.c b/tools/checkasm.c
index e810cdcc..1c173199 100644
--- a/tools/checkasm.c
+++ b/tools/checkasm.c
@@ -822,33 +822,57 @@ static int check_mc( int cpu_ref, int cpu_new )
         uint8_t *dstc[4] = { buf3, buf3+1024, buf3+2048, buf3+3072 };
         uint8_t *dsta[4] = { buf4, buf4+1024, buf4+2048, buf3+3072 };
         set_func_name( "lowres_init" );
+        ok = 1; used_asm = 1;
         for( w=40; w<=48; w+=8 )
-            if( mc_a.frame_init_lowres_core != mc_ref.frame_init_lowres_core )
+        {
+            int stride = (w+8)&~15;
+            call_c( mc_c.frame_init_lowres_core, buf1, dstc[0], dstc[1], dstc[2], dstc[3], w*2, stride, w, 16 );
+            call_a( mc_a.frame_init_lowres_core, buf1, dsta[0], dsta[1], dsta[2], dsta[3], w*2, stride, w, 16 );
+            for( i=0; i<16; i++)
             {
-                int stride = (w+8)&~15;
-                used_asm = 1;
-                call_c( mc_c.frame_init_lowres_core, buf1, dstc[0], dstc[1], dstc[2], dstc[3], w*2, stride, w, 16 );
-                call_a( mc_a.frame_init_lowres_core, buf1, dsta[0], dsta[1], dsta[2], dsta[3], w*2, stride, w, 16 );
-                for( i=0; i<16; i++)
-                {
-                    for( j=0; j<4; j++)
-                        if( memcmp( dstc[j]+i*stride, dsta[j]+i*stride, w ) )
-                        {
-                            ok = 0;
-                            fprintf( stderr, "frame_init_lowres differs at plane %d line %d\n", j, i );
-                            for( k=0; k<w; k++ )
-                                printf( "%d ", dstc[j][k+i*stride] );
-                            printf("\n");
-                            for( k=0; k<w; k++ )
-                                printf( "%d ", dsta[j][k+i*stride] );
-                            printf("\n");
-                            break;
-                        }
-                }
+                for( j=0; j<4; j++)
+                    if( memcmp( dstc[j]+i*stride, dsta[j]+i*stride, w ) )
+                    {
+                        ok = 0;
+                        fprintf( stderr, "frame_init_lowres differs at plane %d line %d\n", j, i );
+                        for( k=0; k<w; k++ )
+                            printf( "%d ", dstc[j][k+i*stride] );
+                        printf("\n");
+                        for( k=0; k<w; k++ )
+                            printf( "%d ", dsta[j][k+i*stride] );
+                        printf("\n");
+                        break;
+                    }
             }
+        }
         report( "lowres init :" );
     }
 
+#define INTEGRAL_INIT( name, size, ... )\
+    if( mc_a.name != mc_ref.name )\
+    {\
+        int stride = 80;\
+        set_func_name( #name );\
+        used_asm = 1;\
+        memcpy( buf3, buf1, size*2*stride );\
+        memcpy( buf4, buf1, size*2*stride );\
+        uint16_t *sum = (uint16_t*)buf3;\
+        call_c1( mc_c.name, __VA_ARGS__ );\
+        sum = (uint16_t*)buf4;\
+        call_a1( mc_a.name, __VA_ARGS__ );\
+        if( memcmp( buf3, buf4, (stride-8)*2 )\
+            || (size>9 && memcmp( buf3+18*stride, buf4+18*stride, (stride-8)*2 )))\
+            ok = 0;\
+        call_c2( mc_c.name, __VA_ARGS__ );\
+        call_a2( mc_a.name, __VA_ARGS__ );\
+    }
+    ok = 1; used_asm = 0;
+    INTEGRAL_INIT( integral_init4h, 2, sum+stride, buf2, stride );
+    INTEGRAL_INIT( integral_init8h, 2, sum+stride, buf2, stride );
+    INTEGRAL_INIT( integral_init4v, 14, sum, sum+9*stride, stride );
+    INTEGRAL_INIT( integral_init8v, 9, sum, stride );
+    report( "integral init :" );
+
     return ret;
 }
 
-- 
2.40.0