From 4d84a45d7e505e4929a0110e047aa29a752e3253 Mon Sep 17 00:00:00 2001
From: Fiona Glaser <fiona@x264.com>
Date: Wed, 3 Sep 2008 15:32:16 -0700
Subject: [PATCH] Add merged SAD for i16x16 analysis Roughly 30% faster i16x16
 analysis under subme=1

---
 common/pixel.c         |  4 +-
 common/pixel.h         | 12 +++---
 common/x86/pixel.h     | 23 ++++++-----
 common/x86/sad-a.asm   | 92 ++++++++++++++++++++++++++++++++++++++++++
 common/x86/x86util.asm | 17 ++++++++
 encoder/analyse.c      |  4 +-
 encoder/encoder.c      |  1 +
 tools/checkasm.c       | 18 +++++----
 8 files changed, 145 insertions(+), 26 deletions(-)

diff --git a/common/pixel.c b/common/pixel.c
index 3e28d857..86388fa0 100644
--- a/common/pixel.c
+++ b/common/pixel.c
@@ -619,6 +619,7 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf )
         }
 #endif
         pixf->intra_satd_x3_16x16 = x264_intra_satd_x3_16x16_mmxext;
+        pixf->intra_sad_x3_16x16 = x264_intra_sad_x3_16x16_mmxext;
         pixf->intra_satd_x3_8x8c  = x264_intra_satd_x3_8x8c_mmxext;
         pixf->intra_satd_x3_4x4   = x264_intra_satd_x3_4x4_mmxext;
     }
@@ -630,7 +631,7 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf )
         INIT2( sad_x4, _sse2 );
         INIT_ADS( _sse2 );
         pixf->var[PIXEL_8x8] = x264_pixel_var_8x8_sse2;
-
+        pixf->intra_sad_x3_16x16 = x264_intra_sad_x3_16x16_sse2;
 #ifdef ARCH_X86
         if( cpu&X264_CPU_CACHELINE_64 )
         {
@@ -673,6 +674,7 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf )
         pixf->sa8d[PIXEL_16x16]= x264_pixel_sa8d_16x16_ssse3;
         pixf->sa8d[PIXEL_8x8]  = x264_pixel_sa8d_8x8_ssse3;
         pixf->intra_satd_x3_16x16 = x264_intra_satd_x3_16x16_ssse3;
+        pixf->intra_sad_x3_16x16  = x264_intra_sad_x3_16x16_ssse3;
         pixf->intra_satd_x3_8x8c  = x264_intra_satd_x3_8x8c_ssse3;
         pixf->intra_satd_x3_4x4   = x264_intra_satd_x3_4x4_ssse3;
 #ifdef ARCH_X86_64
diff --git a/common/pixel.h b/common/pixel.h
index 127f89dd..0ed1ef4c 100644
--- a/common/pixel.h
+++ b/common/pixel.h
@@ -91,12 +91,14 @@ typedef struct
     int (*ads[7])( int enc_dc[4], uint16_t *sums, int delta,
                    uint16_t *cost_mvx, int16_t *mvs, int width, int thresh );
 
-    /* calculate satd of V, H, and DC modes.
+    /* calculate satd or sad of V, H, and DC modes.
      * may be NULL, in which case just use pred+satd instead. */
-    void (*intra_satd_x3_16x16)( uint8_t *fenc, uint8_t *fdec, int res[3] );
-    void (*intra_satd_x3_8x8c)( uint8_t *fenc, uint8_t *fdec, int res[3] );
-    void (*intra_satd_x3_4x4)( uint8_t *fenc, uint8_t *fdec, int res[3] );
-    void (*intra_sa8d_x3_8x8)( uint8_t *fenc, uint8_t edge[33], int res[3] );
+    void (*intra_mbcmp_x3_16x16)( uint8_t *fenc, uint8_t *fdec  , int res[3] );
+    void (*intra_satd_x3_16x16) ( uint8_t *fenc, uint8_t *fdec  , int res[3] );
+    void (*intra_sad_x3_16x16)  ( uint8_t *fenc, uint8_t *fdec  , int res[3] );
+    void (*intra_satd_x3_8x8c)  ( uint8_t *fenc, uint8_t *fdec  , int res[3] );
+    void (*intra_satd_x3_4x4)   ( uint8_t *fenc, uint8_t *fdec  , int res[3] );
+    void (*intra_sa8d_x3_8x8)   ( uint8_t *fenc, uint8_t edge[33], int res[3] );
 } x264_pixel_function_t;
 
 void x264_pixel_init( int cpu, x264_pixel_function_t *pixf );
diff --git a/common/x86/pixel.h b/common/x86/pixel.h
index cbb37dda..f0df2dd2 100644
--- a/common/x86/pixel.h
+++ b/common/x86/pixel.h
@@ -74,18 +74,21 @@ int x264_pixel_var_16x16_sse2  ( uint8_t *pix, int i_stride, uint32_t *sad );
 int x264_pixel_var_8x8_mmxext  ( uint8_t *pix, int i_stride, uint32_t *sad );
 int x264_pixel_var_8x8_sse2    ( uint8_t *pix, int i_stride, uint32_t *sad );
 
-void x264_intra_satd_x3_4x4_mmxext( uint8_t *, uint8_t *, int * );
-void x264_intra_satd_x3_4x4_ssse3( uint8_t *, uint8_t *, int * );
-void x264_intra_satd_x3_8x8c_mmxext( uint8_t *, uint8_t *, int * );
-void x264_intra_satd_x3_8x8c_ssse3( uint8_t *, uint8_t *, int * );
+void x264_intra_satd_x3_4x4_mmxext  ( uint8_t *, uint8_t *, int * );
+void x264_intra_satd_x3_4x4_ssse3   ( uint8_t *, uint8_t *, int * );
+void x264_intra_satd_x3_8x8c_mmxext ( uint8_t *, uint8_t *, int * );
+void x264_intra_satd_x3_8x8c_ssse3  ( uint8_t *, uint8_t *, int * );
 void x264_intra_satd_x3_16x16_mmxext( uint8_t *, uint8_t *, int * );
-void x264_intra_satd_x3_16x16_ssse3( uint8_t *, uint8_t *, int * );
-void x264_intra_sa8d_x3_8x8_mmxext( uint8_t *, uint8_t *, int * );
-void x264_intra_sa8d_x3_8x8_sse2( uint8_t *, uint8_t *, int * );
-void x264_intra_sa8d_x3_8x8_ssse3( uint8_t *, uint8_t *, int * );
+void x264_intra_satd_x3_16x16_ssse3 ( uint8_t *, uint8_t *, int * );
+void x264_intra_sad_x3_16x16_mmxext ( uint8_t *, uint8_t *, int * );
+void x264_intra_sad_x3_16x16_sse2   ( uint8_t *, uint8_t *, int * );
+void x264_intra_sad_x3_16x16_ssse3  ( uint8_t *, uint8_t *, int * );
+void x264_intra_sa8d_x3_8x8_mmxext  ( uint8_t *, uint8_t *, int * );
+void x264_intra_sa8d_x3_8x8_sse2    ( uint8_t *, uint8_t *, int * );
+void x264_intra_sa8d_x3_8x8_ssse3   ( uint8_t *, uint8_t *, int * );
 void x264_intra_sa8d_x3_8x8_core_mmxext( uint8_t *, int16_t [2][8], int * );
-void x264_intra_sa8d_x3_8x8_core_sse2( uint8_t *, int16_t [2][8], int * );
-void x264_intra_sa8d_x3_8x8_core_ssse3( uint8_t *, int16_t [2][8], int * );
+void x264_intra_sa8d_x3_8x8_core_sse2  ( uint8_t *, int16_t [2][8], int * );
+void x264_intra_sa8d_x3_8x8_core_ssse3 ( uint8_t *, int16_t [2][8], int * );
 
 void x264_pixel_ssim_4x4x2_core_mmxext( const uint8_t *pix1, int stride1,
                                         const uint8_t *pix2, int stride2, int sums[2][4] );
diff --git a/common/x86/sad-a.asm b/common/x86/sad-a.asm
index 2c167221..49db7892 100644
--- a/common/x86/sad-a.asm
+++ b/common/x86/sad-a.asm
@@ -23,8 +23,10 @@
 ;*****************************************************************************
 
 %include "x86inc.asm"
+%include "x86util.asm"
 
 SECTION_RODATA
+pb_3: times 16 db 3
 sw_64: dd 64
 
 SECTION .text
@@ -221,6 +223,96 @@ SAD_W16 sse2_aligned
 
 
 
+;-----------------------------------------------------------------------------
+; void intra_sad_x3_16x16 ( uint8_t *fenc, uint8_t *fdec, int res[3] );
+;-----------------------------------------------------------------------------
+
+;xmm7: DC prediction    xmm6: H prediction  xmm5: V prediction
+;xmm4: DC pred score    xmm3: H pred score  xmm2: V pred score
+%macro INTRA_SAD16 1
+cglobal x264_intra_sad_x3_16x16_%1,3,5
+    pxor    mm0, mm0
+    pxor    mm1, mm1
+    psadbw  mm0, [r1-FDEC_STRIDE+0]
+    psadbw  mm1, [r1-FDEC_STRIDE+8]
+    paddw   mm0, mm1
+    movd    r3d, mm0
+%ifidn %1, ssse3
+    mova  m1, [pb_3 GLOBAL]
+%endif
+%assign n 0
+%rep 16
+    movzx   r4d, byte [r1-1+FDEC_STRIDE*n]
+    add     r3d, r4d
+%assign n n+1
+%endrep
+    add     r3d, 16
+    shr     r3d, 5
+    imul    r3d, 0x01010101
+    movd    m7, r3d
+    mova    m5, [r1-FDEC_STRIDE]
+%if mmsize==16
+    pshufd  m7, m7, 0
+%else
+    mova    m1, [r1-FDEC_STRIDE+8]
+    punpckldq m7, m7
+%endif
+    pxor    m4, m4
+    pxor    m3, m3
+    pxor    m2, m2
+    mov     r3d, 15*FENC_STRIDE
+.vloop:
+    SPLATB  m6, r1+r3*2-1, m1
+    mova    m0, [r0+r3]
+    psadbw  m0, m7
+    paddw   m4, m0
+    mova    m0, [r0+r3]
+    psadbw  m0, m5
+    paddw   m2, m0
+%if mmsize==8
+    mova    m0, [r0+r3]
+    psadbw  m0, m6
+    paddw   m3, m0
+    mova    m0, [r0+r3+8]
+    psadbw  m0, m7
+    paddw   m4, m0
+    mova    m0, [r0+r3+8]
+    psadbw  m0, m1
+    paddw   m2, m0
+    psadbw  m6, [r0+r3+8]
+    paddw   m3, m6
+%else
+    psadbw  m6, [r0+r3]
+    paddw   m3, m6
+%endif
+    add     r3d, -FENC_STRIDE
+    jge .vloop
+%if mmsize==16
+    pslldq  m3, 4
+    por     m3, m2
+    movhlps m1, m3
+    paddw   m3, m1
+    movq  [r2+0], m3
+    movhlps m1, m4
+    paddw   m4, m1
+%else
+    movd  [r2+0], m2
+    movd  [r2+4], m3
+%endif
+    movd  [r2+8], m4
+    RET
+%endmacro
+
+INIT_MMX
+%define SPLATB SPLATB_MMX
+INTRA_SAD16 mmxext
+INIT_XMM
+INTRA_SAD16 sse2
+%define SPLATB SPLATB_SSSE3
+INTRA_SAD16 ssse3
+
+
+
 ;=============================================================================
 ; SAD x3/x4 MMX
 ;=============================================================================
diff --git a/common/x86/x86util.asm b/common/x86/x86util.asm
index d4eec429..e61d1381 100644
--- a/common/x86/x86util.asm
+++ b/common/x86/x86util.asm
@@ -131,6 +131,22 @@
     ABS2 %3, %4, %5, %6
 %endmacro
 
+%macro SPLATB_MMX 3
+    movd      %1, [%2-3] ;to avoid crossing a cacheline
+    punpcklbw %1, %1
+%if mmsize==16
+    pshuflw   %1, %1, 0xff
+    movlhps   %1, %1
+%else
+    pshufw    %1, %1, 0xff
+%endif
+%endmacro
+
+%macro SPLATB_SSSE3 3
+    movd      %1, [%2-3]
+    pshufb    %1, %3
+%endmacro
+
 %macro PALIGNR_MMX 4
     %ifnidn %4, %2
     mova    %4, %2
@@ -221,3 +237,4 @@
     packuswb   %1, %1
     movh       %4, %1
 %endmacro
+
diff --git a/encoder/analyse.c b/encoder/analyse.c
index d9e77a32..10399690 100644
--- a/encoder/analyse.c
+++ b/encoder/analyse.c
@@ -544,7 +544,7 @@ static void x264_mb_analyse_intra( x264_t *h, x264_mb_analysis_t *a, int i_satd_
     int i, idx;
     int i_max;
     int predict_mode[9];
-    int b_merged_satd = h->pixf.intra_satd_x3_16x16 && h->pixf.mbcmp[0] == h->pixf.satd[0];
+    int b_merged_satd = !!h->pixf.intra_mbcmp_x3_16x16;
 
     /*---------------- Try all mode and calculate their score ---------------*/
 
@@ -553,7 +553,7 @@ static void x264_mb_analyse_intra( x264_t *h, x264_mb_analysis_t *a, int i_satd_
 
     if( b_merged_satd && i_max == 4 )
     {
-        h->pixf.intra_satd_x3_16x16( p_src, p_dst, a->i_satd_i16x16_dir );
+        h->pixf.intra_mbcmp_x3_16x16( p_src, p_dst, a->i_satd_i16x16_dir );
         h->predict_16x16[I_PRED_16x16_P]( p_dst );
         a->i_satd_i16x16_dir[I_PRED_16x16_P] =
             h->pixf.mbcmp[PIXEL_16x16]( p_dst, FDEC_STRIDE, p_src, FENC_STRIDE );
diff --git a/encoder/encoder.c b/encoder/encoder.c
index 584aef65..92bc699c 100644
--- a/encoder/encoder.c
+++ b/encoder/encoder.c
@@ -585,6 +585,7 @@ static void mbcmp_init( x264_t *h )
     int satd = !h->mb.b_lossless && h->param.analyse.i_subpel_refine > 1;
     memcpy( h->pixf.mbcmp, satd ? h->pixf.satd : h->pixf.sad_aligned, sizeof(h->pixf.mbcmp) );
     memcpy( h->pixf.mbcmp_unaligned, satd ? h->pixf.satd : h->pixf.sad, sizeof(h->pixf.mbcmp_unaligned) );
+    h->pixf.intra_mbcmp_x3_16x16 = satd ? h->pixf.intra_satd_x3_16x16 : h->pixf.intra_sad_x3_16x16;
     satd &= h->param.analyse.i_me_method == X264_ME_TESA;
     memcpy( h->pixf.fpelcmp, satd ? h->pixf.satd : h->pixf.sad, sizeof(h->pixf.fpelcmp) );
     memcpy( h->pixf.fpelcmp_x3, satd ? h->pixf.satd_x3 : h->pixf.sad_x3, sizeof(h->pixf.fpelcmp_x3) );
diff --git a/tools/checkasm.c b/tools/checkasm.c
index 9f892859..8700df2e 100644
--- a/tools/checkasm.c
+++ b/tools/checkasm.c
@@ -324,7 +324,7 @@ static int check_pixel( int cpu_ref, int cpu_new )
     TEST_PIXEL_VAR( PIXEL_8x8 );
     report( "pixel var :" );
 
-#define TEST_INTRA_SATD( name, pred, satd, i8x8, ... ) \
+#define TEST_INTRA_MBCMP( name, pred, satd, i8x8, ... ) \
     if( pixel_asm.name && pixel_asm.name != pixel_ref.name ) \
     { \
         int res_c[3], res_asm[3]; \
@@ -333,10 +333,10 @@ static int check_pixel( int cpu_ref, int cpu_new )
         memcpy( buf3, buf2, 1024 ); \
         for( i=0; i<3; i++ ) \
         { \
-            pred[i]( buf3+40, ##__VA_ARGS__ ); \
-            res_c[i] = pixel_c.satd( buf1+40, 16, buf3+40, 32 ); \
+            pred[i]( buf3+48, ##__VA_ARGS__ ); \
+            res_c[i] = pixel_c.satd( buf1+48, 16, buf3+48, 32 ); \
         } \
-        call_a( pixel_asm.name, buf1+40, i8x8 ? edge : buf3+40, res_asm ); \
+        call_a( pixel_asm.name, buf1+48, i8x8 ? edge : buf3+48, res_asm ); \
         if( memcmp(res_c, res_asm, sizeof(res_c)) ) \
         { \
             ok = 0; \
@@ -347,11 +347,13 @@ static int check_pixel( int cpu_ref, int cpu_new )
     }
 
     ok = 1; used_asm = 0;
-    TEST_INTRA_SATD( intra_satd_x3_16x16, predict_16x16, satd[PIXEL_16x16], 0 );
-    TEST_INTRA_SATD( intra_satd_x3_8x8c, predict_8x8c, satd[PIXEL_8x8], 0 );
-    TEST_INTRA_SATD( intra_satd_x3_4x4, predict_4x4, satd[PIXEL_4x4], 0 );
-    TEST_INTRA_SATD( intra_sa8d_x3_8x8, predict_8x8, sa8d[PIXEL_8x8], 1, edge );
+    TEST_INTRA_MBCMP( intra_satd_x3_16x16, predict_16x16, satd[PIXEL_16x16], 0 );
+    TEST_INTRA_MBCMP( intra_satd_x3_8x8c , predict_8x8c , satd[PIXEL_8x8]  , 0 );
+    TEST_INTRA_MBCMP( intra_satd_x3_4x4  , predict_4x4  , satd[PIXEL_4x4]  , 0 );
+    TEST_INTRA_MBCMP( intra_sa8d_x3_8x8  , predict_8x8  , sa8d[PIXEL_8x8]  , 1, edge );
     report( "intra satd_x3 :" );
+    TEST_INTRA_MBCMP( intra_sad_x3_16x16 , predict_16x16, sad [PIXEL_16x16], 0 );
+    report( "intra sad_x3 :" );
 
     if( pixel_asm.ssim_4x4x2_core != pixel_ref.ssim_4x4x2_core ||
         pixel_asm.ssim_end4 != pixel_ref.ssim_end4 )
-- 
2.40.0