Add sad_aligned for faster subme=1 mbcmp

author Fiona Glaser <fiona@x264.com>

Wed, 3 Sep 2008 22:15:17 +0000 (15:15 -0700)

committer Fiona Glaser <fiona@x264.com>

Fri, 5 Sep 2008 18:34:20 +0000 (11:34 -0700)
author Fiona Glaser <fiona@x264.com>
Wed, 3 Sep 2008 22:15:17 +0000 (15:15 -0700)
committer Fiona Glaser <fiona@x264.com>
Fri, 5 Sep 2008 18:34:20 +0000 (11:34 -0700)
diff --git a/common/pixel.c b/common/pixel.c

index 27575b5ca0f40f2bb2ffae94a33b3f1d99f2396f..3e28d8576235b9c3320d25051e1be24cd28e4336 100644 (file)
--- a/common/pixel.c
+++ b/common/pixel.c
@@ -529,20 +529,24 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf )
  {
      memset( pixf, 0, sizeof(*pixf) );
  
-#define INIT2( name, cpu ) \
-    pixf->name[PIXEL_16x16] = x264_pixel_##name##_16x16##cpu;\
-    pixf->name[PIXEL_16x8]  = x264_pixel_##name##_16x8##cpu;
-#define INIT4( name, cpu ) \
-    INIT2( name, cpu ) \
-    pixf->name[PIXEL_8x16]  = x264_pixel_##name##_8x16##cpu;\
-    pixf->name[PIXEL_8x8]   = x264_pixel_##name##_8x8##cpu;
-#define INIT5( name, cpu ) \
-    INIT4( name, cpu ) \
-    pixf->name[PIXEL_8x4]   = x264_pixel_##name##_8x4##cpu;
-#define INIT7( name, cpu ) \
-    INIT5( name, cpu ) \
-    pixf->name[PIXEL_4x8]   = x264_pixel_##name##_4x8##cpu;\
-    pixf->name[PIXEL_4x4]   = x264_pixel_##name##_4x4##cpu;
+#define INIT2_NAME( name1, name2, cpu ) \
+    pixf->name1[PIXEL_16x16] = x264_pixel_##name2##_16x16##cpu;\
+    pixf->name1[PIXEL_16x8]  = x264_pixel_##name2##_16x8##cpu;
+#define INIT4_NAME( name1, name2, cpu ) \
+    INIT2_NAME( name1, name2, cpu ) \
+    pixf->name1[PIXEL_8x16]  = x264_pixel_##name2##_8x16##cpu;\
+    pixf->name1[PIXEL_8x8]   = x264_pixel_##name2##_8x8##cpu;
+#define INIT5_NAME( name1, name2, cpu ) \
+    INIT4_NAME( name1, name2, cpu ) \
+    pixf->name1[PIXEL_8x4]   = x264_pixel_##name2##_8x4##cpu;
+#define INIT7_NAME( name1, name2, cpu ) \
+    INIT5_NAME( name1, name2, cpu ) \
+    pixf->name1[PIXEL_4x8]   = x264_pixel_##name2##_4x8##cpu;\
+    pixf->name1[PIXEL_4x4]   = x264_pixel_##name2##_4x4##cpu;
+#define INIT2( name, cpu ) INIT2_NAME( name, name, cpu )
+#define INIT4( name, cpu ) INIT4_NAME( name, name, cpu )
+#define INIT5( name, cpu ) INIT5_NAME( name, name, cpu )
+#define INIT7( name, cpu ) INIT7_NAME( name, name, cpu )
  
  #define INIT_ADS( cpu ) \
      pixf->ads[PIXEL_16x16] = x264_pixel_ads4##cpu;\
@@ -550,6 +554,7 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf )
      pixf->ads[PIXEL_8x8] = x264_pixel_ads1##cpu;
  
      INIT7( sad, );
+    INIT7_NAME( sad_aligned, sad, );
      INIT7( sad_x3, );
      INIT7( sad_x4, );
      INIT7( ssd, );
@@ -574,6 +579,7 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf )
      if( cpu&X264_CPU_MMXEXT )
      {
          INIT7( sad, _mmxext );
+        INIT7_NAME( sad_aligned, sad, _mmxext );
          INIT7( sad_x3, _mmxext );
          INIT7( sad_x4, _mmxext );
          INIT7( satd, _mmxext );
@@ -640,6 +646,7 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf )
          INIT5( satd, _sse2 );
          INIT5( satd_x3, _sse2 );
          INIT5( satd_x4, _sse2 );
+        INIT2_NAME( sad_aligned, sad, _sse2_aligned );
          pixf->var[PIXEL_16x16] = x264_pixel_var_16x16_sse2;
          pixf->ssim_4x4x2_core  = x264_pixel_ssim_4x4x2_core_sse2;
          pixf->ssim_end4        = x264_pixel_ssim_end4_sse2;
diff --git a/common/pixel.h b/common/pixel.h

index fd23680df206353c268f82dd4a254d6d3c68dbbf..127f89ddb9cb6c6a4b97e9e9c05d7022d6998ce5 100644 (file)
--- a/common/pixel.h
+++ b/common/pixel.h
@@ -69,10 +69,12 @@ typedef struct
      x264_pixel_cmp_t ssim[7];
      x264_pixel_cmp_t sa8d[4];
      x264_pixel_cmp_t mbcmp[7]; /* either satd or sad for subpel refine and mode decision */
+    x264_pixel_cmp_t mbcmp_unaligned[7]; /* unaligned mbcmp for subpel */
      x264_pixel_cmp_t fpelcmp[7]; /* either satd or sad for fullpel motion search */
      x264_pixel_cmp_x3_t fpelcmp_x3[7];
      x264_pixel_cmp_x4_t fpelcmp_x4[7];
      x264_pixel_var_t var[4];
+    x264_pixel_cmp_t sad_aligned[7]; /* Aligned SAD for mbcmp */
  
      void (*ssim_4x4x2_core)( const uint8_t *pix1, int stride1,
                               const uint8_t *pix2, int stride2, int sums[2][4] );
diff --git a/common/x86/pixel.h b/common/x86/pixel.h

index 9326a84a720d8008147d5fd939a571eefdd49f3a..cbb37dda6409a1372385d927e2becac4d4d34be1 100644 (file)
--- a/common/x86/pixel.h
+++ b/common/x86/pixel.h
@@ -43,6 +43,7 @@
  DECL_X1( sad, mmxext )
  DECL_X1( sad, sse2 )
  DECL_X1( sad, sse3 )
+DECL_X1( sad, sse2_aligned )
  DECL_X4( sad, mmxext )
  DECL_X4( sad, sse2 )
  DECL_X4( sad, sse3 )
diff --git a/common/x86/sad-a.asm b/common/x86/sad-a.asm

index a95450067ea6e4e1c5a59180113114c4395b31f6..2c167221671159f46d1de1e38e2c6f19c8be6333 100644 (file)
--- a/common/x86/sad-a.asm
+++ b/common/x86/sad-a.asm
@@ -215,7 +215,9 @@ cglobal x264_pixel_sad_16x8_%1, 4,4
  SAD_W16 sse2
  %define movdqu lddqu
  SAD_W16 sse3
-%undef movdqu
+%define movdqu movdqa
+SAD_W16 sse2_aligned
+%define movdqu movups
  
  
  
diff --git a/encoder/analyse.c b/encoder/analyse.c

index cd7f745a80d7238bc4694261022f961538f35fbc..d9e77a32937f18184fecb2470c4fb4a8e32b58ee 100644 (file)
--- a/encoder/analyse.c
+++ b/encoder/analyse.c
@@ -588,7 +588,7 @@ static void x264_mb_analyse_intra( x264_t *h, x264_mb_analysis_t *a, int i_satd_
      if( flags & X264_ANALYSE_I8x8 )
      {
          DECLARE_ALIGNED_16( uint8_t edge[33] );
-        x264_pixel_cmp_t sa8d = (*h->pixf.mbcmp == *h->pixf.sad) ? h->pixf.sad[PIXEL_8x8] : h->pixf.sa8d[PIXEL_8x8];
+        x264_pixel_cmp_t sa8d = (h->pixf.mbcmp[0] == h->pixf.satd[0]) ? h->pixf.sa8d[PIXEL_8x8] : h->pixf.mbcmp[PIXEL_8x8];
          int i_satd_thresh = a->b_mbrd ? COST_MAX : X264_MIN( i_satd_inter, a->i_satd_i16x16 );
          int i_cost = 0;
          b_merged_satd = h->pixf.intra_sa8d_x3_8x8 && h->pixf.mbcmp[0] == h->pixf.satd[0];
diff --git a/encoder/encoder.c b/encoder/encoder.c

index d991a5e29c28551d4d9d079c29311a741c1d1a13..584aef6595f8efb3b3cb0edd236e18730ea13264 100644 (file)
--- a/encoder/encoder.c
+++ b/encoder/encoder.c
@@ -583,7 +583,8 @@ static int x264_validate_parameters( x264_t *h )
  static void mbcmp_init( x264_t *h )
  {
      int satd = !h->mb.b_lossless && h->param.analyse.i_subpel_refine > 1;
-    memcpy( h->pixf.mbcmp, satd ? h->pixf.satd : h->pixf.sad, sizeof(h->pixf.mbcmp) );
+    memcpy( h->pixf.mbcmp, satd ? h->pixf.satd : h->pixf.sad_aligned, sizeof(h->pixf.mbcmp) );
+    memcpy( h->pixf.mbcmp_unaligned, satd ? h->pixf.satd : h->pixf.sad, sizeof(h->pixf.mbcmp_unaligned) );
      satd &= h->param.analyse.i_me_method == X264_ME_TESA;
      memcpy( h->pixf.fpelcmp, satd ? h->pixf.satd : h->pixf.sad, sizeof(h->pixf.fpelcmp) );
      memcpy( h->pixf.fpelcmp_x3, satd ? h->pixf.satd_x3 : h->pixf.sad_x3, sizeof(h->pixf.fpelcmp_x3) );
diff --git a/encoder/me.c b/encoder/me.c

index 44f6f6db227fd3674faa47bbbcbf0cfbc8b52249..63c57863b51ecb5c5771507769e54cb978b03626 100644 (file)
--- a/encoder/me.c
+++ b/encoder/me.c
@@ -663,7 +663,7 @@ if( b_refine_qpel || (dir^1) != odir ) \
  { \
      int stride = 16; \
      uint8_t *src = h->mc.get_ref( pix[0], &stride, m->p_fref, m->i_stride[0], mx, my, bw, bh ); \
-    int cost = h->pixf.mbcmp[i_pixel]( m->p_fenc[0], FENC_STRIDE, src, stride ) \
+    int cost = h->pixf.mbcmp_unaligned[i_pixel]( m->p_fenc[0], FENC_STRIDE, src, stride ) \
               + p_cost_mvx[ mx ] + p_cost_mvy[ my ]; \
      if( b_chroma_me && cost < bcost ) \
      { \
@@ -904,7 +904,7 @@ int x264_me_refine_bidir( x264_t *h, x264_me_t *m0, x264_me_t *m1, int i_weight
  { \
      int stride = 16; \
      uint8_t *src = h->mc.get_ref( pix, &stride, m->p_fref, m->i_stride[0], mx, my, bw*4, bh*4 ); \
-    dst = h->pixf.mbcmp[i_pixel]( m->p_fenc[0], FENC_STRIDE, src, stride ) \
+    dst = h->pixf.mbcmp_unaligned[i_pixel]( m->p_fenc[0], FENC_STRIDE, src, stride ) \
          + p_cost_mvx[mx] + p_cost_mvy[my]; \
      COPY1_IF_LT( bsatd, dst ); \
  }
diff --git a/tools/checkasm.c b/tools/checkasm.c

index f13d6e05be050c50aa0a94ed97fa56e02e0aa6b1..9f8928598265e822efbc9d25f81c013c3f02cbdc 100644 (file)
--- a/tools/checkasm.c
+++ b/tools/checkasm.c
@@ -258,6 +258,7 @@ static int check_pixel( int cpu_ref, int cpu_new )
      report( "pixel " #name " :" );
  
      TEST_PIXEL( sad, 0 );
+    TEST_PIXEL( sad_aligned, 1 );
      TEST_PIXEL( ssd, 1 );
      TEST_PIXEL( satd, 0 );
      TEST_PIXEL( sa8d, 0 );
author	Fiona Glaser <fiona@x264.com>
	Wed, 3 Sep 2008 22:15:17 +0000 (15:15 -0700)
committer	Fiona Glaser <fiona@x264.com>
	Fri, 5 Sep 2008 18:34:20 +0000 (11:34 -0700)
common/pixel.c		patch \| blob \| history
common/pixel.h		patch \| blob \| history
common/x86/pixel.h		patch \| blob \| history
common/x86/sad-a.asm		patch \| blob \| history
encoder/analyse.c		patch \| blob \| history
encoder/encoder.c		patch \| blob \| history
encoder/me.c		patch \| blob \| history
tools/checkasm.c		patch \| blob \| history