Faster weightp analysis

author Fiona Glaser <fiona@x264.com>

Mon, 16 Nov 2009 23:23:58 +0000 (15:23 -0800)

committer Fiona Glaser <fiona@x264.com>

Tue, 17 Nov 2009 23:16:36 +0000 (15:16 -0800)
author Fiona Glaser <fiona@x264.com>
Mon, 16 Nov 2009 23:23:58 +0000 (15:23 -0800)
committer Fiona Glaser <fiona@x264.com>
Tue, 17 Nov 2009 23:16:36 +0000 (15:16 -0800)
diff --git a/common/arm/pixel-a.S b/common/arm/pixel-a.S

index ca406acd6b6566f2635bf90d2e68506406081453..4dd65edef4767cd6611c13844f1a83d2dc9cf561 100644 (file)
--- a/common/arm/pixel-a.S
+++ b/common/arm/pixel-a.S
@@ -512,8 +512,6 @@ function x264_pixel_var_8x8_neon
      VAR_SQR_SUM     q1,  q9,   q14, d24
      vld1.64         {d26}, [r0,:64], r1
      VAR_SQR_SUM     q2,  q10,  q15, d26
-
-    mov             r2,  #6
      b               x264_var_end
  .endfunc
  
@@ -529,7 +527,6 @@ function x264_pixel_var_16x16_neon
      VAR_SQR_SUM     q2,  q13,  q15, d19, vpaddl.u16
  
      mov             ip,  #7
-    mov             r2,  #8
  var16_loop:
      subs            ip,  ip,  #1
      vld1.64         {d16-d17}, [r0,:128], r1
@@ -554,8 +551,6 @@ function x264_var_end
      vpadd.u32       d0,  d0,  d2
  
      vmov            r0,  r1,  d0
-    mul             r0,  r0,  r0
-    sub             r0,  r1,  r0,  lsr r2
      bx              lr
  .endfunc
  
diff --git a/common/arm/pixel.h b/common/arm/pixel.h

index 2ef5cea47682ba642391afac617d0ad04348a7ad..068352083e9c2f0ec637a5d150ca056c7a3c98c3 100644 (file)
--- a/common/arm/pixel.h
+++ b/common/arm/pixel.h
@@ -52,8 +52,8 @@ DECL_X1( ssd, neon )
  int x264_pixel_sa8d_8x8_neon( uint8_t *, int, uint8_t *, int );
  int x264_pixel_sa8d_16x16_neon( uint8_t *, int, uint8_t *, int );
  
-int x264_pixel_var_8x8_neon( uint8_t *, int );
-int x264_pixel_var_16x16_neon( uint8_t *, int );
+uint64_t x264_pixel_var_8x8_neon( uint8_t *, int );
+uint64_t x264_pixel_var_16x16_neon( uint8_t *, int );
  int x264_pixel_var2_8x8_neon( uint8_t *, int, uint8_t *, int, int * );
  
  uint64_t x264_pixel_hadamard_ac_8x8_neon( uint8_t *, int );
diff --git a/common/pixel.c b/common/pixel.c

index 292cdf57006c61f74e4e8c1f9f82ff632bd884e5..7c6023711e94e18b37ff794df07addfee0de54af 100644 (file)
--- a/common/pixel.c
+++ b/common/pixel.c
@@ -142,10 +142,10 @@ int64_t x264_pixel_ssd_wxh( x264_pixel_function_t *pf, uint8_t *pix1, int i_pix1
  /****************************************************************************
   * pixel_var_wxh
   ****************************************************************************/
-#define PIXEL_VAR_C( name, w, shift ) \
-static int name( uint8_t *pix, int i_stride ) \
+#define PIXEL_VAR_C( name, w ) \
+static uint64_t name( uint8_t *pix, int i_stride ) \
  {                                             \
-    uint32_t var = 0, sum = 0, sqr = 0;       \
+    uint32_t sum = 0, sqr = 0;                \
      int x, y;                                 \
      for( y = 0; y < w; y++ )                  \
      {                                         \
@@ -156,12 +156,11 @@ static int name( uint8_t *pix, int i_stride ) \
          }                                     \
          pix += i_stride;                      \
      }                                         \
-    var = sqr - (sum * sum >> shift);         \
-    return var;                               \
+    return sum + ((uint64_t)sqr << 32);       \
  }
  
-PIXEL_VAR_C( x264_pixel_var_16x16, 16, 8 )
-PIXEL_VAR_C( x264_pixel_var_8x8,    8, 6 )
+PIXEL_VAR_C( x264_pixel_var_16x16, 16 )
+PIXEL_VAR_C( x264_pixel_var_8x8,    8 )
  
  /****************************************************************************
   * pixel_var2_wxh
diff --git a/common/pixel.h b/common/pixel.h

index 53f99566ccf3a24e3af6fb9157e18083836e910c..11026422576b9b077811edb321e2a8bc804b5d61 100644 (file)
--- a/common/pixel.h
+++ b/common/pixel.h
@@ -75,7 +75,7 @@ typedef struct
      x264_pixel_cmp_t sad_aligned[7]; /* Aligned SAD for mbcmp */
      int (*var2_8x8)( uint8_t *, int, uint8_t *, int, int * );
  
-    int (*var[4])( uint8_t *pix, int stride );
+    uint64_t (*var[4])( uint8_t *pix, int stride );
      uint64_t (*hadamard_ac[4])( uint8_t *pix, int stride );
  
      void (*ssim_4x4x2_core)( const uint8_t *pix1, int stride1,
diff --git a/common/ppc/pixel.c b/common/ppc/pixel.c

index 844d7f4f69d1620cb9f72142368d0f8f50ea7f4b..64d4c493fd06798f5780d800471775bf548804d8 100644 (file)
--- a/common/ppc/pixel.c
+++ b/common/ppc/pixel.c
@@ -1636,7 +1636,7 @@ static int pixel_ssd_8x8_altivec ( uint8_t *pix1, int i_stride_pix1,
  /****************************************************************************
   * variance
   ****************************************************************************/
-static int x264_pixel_var_16x16_altivec( uint8_t *pix, int i_stride )
+static uint64_t x264_pixel_var_16x16_altivec( uint8_t *pix, int i_stride )
  {
      ALIGNED_16(uint32_t sum_tab[4]);
      ALIGNED_16(uint32_t sqr_tab[4]);
@@ -1661,11 +1661,10 @@ static int x264_pixel_var_16x16_altivec( uint8_t *pix, int i_stride )
  
      uint32_t sum = sum_tab[3];
      uint32_t sqr = sqr_tab[3];
-    uint32_t var = sqr - (sum * sum >> 8);
-    return var;
+    return sum + ((uint64_t)sqr<<32);
  }
  
-static int x264_pixel_var_8x8_altivec( uint8_t *pix, int i_stride )
+static uint64_t x264_pixel_var_8x8_altivec( uint8_t *pix, int i_stride )
  {
      ALIGNED_16(uint32_t sum_tab[4]);
      ALIGNED_16(uint32_t sqr_tab[4]);
@@ -1700,8 +1699,7 @@ static int x264_pixel_var_8x8_altivec( uint8_t *pix, int i_stride )
  
      uint32_t sum = sum_tab[3];
      uint32_t sqr = sqr_tab[3];
-    uint32_t var = sqr - (sum * sum >> 6);
-    return var;
+    return sum + ((uint64_t)sqr<<32);
  }
  
  
diff --git a/common/x86/pixel-a.asm b/common/x86/pixel-a.asm

index 0f6ed6c150d24d559c90fad5b7074ea8464fe211..72ecad786cb6dfc72454afc95a3335aed1507d2b 100644 (file)
--- a/common/x86/pixel-a.asm
+++ b/common/x86/pixel-a.asm
@@ -316,14 +316,15 @@ SSD  4,  8, ssse3
  %endif
  %endmacro
  
-%macro VAR_END 1
+%macro VAR_END 0
      HADDW   m5, m7
-    movd   r1d, m5
-    imul   r1d, r1d
+    movd   eax, m5
      HADDD   m6, m1
-    shr    r1d, %1
-    movd   eax, m6
-    sub    eax, r1d  ; sqr - (sum * sum >> shift)
+    movd   edx, m6
+%ifdef ARCH_X86_64
+    shl    rdx, 32
+    add    rax, rdx
+%endif
      RET
  %endmacro
  
@@ -370,12 +371,12 @@ INIT_MMX
  cglobal x264_pixel_var_16x16_mmxext, 2,3
      VAR_START 0
      VAR_2ROW 8, 16
-    VAR_END 8
+    VAR_END
  
  cglobal x264_pixel_var_8x8_mmxext, 2,3
      VAR_START 0
      VAR_2ROW r1, 4
-    VAR_END 6
+    VAR_END
  
  INIT_XMM
  cglobal x264_pixel_var_16x16_sse2, 2,3,8
@@ -389,7 +390,7 @@ cglobal x264_pixel_var_16x16_sse2, 2,3,8
      VAR_CORE
      dec r2d
      jg .loop
-    VAR_END 8
+    VAR_END
  
  cglobal x264_pixel_var_8x8_sse2, 2,4,8
      VAR_START 1
@@ -405,7 +406,7 @@ cglobal x264_pixel_var_8x8_sse2, 2,4,8
      VAR_CORE
      dec r2d
      jg .loop
-    VAR_END 6
+    VAR_END
  
  %macro VAR2_END 0
      HADDW   m5, m7
diff --git a/common/x86/pixel.h b/common/x86/pixel.h

index b1e22cee7885513c8c16a8f44ef4d06c89807cb5..9bba68305428e0eee685b937fd5eb5ced45f6858 100644 (file)
--- a/common/x86/pixel.h
+++ b/common/x86/pixel.h
@@ -69,8 +69,8 @@ DECL_X4( sad, cache64_mmxext );
  DECL_X4( sad, cache64_sse2 );
  DECL_X4( sad, cache64_ssse3 );
  
-DECL_PIXELS( int, var, mmxext, ( uint8_t *pix, int i_stride ))
-DECL_PIXELS( int, var, sse2,   ( uint8_t *pix, int i_stride ))
+DECL_PIXELS( uint64_t, var, mmxext, ( uint8_t *pix, int i_stride ))
+DECL_PIXELS( uint64_t, var, sse2,   ( uint8_t *pix, int i_stride ))
  DECL_PIXELS( uint64_t, hadamard_ac, mmxext, ( uint8_t *pix, int i_stride ))
  DECL_PIXELS( uint64_t, hadamard_ac, sse2,   ( uint8_t *pix, int i_stride ))
  DECL_PIXELS( uint64_t, hadamard_ac, ssse3,  ( uint8_t *pix, int i_stride ))
diff --git a/encoder/ratecontrol.c b/encoder/ratecontrol.c

index 912ba5418a4b27a62acb97a5e86a9efe959e45fd..7120ee571639ccb52b0c9c41aa395ab17f6c04c4 100644 (file)
--- a/encoder/ratecontrol.c
+++ b/encoder/ratecontrol.c
@@ -179,6 +179,22 @@ static inline double qscale2bits(ratecontrol_entry_t *rce, double qscale)
             + rce->misc_bits;
  }
  
+static ALWAYS_INLINE uint32_t ac_energy_plane( x264_t *h, int mb_x, int mb_y, x264_frame_t *frame, int i )
+{
+    int w = i ? 8 : 16;
+    int shift = i ? 6 : 8;
+    int stride = frame->i_stride[i];
+    int offset = h->mb.b_interlaced
+        ? w * (mb_x + (mb_y&~1) * stride) + (mb_y&1) * stride
+        : w * (mb_x + mb_y * stride);
+    int pix = i ? PIXEL_8x8 : PIXEL_16x16;
+    stride <<= h->mb.b_interlaced;
+    uint64_t res = h->pixf.var[pix]( frame->plane[i] + offset, stride );
+    uint32_t sum = (uint32_t)res;
+    uint32_t sqr = res >> 32;
+    return sqr - (sum * sum >> shift);
+}
+
  // Find the total AC energy of the block in all planes.
  static NOINLINE uint32_t ac_energy_mb( x264_t *h, int mb_x, int mb_y, x264_frame_t *frame )
  {
@@ -186,18 +202,9 @@ static NOINLINE uint32_t ac_energy_mb( x264_t *h, int mb_x, int mb_y, x264_frame
       * and putting it after floating point ops.  As a result, we put the emms at the end of the
       * function and make sure that its always called before the float math.  Noinline makes
       * sure no reordering goes on. */
-    uint32_t var = 0, i;
-    for( i = 0; i < 3; i++ )
-    {
-        int w = i ? 8 : 16;
-        int stride = frame->i_stride[i];
-        int offset = h->mb.b_interlaced
-            ? w * (mb_x + (mb_y&~1) * stride) + (mb_y&1) * stride
-            : w * (mb_x + mb_y * stride);
-        int pix = i ? PIXEL_8x8 : PIXEL_16x16;
-        stride <<= h->mb.b_interlaced;
-        var += h->pixf.var[pix]( frame->plane[i]+offset, stride );
-    }
+    uint32_t var = ac_energy_plane( h, mb_x, mb_y, frame, 0 );
+    var         += ac_energy_plane( h, mb_x, mb_y, frame, 1 );
+    var         += ac_energy_plane( h, mb_x, mb_y, frame, 2 );
      x264_emms();
      return var;
  }
diff --git a/encoder/slicetype.c b/encoder/slicetype.c

index c12e8739caa9f95562252cb5e3ee5df2da14dbc1..2df7dee7269e76a56b48229c39f86c65a3132b86 100644 (file)
--- a/encoder/slicetype.c
+++ b/encoder/slicetype.c
@@ -56,24 +56,23 @@ static void get_h264_weight( unsigned int weight_nonh264, int offset, x264_weigh
      }
      w->i_scale = X264_MIN( w->i_scale, 127 );
  }
-/* due to a GCC bug on some platforms (win32), flat[16] may not actually be aligned. */
-ALIGNED_16( static uint8_t flat[17] ) = {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1};
  
  static NOINLINE void weights_plane_analyse( x264_t *h, uint8_t *plane, int width, int height, int stride, unsigned int *sum, uint64_t *var )
  {
      int x,y;
-    unsigned int sad = 0;
+    uint64_t sad = 0;
      uint64_t ssd = 0;
      uint8_t *p = plane;
      for( y = 0; y < height>>4; y++, p += stride*16 )
          for( x = 0; x < width; x+=16 )
          {
-            sad += h->pixf.sad_aligned[PIXEL_16x16]( p + x, stride, flat, 0 );
-            ssd += h->pixf.ssd[PIXEL_16x16]( p + x, stride, flat, 0 );
+            uint64_t res = h->pixf.var[PIXEL_16x16]( p + x, stride );
+            sad += (uint32_t)res;
+            ssd += res >> 32;
          }
  
      *sum = sad;
-    *var = ssd - (uint64_t) sad * sad / (width * height);
+    *var = ssd - ((uint64_t)sad * sad + width * height / 2) / (width * height);
      x264_emms();
  }
  
@@ -126,24 +125,19 @@ static NOINLINE unsigned int x264_weight_cost( x264_t *h, x264_frame_t *fenc, ui
      int i_lines = fenc->i_lines_lowres;
      int i_width = fenc->i_width_lowres;
      uint8_t *fenc_plane = fenc->lowres[0];
-    ALIGNED_ARRAY_16( uint8_t, buf, [8*8] );
+    ALIGNED_8( uint8_t buf[8*8] );
      int pixoff = 0;
      int i_mb = 0;
  
      if( w )
+    {
          for( y = 0; y < i_lines; y += 8, pixoff = y*i_stride )
              for( x = 0; x < i_width; x += 8, i_mb++, pixoff += 8)
              {
                  w->weightfn[8>>2]( buf, 8, &src[pixoff], i_stride, w, 8 );
                  cost += X264_MIN( h->pixf.mbcmp[PIXEL_8x8]( buf, 8, &fenc_plane[pixoff], i_stride ), fenc->i_intra_cost[i_mb] );
              }
-    else
-        for( y = 0; y < i_lines; y += 8, pixoff = y*i_stride )
-            for( x = 0; x < i_width; x += 8, i_mb++, pixoff += 8 )
-                cost += X264_MIN( h->pixf.mbcmp[PIXEL_8x8]( &src[pixoff], i_stride, &fenc_plane[pixoff], i_stride ), fenc->i_intra_cost[i_mb] );
-
-    if( w )
-    {
+        /* Add cost of weights in the slice header. */
          int numslices;
          if( h->param.i_slice_count )
              numslices = h->param.i_slice_count;
@@ -151,11 +145,15 @@ static NOINLINE unsigned int x264_weight_cost( x264_t *h, x264_frame_t *fenc, ui
              numslices = (h->sps->i_mb_width * h->sps->i_mb_height + h->param.i_slice_max_mbs-1) / h->param.i_slice_max_mbs;
          else
              numslices = 1;
-        // FIXME still need to calculate for --slice-max-size
-        // Multiply by 2 as there will be a duplicate. 10 bits added as if there is a weighted frame, then an additional duplicate is used.
-        // Since using lowres frames, assume lambda = 1.
+        /* FIXME: find a way to account for --slice-max-size?
+         * Multiply by 2 as there will be a duplicate. 10 bits added as if there is a weighted frame, then an additional duplicate is used.
+         * Since using lowres frames, assume lambda = 1. */
          cost += numslices * ( 10 + 2 * ( bs_size_ue( w[0].i_denom ) + bs_size_se( w[0].i_scale ) + bs_size_se( w[0].i_offset ) ) );
      }
+    else
+        for( y = 0; y < i_lines; y += 8, pixoff = y*i_stride )
+            for( x = 0; x < i_width; x += 8, i_mb++, pixoff += 8 )
+                cost += X264_MIN( h->pixf.mbcmp[PIXEL_8x8]( &src[pixoff], i_stride, &fenc_plane[pixoff], i_stride ), fenc->i_intra_cost[i_mb] );
      x264_emms();
      return cost;
  }
@@ -171,17 +169,16 @@ void x264_weights_analyse( x264_t *h, x264_frame_t *fenc, x264_frame_t *ref, int
      int i_delta_index = fenc->i_frame - ref->i_frame - 1;
      /* epsilon is chosen to require at least a numerator of 127 (with denominator = 128) */
      const float epsilon = 1.0/128.0;
-
      float guess_scale;
      int found;
      x264_weight_t *weights = fenc->weight[0];
  
      weights_plane_analyse( h, fenc->plane[0], fenc->i_width[0], fenc->i_lines[0], fenc->i_stride[0], &fenc_sum, &fenc_var );
-    weights_plane_analyse( h, ref->plane[0], ref->i_width[0], ref->i_lines[0], ref->i_stride[0], &ref_sum, &ref_var );
+    weights_plane_analyse( h,  ref->plane[0],  ref->i_width[0],  ref->i_lines[0],  ref->i_stride[0],  &ref_sum,  &ref_var );
      fenc_var = round( sqrt( fenc_var ) );
-    ref_var = round( sqrt( ref_var ) );
+    ref_var  = round( sqrt(  ref_var ) );
      fenc_mean = (float)fenc_sum / (fenc->i_lines[0] * fenc->i_width[0]);
-    ref_mean = (float)ref_sum / (fenc->i_lines[0] * fenc->i_width[0]);
+    ref_mean  = (float) ref_sum / (fenc->i_lines[0] * fenc->i_width[0]);
  
      //early termination
      if( fabs( ref_mean - fenc_mean ) < 0.5 && fabsf( 1 - (float)fenc_var / ref_var ) < epsilon )
@@ -220,7 +217,7 @@ void x264_weights_analyse( x264_t *h, x264_frame_t *fenc, x264_frame_t *ref, int
      x264_emms();
  
      /* FIXME: More analysis can be done here on SAD vs. SATD termination. */
-       /* 0.2% termination derived experimentally to avoid weird weights in frames that are mostly intra. */
+    /* 0.2% termination derived experimentally to avoid weird weights in frames that are mostly intra. */
      if( !found || (minscale == 1<<mindenom && minoff == 0) || (float)minscore / origscore > 0.998 )
      {
          SET_WEIGHT( weights[0], 0, 1, 0, 0 );
diff --git a/tools/checkasm.c b/tools/checkasm.c

index 868c9c2b9289d7fc8fab435a628a5ea82b6f02cf..d82a1304dd7e74f29113369e1ae2c7f87c90d93f 100644 (file)
--- a/tools/checkasm.c
+++ b/tools/checkasm.c
@@ -344,16 +344,20 @@ static int check_pixel( int cpu_ref, int cpu_new )
  #define TEST_PIXEL_VAR( i ) \
      if( pixel_asm.var[i] != pixel_ref.var[i] ) \
      { \
-        int res_c, res_asm; \
          set_func_name( "%s_%s", "var", pixel_names[i] ); \
          used_asm = 1; \
-        res_c   = call_c( pixel_c.var[i], buf1, 16 ); \
-        res_asm = call_a( pixel_asm.var[i], buf1, 16 ); \
+        /* abi-check wrapper can't return uint64_t, so separate it from return value check */\
+        call_c1( pixel_c.var[i], buf1, 16 ); \
+        call_a1( pixel_asm.var[i], buf1, 16 ); \
+        uint64_t res_c   = pixel_c.var[i]( buf1, 16 ); \
+        uint64_t res_asm = pixel_asm.var[i]( buf1, 16 ); \
          if( res_c != res_asm ) \
          { \
              ok = 0; \
-            fprintf( stderr, "var[%d]: %d != %d [FAILED]\n", i, res_c, res_asm ); \
+            fprintf( stderr, "var[%d]: %d %d != %d %d [FAILED]\n", i, (int)res_c, (int)(res_c>>32), (int)res_asm, (int)(res_asm>>32) ); \
          } \
+        call_c2( pixel_c.var[i], buf1, 16 ); \
+        call_a2( pixel_asm.var[i], buf1, 16 ); \
      }
  
      ok = 1; used_asm = 0;
@@ -386,6 +390,8 @@ static int check_pixel( int cpu_ref, int cpu_new )
              for( j=0; j<32; j++ )
              {
                  uint8_t *pix = (j&16 ? buf1 : buf3) + (j&15)*256;
+                call_c1( pixel_c.hadamard_ac[i], buf1, 16 );
+                call_a1( pixel_asm.hadamard_ac[i], buf1, 16 );
                  uint64_t rc = pixel_c.hadamard_ac[i]( pix, 16 );
                  uint64_t ra = pixel_asm.hadamard_ac[i]( pix, 16 );
                  if( rc != ra )
author	Fiona Glaser <fiona@x264.com>
	Mon, 16 Nov 2009 23:23:58 +0000 (15:23 -0800)
committer	Fiona Glaser <fiona@x264.com>
	Tue, 17 Nov 2009 23:16:36 +0000 (15:16 -0800)
common/arm/pixel-a.S		patch \| blob \| history
common/arm/pixel.h		patch \| blob \| history
common/pixel.c		patch \| blob \| history
common/pixel.h		patch \| blob \| history
common/ppc/pixel.c		patch \| blob \| history
common/x86/pixel-a.asm		patch \| blob \| history
common/x86/pixel.h		patch \| blob \| history
encoder/ratecontrol.c		patch \| blob \| history
encoder/slicetype.c		patch \| blob \| history
tools/checkasm.c		patch \| blob \| history