From: Fiona Glaser <fiona@x264.com>
Date: Mon, 16 Nov 2009 23:23:58 +0000 (-0800)
Subject: Faster weightp analysis
X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=63f7147714b37f1779dcf62138f21771368cb8e8;p=libx264

Faster weightp analysis
Modify pixel_var slightly to return the necessary information and use it for weight analysis instead of sad/ssd.
Various minor cosmetics.
---

diff --git a/common/arm/pixel-a.S b/common/arm/pixel-a.S
index ca406acd..4dd65ede 100644
--- a/common/arm/pixel-a.S
+++ b/common/arm/pixel-a.S
@@ -512,8 +512,6 @@ function x264_pixel_var_8x8_neon
     VAR_SQR_SUM     q1,  q9,   q14, d24
     vld1.64         {d26}, [r0,:64], r1
     VAR_SQR_SUM     q2,  q10,  q15, d26
-
-    mov             r2,  #6
     b               x264_var_end
 .endfunc
 
@@ -529,7 +527,6 @@ function x264_pixel_var_16x16_neon
     VAR_SQR_SUM     q2,  q13,  q15, d19, vpaddl.u16
 
     mov             ip,  #7
-    mov             r2,  #8
 var16_loop:
     subs            ip,  ip,  #1
     vld1.64         {d16-d17}, [r0,:128], r1
@@ -554,8 +551,6 @@ function x264_var_end
     vpadd.u32       d0,  d0,  d2
 
     vmov            r0,  r1,  d0
-    mul             r0,  r0,  r0
-    sub             r0,  r1,  r0,  lsr r2
     bx              lr
 .endfunc
 
diff --git a/common/arm/pixel.h b/common/arm/pixel.h
index 2ef5cea4..06835208 100644
--- a/common/arm/pixel.h
+++ b/common/arm/pixel.h
@@ -52,8 +52,8 @@ DECL_X1( ssd, neon )
 int x264_pixel_sa8d_8x8_neon( uint8_t *, int, uint8_t *, int );
 int x264_pixel_sa8d_16x16_neon( uint8_t *, int, uint8_t *, int );
 
-int x264_pixel_var_8x8_neon( uint8_t *, int );
-int x264_pixel_var_16x16_neon( uint8_t *, int );
+uint64_t x264_pixel_var_8x8_neon( uint8_t *, int );
+uint64_t x264_pixel_var_16x16_neon( uint8_t *, int );
 int x264_pixel_var2_8x8_neon( uint8_t *, int, uint8_t *, int, int * );
 
 uint64_t x264_pixel_hadamard_ac_8x8_neon( uint8_t *, int );
diff --git a/common/pixel.c b/common/pixel.c
index 292cdf57..7c602371 100644
--- a/common/pixel.c
+++ b/common/pixel.c
@@ -142,10 +142,10 @@ int64_t x264_pixel_ssd_wxh( x264_pixel_function_t *pf, uint8_t *pix1, int i_pix1
 /****************************************************************************
  * pixel_var_wxh
  ****************************************************************************/
-#define PIXEL_VAR_C( name, w, shift ) \
-static int name( uint8_t *pix, int i_stride ) \
+#define PIXEL_VAR_C( name, w ) \
+static uint64_t name( uint8_t *pix, int i_stride ) \
 {                                             \
-    uint32_t var = 0, sum = 0, sqr = 0;       \
+    uint32_t sum = 0, sqr = 0;                \
     int x, y;                                 \
     for( y = 0; y < w; y++ )                  \
     {                                         \
@@ -156,12 +156,11 @@ static int name( uint8_t *pix, int i_stride ) \
         }                                     \
         pix += i_stride;                      \
     }                                         \
-    var = sqr - (sum * sum >> shift);         \
-    return var;                               \
+    return sum + ((uint64_t)sqr << 32);       \
 }
 
-PIXEL_VAR_C( x264_pixel_var_16x16, 16, 8 )
-PIXEL_VAR_C( x264_pixel_var_8x8,    8, 6 )
+PIXEL_VAR_C( x264_pixel_var_16x16, 16 )
+PIXEL_VAR_C( x264_pixel_var_8x8,    8 )
 
 /****************************************************************************
  * pixel_var2_wxh
diff --git a/common/pixel.h b/common/pixel.h
index 53f99566..11026422 100644
--- a/common/pixel.h
+++ b/common/pixel.h
@@ -75,7 +75,7 @@ typedef struct
     x264_pixel_cmp_t sad_aligned[7]; /* Aligned SAD for mbcmp */
     int (*var2_8x8)( uint8_t *, int, uint8_t *, int, int * );
 
-    int (*var[4])( uint8_t *pix, int stride );
+    uint64_t (*var[4])( uint8_t *pix, int stride );
     uint64_t (*hadamard_ac[4])( uint8_t *pix, int stride );
 
     void (*ssim_4x4x2_core)( const uint8_t *pix1, int stride1,
diff --git a/common/ppc/pixel.c b/common/ppc/pixel.c
index 844d7f4f..64d4c493 100644
--- a/common/ppc/pixel.c
+++ b/common/ppc/pixel.c
@@ -1636,7 +1636,7 @@ static int pixel_ssd_8x8_altivec ( uint8_t *pix1, int i_stride_pix1,
 /****************************************************************************
  * variance
  ****************************************************************************/
-static int x264_pixel_var_16x16_altivec( uint8_t *pix, int i_stride )
+static uint64_t x264_pixel_var_16x16_altivec( uint8_t *pix, int i_stride )
 {
     ALIGNED_16(uint32_t sum_tab[4]);
     ALIGNED_16(uint32_t sqr_tab[4]);
@@ -1661,11 +1661,10 @@ static int x264_pixel_var_16x16_altivec( uint8_t *pix, int i_stride )
 
     uint32_t sum = sum_tab[3];
     uint32_t sqr = sqr_tab[3];
-    uint32_t var = sqr - (sum * sum >> 8);
-    return var;
+    return sum + ((uint64_t)sqr<<32);
 }
 
-static int x264_pixel_var_8x8_altivec( uint8_t *pix, int i_stride )
+static uint64_t x264_pixel_var_8x8_altivec( uint8_t *pix, int i_stride )
 {
     ALIGNED_16(uint32_t sum_tab[4]);
     ALIGNED_16(uint32_t sqr_tab[4]);
@@ -1700,8 +1699,7 @@ static int x264_pixel_var_8x8_altivec( uint8_t *pix, int i_stride )
 
     uint32_t sum = sum_tab[3];
     uint32_t sqr = sqr_tab[3];
-    uint32_t var = sqr - (sum * sum >> 6);
-    return var;
+    return sum + ((uint64_t)sqr<<32);
 }
 
 
diff --git a/common/x86/pixel-a.asm b/common/x86/pixel-a.asm
index 0f6ed6c1..72ecad78 100644
--- a/common/x86/pixel-a.asm
+++ b/common/x86/pixel-a.asm
@@ -316,14 +316,15 @@ SSD  4,  8, ssse3
 %endif
 %endmacro
 
-%macro VAR_END 1
+%macro VAR_END 0
     HADDW   m5, m7
-    movd   r1d, m5
-    imul   r1d, r1d
+    movd   eax, m5
     HADDD   m6, m1
-    shr    r1d, %1
-    movd   eax, m6
-    sub    eax, r1d  ; sqr - (sum * sum >> shift)
+    movd   edx, m6
+%ifdef ARCH_X86_64
+    shl    rdx, 32
+    add    rax, rdx
+%endif
     RET
 %endmacro
 
@@ -370,12 +371,12 @@ INIT_MMX
 cglobal x264_pixel_var_16x16_mmxext, 2,3
     VAR_START 0
     VAR_2ROW 8, 16
-    VAR_END 8
+    VAR_END
 
 cglobal x264_pixel_var_8x8_mmxext, 2,3
     VAR_START 0
     VAR_2ROW r1, 4
-    VAR_END 6
+    VAR_END
 
 INIT_XMM
 cglobal x264_pixel_var_16x16_sse2, 2,3,8
@@ -389,7 +390,7 @@ cglobal x264_pixel_var_16x16_sse2, 2,3,8
     VAR_CORE
     dec r2d
     jg .loop
-    VAR_END 8
+    VAR_END
 
 cglobal x264_pixel_var_8x8_sse2, 2,4,8
     VAR_START 1
@@ -405,7 +406,7 @@ cglobal x264_pixel_var_8x8_sse2, 2,4,8
     VAR_CORE
     dec r2d
     jg .loop
-    VAR_END 6
+    VAR_END
 
 %macro VAR2_END 0
     HADDW   m5, m7
diff --git a/common/x86/pixel.h b/common/x86/pixel.h
index b1e22cee..9bba6830 100644
--- a/common/x86/pixel.h
+++ b/common/x86/pixel.h
@@ -69,8 +69,8 @@ DECL_X4( sad, cache64_mmxext );
 DECL_X4( sad, cache64_sse2 );
 DECL_X4( sad, cache64_ssse3 );
 
-DECL_PIXELS( int, var, mmxext, ( uint8_t *pix, int i_stride ))
-DECL_PIXELS( int, var, sse2,   ( uint8_t *pix, int i_stride ))
+DECL_PIXELS( uint64_t, var, mmxext, ( uint8_t *pix, int i_stride ))
+DECL_PIXELS( uint64_t, var, sse2,   ( uint8_t *pix, int i_stride ))
 DECL_PIXELS( uint64_t, hadamard_ac, mmxext, ( uint8_t *pix, int i_stride ))
 DECL_PIXELS( uint64_t, hadamard_ac, sse2,   ( uint8_t *pix, int i_stride ))
 DECL_PIXELS( uint64_t, hadamard_ac, ssse3,  ( uint8_t *pix, int i_stride ))
diff --git a/encoder/ratecontrol.c b/encoder/ratecontrol.c
index 912ba541..7120ee57 100644
--- a/encoder/ratecontrol.c
+++ b/encoder/ratecontrol.c
@@ -179,6 +179,22 @@ static inline double qscale2bits(ratecontrol_entry_t *rce, double qscale)
            + rce->misc_bits;
 }
 
+static ALWAYS_INLINE uint32_t ac_energy_plane( x264_t *h, int mb_x, int mb_y, x264_frame_t *frame, int i )
+{
+    int w = i ? 8 : 16;
+    int shift = i ? 6 : 8;
+    int stride = frame->i_stride[i];
+    int offset = h->mb.b_interlaced
+        ? w * (mb_x + (mb_y&~1) * stride) + (mb_y&1) * stride
+        : w * (mb_x + mb_y * stride);
+    int pix = i ? PIXEL_8x8 : PIXEL_16x16;
+    stride <<= h->mb.b_interlaced;
+    uint64_t res = h->pixf.var[pix]( frame->plane[i] + offset, stride );
+    uint32_t sum = (uint32_t)res;
+    uint32_t sqr = res >> 32;
+    return sqr - (sum * sum >> shift);
+}
+
 // Find the total AC energy of the block in all planes.
 static NOINLINE uint32_t ac_energy_mb( x264_t *h, int mb_x, int mb_y, x264_frame_t *frame )
 {
@@ -186,18 +202,9 @@ static NOINLINE uint32_t ac_energy_mb( x264_t *h, int mb_x, int mb_y, x264_frame
      * and putting it after floating point ops.  As a result, we put the emms at the end of the
      * function and make sure that its always called before the float math.  Noinline makes
      * sure no reordering goes on. */
-    uint32_t var = 0, i;
-    for( i = 0; i < 3; i++ )
-    {
-        int w = i ? 8 : 16;
-        int stride = frame->i_stride[i];
-        int offset = h->mb.b_interlaced
-            ? w * (mb_x + (mb_y&~1) * stride) + (mb_y&1) * stride
-            : w * (mb_x + mb_y * stride);
-        int pix = i ? PIXEL_8x8 : PIXEL_16x16;
-        stride <<= h->mb.b_interlaced;
-        var += h->pixf.var[pix]( frame->plane[i]+offset, stride );
-    }
+    uint32_t var = ac_energy_plane( h, mb_x, mb_y, frame, 0 );
+    var         += ac_energy_plane( h, mb_x, mb_y, frame, 1 );
+    var         += ac_energy_plane( h, mb_x, mb_y, frame, 2 );
     x264_emms();
     return var;
 }
diff --git a/encoder/slicetype.c b/encoder/slicetype.c
index c12e8739..2df7dee7 100644
--- a/encoder/slicetype.c
+++ b/encoder/slicetype.c
@@ -56,24 +56,23 @@ static void get_h264_weight( unsigned int weight_nonh264, int offset, x264_weigh
     }
     w->i_scale = X264_MIN( w->i_scale, 127 );
 }
-/* due to a GCC bug on some platforms (win32), flat[16] may not actually be aligned. */
-ALIGNED_16( static uint8_t flat[17] ) = {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1};
 
 static NOINLINE void weights_plane_analyse( x264_t *h, uint8_t *plane, int width, int height, int stride, unsigned int *sum, uint64_t *var )
 {
     int x,y;
-    unsigned int sad = 0;
+    uint64_t sad = 0;
     uint64_t ssd = 0;
     uint8_t *p = plane;
     for( y = 0; y < height>>4; y++, p += stride*16 )
         for( x = 0; x < width; x+=16 )
         {
-            sad += h->pixf.sad_aligned[PIXEL_16x16]( p + x, stride, flat, 0 );
-            ssd += h->pixf.ssd[PIXEL_16x16]( p + x, stride, flat, 0 );
+            uint64_t res = h->pixf.var[PIXEL_16x16]( p + x, stride );
+            sad += (uint32_t)res;
+            ssd += res >> 32;
         }
 
     *sum = sad;
-    *var = ssd - (uint64_t) sad * sad / (width * height);
+    *var = ssd - ((uint64_t)sad * sad + width * height / 2) / (width * height);
     x264_emms();
 }
 
@@ -126,24 +125,19 @@ static NOINLINE unsigned int x264_weight_cost( x264_t *h, x264_frame_t *fenc, ui
     int i_lines = fenc->i_lines_lowres;
     int i_width = fenc->i_width_lowres;
     uint8_t *fenc_plane = fenc->lowres[0];
-    ALIGNED_ARRAY_16( uint8_t, buf, [8*8] );
+    ALIGNED_8( uint8_t buf[8*8] );
     int pixoff = 0;
     int i_mb = 0;
 
     if( w )
+    {
         for( y = 0; y < i_lines; y += 8, pixoff = y*i_stride )
             for( x = 0; x < i_width; x += 8, i_mb++, pixoff += 8)
             {
                 w->weightfn[8>>2]( buf, 8, &src[pixoff], i_stride, w, 8 );
                 cost += X264_MIN( h->pixf.mbcmp[PIXEL_8x8]( buf, 8, &fenc_plane[pixoff], i_stride ), fenc->i_intra_cost[i_mb] );
             }
-    else
-        for( y = 0; y < i_lines; y += 8, pixoff = y*i_stride )
-            for( x = 0; x < i_width; x += 8, i_mb++, pixoff += 8 )
-                cost += X264_MIN( h->pixf.mbcmp[PIXEL_8x8]( &src[pixoff], i_stride, &fenc_plane[pixoff], i_stride ), fenc->i_intra_cost[i_mb] );
-
-    if( w )
-    {
+        /* Add cost of weights in the slice header. */
         int numslices;
         if( h->param.i_slice_count )
             numslices = h->param.i_slice_count;
@@ -151,11 +145,15 @@ static NOINLINE unsigned int x264_weight_cost( x264_t *h, x264_frame_t *fenc, ui
             numslices = (h->sps->i_mb_width * h->sps->i_mb_height + h->param.i_slice_max_mbs-1) / h->param.i_slice_max_mbs;
         else
             numslices = 1;
-        // FIXME still need to calculate for --slice-max-size
-        // Multiply by 2 as there will be a duplicate. 10 bits added as if there is a weighted frame, then an additional duplicate is used.
-        // Since using lowres frames, assume lambda = 1.
+        /* FIXME: find a way to account for --slice-max-size?
+         * Multiply by 2 as there will be a duplicate. 10 bits added as if there is a weighted frame, then an additional duplicate is used.
+         * Since using lowres frames, assume lambda = 1. */
         cost += numslices * ( 10 + 2 * ( bs_size_ue( w[0].i_denom ) + bs_size_se( w[0].i_scale ) + bs_size_se( w[0].i_offset ) ) );
     }
+    else
+        for( y = 0; y < i_lines; y += 8, pixoff = y*i_stride )
+            for( x = 0; x < i_width; x += 8, i_mb++, pixoff += 8 )
+                cost += X264_MIN( h->pixf.mbcmp[PIXEL_8x8]( &src[pixoff], i_stride, &fenc_plane[pixoff], i_stride ), fenc->i_intra_cost[i_mb] );
     x264_emms();
     return cost;
 }
@@ -171,17 +169,16 @@ void x264_weights_analyse( x264_t *h, x264_frame_t *fenc, x264_frame_t *ref, int
     int i_delta_index = fenc->i_frame - ref->i_frame - 1;
     /* epsilon is chosen to require at least a numerator of 127 (with denominator = 128) */
     const float epsilon = 1.0/128.0;
-
     float guess_scale;
     int found;
     x264_weight_t *weights = fenc->weight[0];
 
     weights_plane_analyse( h, fenc->plane[0], fenc->i_width[0], fenc->i_lines[0], fenc->i_stride[0], &fenc_sum, &fenc_var );
-    weights_plane_analyse( h, ref->plane[0], ref->i_width[0], ref->i_lines[0], ref->i_stride[0], &ref_sum, &ref_var );
+    weights_plane_analyse( h,  ref->plane[0],  ref->i_width[0],  ref->i_lines[0],  ref->i_stride[0],  &ref_sum,  &ref_var );
     fenc_var = round( sqrt( fenc_var ) );
-    ref_var = round( sqrt( ref_var ) );
+    ref_var  = round( sqrt(  ref_var ) );
     fenc_mean = (float)fenc_sum / (fenc->i_lines[0] * fenc->i_width[0]);
-    ref_mean = (float)ref_sum / (fenc->i_lines[0] * fenc->i_width[0]);
+    ref_mean  = (float) ref_sum / (fenc->i_lines[0] * fenc->i_width[0]);
 
     //early termination
     if( fabs( ref_mean - fenc_mean ) < 0.5 && fabsf( 1 - (float)fenc_var / ref_var ) < epsilon )
@@ -220,7 +217,7 @@ void x264_weights_analyse( x264_t *h, x264_frame_t *fenc, x264_frame_t *ref, int
     x264_emms();
 
     /* FIXME: More analysis can be done here on SAD vs. SATD termination. */
-	/* 0.2% termination derived experimentally to avoid weird weights in frames that are mostly intra. */
+    /* 0.2% termination derived experimentally to avoid weird weights in frames that are mostly intra. */
     if( !found || (minscale == 1<<mindenom && minoff == 0) || (float)minscore / origscore > 0.998 )
     {
         SET_WEIGHT( weights[0], 0, 1, 0, 0 );
diff --git a/tools/checkasm.c b/tools/checkasm.c
index 868c9c2b..d82a1304 100644
--- a/tools/checkasm.c
+++ b/tools/checkasm.c
@@ -344,16 +344,20 @@ static int check_pixel( int cpu_ref, int cpu_new )
 #define TEST_PIXEL_VAR( i ) \
     if( pixel_asm.var[i] != pixel_ref.var[i] ) \
     { \
-        int res_c, res_asm; \
         set_func_name( "%s_%s", "var", pixel_names[i] ); \
         used_asm = 1; \
-        res_c   = call_c( pixel_c.var[i], buf1, 16 ); \
-        res_asm = call_a( pixel_asm.var[i], buf1, 16 ); \
+        /* abi-check wrapper can't return uint64_t, so separate it from return value check */\
+        call_c1( pixel_c.var[i], buf1, 16 ); \
+        call_a1( pixel_asm.var[i], buf1, 16 ); \
+        uint64_t res_c   = pixel_c.var[i]( buf1, 16 ); \
+        uint64_t res_asm = pixel_asm.var[i]( buf1, 16 ); \
         if( res_c != res_asm ) \
         { \
             ok = 0; \
-            fprintf( stderr, "var[%d]: %d != %d [FAILED]\n", i, res_c, res_asm ); \
+            fprintf( stderr, "var[%d]: %d %d != %d %d [FAILED]\n", i, (int)res_c, (int)(res_c>>32), (int)res_asm, (int)(res_asm>>32) ); \
         } \
+        call_c2( pixel_c.var[i], buf1, 16 ); \
+        call_a2( pixel_asm.var[i], buf1, 16 ); \
     }
 
     ok = 1; used_asm = 0;
@@ -386,6 +390,8 @@ static int check_pixel( int cpu_ref, int cpu_new )
             for( j=0; j<32; j++ )
             {
                 uint8_t *pix = (j&16 ? buf1 : buf3) + (j&15)*256;
+                call_c1( pixel_c.hadamard_ac[i], buf1, 16 );
+                call_a1( pixel_asm.hadamard_ac[i], buf1, 16 );
                 uint64_t rc = pixel_c.hadamard_ac[i]( pix, 16 );
                 uint64_t ra = pixel_asm.hadamard_ac[i]( pix, 16 );
                 if( rc != ra )