From: Fiona Glaser Date: Mon, 16 Nov 2009 23:23:58 +0000 (-0800) Subject: Faster weightp analysis X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=63f7147714b37f1779dcf62138f21771368cb8e8;p=libx264 Faster weightp analysis Modify pixel_var slightly to return the necessary information and use it for weight analysis instead of sad/ssd. Various minor cosmetics. --- diff --git a/common/arm/pixel-a.S b/common/arm/pixel-a.S index ca406acd..4dd65ede 100644 --- a/common/arm/pixel-a.S +++ b/common/arm/pixel-a.S @@ -512,8 +512,6 @@ function x264_pixel_var_8x8_neon VAR_SQR_SUM q1, q9, q14, d24 vld1.64 {d26}, [r0,:64], r1 VAR_SQR_SUM q2, q10, q15, d26 - - mov r2, #6 b x264_var_end .endfunc @@ -529,7 +527,6 @@ function x264_pixel_var_16x16_neon VAR_SQR_SUM q2, q13, q15, d19, vpaddl.u16 mov ip, #7 - mov r2, #8 var16_loop: subs ip, ip, #1 vld1.64 {d16-d17}, [r0,:128], r1 @@ -554,8 +551,6 @@ function x264_var_end vpadd.u32 d0, d0, d2 vmov r0, r1, d0 - mul r0, r0, r0 - sub r0, r1, r0, lsr r2 bx lr .endfunc diff --git a/common/arm/pixel.h b/common/arm/pixel.h index 2ef5cea4..06835208 100644 --- a/common/arm/pixel.h +++ b/common/arm/pixel.h @@ -52,8 +52,8 @@ DECL_X1( ssd, neon ) int x264_pixel_sa8d_8x8_neon( uint8_t *, int, uint8_t *, int ); int x264_pixel_sa8d_16x16_neon( uint8_t *, int, uint8_t *, int ); -int x264_pixel_var_8x8_neon( uint8_t *, int ); -int x264_pixel_var_16x16_neon( uint8_t *, int ); +uint64_t x264_pixel_var_8x8_neon( uint8_t *, int ); +uint64_t x264_pixel_var_16x16_neon( uint8_t *, int ); int x264_pixel_var2_8x8_neon( uint8_t *, int, uint8_t *, int, int * ); uint64_t x264_pixel_hadamard_ac_8x8_neon( uint8_t *, int ); diff --git a/common/pixel.c b/common/pixel.c index 292cdf57..7c602371 100644 --- a/common/pixel.c +++ b/common/pixel.c @@ -142,10 +142,10 @@ int64_t x264_pixel_ssd_wxh( x264_pixel_function_t *pf, uint8_t *pix1, int i_pix1 /**************************************************************************** * pixel_var_wxh ****************************************************************************/ -#define PIXEL_VAR_C( name, w, shift ) \ -static int name( uint8_t *pix, int i_stride ) \ +#define PIXEL_VAR_C( name, w ) \ +static uint64_t name( uint8_t *pix, int i_stride ) \ { \ - uint32_t var = 0, sum = 0, sqr = 0; \ + uint32_t sum = 0, sqr = 0; \ int x, y; \ for( y = 0; y < w; y++ ) \ { \ @@ -156,12 +156,11 @@ static int name( uint8_t *pix, int i_stride ) \ } \ pix += i_stride; \ } \ - var = sqr - (sum * sum >> shift); \ - return var; \ + return sum + ((uint64_t)sqr << 32); \ } -PIXEL_VAR_C( x264_pixel_var_16x16, 16, 8 ) -PIXEL_VAR_C( x264_pixel_var_8x8, 8, 6 ) +PIXEL_VAR_C( x264_pixel_var_16x16, 16 ) +PIXEL_VAR_C( x264_pixel_var_8x8, 8 ) /**************************************************************************** * pixel_var2_wxh diff --git a/common/pixel.h b/common/pixel.h index 53f99566..11026422 100644 --- a/common/pixel.h +++ b/common/pixel.h @@ -75,7 +75,7 @@ typedef struct x264_pixel_cmp_t sad_aligned[7]; /* Aligned SAD for mbcmp */ int (*var2_8x8)( uint8_t *, int, uint8_t *, int, int * ); - int (*var[4])( uint8_t *pix, int stride ); + uint64_t (*var[4])( uint8_t *pix, int stride ); uint64_t (*hadamard_ac[4])( uint8_t *pix, int stride ); void (*ssim_4x4x2_core)( const uint8_t *pix1, int stride1, diff --git a/common/ppc/pixel.c b/common/ppc/pixel.c index 844d7f4f..64d4c493 100644 --- a/common/ppc/pixel.c +++ b/common/ppc/pixel.c @@ -1636,7 +1636,7 @@ static int pixel_ssd_8x8_altivec ( uint8_t *pix1, int i_stride_pix1, /**************************************************************************** * variance ****************************************************************************/ -static int x264_pixel_var_16x16_altivec( uint8_t *pix, int i_stride ) +static uint64_t x264_pixel_var_16x16_altivec( uint8_t *pix, int i_stride ) { ALIGNED_16(uint32_t sum_tab[4]); ALIGNED_16(uint32_t sqr_tab[4]); @@ -1661,11 +1661,10 @@ static int x264_pixel_var_16x16_altivec( uint8_t *pix, int i_stride ) uint32_t sum = sum_tab[3]; uint32_t sqr = sqr_tab[3]; - uint32_t var = sqr - (sum * sum >> 8); - return var; + return sum + ((uint64_t)sqr<<32); } -static int x264_pixel_var_8x8_altivec( uint8_t *pix, int i_stride ) +static uint64_t x264_pixel_var_8x8_altivec( uint8_t *pix, int i_stride ) { ALIGNED_16(uint32_t sum_tab[4]); ALIGNED_16(uint32_t sqr_tab[4]); @@ -1700,8 +1699,7 @@ static int x264_pixel_var_8x8_altivec( uint8_t *pix, int i_stride ) uint32_t sum = sum_tab[3]; uint32_t sqr = sqr_tab[3]; - uint32_t var = sqr - (sum * sum >> 6); - return var; + return sum + ((uint64_t)sqr<<32); } diff --git a/common/x86/pixel-a.asm b/common/x86/pixel-a.asm index 0f6ed6c1..72ecad78 100644 --- a/common/x86/pixel-a.asm +++ b/common/x86/pixel-a.asm @@ -316,14 +316,15 @@ SSD 4, 8, ssse3 %endif %endmacro -%macro VAR_END 1 +%macro VAR_END 0 HADDW m5, m7 - movd r1d, m5 - imul r1d, r1d + movd eax, m5 HADDD m6, m1 - shr r1d, %1 - movd eax, m6 - sub eax, r1d ; sqr - (sum * sum >> shift) + movd edx, m6 +%ifdef ARCH_X86_64 + shl rdx, 32 + add rax, rdx +%endif RET %endmacro @@ -370,12 +371,12 @@ INIT_MMX cglobal x264_pixel_var_16x16_mmxext, 2,3 VAR_START 0 VAR_2ROW 8, 16 - VAR_END 8 + VAR_END cglobal x264_pixel_var_8x8_mmxext, 2,3 VAR_START 0 VAR_2ROW r1, 4 - VAR_END 6 + VAR_END INIT_XMM cglobal x264_pixel_var_16x16_sse2, 2,3,8 @@ -389,7 +390,7 @@ cglobal x264_pixel_var_16x16_sse2, 2,3,8 VAR_CORE dec r2d jg .loop - VAR_END 8 + VAR_END cglobal x264_pixel_var_8x8_sse2, 2,4,8 VAR_START 1 @@ -405,7 +406,7 @@ cglobal x264_pixel_var_8x8_sse2, 2,4,8 VAR_CORE dec r2d jg .loop - VAR_END 6 + VAR_END %macro VAR2_END 0 HADDW m5, m7 diff --git a/common/x86/pixel.h b/common/x86/pixel.h index b1e22cee..9bba6830 100644 --- a/common/x86/pixel.h +++ b/common/x86/pixel.h @@ -69,8 +69,8 @@ DECL_X4( sad, cache64_mmxext ); DECL_X4( sad, cache64_sse2 ); DECL_X4( sad, cache64_ssse3 ); -DECL_PIXELS( int, var, mmxext, ( uint8_t *pix, int i_stride )) -DECL_PIXELS( int, var, sse2, ( uint8_t *pix, int i_stride )) +DECL_PIXELS( uint64_t, var, mmxext, ( uint8_t *pix, int i_stride )) +DECL_PIXELS( uint64_t, var, sse2, ( uint8_t *pix, int i_stride )) DECL_PIXELS( uint64_t, hadamard_ac, mmxext, ( uint8_t *pix, int i_stride )) DECL_PIXELS( uint64_t, hadamard_ac, sse2, ( uint8_t *pix, int i_stride )) DECL_PIXELS( uint64_t, hadamard_ac, ssse3, ( uint8_t *pix, int i_stride )) diff --git a/encoder/ratecontrol.c b/encoder/ratecontrol.c index 912ba541..7120ee57 100644 --- a/encoder/ratecontrol.c +++ b/encoder/ratecontrol.c @@ -179,6 +179,22 @@ static inline double qscale2bits(ratecontrol_entry_t *rce, double qscale) + rce->misc_bits; } +static ALWAYS_INLINE uint32_t ac_energy_plane( x264_t *h, int mb_x, int mb_y, x264_frame_t *frame, int i ) +{ + int w = i ? 8 : 16; + int shift = i ? 6 : 8; + int stride = frame->i_stride[i]; + int offset = h->mb.b_interlaced + ? w * (mb_x + (mb_y&~1) * stride) + (mb_y&1) * stride + : w * (mb_x + mb_y * stride); + int pix = i ? PIXEL_8x8 : PIXEL_16x16; + stride <<= h->mb.b_interlaced; + uint64_t res = h->pixf.var[pix]( frame->plane[i] + offset, stride ); + uint32_t sum = (uint32_t)res; + uint32_t sqr = res >> 32; + return sqr - (sum * sum >> shift); +} + // Find the total AC energy of the block in all planes. static NOINLINE uint32_t ac_energy_mb( x264_t *h, int mb_x, int mb_y, x264_frame_t *frame ) { @@ -186,18 +202,9 @@ static NOINLINE uint32_t ac_energy_mb( x264_t *h, int mb_x, int mb_y, x264_frame * and putting it after floating point ops. As a result, we put the emms at the end of the * function and make sure that its always called before the float math. Noinline makes * sure no reordering goes on. */ - uint32_t var = 0, i; - for( i = 0; i < 3; i++ ) - { - int w = i ? 8 : 16; - int stride = frame->i_stride[i]; - int offset = h->mb.b_interlaced - ? w * (mb_x + (mb_y&~1) * stride) + (mb_y&1) * stride - : w * (mb_x + mb_y * stride); - int pix = i ? PIXEL_8x8 : PIXEL_16x16; - stride <<= h->mb.b_interlaced; - var += h->pixf.var[pix]( frame->plane[i]+offset, stride ); - } + uint32_t var = ac_energy_plane( h, mb_x, mb_y, frame, 0 ); + var += ac_energy_plane( h, mb_x, mb_y, frame, 1 ); + var += ac_energy_plane( h, mb_x, mb_y, frame, 2 ); x264_emms(); return var; } diff --git a/encoder/slicetype.c b/encoder/slicetype.c index c12e8739..2df7dee7 100644 --- a/encoder/slicetype.c +++ b/encoder/slicetype.c @@ -56,24 +56,23 @@ static void get_h264_weight( unsigned int weight_nonh264, int offset, x264_weigh } w->i_scale = X264_MIN( w->i_scale, 127 ); } -/* due to a GCC bug on some platforms (win32), flat[16] may not actually be aligned. */ -ALIGNED_16( static uint8_t flat[17] ) = {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1}; static NOINLINE void weights_plane_analyse( x264_t *h, uint8_t *plane, int width, int height, int stride, unsigned int *sum, uint64_t *var ) { int x,y; - unsigned int sad = 0; + uint64_t sad = 0; uint64_t ssd = 0; uint8_t *p = plane; for( y = 0; y < height>>4; y++, p += stride*16 ) for( x = 0; x < width; x+=16 ) { - sad += h->pixf.sad_aligned[PIXEL_16x16]( p + x, stride, flat, 0 ); - ssd += h->pixf.ssd[PIXEL_16x16]( p + x, stride, flat, 0 ); + uint64_t res = h->pixf.var[PIXEL_16x16]( p + x, stride ); + sad += (uint32_t)res; + ssd += res >> 32; } *sum = sad; - *var = ssd - (uint64_t) sad * sad / (width * height); + *var = ssd - ((uint64_t)sad * sad + width * height / 2) / (width * height); x264_emms(); } @@ -126,24 +125,19 @@ static NOINLINE unsigned int x264_weight_cost( x264_t *h, x264_frame_t *fenc, ui int i_lines = fenc->i_lines_lowres; int i_width = fenc->i_width_lowres; uint8_t *fenc_plane = fenc->lowres[0]; - ALIGNED_ARRAY_16( uint8_t, buf, [8*8] ); + ALIGNED_8( uint8_t buf[8*8] ); int pixoff = 0; int i_mb = 0; if( w ) + { for( y = 0; y < i_lines; y += 8, pixoff = y*i_stride ) for( x = 0; x < i_width; x += 8, i_mb++, pixoff += 8) { w->weightfn[8>>2]( buf, 8, &src[pixoff], i_stride, w, 8 ); cost += X264_MIN( h->pixf.mbcmp[PIXEL_8x8]( buf, 8, &fenc_plane[pixoff], i_stride ), fenc->i_intra_cost[i_mb] ); } - else - for( y = 0; y < i_lines; y += 8, pixoff = y*i_stride ) - for( x = 0; x < i_width; x += 8, i_mb++, pixoff += 8 ) - cost += X264_MIN( h->pixf.mbcmp[PIXEL_8x8]( &src[pixoff], i_stride, &fenc_plane[pixoff], i_stride ), fenc->i_intra_cost[i_mb] ); - - if( w ) - { + /* Add cost of weights in the slice header. */ int numslices; if( h->param.i_slice_count ) numslices = h->param.i_slice_count; @@ -151,11 +145,15 @@ static NOINLINE unsigned int x264_weight_cost( x264_t *h, x264_frame_t *fenc, ui numslices = (h->sps->i_mb_width * h->sps->i_mb_height + h->param.i_slice_max_mbs-1) / h->param.i_slice_max_mbs; else numslices = 1; - // FIXME still need to calculate for --slice-max-size - // Multiply by 2 as there will be a duplicate. 10 bits added as if there is a weighted frame, then an additional duplicate is used. - // Since using lowres frames, assume lambda = 1. + /* FIXME: find a way to account for --slice-max-size? + * Multiply by 2 as there will be a duplicate. 10 bits added as if there is a weighted frame, then an additional duplicate is used. + * Since using lowres frames, assume lambda = 1. */ cost += numslices * ( 10 + 2 * ( bs_size_ue( w[0].i_denom ) + bs_size_se( w[0].i_scale ) + bs_size_se( w[0].i_offset ) ) ); } + else + for( y = 0; y < i_lines; y += 8, pixoff = y*i_stride ) + for( x = 0; x < i_width; x += 8, i_mb++, pixoff += 8 ) + cost += X264_MIN( h->pixf.mbcmp[PIXEL_8x8]( &src[pixoff], i_stride, &fenc_plane[pixoff], i_stride ), fenc->i_intra_cost[i_mb] ); x264_emms(); return cost; } @@ -171,17 +169,16 @@ void x264_weights_analyse( x264_t *h, x264_frame_t *fenc, x264_frame_t *ref, int int i_delta_index = fenc->i_frame - ref->i_frame - 1; /* epsilon is chosen to require at least a numerator of 127 (with denominator = 128) */ const float epsilon = 1.0/128.0; - float guess_scale; int found; x264_weight_t *weights = fenc->weight[0]; weights_plane_analyse( h, fenc->plane[0], fenc->i_width[0], fenc->i_lines[0], fenc->i_stride[0], &fenc_sum, &fenc_var ); - weights_plane_analyse( h, ref->plane[0], ref->i_width[0], ref->i_lines[0], ref->i_stride[0], &ref_sum, &ref_var ); + weights_plane_analyse( h, ref->plane[0], ref->i_width[0], ref->i_lines[0], ref->i_stride[0], &ref_sum, &ref_var ); fenc_var = round( sqrt( fenc_var ) ); - ref_var = round( sqrt( ref_var ) ); + ref_var = round( sqrt( ref_var ) ); fenc_mean = (float)fenc_sum / (fenc->i_lines[0] * fenc->i_width[0]); - ref_mean = (float)ref_sum / (fenc->i_lines[0] * fenc->i_width[0]); + ref_mean = (float) ref_sum / (fenc->i_lines[0] * fenc->i_width[0]); //early termination if( fabs( ref_mean - fenc_mean ) < 0.5 && fabsf( 1 - (float)fenc_var / ref_var ) < epsilon ) @@ -220,7 +217,7 @@ void x264_weights_analyse( x264_t *h, x264_frame_t *fenc, x264_frame_t *ref, int x264_emms(); /* FIXME: More analysis can be done here on SAD vs. SATD termination. */ - /* 0.2% termination derived experimentally to avoid weird weights in frames that are mostly intra. */ + /* 0.2% termination derived experimentally to avoid weird weights in frames that are mostly intra. */ if( !found || (minscale == 1< 0.998 ) { SET_WEIGHT( weights[0], 0, 1, 0, 0 ); diff --git a/tools/checkasm.c b/tools/checkasm.c index 868c9c2b..d82a1304 100644 --- a/tools/checkasm.c +++ b/tools/checkasm.c @@ -344,16 +344,20 @@ static int check_pixel( int cpu_ref, int cpu_new ) #define TEST_PIXEL_VAR( i ) \ if( pixel_asm.var[i] != pixel_ref.var[i] ) \ { \ - int res_c, res_asm; \ set_func_name( "%s_%s", "var", pixel_names[i] ); \ used_asm = 1; \ - res_c = call_c( pixel_c.var[i], buf1, 16 ); \ - res_asm = call_a( pixel_asm.var[i], buf1, 16 ); \ + /* abi-check wrapper can't return uint64_t, so separate it from return value check */\ + call_c1( pixel_c.var[i], buf1, 16 ); \ + call_a1( pixel_asm.var[i], buf1, 16 ); \ + uint64_t res_c = pixel_c.var[i]( buf1, 16 ); \ + uint64_t res_asm = pixel_asm.var[i]( buf1, 16 ); \ if( res_c != res_asm ) \ { \ ok = 0; \ - fprintf( stderr, "var[%d]: %d != %d [FAILED]\n", i, res_c, res_asm ); \ + fprintf( stderr, "var[%d]: %d %d != %d %d [FAILED]\n", i, (int)res_c, (int)(res_c>>32), (int)res_asm, (int)(res_asm>>32) ); \ } \ + call_c2( pixel_c.var[i], buf1, 16 ); \ + call_a2( pixel_asm.var[i], buf1, 16 ); \ } ok = 1; used_asm = 0; @@ -386,6 +390,8 @@ static int check_pixel( int cpu_ref, int cpu_new ) for( j=0; j<32; j++ ) { uint8_t *pix = (j&16 ? buf1 : buf3) + (j&15)*256; + call_c1( pixel_c.hadamard_ac[i], buf1, 16 ); + call_a1( pixel_asm.hadamard_ac[i], buf1, 16 ); uint64_t rc = pixel_c.hadamard_ac[i]( pix, 16 ); uint64_t ra = pixel_asm.hadamard_ac[i]( pix, 16 ); if( rc != ra )