From: Henrik Gramner Date: Fri, 6 Apr 2012 22:40:09 +0000 (+0200) Subject: Faster chroma weight cost calculation X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=4442eaceb4992098e1e4e30aa13e70bb35d2cae6;p=libx264 Faster chroma weight cost calculation New assembly function with SSE2, SSSE3 and XOP implementations for calculating absolute sum of differences. --- diff --git a/common/pixel.c b/common/pixel.c index 03425b56..d6e684cc 100644 --- a/common/pixel.c +++ b/common/pixel.c @@ -703,6 +703,15 @@ int x264_field_vsad( x264_t *h, int mb_x, int mb_y ) return (score_field < score_frame); } +static int pixel_asd8( pixel *pix1, intptr_t stride1, pixel *pix2, intptr_t stride2, int height ) +{ + int sum = 0; + for( int y = 0; y < height; y++, pix1 += stride1, pix2 += stride2 ) + for( int x = 0; x < 8; x++ ) + sum += pix1[x] - pix2[x]; + return abs( sum ); +} + /**************************************************************************** * successive elimination ****************************************************************************/ @@ -814,6 +823,7 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf ) pixf->ssim_4x4x2_core = ssim_4x4x2_core; pixf->ssim_end4 = ssim_end4; pixf->vsad = pixel_vsad; + pixf->asd8 = pixel_asd8; pixf->intra_sad_x3_4x4 = x264_intra_sad_x3_4x4; pixf->intra_satd_x3_4x4 = x264_intra_satd_x3_4x4; @@ -888,6 +898,7 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf ) INIT4( hadamard_ac, _sse2 ); } pixf->vsad = x264_pixel_vsad_sse2; + pixf->asd8 = x264_pixel_asd8_sse2; pixf->intra_sad_x3_8x8 = x264_intra_sad_x3_8x8_sse2; pixf->intra_sad_x3_8x8c = x264_intra_sad_x3_8x8c_sse2; pixf->intra_sad_x3_16x16 = x264_intra_sad_x3_16x16_sse2; @@ -915,6 +926,7 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf ) INIT4( hadamard_ac, _ssse3 ); } pixf->vsad = x264_pixel_vsad_ssse3; + pixf->asd8 = x264_pixel_asd8_ssse3; pixf->intra_sad_x3_4x4 = x264_intra_sad_x3_4x4_ssse3; pixf->sa8d[PIXEL_16x16]= x264_pixel_sa8d_16x16_ssse3; pixf->sa8d[PIXEL_8x8] = x264_pixel_sa8d_8x8_ssse3; @@ -951,6 +963,7 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf ) if( cpu&X264_CPU_XOP ) { pixf->vsad = x264_pixel_vsad_xop; + pixf->asd8 = x264_pixel_asd8_xop; } #endif // HAVE_MMX #else // !HIGH_BIT_DEPTH @@ -1035,6 +1048,7 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf ) pixf->var2[PIXEL_8x8] = x264_pixel_var2_8x8_sse2; pixf->var2[PIXEL_8x16] = x264_pixel_var2_8x16_sse2; pixf->vsad = x264_pixel_vsad_sse2; + pixf->asd8 = x264_pixel_asd8_sse2; } if( (cpu&X264_CPU_SSE2) && !(cpu&X264_CPU_SSE2_IS_SLOW) ) @@ -1126,6 +1140,7 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf ) pixf->intra_sad_x3_8x8c = x264_intra_sad_x3_8x8c_ssse3; pixf->var2[PIXEL_8x8] = x264_pixel_var2_8x8_ssse3; pixf->var2[PIXEL_8x16] = x264_pixel_var2_8x16_ssse3; + pixf->asd8 = x264_pixel_asd8_ssse3; if( cpu&X264_CPU_CACHELINE_64 ) { INIT2( sad, _cache64_ssse3 ); diff --git a/common/pixel.h b/common/pixel.h index 50589137..8365b9e5 100644 --- a/common/pixel.h +++ b/common/pixel.h @@ -89,6 +89,7 @@ typedef struct x264_pixel_cmp_x4_t fpelcmp_x4[7]; x264_pixel_cmp_t sad_aligned[8]; /* Aligned SAD for mbcmp */ int (*vsad)( pixel *, intptr_t, int ); + int (*asd8)( pixel *pix1, intptr_t stride1, pixel *pix2, intptr_t stride2, int height ); uint64_t (*var[4])( pixel *pix, intptr_t stride ); int (*var2[4])( pixel *pix1, intptr_t stride1, diff --git a/common/x86/pixel-a.asm b/common/x86/pixel-a.asm index 06737ab1..676b8424 100644 --- a/common/x86/pixel-a.asm +++ b/common/x86/pixel-a.asm @@ -4001,6 +4001,73 @@ SSIM INIT_XMM avx SSIM +;----------------------------------------------------------------------------- +; int pixel_asd8( pixel *pix1, intptr_t stride1, pixel *pix2, intptr_t stride2, int height ); +;----------------------------------------------------------------------------- +%macro ASD8 0 +cglobal pixel_asd8, 5,5 + pxor m0, m0 + pxor m1, m1 +.loop: +%if HIGH_BIT_DEPTH + paddw m0, [r0] + paddw m1, [r2] + paddw m0, [r0+2*r1] + paddw m1, [r2+2*r3] + lea r0, [r0+4*r1] + paddw m0, [r0] + paddw m1, [r2+4*r3] + lea r2, [r2+4*r3] + paddw m0, [r0+2*r1] + paddw m1, [r2+2*r3] + lea r0, [r0+4*r1] + lea r2, [r2+4*r3] +%else + movq m2, [r0] + movq m3, [r2] + movhps m2, [r0+r1] + movhps m3, [r2+r3] + lea r0, [r0+2*r1] + psadbw m2, m1 + psadbw m3, m1 + movq m4, [r0] + movq m5, [r2+2*r3] + lea r2, [r2+2*r3] + movhps m4, [r0+r1] + movhps m5, [r2+r3] + lea r0, [r0+2*r1] + paddw m0, m2 + psubw m0, m3 + psadbw m4, m1 + psadbw m5, m1 + lea r2, [r2+2*r3] + paddw m0, m4 + psubw m0, m5 +%endif + sub r4d, 4 + jg .loop +%if HIGH_BIT_DEPTH + psubw m0, m1 + HADDW m0, m1 + ABSD m1, m0 +%else + movhlps m1, m0 + paddw m0, m1 + ABSW m1, m0 +%endif + movd eax, m1 + RET +%endmacro + +INIT_XMM sse2 +ASD8 +INIT_XMM ssse3 +ASD8 +%if HIGH_BIT_DEPTH +INIT_XMM xop +ASD8 +%endif + ;============================================================================= ; Successive Elimination ADS ;============================================================================= diff --git a/common/x86/pixel.h b/common/x86/pixel.h index eeea9c70..f1711d2f 100644 --- a/common/x86/pixel.h +++ b/common/x86/pixel.h @@ -159,6 +159,9 @@ int x264_pixel_vsad_mmx2 ( pixel *src, intptr_t stride, int height ); int x264_pixel_vsad_sse2 ( pixel *src, intptr_t stride, int height ); int x264_pixel_vsad_ssse3( pixel *src, intptr_t stride, int height ); int x264_pixel_vsad_xop ( pixel *src, intptr_t stride, int height ); +int x264_pixel_asd8_sse2 ( pixel *pix1, intptr_t stride1, pixel *pix2, intptr_t stride2, int height ); +int x264_pixel_asd8_ssse3( pixel *pix1, intptr_t stride1, pixel *pix2, intptr_t stride2, int height ); +int x264_pixel_asd8_xop ( pixel *pix1, intptr_t stride1, pixel *pix2, intptr_t stride2, int height ); #define DECL_ADS( size, suffix ) \ int x264_pixel_ads##size##_##suffix( int enc_dc[size], uint16_t *sums, int delta,\ diff --git a/encoder/slicetype.c b/encoder/slicetype.c index 0b20f66a..f1c207f3 100644 --- a/encoder/slicetype.c +++ b/encoder/slicetype.c @@ -220,15 +220,12 @@ static NOINLINE unsigned int x264_weight_cost_chroma( x264_t *h, x264_frame_t *f { unsigned int cost = 0; int i_stride = fenc->i_stride[1]; - int i_offset = i_stride / 2; int i_lines = fenc->i_lines[1]; int i_width = fenc->i_width[1]; - pixel *src = ref + i_offset; + pixel *src = ref + (i_stride >> 1); ALIGNED_ARRAY_16( pixel, buf, [8*16] ); int pixoff = 0; - int chromapix = h->luma2chroma_pixel[PIXEL_16x16]; int height = 16 >> CHROMA_V_SHIFT; - ALIGNED_16( static pixel flat[8] ) = {0}; if( w ) { for( int y = 0; y < i_lines; y += height, pixoff = y*i_stride ) @@ -239,19 +236,15 @@ static NOINLINE unsigned int x264_weight_cost_chroma( x264_t *h, x264_frame_t *f * But testing shows that for chroma the DC coefficient is by far the most * important part of the coding cost. Thus a more useful chroma weight is * obtained by comparing each block's DC coefficient instead of the actual - * pixels. - * - * FIXME: add a (faster) asm sum function to replace sad. */ - cost += abs( h->pixf.sad_aligned[chromapix]( buf, 8, flat, 0 ) - - h->pixf.sad_aligned[chromapix]( &src[pixoff], i_stride, flat, 0 ) ); + * pixels. */ + cost += h->pixf.asd8( buf, 8, &src[pixoff], i_stride, height ); } cost += x264_weight_slice_header_cost( h, w, 1 ); } else for( int y = 0; y < i_lines; y += height, pixoff = y*i_stride ) for( int x = 0; x < i_width; x += 8, pixoff += 8 ) - cost += abs( h->pixf.sad_aligned[chromapix]( &ref[pixoff], i_stride, flat, 0 ) - - h->pixf.sad_aligned[chromapix]( &src[pixoff], i_stride, flat, 0 ) ); + cost += h->pixf.asd8( &ref[pixoff], i_stride, &src[pixoff], i_stride, height ); x264_emms(); return cost; } diff --git a/tools/checkasm.c b/tools/checkasm.c index afeb66cb..abf581b5 100644 --- a/tools/checkasm.c +++ b/tools/checkasm.c @@ -475,6 +475,21 @@ static int check_pixel( int cpu_ref, int cpu_new ) } report( "pixel vsad :" ); + ok = 1; used_asm = 0; + if( pixel_asm.asd8 != pixel_ref.asd8 ) + { + set_func_name( "asd8" ); + used_asm = 1; + int res_c = call_c( pixel_c.asd8, pbuf1, (intptr_t)8, pbuf2, (intptr_t)8, 16 ); + int res_a = call_a( pixel_asm.asd8, pbuf1, (intptr_t)8, pbuf2, (intptr_t)8, 16 ); + if( res_c != res_a ) + { + ok = 0; + fprintf( stderr, "asd: %d != %d\n", res_c, res_a ); + } + } + report( "pixel asd :" ); + #define TEST_INTRA_X3( name, i8x8, ... ) \ if( pixel_asm.name && pixel_asm.name != pixel_ref.name ) \ { \