From: Loren Merritt Date: Mon, 22 Sep 2008 10:17:35 +0000 (-0600) Subject: avg_weight_ssse3 X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=cb173c5044fcc4792b7978720884cea7aa2e3848;p=libx264 avg_weight_ssse3 --- diff --git a/common/macroblock.c b/common/macroblock.c index ea3cab66..843dd481 100644 --- a/common/macroblock.c +++ b/common/macroblock.c @@ -1501,7 +1501,12 @@ void x264_macroblock_bipred_init( x264_t *h ) if( h->param.analyse.b_weighted_bipred && dist_scale_factor >= -64 && dist_scale_factor <= 128 ) + { h->mb.bipred_weight[i_ref0][i_ref1] = 64 - dist_scale_factor; + // ssse3 implementation of biweight doesn't support the extrema. + // if we ever generate them, we'll have to drop that optimization. + assert( dist_scale_factor >= -63 && dist_scale_factor <= 127 ); + } else h->mb.bipred_weight[i_ref0][i_ref1] = 32; } diff --git a/common/x86/mc-a.asm b/common/x86/mc-a.asm index c6c81c2c..8fb1145c 100644 --- a/common/x86/mc-a.asm +++ b/common/x86/mc-a.asm @@ -47,8 +47,11 @@ SECTION .text %define t3 r3 %define t4 r4 %define t5 r5 - %macro AVG_START 1 - cglobal %1, 6,7 + %define t6d r10d + %define t7d r11d + %macro AVG_START 0 + PROLOGUE 6,7 + .height_loop: %endmacro %else %define t0 r1 @@ -57,14 +60,17 @@ SECTION .text %define t3 r4 %define t4 r5 %define t5 r6 - %macro AVG_START 1 - cglobal %1, 0,7 + %define t6d r1d + %define t7d r2d + %macro AVG_START 0 + PROLOGUE 0,7 mov t0, r0m mov t1, r1m mov t2, r2m mov t3, r3m mov t4, r4m mov t5, r5m + .height_loop: %endmacro %endif @@ -77,9 +83,9 @@ SECTION .text %endif %endmacro -%macro BIWEIGHT 3 - movh m0, %2 - movh m1, %3 +%macro BIWEIGHT_MMX 2 + movh m0, %1 + movh m1, %2 punpcklbw m0, m7 punpcklbw m1, m7 pmullw m0, m4 @@ -87,35 +93,72 @@ SECTION .text paddw m0, m1 paddw m0, m6 psraw m0, 6 - pmaxsw m0, m7 - packuswb m0, m0 - movh %1, m0 %endmacro -%macro BIWEIGHT_START 0 +%macro BIWEIGHT_START_MMX 0 movd m4, r6m SPLATW m4, m4 ; weight_dst mova m5, [pw_64 GLOBAL] - psubw m5, m4 ; weight_src + psubw m5, m4 ; weight_src mova m6, [pw_32 GLOBAL] ; rounding pxor m7, m7 -.height_loop: %endmacro -INIT_MMX +%macro BIWEIGHT_SSSE3 2 + movh m0, %1 + movh m1, %2 + punpcklbw m0, m1 + pmaddubsw m0, m5 + paddw m0, m6 + psraw m0, 6 +%endmacro + +%macro BIWEIGHT_START_SSSE3 0 + movzx t6d, byte r6m ; FIXME x86_64 + mov t7d, 64 + sub t7d, t6d + shl t7d, 8 + add t6d, t7d + movd m5, t6d + mova m6, [pw_32 GLOBAL] + SPLATW m5, m5 ; weight_dst,src +%endmacro + +%macro BIWEIGHT_ROW 4 + BIWEIGHT [%2], [%3] +%if %4==mmsize/2 + packuswb m0, m0 + movh [%1], m0 +%else + SWAP 0, 2 + BIWEIGHT [%2+mmsize/2], [%3+mmsize/2] + packuswb m2, m0 + mova [%1], m2 +%endif +%endmacro ;----------------------------------------------------------------------------- ; int x264_pixel_avg_weight_w16_mmxext( uint8_t *dst, int, uint8_t *src1, int, uint8_t *src2, int, int i_weight ) ;----------------------------------------------------------------------------- %macro AVG_WEIGHT 2 -AVG_START x264_pixel_avg_weight_w%2_%1 +cglobal x264_pixel_avg_weight_w%2_%1, 0,0 BIWEIGHT_START + AVG_START +%if %2==8 && mmsize==16 + BIWEIGHT [t2], [t4] + SWAP 0, 2 + BIWEIGHT [t2+t3], [t4+t5] + packuswb m2, m0 + movlps [t0], m2 + movhps [t0+t1], m2 +%else %assign x 0 -%rep %2*2/mmsize - BIWEIGHT [t0+x], [t2+x], [t4+x] - BIWEIGHT [t0+x+t1], [t2+x+t3], [t4+x+t5] -%assign x x+mmsize/2 +%rep 1+%2/(mmsize*2) + BIWEIGHT_ROW t0+x, t2+x, t4+x, %2 + BIWEIGHT_ROW t0+x+t1, t2+x+t3, t4+x+t5, %2 +%assign x x+mmsize %endrep +%endif lea t0, [t0+t1*2] lea t2, [t2+t3*2] lea t4, [t4+t5*2] @@ -124,12 +167,23 @@ AVG_START x264_pixel_avg_weight_w%2_%1 REP_RET %endmacro +%define BIWEIGHT BIWEIGHT_MMX +%define BIWEIGHT_START BIWEIGHT_START_MMX +INIT_MMX AVG_WEIGHT mmxext, 4 AVG_WEIGHT mmxext, 8 AVG_WEIGHT mmxext, 16 INIT_XMM +%define x264_pixel_avg_weight_w4_sse2 x264_pixel_avg_weight_w4_mmxext AVG_WEIGHT sse2, 8 AVG_WEIGHT sse2, 16 +%define BIWEIGHT BIWEIGHT_SSSE3 +%define BIWEIGHT_START BIWEIGHT_START_SSSE3 +INIT_MMX +AVG_WEIGHT ssse3, 4 +INIT_XMM +AVG_WEIGHT ssse3, 8 +AVG_WEIGHT ssse3, 16 @@ -145,7 +199,7 @@ AVG_WEIGHT sse2, 16 cglobal x264_pixel_avg_%1x%2_%3,0,0 mov eax, %2 cmp dword r6m, 32 - jne x264_pixel_avg_weight_w%1_mmxext + jne x264_pixel_avg_weight_w%1_%3 %if mmsize == 16 && %1 == 16 test dword r4m, 15 jz x264_pixel_avg_w%1_sse2 @@ -168,38 +222,31 @@ cglobal x264_pixel_avg_%1x%2_%3,0,0 REP_RET %endmacro -INIT_MMX - -AVG_START x264_pixel_avg_w4_mmxext -.height_loop: - movd mm0, [t2] - movd mm1, [t2+t3] - pavgb mm0, [t4] - pavgb mm1, [t4+t5] - movd [t0], mm0 - movd [t0+t1], mm1 -AVG_END +%macro AVG_FUNC 3 +cglobal %1 + AVG_START + %2 m0, [t2] + %2 m1, [t2+t3] + pavgb m0, [t4] + pavgb m1, [t4+t5] + %3 [t0], m0 + %3 [t0+t1], m1 + AVG_END +%endmacro +INIT_MMX +AVG_FUNC x264_pixel_avg_w4_mmxext, movd, movd AVGH 4, 8, mmxext AVGH 4, 4, mmxext AVGH 4, 2, mmxext -AVG_START x264_pixel_avg_w8_mmxext -.height_loop: - movq mm0, [t2] - movq mm1, [t2+t3] - pavgb mm0, [t4] - pavgb mm1, [t4+t5] - movq [t0], mm0 - movq [t0+t1], mm1 -AVG_END - +AVG_FUNC x264_pixel_avg_w8_mmxext, movq, movq AVGH 8, 16, mmxext AVGH 8, 8, mmxext AVGH 8, 4, mmxext -AVG_START x264_pixel_avg_w16_mmxext -.height_loop: +cglobal x264_pixel_avg_w16_mmxext + AVG_START movq mm0, [t2 ] movq mm1, [t2+8] movq mm2, [t2+t3 ] @@ -212,27 +259,27 @@ AVG_START x264_pixel_avg_w16_mmxext movq [t0+8], mm1 movq [t0+t1 ], mm2 movq [t0+t1+8], mm3 -AVG_END + AVG_END AVGH 16, 16, mmxext AVGH 16, 8, mmxext -AVG_START x264_pixel_avg_w16_sse2 -.height_loop: - movdqu xmm0, [t2] - movdqu xmm1, [t2+t3] - pavgb xmm0, [t4] - pavgb xmm1, [t4+t5] - movdqa [t0], xmm0 - movdqa [t0+t1], xmm1 -AVG_END - INIT_XMM +AVG_FUNC x264_pixel_avg_w16_sse2, movdqu, movdqa AVGH 16, 16, sse2 -AVGH 16, 8, sse2 -AVGH 8, 16, sse2 -AVGH 8, 8, sse2 -AVGH 8, 4, sse2 +AVGH 16, 8, sse2 +AVGH 8, 16, sse2 +AVGH 8, 8, sse2 +AVGH 8, 4, sse2 +AVGH 16, 16, ssse3 +AVGH 16, 8, ssse3 +AVGH 8, 16, ssse3 +AVGH 8, 8, ssse3 +AVGH 8, 4, ssse3 +INIT_MMX +AVGH 4, 8, ssse3 +AVGH 4, 4, ssse3 +AVGH 4, 2, ssse3 diff --git a/common/x86/mc-c.c b/common/x86/mc-c.c index ec32ad8f..4942917f 100644 --- a/common/x86/mc-c.c +++ b/common/x86/mc-c.c @@ -29,20 +29,19 @@ #include "common/common.h" #include "mc.h" -/* NASM functions */ -extern void x264_pixel_avg_16x16_sse2( uint8_t *, int, uint8_t *, int, uint8_t *, int, int ); -extern void x264_pixel_avg_16x8_sse2( uint8_t *, int, uint8_t *, int, uint8_t *, int, int ); -extern void x264_pixel_avg_8x16_sse2( uint8_t *, int, uint8_t *, int, uint8_t *, int, int ); -extern void x264_pixel_avg_8x8_sse2( uint8_t *, int, uint8_t *, int, uint8_t *, int, int ); -extern void x264_pixel_avg_8x4_sse2( uint8_t *, int, uint8_t *, int, uint8_t *, int, int ); -extern void x264_pixel_avg_16x16_mmxext( uint8_t *, int, uint8_t *, int, uint8_t *, int, int ); -extern void x264_pixel_avg_16x8_mmxext( uint8_t *, int, uint8_t *, int, uint8_t *, int, int ); -extern void x264_pixel_avg_8x16_mmxext( uint8_t *, int, uint8_t *, int, uint8_t *, int, int ); -extern void x264_pixel_avg_8x8_mmxext( uint8_t *, int, uint8_t *, int, uint8_t *, int, int ); -extern void x264_pixel_avg_8x4_mmxext( uint8_t *, int, uint8_t *, int, uint8_t *, int, int ); -extern void x264_pixel_avg_4x8_mmxext( uint8_t *, int, uint8_t *, int, uint8_t *, int, int ); -extern void x264_pixel_avg_4x4_mmxext( uint8_t *, int, uint8_t *, int, uint8_t *, int, int ); -extern void x264_pixel_avg_4x2_mmxext( uint8_t *, int, uint8_t *, int, uint8_t *, int, int ); +#define DECL_SUF( func, args )\ + void func##_mmxext args;\ + void func##_sse2 args;\ + void func##_ssse3 args; + +DECL_SUF( x264_pixel_avg_16x16, ( uint8_t *, int, uint8_t *, int, uint8_t *, int, int )) +DECL_SUF( x264_pixel_avg_16x8, ( uint8_t *, int, uint8_t *, int, uint8_t *, int, int )) +DECL_SUF( x264_pixel_avg_8x16, ( uint8_t *, int, uint8_t *, int, uint8_t *, int, int )) +DECL_SUF( x264_pixel_avg_8x8, ( uint8_t *, int, uint8_t *, int, uint8_t *, int, int )) +DECL_SUF( x264_pixel_avg_8x4, ( uint8_t *, int, uint8_t *, int, uint8_t *, int, int )) +DECL_SUF( x264_pixel_avg_4x8, ( uint8_t *, int, uint8_t *, int, uint8_t *, int, int )) +DECL_SUF( x264_pixel_avg_4x4, ( uint8_t *, int, uint8_t *, int, uint8_t *, int, int )) +DECL_SUF( x264_pixel_avg_4x2, ( uint8_t *, int, uint8_t *, int, uint8_t *, int, int )) extern void x264_mc_copy_w4_mmx( uint8_t *, int, uint8_t *, int, int ); extern void x264_mc_copy_w8_mmx( uint8_t *, int, uint8_t *, int, int ); extern void x264_mc_copy_w16_mmx( uint8_t *, int, uint8_t *, int, int ); @@ -310,6 +309,15 @@ void x264_mc_init_mmx( int cpu, x264_mc_functions_t *pf ) if( !(cpu&X264_CPU_SSSE3) ) return; + pf->avg[PIXEL_16x16] = x264_pixel_avg_16x16_ssse3; + pf->avg[PIXEL_16x8] = x264_pixel_avg_16x8_ssse3; + pf->avg[PIXEL_8x16] = x264_pixel_avg_8x16_ssse3; + pf->avg[PIXEL_8x8] = x264_pixel_avg_8x8_ssse3; + pf->avg[PIXEL_8x4] = x264_pixel_avg_8x4_ssse3; + pf->avg[PIXEL_4x8] = x264_pixel_avg_4x8_ssse3; + pf->avg[PIXEL_4x4] = x264_pixel_avg_4x4_ssse3; + pf->avg[PIXEL_4x2] = x264_pixel_avg_4x2_ssse3; + pf->hpel_filter = x264_hpel_filter_ssse3; pf->frame_init_lowres_core = x264_frame_init_lowres_core_ssse3; pf->mc_chroma = x264_mc_chroma_ssse3; diff --git a/tools/checkasm.c b/tools/checkasm.c index bb1cdc0c..5f5004a5 100644 --- a/tools/checkasm.c +++ b/tools/checkasm.c @@ -774,15 +774,15 @@ static int check_mc( int cpu_ref, int cpu_new ) #define MC_TEST_AVG( name, weight ) \ for( i = 0, ok = 1, used_asm = 0; i < 10; i++ ) \ { \ - memcpy( buf2, buf1, 1024 ); \ - memcpy( buf4, buf3, 1024 ); \ + memcpy( buf3, buf1+320, 320 ); \ + memcpy( buf4, buf1+320, 320 ); \ if( mc_a.name[i] != mc_ref.name[i] ) \ { \ set_func_name( "%s_%s", #name, pixel_names[i] );\ used_asm = 1; \ call_c1( mc_c.name[i], buf3, 16, buf2+1, 16, buf1+18, 16, weight ); \ call_a1( mc_a.name[i], buf4, 16, buf2+1, 16, buf1+18, 16, weight ); \ - if( memcmp( buf3, buf4, 1024 ) ) \ + if( memcmp( buf3, buf4, 320 ) ) \ { \ ok = 0; \ fprintf( stderr, #name "[%d]: [FAILED]\n", i ); \ @@ -792,7 +792,7 @@ static int check_mc( int cpu_ref, int cpu_new ) } \ } ok = 1; used_asm = 0; - for( w = -64; w <= 128 && ok; w++ ) + for( w = -63; w <= 127 && ok; w++ ) MC_TEST_AVG( avg, w ); report( "mc wpredb :" );