From: Loren Merritt Date: Thu, 20 Mar 2008 20:00:08 +0000 (-0600) Subject: new ssd_8x*_sse2 X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=14b45a81c25be808e3da6d7b3e78051f6c5b5308;p=libx264 new ssd_8x*_sse2 align ssd_16x*_sse2 unroll ssd_4x*_mmx --- diff --git a/common/pixel.c b/common/pixel.c index ae908458..1d5567b6 100644 --- a/common/pixel.c +++ b/common/pixel.c @@ -99,14 +99,17 @@ int64_t x264_pixel_ssd_wxh( x264_pixel_function_t *pf, uint8_t *pix1, int i_pix1 { int64_t i_ssd = 0; int x, y; + int align = !(((long)pix1 | (long)pix2 | i_pix1 | i_pix2) & 15); #define SSD(size) i_ssd += pf->ssd[size]( pix1 + y*i_pix1 + x, i_pix1, \ pix2 + y*i_pix2 + x, i_pix2 ); for( y = 0; y < i_height-15; y += 16 ) { - for( x = 0; x < i_width-15; x += 16 ) - SSD(PIXEL_16x16); - if( x < i_width-7 ) + x = 0; + if( align ) + for( ; x < i_width-15; x += 16 ) + SSD(PIXEL_16x16); + for( ; x < i_width-7; x += 8 ) SSD(PIXEL_8x16); } if( y < i_height-7 ) @@ -610,7 +613,7 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf ) // these are faster on both Intel and AMD if( cpu&X264_CPU_SSE2 ) { - INIT2( ssd, _sse2 ); + INIT5( ssd, _sse2 ); pixf->ssim_4x4x2_core = x264_pixel_ssim_4x4x2_core_sse2; pixf->ssim_end4 = x264_pixel_ssim_end4_sse2; pixf->sa8d[PIXEL_16x16] = x264_pixel_sa8d_16x16_sse2; diff --git a/common/pixel.h b/common/pixel.h index fb5f99ec..d533620c 100644 --- a/common/pixel.h +++ b/common/pixel.h @@ -24,6 +24,8 @@ #ifndef _PIXEL_H #define _PIXEL_H 1 +// SSD assumes all args aligned +// other cmp functions assume first arg aligned typedef int (*x264_pixel_cmp_t) ( uint8_t *, int, uint8_t *, int ); typedef void (*x264_pixel_cmp_x3_t) ( uint8_t *, uint8_t *, uint8_t *, uint8_t *, int, int[3] ); typedef void (*x264_pixel_cmp_x4_t) ( uint8_t *, uint8_t *, uint8_t *, uint8_t *, uint8_t *, int, int[4] ); diff --git a/common/x86/pixel-a.asm b/common/x86/pixel-a.asm index 557aeb8a..b4d06561 100644 --- a/common/x86/pixel-a.asm +++ b/common/x86/pixel-a.asm @@ -83,41 +83,64 @@ SECTION .text paddd mm0, mm4 %endmacro -%macro SSD_INC_1x8P 0 +%macro SSD_INC_2x16P 0 + SSD_INC_1x16P + SSD_INC_1x16P +%endmacro + +%macro SSD_INC_2x8P 0 movq mm1, [r0] movq mm2, [r2] + movq mm3, [r0+r1] + movq mm4, [r2+r3] movq mm5, mm2 + movq mm6, mm4 psubusb mm2, mm1 + psubusb mm4, mm3 psubusb mm1, mm5 - por mm1, mm2 ; mm1 = 8bit abs diff + psubusb mm3, mm6 + por mm1, mm2 + por mm3, mm4 movq mm2, mm1 + movq mm4, mm3 punpcklbw mm1, mm7 - punpckhbw mm2, mm7 ; (mm1,mm2) = 16bit abs diff + punpcklbw mm3, mm7 + punpckhbw mm2, mm7 + punpckhbw mm4, mm7 pmaddwd mm1, mm1 pmaddwd mm2, mm2 + pmaddwd mm3, mm3 + pmaddwd mm4, mm4 - add r0, r1 - add r2, r3 + lea r0, [r0+2*r1] + lea r2, [r2+2*r3] paddd mm0, mm1 paddd mm0, mm2 + paddd mm0, mm3 + paddd mm0, mm4 %endmacro -%macro SSD_INC_1x4P 0 - movd mm1, [r0] - movd mm2, [r2] - - movq mm5, mm2 - psubusb mm2, mm1 - psubusb mm1, mm5 - por mm1, mm2 - punpcklbw mm1, mm7 - pmaddwd mm1, mm1 - - add r0, r1 - add r2, r3 - paddd mm0, mm1 +%macro SSD_INC_2x4P 0 + movd mm1, [r0] + movd mm2, [r2] + movd mm3, [r0+r1] + movd mm4, [r2+r3] + + punpcklbw mm1, mm7 + punpcklbw mm2, mm7 + punpcklbw mm3, mm7 + punpcklbw mm4, mm7 + psubw mm1, mm2 + psubw mm3, mm4 + pmaddwd mm1, mm1 + pmaddwd mm3, mm3 + + lea r0, [r0+2*r1] + lea r2, [r2+2*r3] + paddd mm0, mm1 + paddd mm0, mm3 %endmacro ;----------------------------------------------------------------------------- @@ -127,8 +150,8 @@ SECTION .text cglobal x264_pixel_ssd_%1x%2_mmx, 4,4 pxor mm7, mm7 ; zero pxor mm0, mm0 ; mm0 holds the sum -%rep %2 - SSD_INC_1x%1P +%rep %2/2 + SSD_INC_2x%1P %endrep movq mm1, mm0 psrlq mm1, 32 @@ -146,10 +169,10 @@ SSD_MMX 4, 8 SSD_MMX 4, 4 %macro SSD_INC_2x16P_SSE2 0 - movdqu xmm1, [r0] - movdqu xmm2, [r2] - movdqu xmm3, [r0+r1] - movdqu xmm4, [r2+r3] + movdqa xmm1, [r0] + movdqa xmm2, [r2] + movdqa xmm3, [r0+r1] + movdqa xmm4, [r2+r3] movdqa xmm5, xmm1 movdqa xmm6, xmm3 @@ -180,6 +203,27 @@ SSD_MMX 4, 4 paddd xmm0, xmm3 %endmacro +%macro SSD_INC_2x8P_SSE2 0 + movq xmm1, [r0] + movq xmm2, [r2] + movq xmm3, [r0+r1] + movq xmm4, [r2+r3] + + punpcklbw xmm1,xmm7 + punpcklbw xmm2,xmm7 + punpcklbw xmm3,xmm7 + punpcklbw xmm4,xmm7 + psubw xmm1,xmm2 + psubw xmm3,xmm4 + pmaddwd xmm1,xmm1 + pmaddwd xmm3,xmm3 + + lea r0, [r0+r1*2] + lea r2, [r2+r3*2] + paddd xmm0, xmm1 + paddd xmm0, xmm3 +%endmacro + ;----------------------------------------------------------------------------- ; int x264_pixel_ssd_16x16_sse2 (uint8_t *, int, uint8_t *, int ) ;----------------------------------------------------------------------------- @@ -188,7 +232,7 @@ cglobal x264_pixel_ssd_%1x%2_sse2, 4,4 pxor xmm7, xmm7 pxor xmm0, xmm0 %rep %2/2 - SSD_INC_2x16P_SSE2 + SSD_INC_2x%1P_SSE2 %endrep HADDD xmm0, xmm1 movd eax, xmm0 @@ -197,6 +241,9 @@ cglobal x264_pixel_ssd_%1x%2_sse2, 4,4 SSD_SSE2 16, 16 SSD_SSE2 16, 8 +SSD_SSE2 8, 16 +SSD_SSE2 8, 8 +SSD_SSE2 8, 4 diff --git a/tools/checkasm.c b/tools/checkasm.c index 2b947b09..f8f2e352 100644 --- a/tools/checkasm.c +++ b/tools/checkasm.c @@ -48,7 +48,7 @@ static int check_pixel( int cpu_ref, int cpu_new ) x264_predict_4x4_init( 0, predict_4x4 ); x264_predict_8x8_filter( buf2+40, edge, ALL_NEIGHBORS, ALL_NEIGHBORS ); -#define TEST_PIXEL( name ) \ +#define TEST_PIXEL( name, align ) \ for( i = 0, ok = 1, used_asm = 0; i < 7; i++ ) \ { \ int res_c, res_asm; \ @@ -57,8 +57,8 @@ static int check_pixel( int cpu_ref, int cpu_new ) for( j=0; j<64; j++ ) \ { \ used_asm = 1; \ - res_c = call_c( pixel_c.name[i], buf1, 32, buf2+j, 16 ); \ - res_asm = call_a( pixel_asm.name[i], buf1, 32, buf2+j, 16 ); \ + res_c = call_c( pixel_c.name[i], buf1, 32, buf2+j*!align, 16 ); \ + res_asm = call_a( pixel_asm.name[i], buf1, 32, buf2+j*!align, 16 ); \ if( res_c != res_asm ) \ { \ ok = 0; \ @@ -70,10 +70,10 @@ static int check_pixel( int cpu_ref, int cpu_new ) } \ report( "pixel " #name " :" ); - TEST_PIXEL( sad ); - TEST_PIXEL( ssd ); - TEST_PIXEL( satd ); - TEST_PIXEL( sa8d ); + TEST_PIXEL( sad, 0 ); + TEST_PIXEL( ssd, 1 ); + TEST_PIXEL( satd, 0 ); + TEST_PIXEL( sa8d, 0 ); #define TEST_PIXEL_X( N ) \ for( i = 0, ok = 1, used_asm = 0; i < 7; i++ ) \