From 8850b6faaf55b83ed3aa86ff9fcb5e35c439b236 Mon Sep 17 00:00:00 2001 From: Loren Merritt Date: Tue, 12 Sep 2006 22:21:23 +0000 Subject: [PATCH] faster ESA git-svn-id: svn://svn.videolan.org/x264/trunk@562 df754926-b1dd-0310-bc7b-ec298dee348c --- common/amd64/pixel-a.asm | 84 ++++++++++++++++++++++++++++++++ common/i386/pixel-a.asm | 100 +++++++++++++++++++++++++++++++++++++++ common/i386/pixel.h | 7 +++ common/pixel.c | 45 ++++++++++++++++++ common/pixel.h | 5 ++ encoder/me.c | 80 ++++++++++++------------------- tools/checkasm.c | 21 +++++++- 7 files changed, 292 insertions(+), 50 deletions(-) diff --git a/common/amd64/pixel-a.asm b/common/amd64/pixel-a.asm index f954a325..705596b7 100644 --- a/common/amd64/pixel-a.asm +++ b/common/amd64/pixel-a.asm @@ -456,6 +456,10 @@ cglobal x264_intra_satd_x3_4x4_mmxext cglobal x264_intra_satd_x3_8x8c_mmxext cglobal x264_intra_satd_x3_16x16_mmxext +cglobal x264_pixel_ads4_mmxext +cglobal x264_pixel_ads2_mmxext +cglobal x264_pixel_ads1_mmxext + %macro SAD_START 0 pxor mm0, mm0 @@ -1110,3 +1114,83 @@ x264_intra_satd_x3_8x8c_mmxext: movd [parm3q+4], mm1 ; i8x8c_h satd movd [parm3q+8], mm2 ; i8x8c_v satd ret + + + +;----------------------------------------------------------------------------- +; void x264_pixel_ads4_mmxext( int enc_dc[4], uint16_t *sums, int delta, +; uint16_t *res, int width ) +;----------------------------------------------------------------------------- +ALIGN 16 +x264_pixel_ads4_mmxext: + movq mm6, [parm1q] + movq mm4, [parm1q+8] + pshufw mm7, mm6, 0 + pshufw mm6, mm6, 0xAA + pshufw mm5, mm4, 0 + pshufw mm4, mm4, 0xAA + shl parm3q, 1 +.loop: + movq mm0, [parm2q] + movq mm1, [parm2q+16] + psubw mm0, mm7 + psubw mm1, mm6 + MMX_ABS mm0, mm2 + MMX_ABS mm1, mm3 + movq mm2, [parm2q+parm3q] + movq mm3, [parm2q+parm3q+16] + psubw mm2, mm5 + psubw mm3, mm4 + paddw mm0, mm1 + MMX_ABS mm2, mm1 + MMX_ABS mm3, mm1 + paddw mm0, mm2 + paddw mm0, mm3 + movq [parm4q], mm0 + add parm2q, 8 + add parm4q, 8 + sub parm5d, 4 + jg .loop + nop + ret + +ALIGN 16 +x264_pixel_ads2_mmxext: + movq mm6, [parm1q] + pshufw mm7, mm6, 0 + pshufw mm6, mm6, 0xAA + shl parm3q, 1 +.loop: + movq mm0, [parm2q] + movq mm1, [parm2q+parm3q] + psubw mm0, mm7 + psubw mm1, mm6 + MMX_ABS mm0, mm2 + MMX_ABS mm1, mm3 + paddw mm0, mm1 + movq [parm4q], mm0 + add parm2q, 8 + add parm4q, 8 + sub parm5d, 4 + jg .loop + nop + ret + +ALIGN 16 +x264_pixel_ads1_mmxext: + pshufw mm7, [parm1q], 0 +.loop: + movq mm0, [parm2q] + movq mm1, [parm2q+8] + psubw mm0, mm7 + psubw mm1, mm7 + MMX_ABS mm0, mm2 + MMX_ABS mm1, mm3 + movq [parm4q], mm0 + movq [parm4q+8], mm1 + add parm2q, 16 + add parm4q, 16 + sub parm5d, 8 + jg .loop + nop + ret diff --git a/common/i386/pixel-a.asm b/common/i386/pixel-a.asm index 66ee5cd0..dad09d99 100644 --- a/common/i386/pixel-a.asm +++ b/common/i386/pixel-a.asm @@ -492,6 +492,10 @@ cglobal x264_intra_sa8d_x3_8x8_core_mmxext cglobal x264_pixel_ssim_4x4x2_core_mmxext +cglobal x264_pixel_ads4_mmxext +cglobal x264_pixel_ads2_mmxext +cglobal x264_pixel_ads1_mmxext + %macro SAD_START 0 push ebx @@ -1635,3 +1639,99 @@ x264_pixel_ssim_4x4x2_core_mmxext: pop ebx emms ret + + + +;----------------------------------------------------------------------------- +; void x264_pixel_ads4_mmxext( int enc_dc[4], uint16_t *sums, int delta, +; uint16_t *res, int width ) +;----------------------------------------------------------------------------- +ALIGN 16 +x264_pixel_ads4_mmxext: + push ebx + mov eax, [esp+8] + movq mm6, [eax] + movq mm4, [eax+8] + pshufw mm7, mm6, 0 + pshufw mm6, mm6, 0xAA + pshufw mm5, mm4, 0 + pshufw mm4, mm4, 0xAA + mov eax, [esp+12] + mov ebx, [esp+16] + mov ecx, [esp+20] + mov edx, [esp+24] + shl ebx, 1 +.loop: + movq mm0, [eax] + movq mm1, [eax+16] + psubw mm0, mm7 + psubw mm1, mm6 + MMX_ABS mm0, mm2 + MMX_ABS mm1, mm3 + movq mm2, [eax+ebx] + movq mm3, [eax+ebx+16] + psubw mm2, mm5 + psubw mm3, mm4 + paddw mm0, mm1 + MMX_ABS mm2, mm1 + MMX_ABS mm3, mm1 + paddw mm0, mm2 + paddw mm0, mm3 + movq [ecx], mm0 + add eax, 8 + add ecx, 8 + sub edx, 4 + jg .loop + pop ebx + ret + +ALIGN 16 +x264_pixel_ads2_mmxext: + push ebx + mov eax, [esp+8] + movq mm6, [eax] + pshufw mm7, mm6, 0 + pshufw mm6, mm6, 0xAA + mov eax, [esp+12] + mov ebx, [esp+16] + mov ecx, [esp+20] + mov edx, [esp+24] + shl ebx, 1 +.loop: + movq mm0, [eax] + movq mm1, [eax+ebx] + psubw mm0, mm7 + psubw mm1, mm6 + MMX_ABS mm0, mm2 + MMX_ABS mm1, mm3 + paddw mm0, mm1 + movq [ecx], mm0 + add eax, 8 + add ecx, 8 + sub edx, 4 + jg .loop + pop ebx + ret + +ALIGN 16 +x264_pixel_ads1_mmxext: + mov eax, [esp+4] + pshufw mm7, [eax], 0 + mov eax, [esp+8] + mov ecx, [esp+16] + mov edx, [esp+20] +.loop: + movq mm0, [eax] + movq mm1, [eax+8] + psubw mm0, mm7 + psubw mm1, mm7 + MMX_ABS mm0, mm2 + MMX_ABS mm1, mm3 + movq [ecx], mm0 + movq [ecx+8], mm1 + add eax, 16 + add ecx, 16 + sub edx, 8 + jg .loop + nop + ret diff --git a/common/i386/pixel.h b/common/i386/pixel.h index f33b22d7..fb06cccf 100644 --- a/common/i386/pixel.h +++ b/common/i386/pixel.h @@ -104,4 +104,11 @@ void x264_pixel_ssim_4x4x2_core_sse2( const uint8_t *pix1, int stride1, const uint8_t *pix2, int stride2, int sums[2][4] ); float x264_pixel_ssim_end4_sse2( int sum0[5][4], int sum1[5][4], int width ); +void x264_pixel_ads4_mmxext( int enc_dc[4], uint16_t *sums, int delta, + uint16_t *res, int width ); +void x264_pixel_ads2_mmxext( int enc_dc[2], uint16_t *sums, int delta, + uint16_t *res, int width ); +void x264_pixel_ads1_mmxext( int enc_dc[1], uint16_t *sums, int delta, + uint16_t *res, int width ); + #endif diff --git a/common/pixel.c b/common/pixel.c index aabc69c3..5ab6f726 100644 --- a/common/pixel.c +++ b/common/pixel.c @@ -399,6 +399,38 @@ float x264_pixel_ssim_wxh( x264_pixel_function_t *pf, } +/**************************************************************************** + * successive elimination + ****************************************************************************/ +static void pixel_ads4( int enc_dc[4], uint16_t *sums, int delta, + uint16_t *res, int width ) +{ + int i; + for( i=0; issim_4x4x2_core = ssim_4x4x2_core; pixf->ssim_end4 = ssim_end4; + pixf->ads[PIXEL_16x16] = pixel_ads4; + pixf->ads[PIXEL_16x8] = pixel_ads2; + pixf->ads[PIXEL_8x8] = pixel_ads1; + #ifdef HAVE_MMXEXT if( cpu&X264_CPU_MMX ) { @@ -445,6 +481,10 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf ) pixf->sad_pde[PIXEL_16x8 ] = x264_pixel_sad_pde_16x8_mmxext; pixf->sad_pde[PIXEL_8x16 ] = x264_pixel_sad_pde_8x16_mmxext; + pixf->ads[PIXEL_16x16] = x264_pixel_ads4_mmxext; + pixf->ads[PIXEL_16x8 ] = x264_pixel_ads2_mmxext; + pixf->ads[PIXEL_8x8 ] = x264_pixel_ads1_mmxext; + #ifdef ARCH_X86 pixf->sa8d[PIXEL_16x16] = x264_pixel_sa8d_16x16_mmxext; pixf->sa8d[PIXEL_8x8] = x264_pixel_sa8d_8x8_mmxext; @@ -516,5 +556,10 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf ) pixf->sad_x4[PIXEL_16x8] = x264_pixel_sad_x4_16x8_vis; pixf->sad_x4[PIXEL_16x16] = x264_pixel_sad_x4_16x16_vis; #endif + + pixf->ads[PIXEL_8x16] = + pixf->ads[PIXEL_8x4] = + pixf->ads[PIXEL_4x8] = pixf->ads[PIXEL_16x8]; + pixf->ads[PIXEL_4x4] = pixf->ads[PIXEL_8x8]; } diff --git a/common/pixel.h b/common/pixel.h index d6b014cf..c1d4fca1 100644 --- a/common/pixel.h +++ b/common/pixel.h @@ -85,6 +85,11 @@ typedef struct x264_pixel_cmp_x3_t sad_x3[7]; x264_pixel_cmp_x4_t sad_x4[7]; + /* abs-diff-sum for successive elimination. + * may round width up to a multiple of 8. */ + void (*ads[7])( int enc_dc[4], uint16_t *sums, int delta, + uint16_t *res, int width ); + /* calculate satd of V, H, and DC modes. * may be NULL, in which case just use pred+satd instead. */ void (*intra_satd_x3_16x16)( uint8_t *fenc, uint8_t *fdec, int res[3] ); diff --git a/encoder/me.c b/encoder/me.c index d113e0c7..035a58bb 100644 --- a/encoder/me.c +++ b/encoder/me.c @@ -462,64 +462,46 @@ me_hex2: /* successive elimination by comparing DC before a full SAD, * because sum(abs(diff)) >= abs(diff(sum)). */ const int stride = m->i_stride[0]; - const uint16_t *integral_base = m->integral; static uint8_t zero[16*16] = {0,}; + uint16_t *sums_base = m->integral; int enc_dc[4]; int sad_size = i_pixel <= PIXEL_8x8 ? PIXEL_8x8 : PIXEL_4x4; - int sad_w = x264_pixel_size[sad_size].w; - h->pixf.sad_x4[sad_size]( zero, m->p_fenc[0], m->p_fenc[0]+sad_w, - m->p_fenc[0]+sad_w*FENC_STRIDE, m->p_fenc[0]+sad_w+sad_w*FENC_STRIDE, + int delta = x264_pixel_size[sad_size].w; + uint16_t *ads = alloca((max_x-min_x+8) * sizeof(uint16_t)); + + h->pixf.sad_x4[sad_size]( zero, m->p_fenc[0], m->p_fenc[0]+delta, + m->p_fenc[0]+delta*FENC_STRIDE, m->p_fenc[0]+delta+delta*FENC_STRIDE, FENC_STRIDE, enc_dc ); - if( sad_w == 4 ) - integral_base += stride * (h->fenc->i_lines[0] + 64); - -#define ESA(ADS) \ - for( my = min_y; my <= max_y; my++ )\ - {\ - int mvs[3], i_mvs=0;\ - bcost -= p_cost_mvy[my<<2];\ - for( mx = min_x; mx <= max_x; mx++ )\ - {\ - const uint16_t *integral = &integral_base[ mx + my * stride ];\ - if( ADS < bcost - p_cost_mvx[mx<<2] )\ - {\ - if( i_mvs == 3 )\ - {\ - COST_MV_X4_ABS( mvs[0],my, mvs[1],my, mvs[2],my, mx,my );\ - i_mvs = 0;\ - }\ - else\ - mvs[i_mvs++] = mx;\ - }\ - }\ - bcost += p_cost_mvy[my<<2];\ - for( i=0; ifenc->i_lines[0] + 64); + if( i_pixel == PIXEL_16x16 || i_pixel == PIXEL_8x16 || i_pixel == PIXEL_4x8 ) + delta *= stride; + if( i_pixel == PIXEL_8x16 || i_pixel == PIXEL_4x8 ) + enc_dc[1] = enc_dc[2]; - if( i_pixel == PIXEL_16x16 ) - { - ESA( abs( enc_dc[0] - integral[0] ) - + abs( enc_dc[1] - integral[8] ) - + abs( enc_dc[2] - integral[8*stride] ) - + abs( enc_dc[3] - integral[8*stride+8] ) ); - } - else if( i_pixel == PIXEL_8x8 || i_pixel == PIXEL_4x4 ) - { - ESA( abs( enc_dc[0] - integral[0] ) ); - } - else + for( my = min_y; my <= max_y; my++ ) { - int dw = i_pixel < PIXEL_8x8 ? 8 : 4; - if( i_pixel == PIXEL_8x16 || i_pixel == PIXEL_4x8 ) + int mvs[3], i_mvs=0; + bcost -= p_cost_mvy[my<<2]; + h->pixf.ads[i_pixel]( enc_dc, sums_base + min_x + my * stride, delta, + ads, max_x-min_x+1 ); + for( mx = min_x; mx <= max_x; mx++ ) { - dw *= stride; - enc_dc[1] = enc_dc[2]; + if( ads[mx-min_x] < bcost - p_cost_mvx[mx<<2] ) + { + if( i_mvs == 3 ) + { + COST_MV_X4_ABS( mvs[0],my, mvs[1],my, mvs[2],my, mx,my ); + i_mvs = 0; + } + else + mvs[i_mvs++] = mx; + } } - ESA( abs( enc_dc[0] - integral[0] ) - + abs( enc_dc[1] - integral[dw] ) ); + bcost += p_cost_mvy[my<<2]; + for( i=0; i