From 2bff50702978bf2af30ef2b58264bd71549bc702 Mon Sep 17 00:00:00 2001 From: Fiona Glaser Date: Wed, 3 Sep 2008 15:15:17 -0700 Subject: [PATCH] Add sad_aligned for faster subme=1 mbcmp Distinguish between unaligned and aligned uses of mbcmp SAD_aligned, for MMX SADs, uses non-cacheline SADs. --- common/pixel.c | 35 +++++++++++++++++++++-------------- common/pixel.h | 2 ++ common/x86/pixel.h | 1 + common/x86/sad-a.asm | 4 +++- encoder/analyse.c | 2 +- encoder/encoder.c | 3 ++- encoder/me.c | 4 ++-- tools/checkasm.c | 1 + 8 files changed, 33 insertions(+), 19 deletions(-) diff --git a/common/pixel.c b/common/pixel.c index 27575b5c..3e28d857 100644 --- a/common/pixel.c +++ b/common/pixel.c @@ -529,20 +529,24 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf ) { memset( pixf, 0, sizeof(*pixf) ); -#define INIT2( name, cpu ) \ - pixf->name[PIXEL_16x16] = x264_pixel_##name##_16x16##cpu;\ - pixf->name[PIXEL_16x8] = x264_pixel_##name##_16x8##cpu; -#define INIT4( name, cpu ) \ - INIT2( name, cpu ) \ - pixf->name[PIXEL_8x16] = x264_pixel_##name##_8x16##cpu;\ - pixf->name[PIXEL_8x8] = x264_pixel_##name##_8x8##cpu; -#define INIT5( name, cpu ) \ - INIT4( name, cpu ) \ - pixf->name[PIXEL_8x4] = x264_pixel_##name##_8x4##cpu; -#define INIT7( name, cpu ) \ - INIT5( name, cpu ) \ - pixf->name[PIXEL_4x8] = x264_pixel_##name##_4x8##cpu;\ - pixf->name[PIXEL_4x4] = x264_pixel_##name##_4x4##cpu; +#define INIT2_NAME( name1, name2, cpu ) \ + pixf->name1[PIXEL_16x16] = x264_pixel_##name2##_16x16##cpu;\ + pixf->name1[PIXEL_16x8] = x264_pixel_##name2##_16x8##cpu; +#define INIT4_NAME( name1, name2, cpu ) \ + INIT2_NAME( name1, name2, cpu ) \ + pixf->name1[PIXEL_8x16] = x264_pixel_##name2##_8x16##cpu;\ + pixf->name1[PIXEL_8x8] = x264_pixel_##name2##_8x8##cpu; +#define INIT5_NAME( name1, name2, cpu ) \ + INIT4_NAME( name1, name2, cpu ) \ + pixf->name1[PIXEL_8x4] = x264_pixel_##name2##_8x4##cpu; +#define INIT7_NAME( name1, name2, cpu ) \ + INIT5_NAME( name1, name2, cpu ) \ + pixf->name1[PIXEL_4x8] = x264_pixel_##name2##_4x8##cpu;\ + pixf->name1[PIXEL_4x4] = x264_pixel_##name2##_4x4##cpu; +#define INIT2( name, cpu ) INIT2_NAME( name, name, cpu ) +#define INIT4( name, cpu ) INIT4_NAME( name, name, cpu ) +#define INIT5( name, cpu ) INIT5_NAME( name, name, cpu ) +#define INIT7( name, cpu ) INIT7_NAME( name, name, cpu ) #define INIT_ADS( cpu ) \ pixf->ads[PIXEL_16x16] = x264_pixel_ads4##cpu;\ @@ -550,6 +554,7 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf ) pixf->ads[PIXEL_8x8] = x264_pixel_ads1##cpu; INIT7( sad, ); + INIT7_NAME( sad_aligned, sad, ); INIT7( sad_x3, ); INIT7( sad_x4, ); INIT7( ssd, ); @@ -574,6 +579,7 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf ) if( cpu&X264_CPU_MMXEXT ) { INIT7( sad, _mmxext ); + INIT7_NAME( sad_aligned, sad, _mmxext ); INIT7( sad_x3, _mmxext ); INIT7( sad_x4, _mmxext ); INIT7( satd, _mmxext ); @@ -640,6 +646,7 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf ) INIT5( satd, _sse2 ); INIT5( satd_x3, _sse2 ); INIT5( satd_x4, _sse2 ); + INIT2_NAME( sad_aligned, sad, _sse2_aligned ); pixf->var[PIXEL_16x16] = x264_pixel_var_16x16_sse2; pixf->ssim_4x4x2_core = x264_pixel_ssim_4x4x2_core_sse2; pixf->ssim_end4 = x264_pixel_ssim_end4_sse2; diff --git a/common/pixel.h b/common/pixel.h index fd23680d..127f89dd 100644 --- a/common/pixel.h +++ b/common/pixel.h @@ -69,10 +69,12 @@ typedef struct x264_pixel_cmp_t ssim[7]; x264_pixel_cmp_t sa8d[4]; x264_pixel_cmp_t mbcmp[7]; /* either satd or sad for subpel refine and mode decision */ + x264_pixel_cmp_t mbcmp_unaligned[7]; /* unaligned mbcmp for subpel */ x264_pixel_cmp_t fpelcmp[7]; /* either satd or sad for fullpel motion search */ x264_pixel_cmp_x3_t fpelcmp_x3[7]; x264_pixel_cmp_x4_t fpelcmp_x4[7]; x264_pixel_var_t var[4]; + x264_pixel_cmp_t sad_aligned[7]; /* Aligned SAD for mbcmp */ void (*ssim_4x4x2_core)( const uint8_t *pix1, int stride1, const uint8_t *pix2, int stride2, int sums[2][4] ); diff --git a/common/x86/pixel.h b/common/x86/pixel.h index 9326a84a..cbb37dda 100644 --- a/common/x86/pixel.h +++ b/common/x86/pixel.h @@ -43,6 +43,7 @@ DECL_X1( sad, mmxext ) DECL_X1( sad, sse2 ) DECL_X1( sad, sse3 ) +DECL_X1( sad, sse2_aligned ) DECL_X4( sad, mmxext ) DECL_X4( sad, sse2 ) DECL_X4( sad, sse3 ) diff --git a/common/x86/sad-a.asm b/common/x86/sad-a.asm index a9545006..2c167221 100644 --- a/common/x86/sad-a.asm +++ b/common/x86/sad-a.asm @@ -215,7 +215,9 @@ cglobal x264_pixel_sad_16x8_%1, 4,4 SAD_W16 sse2 %define movdqu lddqu SAD_W16 sse3 -%undef movdqu +%define movdqu movdqa +SAD_W16 sse2_aligned +%define movdqu movups diff --git a/encoder/analyse.c b/encoder/analyse.c index cd7f745a..d9e77a32 100644 --- a/encoder/analyse.c +++ b/encoder/analyse.c @@ -588,7 +588,7 @@ static void x264_mb_analyse_intra( x264_t *h, x264_mb_analysis_t *a, int i_satd_ if( flags & X264_ANALYSE_I8x8 ) { DECLARE_ALIGNED_16( uint8_t edge[33] ); - x264_pixel_cmp_t sa8d = (*h->pixf.mbcmp == *h->pixf.sad) ? h->pixf.sad[PIXEL_8x8] : h->pixf.sa8d[PIXEL_8x8]; + x264_pixel_cmp_t sa8d = (h->pixf.mbcmp[0] == h->pixf.satd[0]) ? h->pixf.sa8d[PIXEL_8x8] : h->pixf.mbcmp[PIXEL_8x8]; int i_satd_thresh = a->b_mbrd ? COST_MAX : X264_MIN( i_satd_inter, a->i_satd_i16x16 ); int i_cost = 0; b_merged_satd = h->pixf.intra_sa8d_x3_8x8 && h->pixf.mbcmp[0] == h->pixf.satd[0]; diff --git a/encoder/encoder.c b/encoder/encoder.c index d991a5e2..584aef65 100644 --- a/encoder/encoder.c +++ b/encoder/encoder.c @@ -583,7 +583,8 @@ static int x264_validate_parameters( x264_t *h ) static void mbcmp_init( x264_t *h ) { int satd = !h->mb.b_lossless && h->param.analyse.i_subpel_refine > 1; - memcpy( h->pixf.mbcmp, satd ? h->pixf.satd : h->pixf.sad, sizeof(h->pixf.mbcmp) ); + memcpy( h->pixf.mbcmp, satd ? h->pixf.satd : h->pixf.sad_aligned, sizeof(h->pixf.mbcmp) ); + memcpy( h->pixf.mbcmp_unaligned, satd ? h->pixf.satd : h->pixf.sad, sizeof(h->pixf.mbcmp_unaligned) ); satd &= h->param.analyse.i_me_method == X264_ME_TESA; memcpy( h->pixf.fpelcmp, satd ? h->pixf.satd : h->pixf.sad, sizeof(h->pixf.fpelcmp) ); memcpy( h->pixf.fpelcmp_x3, satd ? h->pixf.satd_x3 : h->pixf.sad_x3, sizeof(h->pixf.fpelcmp_x3) ); diff --git a/encoder/me.c b/encoder/me.c index 44f6f6db..63c57863 100644 --- a/encoder/me.c +++ b/encoder/me.c @@ -663,7 +663,7 @@ if( b_refine_qpel || (dir^1) != odir ) \ { \ int stride = 16; \ uint8_t *src = h->mc.get_ref( pix[0], &stride, m->p_fref, m->i_stride[0], mx, my, bw, bh ); \ - int cost = h->pixf.mbcmp[i_pixel]( m->p_fenc[0], FENC_STRIDE, src, stride ) \ + int cost = h->pixf.mbcmp_unaligned[i_pixel]( m->p_fenc[0], FENC_STRIDE, src, stride ) \ + p_cost_mvx[ mx ] + p_cost_mvy[ my ]; \ if( b_chroma_me && cost < bcost ) \ { \ @@ -904,7 +904,7 @@ int x264_me_refine_bidir( x264_t *h, x264_me_t *m0, x264_me_t *m1, int i_weight { \ int stride = 16; \ uint8_t *src = h->mc.get_ref( pix, &stride, m->p_fref, m->i_stride[0], mx, my, bw*4, bh*4 ); \ - dst = h->pixf.mbcmp[i_pixel]( m->p_fenc[0], FENC_STRIDE, src, stride ) \ + dst = h->pixf.mbcmp_unaligned[i_pixel]( m->p_fenc[0], FENC_STRIDE, src, stride ) \ + p_cost_mvx[mx] + p_cost_mvy[my]; \ COPY1_IF_LT( bsatd, dst ); \ } diff --git a/tools/checkasm.c b/tools/checkasm.c index f13d6e05..9f892859 100644 --- a/tools/checkasm.c +++ b/tools/checkasm.c @@ -258,6 +258,7 @@ static int check_pixel( int cpu_ref, int cpu_new ) report( "pixel " #name " :" ); TEST_PIXEL( sad, 0 ); + TEST_PIXEL( sad_aligned, 1 ); TEST_PIXEL( ssd, 1 ); TEST_PIXEL( satd, 0 ); TEST_PIXEL( sa8d, 0 ); -- 2.40.0