From: Fiona Glaser Date: Sat, 24 May 2008 19:10:21 +0000 (-0600) Subject: memzero_aligned_mmx X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=9bdf19c2f114a439cc0f4d27ab8493912918584d;p=libx264 memzero_aligned_mmx --- diff --git a/common/mc.c b/common/mc.c index d8d60648..513e47ae 100644 --- a/common/mc.c +++ b/common/mc.c @@ -271,13 +271,18 @@ static void plane_copy( uint8_t *dst, int i_dst, } } -void prefetch_fenc_null( uint8_t *pix_y, int stride_y, - uint8_t *pix_uv, int stride_uv, int mb_x ) +static void prefetch_fenc_null( uint8_t *pix_y, int stride_y, + uint8_t *pix_uv, int stride_uv, int mb_x ) {} -void prefetch_ref_null( uint8_t *pix, int stride, int parity ) +static void prefetch_ref_null( uint8_t *pix, int stride, int parity ) {} +static void memzero_aligned( void * dst, int n ) +{ + memset( dst, 0, n ); +} + void x264_mc_init( int cpu, x264_mc_functions_t *pf ) { pf->mc_luma = mc_luma; @@ -316,6 +321,7 @@ void x264_mc_init( int cpu, x264_mc_functions_t *pf ) pf->prefetch_fenc = prefetch_fenc_null; pf->prefetch_ref = prefetch_ref_null; pf->memcpy_aligned = memcpy; + pf->memzero_aligned = memzero_aligned; #ifdef HAVE_MMX x264_mc_init_mmx( cpu, pf ); diff --git a/common/mc.h b/common/mc.h index 83b3e907..26f113f6 100644 --- a/common/mc.h +++ b/common/mc.h @@ -67,6 +67,7 @@ typedef struct void (*prefetch_ref)( uint8_t *pix, int stride, int parity ); void *(*memcpy_aligned)( void *dst, const void *src, size_t n ); + void (*memzero_aligned)( void *dst, int n ); } x264_mc_functions_t; diff --git a/common/x86/mc-a2.asm b/common/x86/mc-a2.asm index b05d2944..df30d23e 100644 --- a/common/x86/mc-a2.asm +++ b/common/x86/mc-a2.asm @@ -387,6 +387,12 @@ cglobal x264_plane_copy_mmxext, 6,7 emms RET + + +; These functions are not general-use; not only do the SSE ones require aligned input, +; but they also will fail if given a non-mod16 size or a size less than 64. +; memzero SSE will fail for non-mod128. + ;----------------------------------------------------------------------------- ; void *x264_memcpy_aligned_mmx( void *dst, const void *src, size_t n ); ;----------------------------------------------------------------------------- @@ -440,3 +446,25 @@ cglobal x264_memcpy_aligned_sse2, 3,3 movdqa [r0 + r2 + 48], xmm3 jg .copy64 REP_RET + +;----------------------------------------------------------------------------- +; void *x264_memzero_aligned( void *dst, size_t n ); +;----------------------------------------------------------------------------- +%macro MEMZERO 1 +cglobal x264_memzero_aligned_%1, 2,2 + pxor m0, m0 +.loop: + sub r1d, regsize*8 +%assign i 0 +%rep 8 + mova [r0 + r1 + i], m0 +%assign i i+regsize +%endrep + jg .loop + REP_RET +%endmacro + +INIT_MMX +MEMZERO mmx +INIT_XMM +MEMZERO sse2 diff --git a/common/x86/mc-c.c b/common/x86/mc-c.c index 1144c36f..dcb89db0 100644 --- a/common/x86/mc-c.c +++ b/common/x86/mc-c.c @@ -55,6 +55,8 @@ extern void x264_mc_chroma_mmxext( uint8_t *src, int i_src_stride, extern void x264_plane_copy_mmxext( uint8_t *, int, uint8_t *, int, int w, int h); extern void *x264_memcpy_aligned_mmx( void * dst, const void * src, size_t n ); extern void *x264_memcpy_aligned_sse2( void * dst, const void * src, size_t n ); +extern void x264_memzero_aligned_mmx( void * dst, int n ); +extern void x264_memzero_aligned_sse2( void * dst, int n ); #define PIXEL_AVG_W(width,cpu)\ extern void x264_pixel_avg2_w##width##_##cpu( uint8_t *, int, uint8_t *, int, uint8_t *, int ); @@ -230,6 +232,7 @@ void x264_mc_init_mmx( int cpu, x264_mc_functions_t *pf ) pf->copy[PIXEL_8x8] = x264_mc_copy_w8_mmx; pf->copy[PIXEL_4x4] = x264_mc_copy_w4_mmx; pf->memcpy_aligned = x264_memcpy_aligned_mmx; + pf->memzero_aligned = x264_memzero_aligned_mmx; if( !(cpu&X264_CPU_MMXEXT) ) return; @@ -278,6 +281,7 @@ void x264_mc_init_mmx( int cpu, x264_mc_functions_t *pf ) return; pf->memcpy_aligned = x264_memcpy_aligned_sse2; + pf->memzero_aligned = x264_memzero_aligned_sse2; pf->hpel_filter = x264_hpel_filter_sse2_amd; // disable on AMD processors since it is slower diff --git a/encoder/me.c b/encoder/me.c index 0dd63782..81f2f000 100644 --- a/encoder/me.c +++ b/encoder/me.c @@ -837,7 +837,7 @@ int x264_me_refine_bidir( x264_t *h, x264_me_t *m0, x264_me_t *m1, int i_weight int bcost = COST_MAX; int pass = 0; uint8_t visited[8][8][8][8]; - memset( visited, 0, sizeof(visited) ); + h->mc.memzero_aligned( visited, sizeof(visited) ); BIME_CACHE( 0, 0 ); CHECK_BIDIR( 0, 0, 0, 0 );