From 547a6573af56afe8d551201245775c6ba179e781 Mon Sep 17 00:00:00 2001 From: Henrik Gramner Date: Tue, 16 Apr 2013 23:27:29 +0200 Subject: [PATCH] x86: AVX memzero_aligned --- common/common.h | 4 ++-- common/x86/mc-a2.asm | 14 +++++++------- common/x86/mc-c.c | 2 ++ encoder/me.c | 2 +- 4 files changed, 12 insertions(+), 10 deletions(-) diff --git a/common/common.h b/common/common.h index 53a6ff03..1732d59b 100644 --- a/common/common.h +++ b/common/common.h @@ -770,8 +770,8 @@ struct x264_t ALIGNED_16( dctcoef fenc_dct4[16][16] ); /* Psy RD SATD/SA8D scores cache */ - ALIGNED_16( uint64_t fenc_hadamard_cache[9] ); - ALIGNED_16( uint32_t fenc_satd_cache[32] ); + ALIGNED_N( uint64_t fenc_hadamard_cache[9] ); + ALIGNED_N( uint32_t fenc_satd_cache[32] ); /* pointer over mb of the frame to be compressed */ pixel *p_fenc[3]; /* y,u,v */ diff --git a/common/x86/mc-a2.asm b/common/x86/mc-a2.asm index caa93969..27e66b86 100644 --- a/common/x86/mc-a2.asm +++ b/common/x86/mc-a2.asm @@ -1223,7 +1223,7 @@ MEMCPY ;----------------------------------------------------------------------------- ; void *memzero_aligned( void *dst, size_t n ); ;----------------------------------------------------------------------------- -%macro MEMZERO 0 +%macro MEMZERO 1 cglobal memzero_aligned, 2,2 add r0, r1 neg r1 @@ -1234,21 +1234,21 @@ cglobal memzero_aligned, 2,2 %endif .loop: %assign i 0 -%rep 8 +%rep %1 mova [r0 + r1 + i], m0 %assign i i+mmsize %endrep - add r1, mmsize*8 + add r1, mmsize*%1 jl .loop RET %endmacro INIT_MMX mmx -MEMZERO +MEMZERO 8 INIT_XMM sse -MEMZERO - - +MEMZERO 8 +INIT_YMM avx +MEMZERO 4 %if HIGH_BIT_DEPTH == 0 ;----------------------------------------------------------------------------- diff --git a/common/x86/mc-c.c b/common/x86/mc-c.c index 3bb38838..198d7e40 100644 --- a/common/x86/mc-c.c +++ b/common/x86/mc-c.c @@ -129,6 +129,7 @@ void *x264_memcpy_aligned_mmx( void *dst, const void *src, size_t n ); void *x264_memcpy_aligned_sse( void *dst, const void *src, size_t n ); void x264_memzero_aligned_mmx( void *dst, size_t n ); void x264_memzero_aligned_sse( void *dst, size_t n ); +void x264_memzero_aligned_avx( void *dst, size_t n ); void x264_integral_init4h_sse4( uint16_t *sum, uint8_t *pix, intptr_t stride ); void x264_integral_init4h_avx2( uint16_t *sum, uint8_t *pix, intptr_t stride ); void x264_integral_init8h_sse4( uint16_t *sum, uint8_t *pix, intptr_t stride ); @@ -798,6 +799,7 @@ void x264_mc_init_mmx( int cpu, x264_mc_functions_t *pf ) if( !(cpu&X264_CPU_AVX) ) return; + pf->memzero_aligned = x264_memzero_aligned_avx; pf->mbtree_propagate_cost = x264_mbtree_propagate_cost_avx; if( cpu&X264_CPU_FMA4 ) diff --git a/encoder/me.c b/encoder/me.c index 55896025..8238b96b 100644 --- a/encoder/me.c +++ b/encoder/me.c @@ -1058,7 +1058,7 @@ static void ALWAYS_INLINE x264_me_refine_bidir( x264_t *h, x264_me_t *m0, x264_m uint64_t bcostrd = COST_MAX64; uint16_t amvd; /* each byte of visited represents 8 possible m1y positions, so a 4D array isn't needed */ - ALIGNED_ARRAY_16( uint8_t, visited,[8],[8][8] ); + ALIGNED_ARRAY_N( uint8_t, visited,[8],[8][8] ); /* all permutations of an offset in up to 2 of the dimensions */ ALIGNED_4( static const int8_t dia4d[33][4] ) = { -- 2.40.0