From fa3549b5f2478f39cbcbd14d2e956e59f70d18eb Mon Sep 17 00:00:00 2001 From: Henrik Gramner Date: Fri, 28 Nov 2014 23:24:56 +0100 Subject: [PATCH] x86: SSE and AVX implementations of plane_copy Also remove the MMX2 implementation and fix src overread for height == 1. --- common/frame.c | 2 +- common/x86/mc-a2.asm | 83 ++++++++++++++++++++++---------------------- common/x86/mc-c.c | 48 ++++++++++++++++--------- 3 files changed, 73 insertions(+), 60 deletions(-) diff --git a/common/frame.c b/common/frame.c index a8451810..db1d659b 100644 --- a/common/frame.c +++ b/common/frame.c @@ -77,7 +77,7 @@ static x264_frame_t *x264_frame_new( x264_t *h, int b_fdec ) #if ARCH_X86 || ARCH_X86_64 if( h->param.cpu&X264_CPU_CACHELINE_64 ) align = 64; - else if( h->param.cpu&X264_CPU_CACHELINE_32 || h->param.cpu&X264_CPU_AVX2 ) + else if( h->param.cpu&X264_CPU_CACHELINE_32 || h->param.cpu&X264_CPU_AVX ) align = 32; #endif #if ARCH_PPC diff --git a/common/x86/mc-a2.asm b/common/x86/mc-a2.asm index 9e1746c3..58812a06 100644 --- a/common/x86/mc-a2.asm +++ b/common/x86/mc-a2.asm @@ -913,64 +913,63 @@ HPEL %undef sfence %endif ; !HIGH_BIT_DEPTH +%macro PREFETCHNT_ITER 2 ; src, bytes/iteration + %assign %%i 4*(%2) ; prefetch 4 iterations ahead. is this optimal? + %rep (%2+63) / 64 ; assume 64 byte cache lines + prefetchnta [%1+%%i] + %assign %%i %%i + 64 + %endrep +%endmacro + ;----------------------------------------------------------------------------- ; void plane_copy_core( pixel *dst, intptr_t i_dst, ; pixel *src, intptr_t i_src, int w, int h ) ;----------------------------------------------------------------------------- -; assumes i_dst and w are multiples of 16, and i_dst>w -INIT_MMX -cglobal plane_copy_core_mmx2, 6,7 +; assumes i_dst and w are multiples of mmsize, and i_dst>w +%macro PLANE_COPY_CORE 0 +cglobal plane_copy_core, 6,7 FIX_STRIDES r1, r3, r4d %if HIGH_BIT_DEPTH == 0 movsxdifnidn r4, r4d %endif - sub r1, r4 - sub r3, r4 + add r0, r4 + add r2, r4 + neg r4 .loopy: - lea r6d, [r4-63] + lea r6, [r4+4*mmsize] .loopx: - prefetchnta [r2+256] - movq m0, [r2 ] - movq m1, [r2+ 8] - movntq [r0 ], m0 - movntq [r0+ 8], m1 - movq m2, [r2+16] - movq m3, [r2+24] - movntq [r0+16], m2 - movntq [r0+24], m3 - movq m4, [r2+32] - movq m5, [r2+40] - movntq [r0+32], m4 - movntq [r0+40], m5 - movq m6, [r2+48] - movq m7, [r2+56] - movntq [r0+48], m6 - movntq [r0+56], m7 - add r2, 64 - add r0, 64 - sub r6d, 64 - jg .loopx - prefetchnta [r2+256] - add r6d, 63 - jle .end16 -.loop16: - movq m0, [r2 ] - movq m1, [r2+8] - movntq [r0 ], m0 - movntq [r0+8], m1 - add r2, 16 - add r0, 16 - sub r6d, 16 - jg .loop16 -.end16: + PREFETCHNT_ITER r2+r6, 4*mmsize + movu m0, [r2+r6-4*mmsize] + movu m1, [r2+r6-3*mmsize] + movu m2, [r2+r6-2*mmsize] + movu m3, [r2+r6-1*mmsize] + movnta [r0+r6-4*mmsize], m0 + movnta [r0+r6-3*mmsize], m1 + movnta [r0+r6-2*mmsize], m2 + movnta [r0+r6-1*mmsize], m3 + add r6, 4*mmsize + jle .loopx + PREFETCHNT_ITER r2+r6, 4*mmsize + sub r6, 4*mmsize + jz .end +.loop_end: + movu m0, [r2+r6] + movnta [r0+r6], m0 + add r6, mmsize + jl .loop_end +.end: add r0, r1 add r2, r3 - dec r5d + dec r5d jg .loopy sfence - emms RET +%endmacro +INIT_XMM sse +PLANE_COPY_CORE +INIT_YMM avx +PLANE_COPY_CORE %macro INTERLEAVE 4-5 ; dst, srcu, srcv, is_aligned, nt_hint %if HIGH_BIT_DEPTH diff --git a/common/x86/mc-c.c b/common/x86/mc-c.c index 9101997f..fd56130d 100644 --- a/common/x86/mc-c.c +++ b/common/x86/mc-c.c @@ -90,7 +90,8 @@ void x264_mc_copy_w16_aligned_avx( uint16_t *, intptr_t, uint16_t *, intptr_t, i void x264_prefetch_fenc_420_mmx2( pixel *, intptr_t, pixel *, intptr_t, int ); void x264_prefetch_fenc_422_mmx2( pixel *, intptr_t, pixel *, intptr_t, int ); void x264_prefetch_ref_mmx2( pixel *, intptr_t, int ); -void x264_plane_copy_core_mmx2( pixel *, intptr_t, pixel *, intptr_t, int w, int h ); +void x264_plane_copy_core_sse( pixel *, intptr_t, pixel *, intptr_t, int w, int h ); +void x264_plane_copy_core_avx( pixel *, intptr_t, pixel *, intptr_t, int w, int h ); void x264_plane_copy_c( pixel *, intptr_t, pixel *, intptr_t, int w, int h ); void x264_plane_copy_interleave_core_mmx2( pixel *dst, intptr_t i_dst, pixel *srcu, intptr_t i_srcu, @@ -489,23 +490,35 @@ HPEL(32, avx2, avx2, avx2, avx2) #endif #endif // HIGH_BIT_DEPTH -static void x264_plane_copy_mmx2( pixel *dst, intptr_t i_dst, pixel *src, intptr_t i_src, int w, int h ) -{ - int c_w = 16/sizeof(pixel) - 1; - if( w < 256 ) { // tiny resolutions don't want non-temporal hints. dunno the exact threshold. - x264_plane_copy_c( dst, i_dst, src, i_src, w, h ); - } else if( !(w&c_w) ) { - x264_plane_copy_core_mmx2( dst, i_dst, src, i_src, w, h ); - } else if( i_src > 0 ) { - // have to use plain memcpy on the last line (in memory order) to avoid overreading src - x264_plane_copy_core_mmx2( dst, i_dst, src, i_src, (w+c_w)&~c_w, h-1 ); - memcpy( dst+i_dst*(h-1), src+i_src*(h-1), w*sizeof(pixel) ); - } else { - memcpy( dst, src, w*sizeof(pixel) ); - x264_plane_copy_core_mmx2( dst+i_dst, i_dst, src+i_src, i_src, (w+c_w)&~c_w, h-1 ); - } +#define PLANE_COPY(align, cpu)\ +static void x264_plane_copy_##cpu( pixel *dst, intptr_t i_dst, pixel *src, intptr_t i_src, int w, int h )\ +{\ + int c_w = (align) / sizeof(pixel) - 1;\ + if( w < 256 ) /* tiny resolutions don't want non-temporal hints. dunno the exact threshold. */\ + x264_plane_copy_c( dst, i_dst, src, i_src, w, h );\ + else if( !(w&c_w) )\ + x264_plane_copy_core_##cpu( dst, i_dst, src, i_src, w, h );\ + else\ + {\ + if( --h > 0 )\ + {\ + if( i_src > 0 )\ + {\ + x264_plane_copy_core_##cpu( dst, i_dst, src, i_src, (w+c_w)&~c_w, h );\ + dst += i_dst * h;\ + src += i_src * h;\ + }\ + else\ + x264_plane_copy_core_##cpu( dst+i_dst, i_dst, src+i_src, i_src, (w+c_w)&~c_w, h );\ + }\ + /* use plain memcpy on the last line (in memory order) to avoid overreading src. */\ + memcpy( dst, src, w*sizeof(pixel) );\ + }\ } +PLANE_COPY(16, sse) +PLANE_COPY(32, avx) + #define PLANE_INTERLEAVE(cpu) \ static void x264_plane_copy_interleave_##cpu( pixel *dst, intptr_t i_dst,\ pixel *srcu, intptr_t i_srcu,\ @@ -663,7 +676,6 @@ void x264_mc_init_mmx( int cpu, x264_mc_functions_t *pf ) pf->prefetch_fenc_422 = x264_prefetch_fenc_422_mmx2; pf->prefetch_ref = x264_prefetch_ref_mmx2; - pf->plane_copy = x264_plane_copy_mmx2; pf->plane_copy_interleave = x264_plane_copy_interleave_mmx2; pf->store_interleave_chroma = x264_store_interleave_chroma_mmx2; @@ -692,6 +704,7 @@ void x264_mc_init_mmx( int cpu, x264_mc_functions_t *pf ) { pf->memcpy_aligned = x264_memcpy_aligned_sse; pf->memzero_aligned = x264_memzero_aligned_sse; + pf->plane_copy = x264_plane_copy_sse; } #if HIGH_BIT_DEPTH @@ -929,6 +942,7 @@ void x264_mc_init_mmx( int cpu, x264_mc_functions_t *pf ) if( !(cpu&X264_CPU_AVX) ) return; pf->memzero_aligned = x264_memzero_aligned_avx; + pf->plane_copy = x264_plane_copy_avx; pf->mbtree_propagate_cost = x264_mbtree_propagate_cost_avx; pf->mbtree_propagate_list = x264_mbtree_propagate_list_avx; -- 2.40.0