From: Fiona Glaser Date: Wed, 9 Apr 2008 22:30:34 +0000 (-0600) Subject: cacheline split workaround for mc_luma X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=6eb5483505f40bb319ce0afa052ee41543993fc1;p=libx264 cacheline split workaround for mc_luma --- diff --git a/common/frame.c b/common/frame.c index 4fcf6489..a4c33f03 100644 --- a/common/frame.c +++ b/common/frame.c @@ -31,6 +31,7 @@ x264_frame_t *x264_frame_new( x264_t *h ) int i_mb_count = h->mb.i_mb_count; int i_stride, i_width, i_lines; int i_padv = PADV << h->param.b_interlaced; + int luma_plane_size; if( !frame ) return NULL; @@ -55,20 +56,20 @@ x264_frame_t *x264_frame_new( x264_t *h ) frame->i_stride[i] = i_stride >> !!i; frame->i_width[i] = i_width >> !!i; frame->i_lines[i] = i_lines >> !!i; - CHECKED_MALLOC( frame->buffer[i], - frame->i_stride[i] * (i_lines + 2*i_padv) >> !!i ); - frame->plane[i] = ((uint8_t*)frame->buffer[i]) + - ((frame->i_stride[i] * i_padv + PADH) >> !!i); } - frame->filtered[0] = frame->plane[0]; - for( i = 0; i < 3; i++ ) + luma_plane_size = (frame->i_stride[0] * ( frame->i_lines[0] + 2*i_padv )); + for( i = 1; i < 3; i++ ) { - CHECKED_MALLOC( frame->buffer[4+i], - frame->i_stride[0] * ( frame->i_lines[0] + 2*i_padv ) ); - frame->filtered[i+1] = ((uint8_t*)frame->buffer[4+i]) + - frame->i_stride[0] * i_padv + PADH; + CHECKED_MALLOC( frame->buffer[i], luma_plane_size/4 ); + frame->plane[i] = frame->buffer[i] + (frame->i_stride[i] * i_padv + PADH)/2; } + /* all 4 luma planes allocated together, since the cacheline split code + * requires them to be in-phase wrt cacheline alignment. */ + CHECKED_MALLOC( frame->buffer[0], 4*luma_plane_size); + for( i = 0; i < 4; i++ ) + frame->filtered[i] = frame->buffer[0] + i*luma_plane_size + frame->i_stride[0] * i_padv + PADH; + frame->plane[0] = frame->filtered[0]; if( h->frames.b_have_lowres ) { @@ -86,9 +87,9 @@ x264_frame_t *x264_frame_new( x264_t *h ) if( h->param.analyse.i_me_method >= X264_ME_ESA ) { - CHECKED_MALLOC( frame->buffer[7], + CHECKED_MALLOC( frame->buffer[3], 2 * frame->i_stride[0] * (frame->i_lines[0] + 2*i_padv) * sizeof(uint16_t) ); - frame->integral = (uint16_t*)frame->buffer[7] + frame->i_stride[0] * i_padv + PADH; + frame->integral = (uint16_t*)frame->buffer[3] + frame->i_stride[0] * i_padv + PADH; } frame->i_poc = -1; @@ -132,7 +133,7 @@ fail: void x264_frame_delete( x264_frame_t *frame ) { int i, j; - for( i = 0; i < 8; i++ ) + for( i = 0; i < 4; i++ ) x264_free( frame->buffer[i] ); for( i = 0; i < 4; i++ ) x264_free( frame->buffer_lowres[i] ); diff --git a/common/frame.h b/common/frame.h index 6240c846..6c8f991d 100644 --- a/common/frame.h +++ b/common/frame.h @@ -56,7 +56,7 @@ typedef struct /* for unrestricted mv we allocate more data than needed * allocated data are stored in buffer */ - void *buffer[8]; + void *buffer[4]; void *buffer_lowres[4]; /* motion data */ diff --git a/common/x86/mc-a.asm b/common/x86/mc-a.asm index b262e391..ed1e3326 100644 --- a/common/x86/mc-a.asm +++ b/common/x86/mc-a.asm @@ -30,6 +30,7 @@ pw_4: times 4 dw 4 pw_8: times 4 dw 8 pw_32: times 4 dw 32 pw_64: times 4 dw 64 +sw_64: dd 64 SECTION .text @@ -229,7 +230,8 @@ cglobal x264_pixel_avg2_w20_mmxext, 6,7 jg .height_loop REP_RET -cglobal x264_pixel_avg2_w16_sse2, 6,7 +%macro PIXEL_AVG_SSE 1 +cglobal x264_pixel_avg2_w16_%1, 6,7 sub r4, r2 lea r6, [r4+r3] .height_loop: @@ -247,7 +249,7 @@ cglobal x264_pixel_avg2_w16_sse2, 6,7 jg .height_loop REP_RET -cglobal x264_pixel_avg2_w20_sse2, 6,7 +cglobal x264_pixel_avg2_w20_%1, 6,7 sub r4, r2 lea r6, [r4+r3] .height_loop: @@ -270,8 +272,123 @@ cglobal x264_pixel_avg2_w20_sse2, 6,7 sub r5d, 2 jg .height_loop REP_RET +%endmacro + +PIXEL_AVG_SSE sse2 +%ifdef HAVE_SSE3 +%define movdqu lddqu +PIXEL_AVG_SSE sse3 +%undef movdqu +%endif + +; Cacheline split code for processors with high latencies for loads +; split over cache lines. See sad-a.asm for a more detailed explanation. +; This particular instance is complicated by the fact that src1 and src2 +; can have different alignments. For simplicity and code size, only the +; MMX cacheline workaround is used. As a result, in the case of SSE2 +; pixel_avg, the cacheline check functions calls the SSE2 version if there +; is no cacheline split, and the MMX workaround if there is. + +%macro INIT_SHIFT 2 + and eax, 7 + shl eax, 3 +%ifdef PIC32 + ; both versions work, but picgetgot is slower than gpr->mmx is slower than mem->mmx + mov r2, 64 + sub r2, eax + movd %2, eax + movd %1, r2 +%else + movd %1, [sw_64 GLOBAL] + movd %2, eax + psubw %1, %2 +%endif +%endmacro + +%macro AVG_CACHELINE_CHECK 3 ; width, cacheline, instruction set +cglobal x264_pixel_avg2_w%1_cache%2_%3, 0,0 + mov eax, r2m + and eax, 0x1f|(%2>>1) + cmp eax, (32-%1)|(%2>>1) + jle x264_pixel_avg2_w%1_%3 +;w12 isn't needed because w16 is just as fast if there's no cacheline split +%if %1 == 12 + jmp x264_pixel_avg2_w16_cache_mmxext +%else + jmp x264_pixel_avg2_w%1_cache_mmxext +%endif +%endmacro + +%macro AVG_CACHELINE_START 0 + %assign stack_offset 0 + INIT_SHIFT mm6, mm7 + mov eax, r4m + INIT_SHIFT mm4, mm5 + PROLOGUE 6,6,0 + and r2, ~7 + and r4, ~7 + sub r4, r2 +.height_loop: +%endmacro +%macro AVG_CACHELINE_LOOP 2 + movq mm0, [r2+8+%1] + movq mm1, [r2+%1] + movq mm2, [r2+r4+8+%1] + movq mm3, [r2+r4+%1] + psllq mm0, mm6 + psrlq mm1, mm7 + psllq mm2, mm4 + psrlq mm3, mm5 + por mm0, mm1 + por mm2, mm3 + pavgb mm0, mm2 + %2 [r0+%1], mm0 +%endmacro +x264_pixel_avg2_w8_cache_mmxext: + AVG_CACHELINE_START + AVG_CACHELINE_LOOP 0, movq + add r2, r3 + add r0, r1 + dec r5d + jg .height_loop + RET + +x264_pixel_avg2_w16_cache_mmxext: + AVG_CACHELINE_START + AVG_CACHELINE_LOOP 0, movq + AVG_CACHELINE_LOOP 8, movq + add r2, r3 + add r0, r1 + dec r5d + jg .height_loop + RET + +x264_pixel_avg2_w20_cache_mmxext: + AVG_CACHELINE_START + AVG_CACHELINE_LOOP 0, movq + AVG_CACHELINE_LOOP 8, movq + AVG_CACHELINE_LOOP 16, movd + add r2, r3 + add r0, r1 + dec r5d + jg .height_loop + RET + +%ifndef ARCH_X86_64 +AVG_CACHELINE_CHECK 8, 32, mmxext +AVG_CACHELINE_CHECK 12, 32, mmxext +AVG_CACHELINE_CHECK 16, 32, mmxext +AVG_CACHELINE_CHECK 20, 32, mmxext +AVG_CACHELINE_CHECK 16, 64, mmxext +AVG_CACHELINE_CHECK 20, 64, mmxext +%endif + +AVG_CACHELINE_CHECK 8, 64, mmxext +AVG_CACHELINE_CHECK 12, 64, mmxext +AVG_CACHELINE_CHECK 16, 64, sse2 +AVG_CACHELINE_CHECK 20, 64, sse2 ;============================================================================= ; pixel copy @@ -362,6 +479,11 @@ cglobal %1, 5,7 %endmacro COPY_W16_SSE2 x264_mc_copy_w16_sse2, movdqu +; cacheline split with mmx has too much overhead; the speed benefit is near-zero. +; but with SSE3 the overhead is zero, so there's no reason not to include it. +%ifdef HAVE_SSE3 +COPY_W16_SSE2 x264_mc_copy_w16_sse3, lddqu +%endif COPY_W16_SSE2 x264_mc_copy_w16_aligned_sse2, movdqa diff --git a/common/x86/mc-c.c b/common/x86/mc-c.c index c2c2904d..fd202da4 100644 --- a/common/x86/mc-c.c +++ b/common/x86/mc-c.c @@ -38,17 +38,11 @@ extern void x264_pixel_avg_8x4_mmxext( uint8_t *, int, uint8_t *, int ); extern void x264_pixel_avg_4x8_mmxext( uint8_t *, int, uint8_t *, int ); extern void x264_pixel_avg_4x4_mmxext( uint8_t *, int, uint8_t *, int ); extern void x264_pixel_avg_4x2_mmxext( uint8_t *, int, uint8_t *, int ); -extern void x264_pixel_avg2_w4_mmxext( uint8_t *, int, uint8_t *, int, uint8_t *, int ); -extern void x264_pixel_avg2_w8_mmxext( uint8_t *, int, uint8_t *, int, uint8_t *, int ); -extern void x264_pixel_avg2_w12_mmxext( uint8_t *, int, uint8_t *, int, uint8_t *, int ); -extern void x264_pixel_avg2_w16_mmxext( uint8_t *, int, uint8_t *, int, uint8_t *, int ); -extern void x264_pixel_avg2_w20_mmxext( uint8_t *, int, uint8_t *, int, uint8_t *, int ); -extern void x264_pixel_avg2_w16_sse2( uint8_t *, int, uint8_t *, int, uint8_t *, int ); -extern void x264_pixel_avg2_w20_sse2( uint8_t *, int, uint8_t *, int, uint8_t *, int ); extern void x264_mc_copy_w4_mmx( uint8_t *, int, uint8_t *, int, int ); extern void x264_mc_copy_w8_mmx( uint8_t *, int, uint8_t *, int, int ); extern void x264_mc_copy_w16_mmx( uint8_t *, int, uint8_t *, int, int ); extern void x264_mc_copy_w16_sse2( uint8_t *, int, uint8_t *, int, int ); +extern void x264_mc_copy_w16_sse3( uint8_t *, int, uint8_t *, int, int ); extern void x264_mc_copy_w16_aligned_sse2( uint8_t *, int, uint8_t *, int, int ); extern void x264_pixel_avg_weight_4x4_mmxext( uint8_t *, int, uint8_t *, int, int ); extern void x264_pixel_avg_weight_w8_mmxext( uint8_t *, int, uint8_t *, int, int, int ); @@ -62,6 +56,19 @@ extern void x264_plane_copy_mmxext( uint8_t *, int, uint8_t *, int, int w, int h extern void *x264_memcpy_aligned_mmx( void * dst, const void * src, size_t n ); extern void *x264_memcpy_aligned_sse2( void * dst, const void * src, size_t n ); +#define PIXEL_AVG_W(width,cpu)\ +extern void x264_pixel_avg2_w##width##_##cpu( uint8_t *, int, uint8_t *, int, uint8_t *, int ); +/* This declares some functions that don't exist, but that isn't a problem. */ +#define PIXEL_AVG_WALL(cpu)\ +PIXEL_AVG_W(4,cpu); PIXEL_AVG_W(8,cpu); PIXEL_AVG_W(12,cpu); PIXEL_AVG_W(16,cpu); PIXEL_AVG_W(20,cpu); + +PIXEL_AVG_WALL(mmxext) +PIXEL_AVG_WALL(cache32_mmxext) +PIXEL_AVG_WALL(cache64_mmxext) +PIXEL_AVG_WALL(cache64_sse2) +PIXEL_AVG_WALL(sse2) +PIXEL_AVG_WALL(sse3) + #define AVG_WEIGHT(W,H) \ void x264_pixel_avg_weight_ ## W ## x ## H ## _mmxext( uint8_t *dst, int i_dst, uint8_t *src, int i_src, int i_weight_dst ) \ { \ @@ -73,40 +80,48 @@ AVG_WEIGHT(8,16) AVG_WEIGHT(8,8) AVG_WEIGHT(8,4) -static void (* const x264_pixel_avg_wtab_mmxext[6])( uint8_t *, int, uint8_t *, int, uint8_t *, int ) = -{ - NULL, - x264_pixel_avg2_w4_mmxext, - x264_pixel_avg2_w8_mmxext, - x264_pixel_avg2_w12_mmxext, - x264_pixel_avg2_w16_mmxext, - x264_pixel_avg2_w20_mmxext, -}; -static void (* const x264_mc_copy_wtab_mmx[5])( uint8_t *, int, uint8_t *, int, int ) = -{ - NULL, - x264_mc_copy_w4_mmx, - x264_mc_copy_w8_mmx, - NULL, - x264_mc_copy_w16_mmx -}; -static void (* const x264_pixel_avg_wtab_sse2[6])( uint8_t *, int, uint8_t *, int, uint8_t *, int ) = -{ - NULL, - x264_pixel_avg2_w4_mmxext, - x264_pixel_avg2_w8_mmxext, - x264_pixel_avg2_w12_mmxext, - x264_pixel_avg2_w16_sse2, - x264_pixel_avg2_w20_sse2, +#define PIXEL_AVG_WTAB(instr, name1, name2, name3, name4, name5)\ +static void (* const x264_pixel_avg_wtab_##instr[6])( uint8_t *, int, uint8_t *, int, uint8_t *, int ) =\ +{\ + NULL,\ + x264_pixel_avg2_w4_##name1,\ + x264_pixel_avg2_w8_##name2,\ + x264_pixel_avg2_w12_##name3,\ + x264_pixel_avg2_w16_##name4,\ + x264_pixel_avg2_w20_##name5,\ }; -static void (* const x264_mc_copy_wtab_sse2[5])( uint8_t *, int, uint8_t *, int, int ) = -{ - NULL, - x264_mc_copy_w4_mmx, - x264_mc_copy_w8_mmx, - NULL, - x264_mc_copy_w16_sse2, + +/* w16 sse2 is faster than w12 mmx as long as the cacheline issue is resolved */ +#define x264_pixel_avg2_w12_cache64_sse2 x264_pixel_avg2_w16_cache64_sse2 +#define x264_pixel_avg2_w12_sse3 x264_pixel_avg2_w16_sse3 + +PIXEL_AVG_WTAB(mmxext, mmxext, mmxext, mmxext, mmxext, mmxext) +#ifdef ARCH_X86 +PIXEL_AVG_WTAB(cache32_mmxext, mmxext, cache32_mmxext, cache32_mmxext, cache32_mmxext, cache32_mmxext) +#endif +PIXEL_AVG_WTAB(cache64_mmxext, mmxext, cache64_mmxext, cache64_mmxext, cache64_mmxext, cache64_mmxext) +PIXEL_AVG_WTAB(sse2, mmxext, mmxext, mmxext, sse2, sse2) +PIXEL_AVG_WTAB(cache64_sse2, mmxext, cache64_mmxext, cache64_sse2, cache64_sse2, cache64_sse2) +#ifdef HAVE_SSE3 +PIXEL_AVG_WTAB(cache64_sse3, mmxext, cache64_mmxext, sse3, sse3, sse3) +#endif + +#define MC_COPY_WTAB(instr, name1, name2, name3)\ +static void (* const x264_mc_copy_wtab_##instr[5])( uint8_t *, int, uint8_t *, int, int ) =\ +{\ + NULL,\ + x264_mc_copy_w4_##name1,\ + x264_mc_copy_w8_##name2,\ + NULL,\ + x264_mc_copy_w16_##name3,\ }; + +MC_COPY_WTAB(mmx,mmx,mmx,mmx) +MC_COPY_WTAB(sse2,mmx,mmx,sse2) +#ifdef HAVE_SSE3 +MC_COPY_WTAB(sse3,mmx,mmx,sse3) +#endif + static const int hpel_ref0[16] = {0,1,1,1,0,1,1,1,2,3,3,3,0,1,1,1}; static const int hpel_ref1[16] = {0,0,0,0,2,2,3,2,2,2,3,2,2,2,3,2}; @@ -134,7 +149,15 @@ void mc_luma_##name( uint8_t *dst, int i_dst_stride,\ } MC_LUMA(mmxext,mmxext,mmx) +#ifdef ARCH_X86 +MC_LUMA(cache32_mmxext,cache32_mmxext,mmx) +MC_LUMA(cache64_mmxext,cache64_mmxext,mmx) +#endif MC_LUMA(sse2,sse2,sse2) +MC_LUMA(cache64_sse2,cache64_sse2,sse2) +#ifdef HAVE_SSE3 +MC_LUMA(cache64_sse3,cache64_sse3,sse3) +#endif #define GET_REF(name)\ uint8_t *get_ref_##name( uint8_t *dst, int *i_dst_stride,\ @@ -161,7 +184,15 @@ uint8_t *get_ref_##name( uint8_t *dst, int *i_dst_stride,\ } GET_REF(mmxext) +#ifdef ARCH_X86 +GET_REF(cache32_mmxext) +GET_REF(cache64_mmxext) +#endif GET_REF(sse2) +GET_REF(cache64_sse2) +#ifdef HAVE_SSE3 +GET_REF(cache64_sse3) +#endif #define HPEL(align, cpu, cpuv, cpuc, cpuh)\ void x264_hpel_filter_v_##cpuv( uint8_t *dst, uint8_t *src, int16_t *buf, int stride, int width);\ @@ -240,6 +271,19 @@ void x264_mc_init_mmx( int cpu, x264_mc_functions_t *pf ) pf->prefetch_fenc = x264_prefetch_fenc_mmxext; pf->prefetch_ref = x264_prefetch_ref_mmxext; +#ifdef ARCH_X86 // all x86_64 cpus with cacheline split issues use sse2 instead + if( cpu&X264_CPU_CACHELINE_32 ) + { + pf->mc_luma = mc_luma_cache32_mmxext; + pf->get_ref = get_ref_cache32_mmxext; + } + else if( cpu&X264_CPU_CACHELINE_SPLIT ) + { + pf->mc_luma = mc_luma_cache64_mmxext; + pf->get_ref = get_ref_cache64_mmxext; + } +#endif + if( !(cpu&X264_CPU_SSE2) ) return; @@ -257,6 +301,20 @@ void x264_mc_init_mmx( int cpu, x264_mc_functions_t *pf ) pf->avg[PIXEL_16x8] = x264_pixel_avg_16x8_sse2; pf->hpel_filter = x264_hpel_filter_sse2; + if( cpu&X264_CPU_CACHELINE_SPLIT ) + { + pf->mc_luma = mc_luma_cache64_sse2; + pf->get_ref = get_ref_cache64_sse2; +#ifdef HAVE_SSE3 + /* lddqu doesn't work on Core2 */ + if( (cpu&X264_CPU_SSE3) && !(cpu&X264_CPU_SSSE3) ) + { + pf->mc_luma = mc_luma_cache64_sse3; + pf->get_ref = get_ref_cache64_sse3; + } +#endif + } + if( !(cpu&X264_CPU_SSSE3) ) return; diff --git a/common/x86/sad-a.asm b/common/x86/sad-a.asm index c178e8e1..6e31921c 100644 --- a/common/x86/sad-a.asm +++ b/common/x86/sad-a.asm @@ -670,7 +670,7 @@ sad_w16_align%1_sse2: lea r2, [r2+2*r3] dec r4 jg sad_w16_align%1_sse2 - rep ret + ret %endmacro ; computed jump assumes this loop is exactly 64 bytes @@ -689,7 +689,7 @@ sad_w16_align%1_ssse3: lea r2, [r2+2*r3] dec r4 jg sad_w16_align%1_ssse3 - rep ret + ret %endmacro %macro SAD16_CACHELINE_FUNC 2 ; cpu, height diff --git a/tools/checkasm.c b/tools/checkasm.c index d301a8ba..05c25bdb 100644 --- a/tools/checkasm.c +++ b/tools/checkasm.c @@ -476,8 +476,9 @@ static int check_mc( int cpu_ref, int cpu_new ) } ok = 1; used_asm = 0; for( dy = -8; dy < 8; dy++ ) - for( dx = -8; dx < 8; dx++ ) + for( dx = -128; dx < 128; dx++ ) { + if( rand()&15 ) continue; // running all of them is too slow MC_TEST_LUMA( 20, 18 ); MC_TEST_LUMA( 16, 16 ); MC_TEST_LUMA( 16, 8 );