From 8d1ebe2eeb30a204b588502d69d361ee85187821 Mon Sep 17 00:00:00 2001 From: Loren Merritt Date: Tue, 10 Oct 2006 21:26:31 +0000 Subject: [PATCH] prefetch pixels for motion compensation and deblocking. git-svn-id: svn://svn.videolan.org/x264/trunk@590 df754926-b1dd-0310-bc7b-ec298dee348c --- common/amd64/mc-a.asm | 49 +++++++++++++++++++++++++++++++++++ common/frame.c | 2 ++ common/i386/mc-a.asm | 59 +++++++++++++++++++++++++++++++++++++++++++ common/i386/mc-c.c | 5 ++++ common/macroblock.c | 13 ++++++++++ common/macroblock.h | 2 ++ common/mc.c | 10 ++++++++ common/mc.h | 7 +++++ encoder/analyse.c | 4 +++ 9 files changed, 151 insertions(+) diff --git a/common/amd64/mc-a.asm b/common/amd64/mc-a.asm index ad3e73fb..8ae1416b 100644 --- a/common/amd64/mc-a.asm +++ b/common/amd64/mc-a.asm @@ -72,6 +72,9 @@ cglobal x264_mc_copy_w16_sse2 cglobal x264_mc_chroma_mmxext +cglobal x264_prefetch_fenc_mmxext +cglobal x264_prefetch_ref_mmxext + ;============================================================================= ; pixel avg ;============================================================================= @@ -549,3 +552,49 @@ ALIGN 4 dec r11d jnz .height_loop1_w8 rep ret + + + +;----------------------------------------------------------------------------- +; void x264_prefetch_fenc_mmxext( uint8_t *pix_y, int stride_y, +; uint8_t *pix_uv, int stride_uv, int mb_x ) +;----------------------------------------------------------------------------- +ALIGN 16 +x264_prefetch_fenc_mmxext: + mov eax, parm5d + and eax, 3 + imul eax, parm2d + lea parm1q, [parm1q+rax*4+64] + prefetcht0 [parm1q] + prefetcht0 [parm1q+parm2q] + lea parm1q, [parm1q+parm2q*2] + prefetcht0 [parm1q] + prefetcht0 [parm1q+parm2q] + + mov eax, parm5d + and eax, 6 + imul eax, parm4d + lea parm3q, [parm3q+rax+64] + prefetcht0 [parm3q] + prefetcht0 [parm3q+parm4q] + ret + +;----------------------------------------------------------------------------- +; void x264_prefetch_ref_mmxext( uint8_t *pix, int stride, int parity ) +;----------------------------------------------------------------------------- +ALIGN 16 +x264_prefetch_ref_mmxext: + dec parm3d + and parm3d, parm2d + lea parm1q, [parm1q+parm3q*8+64] + lea rax, [parm2q*3] + prefetcht0 [parm1q] + prefetcht0 [parm1q+parm2q] + prefetcht0 [parm1q+parm2q*2] + prefetcht0 [parm1q+rax] + lea parm1q, [parm1q+parm2q*4] + prefetcht0 [parm1q] + prefetcht0 [parm1q+parm2q] + prefetcht0 [parm1q+parm2q*2] + prefetcht0 [parm1q+rax] + ret diff --git a/common/frame.c b/common/frame.c index 96ba0d7a..77209479 100644 --- a/common/frame.c +++ b/common/frame.c @@ -536,6 +536,8 @@ void x264_frame_deblocking_filter( x264_t *h, int i_slice_type ) i_pix_y[2] -= 7*h->fdec->i_stride[2]; } + x264_prefetch_fenc( h, h->fdec, mb_x, mb_y ); + /* cavlc + 8x8 transform stores nnz per 16 coeffs for the purpose of * entropy coding, but per 64 coeffs for the purpose of deblocking */ if( !h->param.b_cabac && b_8x8_transform ) diff --git a/common/i386/mc-a.asm b/common/i386/mc-a.asm index d48ccfd9..6f233c4c 100644 --- a/common/i386/mc-a.asm +++ b/common/i386/mc-a.asm @@ -72,6 +72,9 @@ cglobal x264_mc_copy_w16_sse2 cglobal x264_mc_chroma_mmxext +cglobal x264_prefetch_fenc_mmxext +cglobal x264_prefetch_ref_mmxext + ;============================================================================= ; pixel avg ;============================================================================= @@ -595,3 +598,59 @@ ALIGN 4 pop edi picpop ebx ret + + + +; prefetches tuned for 64 byte cachelines (K7/K8/Core2) +; TODO add 32 and 128 byte versions for P3/P4 + +;----------------------------------------------------------------------------- +; void x264_prefetch_fenc_mmxext( uint8_t *pix_y, int stride_y, +; uint8_t *pix_uv, int stride_uv, int mb_x ) +;----------------------------------------------------------------------------- +ALIGN 16 +x264_prefetch_fenc_mmxext: + mov eax, [esp+20] + mov ecx, [esp+8] + mov edx, [esp+4] + and eax, 3 + imul eax, ecx + lea edx, [edx+eax*4+64] + prefetcht0 [edx] + prefetcht0 [edx+ecx] + lea edx, [edx+ecx*2] + prefetcht0 [edx] + prefetcht0 [edx+ecx] + + mov eax, [esp+20] + mov ecx, [esp+16] + mov edx, [esp+12] + and eax, 6 + imul eax, ecx + lea edx, [edx+eax+64] + prefetcht0 [edx] + prefetcht0 [edx+ecx] + ret + +;----------------------------------------------------------------------------- +; void x264_prefetch_ref_mmxext( uint8_t *pix, int stride, int parity ) +;----------------------------------------------------------------------------- +ALIGN 16 +x264_prefetch_ref_mmxext: + mov eax, [esp+12] + mov ecx, [esp+8] + mov edx, [esp+4] + sub eax, 1 + and eax, ecx + lea edx, [edx+eax*8+64] + lea eax, [ecx*3] + prefetcht0 [edx] + prefetcht0 [edx+ecx] + prefetcht0 [edx+ecx*2] + prefetcht0 [edx+eax] + lea edx, [edx+ecx*4] + prefetcht0 [edx] + prefetcht0 [edx+ecx] + prefetcht0 [edx+ecx*2] + prefetcht0 [edx+eax] + ret diff --git a/common/i386/mc-c.c b/common/i386/mc-c.c index fa15fcd7..2f89b105 100644 --- a/common/i386/mc-c.c +++ b/common/i386/mc-c.c @@ -40,6 +40,8 @@ extern void x264_mc_copy_w8_mmx( uint8_t *, int, uint8_t *, int, int ); extern void x264_mc_copy_w16_mmx( uint8_t *, int, uint8_t *, int, int ); extern void x264_mc_copy_w16_sse2( uint8_t *, int, uint8_t *, int, int ); extern void x264_plane_copy_mmxext( uint8_t *, int, uint8_t *, int, int w, int h); +extern void x264_prefetch_fenc_mmxext( uint8_t *, int, uint8_t *, int, int ); +extern void x264_prefetch_ref_mmxext( uint8_t *, int, int ); #define AVG(W,H) \ static void x264_pixel_avg_ ## W ## x ## H ## _mmxext( uint8_t *dst, int i_dst, uint8_t *src, int i_src ) \ @@ -161,6 +163,9 @@ void x264_mc_mmxext_init( x264_mc_functions_t *pf ) pf->copy[PIXEL_4x4] = x264_mc_copy_w4_mmx; pf->plane_copy = x264_plane_copy_mmxext; + + pf->prefetch_fenc = x264_prefetch_fenc_mmxext; + pf->prefetch_ref = x264_prefetch_ref_mmxext; } void x264_mc_sse2_init( x264_mc_functions_t *pf ) { diff --git a/common/macroblock.c b/common/macroblock.c index 41b4adaf..f9744e8b 100644 --- a/common/macroblock.c +++ b/common/macroblock.c @@ -927,6 +927,15 @@ void x264_macroblock_slice_init( x264_t *h ) memset( h->mb.cache.skip, 0, X264_SCAN8_SIZE * sizeof( int8_t ) ); } +void x264_prefetch_fenc( x264_t *h, x264_frame_t *fenc, int i_mb_x, int i_mb_y ) +{ + int stride_y = fenc->i_stride[0]; + int stride_uv = fenc->i_stride[1]; + int off_y = 16 * (i_mb_x + i_mb_y * stride_y); + int off_uv = 8 * (i_mb_x + i_mb_y * stride_uv); + h->mc.prefetch_fenc( fenc->plane[0]+off_y, stride_y, + fenc->plane[1+(i_mb_x&1)]+off_uv, stride_uv, i_mb_x ); +} void x264_macroblock_cache_load( x264_t *h, int i_mb_x, int i_mb_y ) { @@ -1143,6 +1152,8 @@ void x264_macroblock_cache_load( x264_t *h, int i_mb_x, int i_mb_y ) h->mb.pic.p_integral[1][i] = &h->fref1[i]->integral[ 16 * ( i_mb_x + i_mb_y * h->fdec->i_stride[0] )]; } + x264_prefetch_fenc( h, h->fenc, i_mb_x, i_mb_y ); + /* load ref/mv/mvd */ if( h->sh.i_type != SLICE_TYPE_I ) { @@ -1359,6 +1370,8 @@ void x264_macroblock_cache_save( x264_t *h ) h->mb.pic.p_fdec[i], FDEC_STRIDE, w ); } + x264_prefetch_fenc( h, h->fdec, h->mb.i_mb_x, h->mb.i_mb_y ); + h->mb.type[i_mb_xy] = i_mb_type; if( h->mb.i_type != I_16x16 && h->mb.i_cbp_luma == 0 && h->mb.i_cbp_chroma == 0 ) diff --git a/common/macroblock.h b/common/macroblock.h index ff9dd826..6a9e733d 100644 --- a/common/macroblock.h +++ b/common/macroblock.h @@ -248,6 +248,8 @@ void x264_macroblock_cache_end( x264_t *h ); void x264_macroblock_bipred_init( x264_t *h ); +void x264_prefetch_fenc( x264_t *h, x264_frame_t *fenc, int i_mb_x, int i_mb_y ); + /* x264_mb_predict_mv_16x16: * set mvp with predicted mv for D_16x16 block * h->mb. need only valid values from other blocks */ diff --git a/common/mc.c b/common/mc.c index 43292808..7721b63a 100644 --- a/common/mc.c +++ b/common/mc.c @@ -327,6 +327,13 @@ static void plane_copy( uint8_t *dst, int i_dst, } } +void prefetch_fenc_null( uint8_t *pix_y, int stride_y, + uint8_t *pix_uv, int stride_uv, int mb_x ) +{} + +void prefetch_ref_null( uint8_t *pix, int stride, int parity ) +{} + void x264_mc_init( int cpu, x264_mc_functions_t *pf ) { pf->mc_luma = mc_luma; @@ -361,6 +368,9 @@ void x264_mc_init( int cpu, x264_mc_functions_t *pf ) pf->plane_copy = plane_copy; + pf->prefetch_fenc = prefetch_fenc_null; + pf->prefetch_ref = prefetch_ref_null; + #ifdef HAVE_MMXEXT if( cpu&X264_CPU_MMXEXT ) { x264_mc_mmxext_init( pf ); diff --git a/common/mc.h b/common/mc.h index 52f5b8e9..9c9fe517 100644 --- a/common/mc.h +++ b/common/mc.h @@ -55,6 +55,13 @@ typedef struct void (*plane_copy)( uint8_t *dst, int i_dst, uint8_t *src, int i_src, int w, int h); + + /* prefetch the next few macroblocks of fenc or fdec */ + void (*prefetch_fenc)( uint8_t *pix_y, int stride_y, + uint8_t *pix_uv, int stride_uv, int mb_x ); + /* prefetch the next few macroblocks of a hpel reference frame */ + void (*prefetch_ref)( uint8_t *pix, int stride, int parity ); + } x264_mc_functions_t; void x264_mc_init( int cpu, x264_mc_functions_t *pf ); diff --git a/encoder/analyse.c b/encoder/analyse.c index 064932e2..c3d3c261 100644 --- a/encoder/analyse.c +++ b/encoder/analyse.c @@ -1996,6 +1996,8 @@ void x264_macroblock_analyse( x264_t *h ) int b_skip = 0; int i_intra_cost, i_intra_type; + h->mc.prefetch_ref( h->mb.pic.p_fref[0][0][h->mb.i_mb_x&3], h->mb.pic.i_stride[0], 0 ); + /* Fast P_SKIP detection */ analysis.b_try_pskip = 0; if( h->param.analyse.b_fast_pskip ) @@ -2009,6 +2011,8 @@ void x264_macroblock_analyse( x264_t *h ) b_skip = x264_macroblock_probe_pskip( h ); } + h->mc.prefetch_ref( h->mb.pic.p_fref[0][0][h->mb.i_mb_x&3], h->mb.pic.i_stride[0], 1 ); + if( b_skip ) { h->mb.i_type = P_SKIP; -- 2.40.0