cglobal x264_mc_chroma_mmxext
+cglobal x264_prefetch_fenc_mmxext
+cglobal x264_prefetch_ref_mmxext
+
;=============================================================================
; pixel avg
;=============================================================================
dec r11d
jnz .height_loop1_w8
rep ret
+
+
+
+;-----------------------------------------------------------------------------
+; void x264_prefetch_fenc_mmxext( uint8_t *pix_y, int stride_y,
+; uint8_t *pix_uv, int stride_uv, int mb_x )
+;-----------------------------------------------------------------------------
+ALIGN 16
+x264_prefetch_fenc_mmxext:
+ mov eax, parm5d
+ and eax, 3
+ imul eax, parm2d
+ lea parm1q, [parm1q+rax*4+64]
+ prefetcht0 [parm1q]
+ prefetcht0 [parm1q+parm2q]
+ lea parm1q, [parm1q+parm2q*2]
+ prefetcht0 [parm1q]
+ prefetcht0 [parm1q+parm2q]
+
+ mov eax, parm5d
+ and eax, 6
+ imul eax, parm4d
+ lea parm3q, [parm3q+rax+64]
+ prefetcht0 [parm3q]
+ prefetcht0 [parm3q+parm4q]
+ ret
+
+;-----------------------------------------------------------------------------
+; void x264_prefetch_ref_mmxext( uint8_t *pix, int stride, int parity )
+;-----------------------------------------------------------------------------
+ALIGN 16
+x264_prefetch_ref_mmxext:
+ dec parm3d
+ and parm3d, parm2d
+ lea parm1q, [parm1q+parm3q*8+64]
+ lea rax, [parm2q*3]
+ prefetcht0 [parm1q]
+ prefetcht0 [parm1q+parm2q]
+ prefetcht0 [parm1q+parm2q*2]
+ prefetcht0 [parm1q+rax]
+ lea parm1q, [parm1q+parm2q*4]
+ prefetcht0 [parm1q]
+ prefetcht0 [parm1q+parm2q]
+ prefetcht0 [parm1q+parm2q*2]
+ prefetcht0 [parm1q+rax]
+ ret
i_pix_y[2] -= 7*h->fdec->i_stride[2];
}
+ x264_prefetch_fenc( h, h->fdec, mb_x, mb_y );
+
/* cavlc + 8x8 transform stores nnz per 16 coeffs for the purpose of
* entropy coding, but per 64 coeffs for the purpose of deblocking */
if( !h->param.b_cabac && b_8x8_transform )
cglobal x264_mc_chroma_mmxext
+cglobal x264_prefetch_fenc_mmxext
+cglobal x264_prefetch_ref_mmxext
+
;=============================================================================
; pixel avg
;=============================================================================
pop edi
picpop ebx
ret
+
+
+
+; prefetches tuned for 64 byte cachelines (K7/K8/Core2)
+; TODO add 32 and 128 byte versions for P3/P4
+
+;-----------------------------------------------------------------------------
+; void x264_prefetch_fenc_mmxext( uint8_t *pix_y, int stride_y,
+; uint8_t *pix_uv, int stride_uv, int mb_x )
+;-----------------------------------------------------------------------------
+ALIGN 16
+x264_prefetch_fenc_mmxext:
+ mov eax, [esp+20]
+ mov ecx, [esp+8]
+ mov edx, [esp+4]
+ and eax, 3
+ imul eax, ecx
+ lea edx, [edx+eax*4+64]
+ prefetcht0 [edx]
+ prefetcht0 [edx+ecx]
+ lea edx, [edx+ecx*2]
+ prefetcht0 [edx]
+ prefetcht0 [edx+ecx]
+
+ mov eax, [esp+20]
+ mov ecx, [esp+16]
+ mov edx, [esp+12]
+ and eax, 6
+ imul eax, ecx
+ lea edx, [edx+eax+64]
+ prefetcht0 [edx]
+ prefetcht0 [edx+ecx]
+ ret
+
+;-----------------------------------------------------------------------------
+; void x264_prefetch_ref_mmxext( uint8_t *pix, int stride, int parity )
+;-----------------------------------------------------------------------------
+ALIGN 16
+x264_prefetch_ref_mmxext:
+ mov eax, [esp+12]
+ mov ecx, [esp+8]
+ mov edx, [esp+4]
+ sub eax, 1
+ and eax, ecx
+ lea edx, [edx+eax*8+64]
+ lea eax, [ecx*3]
+ prefetcht0 [edx]
+ prefetcht0 [edx+ecx]
+ prefetcht0 [edx+ecx*2]
+ prefetcht0 [edx+eax]
+ lea edx, [edx+ecx*4]
+ prefetcht0 [edx]
+ prefetcht0 [edx+ecx]
+ prefetcht0 [edx+ecx*2]
+ prefetcht0 [edx+eax]
+ ret
extern void x264_mc_copy_w16_mmx( uint8_t *, int, uint8_t *, int, int );
extern void x264_mc_copy_w16_sse2( uint8_t *, int, uint8_t *, int, int );
extern void x264_plane_copy_mmxext( uint8_t *, int, uint8_t *, int, int w, int h);
+extern void x264_prefetch_fenc_mmxext( uint8_t *, int, uint8_t *, int, int );
+extern void x264_prefetch_ref_mmxext( uint8_t *, int, int );
#define AVG(W,H) \
static void x264_pixel_avg_ ## W ## x ## H ## _mmxext( uint8_t *dst, int i_dst, uint8_t *src, int i_src ) \
pf->copy[PIXEL_4x4] = x264_mc_copy_w4_mmx;
pf->plane_copy = x264_plane_copy_mmxext;
+
+ pf->prefetch_fenc = x264_prefetch_fenc_mmxext;
+ pf->prefetch_ref = x264_prefetch_ref_mmxext;
}
void x264_mc_sse2_init( x264_mc_functions_t *pf )
{
memset( h->mb.cache.skip, 0, X264_SCAN8_SIZE * sizeof( int8_t ) );
}
+void x264_prefetch_fenc( x264_t *h, x264_frame_t *fenc, int i_mb_x, int i_mb_y )
+{
+ int stride_y = fenc->i_stride[0];
+ int stride_uv = fenc->i_stride[1];
+ int off_y = 16 * (i_mb_x + i_mb_y * stride_y);
+ int off_uv = 8 * (i_mb_x + i_mb_y * stride_uv);
+ h->mc.prefetch_fenc( fenc->plane[0]+off_y, stride_y,
+ fenc->plane[1+(i_mb_x&1)]+off_uv, stride_uv, i_mb_x );
+}
void x264_macroblock_cache_load( x264_t *h, int i_mb_x, int i_mb_y )
{
h->mb.pic.p_integral[1][i] = &h->fref1[i]->integral[ 16 * ( i_mb_x + i_mb_y * h->fdec->i_stride[0] )];
}
+ x264_prefetch_fenc( h, h->fenc, i_mb_x, i_mb_y );
+
/* load ref/mv/mvd */
if( h->sh.i_type != SLICE_TYPE_I )
{
h->mb.pic.p_fdec[i], FDEC_STRIDE, w );
}
+ x264_prefetch_fenc( h, h->fdec, h->mb.i_mb_x, h->mb.i_mb_y );
+
h->mb.type[i_mb_xy] = i_mb_type;
if( h->mb.i_type != I_16x16 && h->mb.i_cbp_luma == 0 && h->mb.i_cbp_chroma == 0 )
void x264_macroblock_bipred_init( x264_t *h );
+void x264_prefetch_fenc( x264_t *h, x264_frame_t *fenc, int i_mb_x, int i_mb_y );
+
/* x264_mb_predict_mv_16x16:
* set mvp with predicted mv for D_16x16 block
* h->mb. need only valid values from other blocks */
}
}
+void prefetch_fenc_null( uint8_t *pix_y, int stride_y,
+ uint8_t *pix_uv, int stride_uv, int mb_x )
+{}
+
+void prefetch_ref_null( uint8_t *pix, int stride, int parity )
+{}
+
void x264_mc_init( int cpu, x264_mc_functions_t *pf )
{
pf->mc_luma = mc_luma;
pf->plane_copy = plane_copy;
+ pf->prefetch_fenc = prefetch_fenc_null;
+ pf->prefetch_ref = prefetch_ref_null;
+
#ifdef HAVE_MMXEXT
if( cpu&X264_CPU_MMXEXT ) {
x264_mc_mmxext_init( pf );
void (*plane_copy)( uint8_t *dst, int i_dst,
uint8_t *src, int i_src, int w, int h);
+
+ /* prefetch the next few macroblocks of fenc or fdec */
+ void (*prefetch_fenc)( uint8_t *pix_y, int stride_y,
+ uint8_t *pix_uv, int stride_uv, int mb_x );
+ /* prefetch the next few macroblocks of a hpel reference frame */
+ void (*prefetch_ref)( uint8_t *pix, int stride, int parity );
+
} x264_mc_functions_t;
void x264_mc_init( int cpu, x264_mc_functions_t *pf );
int b_skip = 0;
int i_intra_cost, i_intra_type;
+ h->mc.prefetch_ref( h->mb.pic.p_fref[0][0][h->mb.i_mb_x&3], h->mb.pic.i_stride[0], 0 );
+
/* Fast P_SKIP detection */
analysis.b_try_pskip = 0;
if( h->param.analyse.b_fast_pskip )
b_skip = x264_macroblock_probe_pskip( h );
}
+ h->mc.prefetch_ref( h->mb.pic.p_fref[0][0][h->mb.i_mb_x&3], h->mb.pic.i_stride[0], 1 );
+
if( b_skip )
{
h->mb.i_type = P_SKIP;