From 8d1ebe2eeb30a204b588502d69d361ee85187821 Mon Sep 17 00:00:00 2001
From: Loren Merritt <pengvado@videolan.org>
Date: Tue, 10 Oct 2006 21:26:31 +0000
Subject: [PATCH] prefetch pixels for motion compensation and deblocking.

git-svn-id: svn://svn.videolan.org/x264/trunk@590 df754926-b1dd-0310-bc7b-ec298dee348c
---
 common/amd64/mc-a.asm | 49 +++++++++++++++++++++++++++++++++++
 common/frame.c        |  2 ++
 common/i386/mc-a.asm  | 59 +++++++++++++++++++++++++++++++++++++++++++
 common/i386/mc-c.c    |  5 ++++
 common/macroblock.c   | 13 ++++++++++
 common/macroblock.h   |  2 ++
 common/mc.c           | 10 ++++++++
 common/mc.h           |  7 +++++
 encoder/analyse.c     |  4 +++
 9 files changed, 151 insertions(+)

diff --git a/common/amd64/mc-a.asm b/common/amd64/mc-a.asm
index ad3e73fb..8ae1416b 100644
--- a/common/amd64/mc-a.asm
+++ b/common/amd64/mc-a.asm
@@ -72,6 +72,9 @@ cglobal x264_mc_copy_w16_sse2
 
 cglobal x264_mc_chroma_mmxext
 
+cglobal x264_prefetch_fenc_mmxext
+cglobal x264_prefetch_ref_mmxext
+
 ;=============================================================================
 ; pixel avg
 ;=============================================================================
@@ -549,3 +552,49 @@ ALIGN 4
     dec      r11d
     jnz .height_loop1_w8
     rep ret
+
+
+
+;-----------------------------------------------------------------------------
+; void x264_prefetch_fenc_mmxext( uint8_t *pix_y, int stride_y, 
+;                                 uint8_t *pix_uv, int stride_uv, int mb_x )
+;-----------------------------------------------------------------------------
+ALIGN 16
+x264_prefetch_fenc_mmxext:
+    mov     eax, parm5d
+    and     eax, 3
+    imul    eax, parm2d
+    lea  parm1q, [parm1q+rax*4+64]
+    prefetcht0   [parm1q]
+    prefetcht0   [parm1q+parm2q]
+    lea  parm1q, [parm1q+parm2q*2]
+    prefetcht0   [parm1q]
+    prefetcht0   [parm1q+parm2q]
+
+    mov     eax, parm5d
+    and     eax, 6
+    imul    eax, parm4d
+    lea  parm3q, [parm3q+rax+64]
+    prefetcht0   [parm3q]
+    prefetcht0   [parm3q+parm4q]
+    ret
+
+;-----------------------------------------------------------------------------
+; void x264_prefetch_ref_mmxext( uint8_t *pix, int stride, int parity )
+;-----------------------------------------------------------------------------
+ALIGN 16
+x264_prefetch_ref_mmxext:
+    dec  parm3d
+    and  parm3d, parm2d
+    lea  parm1q, [parm1q+parm3q*8+64]
+    lea     rax, [parm2q*3]
+    prefetcht0   [parm1q]
+    prefetcht0   [parm1q+parm2q]
+    prefetcht0   [parm1q+parm2q*2]
+    prefetcht0   [parm1q+rax]
+    lea  parm1q, [parm1q+parm2q*4]
+    prefetcht0   [parm1q]
+    prefetcht0   [parm1q+parm2q]
+    prefetcht0   [parm1q+parm2q*2]
+    prefetcht0   [parm1q+rax]
+    ret
diff --git a/common/frame.c b/common/frame.c
index 96ba0d7a..77209479 100644
--- a/common/frame.c
+++ b/common/frame.c
@@ -536,6 +536,8 @@ void x264_frame_deblocking_filter( x264_t *h, int i_slice_type )
             i_pix_y[2] -=  7*h->fdec->i_stride[2];
         }
 
+        x264_prefetch_fenc( h, h->fdec, mb_x, mb_y );
+
         /* cavlc + 8x8 transform stores nnz per 16 coeffs for the purpose of
          * entropy coding, but per 64 coeffs for the purpose of deblocking */
         if( !h->param.b_cabac && b_8x8_transform )
diff --git a/common/i386/mc-a.asm b/common/i386/mc-a.asm
index d48ccfd9..6f233c4c 100644
--- a/common/i386/mc-a.asm
+++ b/common/i386/mc-a.asm
@@ -72,6 +72,9 @@ cglobal x264_mc_copy_w16_sse2
 
 cglobal x264_mc_chroma_mmxext
 
+cglobal x264_prefetch_fenc_mmxext
+cglobal x264_prefetch_ref_mmxext
+
 ;=============================================================================
 ; pixel avg
 ;=============================================================================
@@ -595,3 +598,59 @@ ALIGN 4
     pop     edi
     picpop  ebx
     ret
+
+
+
+; prefetches tuned for 64 byte cachelines (K7/K8/Core2)
+; TODO add 32 and 128 byte versions for P3/P4
+
+;-----------------------------------------------------------------------------
+; void x264_prefetch_fenc_mmxext( uint8_t *pix_y, int stride_y, 
+;                                 uint8_t *pix_uv, int stride_uv, int mb_x )
+;-----------------------------------------------------------------------------
+ALIGN 16
+x264_prefetch_fenc_mmxext:
+    mov   eax, [esp+20]
+    mov   ecx, [esp+8]
+    mov   edx, [esp+4]
+    and   eax, 3
+    imul  eax, ecx
+    lea   edx, [edx+eax*4+64]
+    prefetcht0 [edx]
+    prefetcht0 [edx+ecx]
+    lea   edx, [edx+ecx*2]
+    prefetcht0 [edx]
+    prefetcht0 [edx+ecx]
+
+    mov   eax, [esp+20]
+    mov   ecx, [esp+16]
+    mov   edx, [esp+12]
+    and   eax, 6
+    imul  eax, ecx
+    lea   edx, [edx+eax+64]
+    prefetcht0 [edx]
+    prefetcht0 [edx+ecx]
+    ret
+
+;-----------------------------------------------------------------------------
+; void x264_prefetch_ref_mmxext( uint8_t *pix, int stride, int parity )
+;-----------------------------------------------------------------------------
+ALIGN 16
+x264_prefetch_ref_mmxext:
+    mov   eax, [esp+12]
+    mov   ecx, [esp+8]
+    mov   edx, [esp+4]
+    sub   eax, 1
+    and   eax, ecx
+    lea   edx, [edx+eax*8+64]
+    lea   eax, [ecx*3]
+    prefetcht0 [edx]
+    prefetcht0 [edx+ecx]
+    prefetcht0 [edx+ecx*2]
+    prefetcht0 [edx+eax]
+    lea   edx, [edx+ecx*4]
+    prefetcht0 [edx]
+    prefetcht0 [edx+ecx]
+    prefetcht0 [edx+ecx*2]
+    prefetcht0 [edx+eax]
+    ret
diff --git a/common/i386/mc-c.c b/common/i386/mc-c.c
index fa15fcd7..2f89b105 100644
--- a/common/i386/mc-c.c
+++ b/common/i386/mc-c.c
@@ -40,6 +40,8 @@ extern void x264_mc_copy_w8_mmx( uint8_t *, int, uint8_t *, int, int );
 extern void x264_mc_copy_w16_mmx( uint8_t *, int, uint8_t *, int, int );
 extern void x264_mc_copy_w16_sse2( uint8_t *, int, uint8_t *, int, int );
 extern void x264_plane_copy_mmxext( uint8_t *, int, uint8_t *, int, int w, int h);
+extern void x264_prefetch_fenc_mmxext( uint8_t *, int, uint8_t *, int, int );
+extern void x264_prefetch_ref_mmxext( uint8_t *, int, int );
 
 #define AVG(W,H) \
 static void x264_pixel_avg_ ## W ## x ## H ## _mmxext( uint8_t *dst, int i_dst, uint8_t *src, int i_src ) \
@@ -161,6 +163,9 @@ void x264_mc_mmxext_init( x264_mc_functions_t *pf )
     pf->copy[PIXEL_4x4]   = x264_mc_copy_w4_mmx;
 
     pf->plane_copy = x264_plane_copy_mmxext;
+
+    pf->prefetch_fenc = x264_prefetch_fenc_mmxext;
+    pf->prefetch_ref  = x264_prefetch_ref_mmxext;
 }
 void x264_mc_sse2_init( x264_mc_functions_t *pf )
 {
diff --git a/common/macroblock.c b/common/macroblock.c
index 41b4adaf..f9744e8b 100644
--- a/common/macroblock.c
+++ b/common/macroblock.c
@@ -927,6 +927,15 @@ void x264_macroblock_slice_init( x264_t *h )
         memset( h->mb.cache.skip, 0, X264_SCAN8_SIZE * sizeof( int8_t ) );
 }
 
+void x264_prefetch_fenc( x264_t *h, x264_frame_t *fenc, int i_mb_x, int i_mb_y )
+{
+    int stride_y  = fenc->i_stride[0];
+    int stride_uv = fenc->i_stride[1];
+    int off_y = 16 * (i_mb_x + i_mb_y * stride_y);
+    int off_uv = 8 * (i_mb_x + i_mb_y * stride_uv);
+    h->mc.prefetch_fenc( fenc->plane[0]+off_y, stride_y,
+                         fenc->plane[1+(i_mb_x&1)]+off_uv, stride_uv, i_mb_x );
+}
 
 void x264_macroblock_cache_load( x264_t *h, int i_mb_x, int i_mb_y )
 {
@@ -1143,6 +1152,8 @@ void x264_macroblock_cache_load( x264_t *h, int i_mb_x, int i_mb_y )
             h->mb.pic.p_integral[1][i] = &h->fref1[i]->integral[ 16 * ( i_mb_x + i_mb_y * h->fdec->i_stride[0] )];
     }
 
+    x264_prefetch_fenc( h, h->fenc, i_mb_x, i_mb_y );
+
     /* load ref/mv/mvd */
     if( h->sh.i_type != SLICE_TYPE_I )
     {
@@ -1359,6 +1370,8 @@ void x264_macroblock_cache_save( x264_t *h )
             h->mb.pic.p_fdec[i], FDEC_STRIDE, w );
     }
 
+    x264_prefetch_fenc( h, h->fdec, h->mb.i_mb_x, h->mb.i_mb_y );
+
     h->mb.type[i_mb_xy] = i_mb_type;
 
     if( h->mb.i_type != I_16x16 && h->mb.i_cbp_luma == 0 && h->mb.i_cbp_chroma == 0 )
diff --git a/common/macroblock.h b/common/macroblock.h
index ff9dd826..6a9e733d 100644
--- a/common/macroblock.h
+++ b/common/macroblock.h
@@ -248,6 +248,8 @@ void x264_macroblock_cache_end( x264_t *h );
 
 void x264_macroblock_bipred_init( x264_t *h );
 
+void x264_prefetch_fenc( x264_t *h, x264_frame_t *fenc, int i_mb_x, int i_mb_y );
+
 /* x264_mb_predict_mv_16x16:
  *      set mvp with predicted mv for D_16x16 block
  *      h->mb. need only valid values from other blocks */
diff --git a/common/mc.c b/common/mc.c
index 43292808..7721b63a 100644
--- a/common/mc.c
+++ b/common/mc.c
@@ -327,6 +327,13 @@ static void plane_copy( uint8_t *dst, int i_dst,
     }
 }
 
+void prefetch_fenc_null( uint8_t *pix_y, int stride_y,
+                         uint8_t *pix_uv, int stride_uv, int mb_x )
+{}
+
+void prefetch_ref_null( uint8_t *pix, int stride, int parity )
+{}
+
 void x264_mc_init( int cpu, x264_mc_functions_t *pf )
 {
     pf->mc_luma   = mc_luma;
@@ -361,6 +368,9 @@ void x264_mc_init( int cpu, x264_mc_functions_t *pf )
 
     pf->plane_copy = plane_copy;
 
+    pf->prefetch_fenc = prefetch_fenc_null;
+    pf->prefetch_ref  = prefetch_ref_null;
+
 #ifdef HAVE_MMXEXT
     if( cpu&X264_CPU_MMXEXT ) {
         x264_mc_mmxext_init( pf );
diff --git a/common/mc.h b/common/mc.h
index 52f5b8e9..9c9fe517 100644
--- a/common/mc.h
+++ b/common/mc.h
@@ -55,6 +55,13 @@ typedef struct
 
     void (*plane_copy)( uint8_t *dst, int i_dst,
                         uint8_t *src, int i_src, int w, int h);
+
+    /* prefetch the next few macroblocks of fenc or fdec */
+    void (*prefetch_fenc)( uint8_t *pix_y, int stride_y,
+                           uint8_t *pix_uv, int stride_uv, int mb_x );
+    /* prefetch the next few macroblocks of a hpel reference frame */
+    void (*prefetch_ref)( uint8_t *pix, int stride, int parity );
+
 } x264_mc_functions_t;
 
 void x264_mc_init( int cpu, x264_mc_functions_t *pf );
diff --git a/encoder/analyse.c b/encoder/analyse.c
index 064932e2..c3d3c261 100644
--- a/encoder/analyse.c
+++ b/encoder/analyse.c
@@ -1996,6 +1996,8 @@ void x264_macroblock_analyse( x264_t *h )
         int b_skip = 0;
         int i_intra_cost, i_intra_type;
 
+        h->mc.prefetch_ref( h->mb.pic.p_fref[0][0][h->mb.i_mb_x&3], h->mb.pic.i_stride[0], 0 );
+
         /* Fast P_SKIP detection */
         analysis.b_try_pskip = 0;
         if( h->param.analyse.b_fast_pskip )
@@ -2009,6 +2011,8 @@ void x264_macroblock_analyse( x264_t *h )
                 b_skip = x264_macroblock_probe_pskip( h );
         }
 
+        h->mc.prefetch_ref( h->mb.pic.p_fref[0][0][h->mb.i_mb_x&3], h->mb.pic.i_stride[0], 1 );
+
         if( b_skip )
         {
             h->mb.i_type = P_SKIP;
-- 
2.40.0