prefetch pixels for motion compensation and deblocking.

author Loren Merritt <pengvado@videolan.org>

Tue, 10 Oct 2006 21:26:31 +0000 (21:26 +0000)

committer Loren Merritt <pengvado@videolan.org>

Tue, 10 Oct 2006 21:26:31 +0000 (21:26 +0000)
author Loren Merritt <pengvado@videolan.org>
Tue, 10 Oct 2006 21:26:31 +0000 (21:26 +0000)
committer Loren Merritt <pengvado@videolan.org>
Tue, 10 Oct 2006 21:26:31 +0000 (21:26 +0000)
diff --git a/common/amd64/mc-a.asm b/common/amd64/mc-a.asm

index ad3e73fb150c77ae3b8810485b371b1ca78cf0f3..8ae1416bcf30c6735885efbfd7938fb592460a63 100644 (file)
--- a/common/amd64/mc-a.asm
+++ b/common/amd64/mc-a.asm
@@ -72,6 +72,9 @@ cglobal x264_mc_copy_w16_sse2
  
  cglobal x264_mc_chroma_mmxext
  
+cglobal x264_prefetch_fenc_mmxext
+cglobal x264_prefetch_ref_mmxext
+
  ;=============================================================================
  ; pixel avg
  ;=============================================================================
@@ -549,3 +552,49 @@ ALIGN 4
      dec      r11d
      jnz .height_loop1_w8
      rep ret
+
+
+
+;-----------------------------------------------------------------------------
+; void x264_prefetch_fenc_mmxext( uint8_t *pix_y, int stride_y, 
+;                                 uint8_t *pix_uv, int stride_uv, int mb_x )
+;-----------------------------------------------------------------------------
+ALIGN 16
+x264_prefetch_fenc_mmxext:
+    mov     eax, parm5d
+    and     eax, 3
+    imul    eax, parm2d
+    lea  parm1q, [parm1q+rax*4+64]
+    prefetcht0   [parm1q]
+    prefetcht0   [parm1q+parm2q]
+    lea  parm1q, [parm1q+parm2q*2]
+    prefetcht0   [parm1q]
+    prefetcht0   [parm1q+parm2q]
+
+    mov     eax, parm5d
+    and     eax, 6
+    imul    eax, parm4d
+    lea  parm3q, [parm3q+rax+64]
+    prefetcht0   [parm3q]
+    prefetcht0   [parm3q+parm4q]
+    ret
+
+;-----------------------------------------------------------------------------
+; void x264_prefetch_ref_mmxext( uint8_t *pix, int stride, int parity )
+;-----------------------------------------------------------------------------
+ALIGN 16
+x264_prefetch_ref_mmxext:
+    dec  parm3d
+    and  parm3d, parm2d
+    lea  parm1q, [parm1q+parm3q*8+64]
+    lea     rax, [parm2q*3]
+    prefetcht0   [parm1q]
+    prefetcht0   [parm1q+parm2q]
+    prefetcht0   [parm1q+parm2q*2]
+    prefetcht0   [parm1q+rax]
+    lea  parm1q, [parm1q+parm2q*4]
+    prefetcht0   [parm1q]
+    prefetcht0   [parm1q+parm2q]
+    prefetcht0   [parm1q+parm2q*2]
+    prefetcht0   [parm1q+rax]
+    ret
diff --git a/common/frame.c b/common/frame.c

index 96ba0d7a5789b30f1dd86ad32ea895b85650c4ab..7720947984d5a71ac7c1ef72d92be76320bc229f 100644 (file)
--- a/common/frame.c
+++ b/common/frame.c
@@ -536,6 +536,8 @@ void x264_frame_deblocking_filter( x264_t *h, int i_slice_type )
              i_pix_y[2] -=  7*h->fdec->i_stride[2];
          }
  
+        x264_prefetch_fenc( h, h->fdec, mb_x, mb_y );
+
          /* cavlc + 8x8 transform stores nnz per 16 coeffs for the purpose of
           * entropy coding, but per 64 coeffs for the purpose of deblocking */
          if( !h->param.b_cabac && b_8x8_transform )
diff --git a/common/i386/mc-a.asm b/common/i386/mc-a.asm

index d48ccfd974eaf6ef2ec4e5420a4763aacd171404..6f233c4c4a40628a322f3b7bcda23efff9c6b9c7 100644 (file)
--- a/common/i386/mc-a.asm
+++ b/common/i386/mc-a.asm
@@ -72,6 +72,9 @@ cglobal x264_mc_copy_w16_sse2
  
  cglobal x264_mc_chroma_mmxext
  
+cglobal x264_prefetch_fenc_mmxext
+cglobal x264_prefetch_ref_mmxext
+
  ;=============================================================================
  ; pixel avg
  ;=============================================================================
@@ -595,3 +598,59 @@ ALIGN 4
      pop     edi
      picpop  ebx
      ret
+
+
+
+; prefetches tuned for 64 byte cachelines (K7/K8/Core2)
+; TODO add 32 and 128 byte versions for P3/P4
+
+;-----------------------------------------------------------------------------
+; void x264_prefetch_fenc_mmxext( uint8_t *pix_y, int stride_y, 
+;                                 uint8_t *pix_uv, int stride_uv, int mb_x )
+;-----------------------------------------------------------------------------
+ALIGN 16
+x264_prefetch_fenc_mmxext:
+    mov   eax, [esp+20]
+    mov   ecx, [esp+8]
+    mov   edx, [esp+4]
+    and   eax, 3
+    imul  eax, ecx
+    lea   edx, [edx+eax*4+64]
+    prefetcht0 [edx]
+    prefetcht0 [edx+ecx]
+    lea   edx, [edx+ecx*2]
+    prefetcht0 [edx]
+    prefetcht0 [edx+ecx]
+
+    mov   eax, [esp+20]
+    mov   ecx, [esp+16]
+    mov   edx, [esp+12]
+    and   eax, 6
+    imul  eax, ecx
+    lea   edx, [edx+eax+64]
+    prefetcht0 [edx]
+    prefetcht0 [edx+ecx]
+    ret
+
+;-----------------------------------------------------------------------------
+; void x264_prefetch_ref_mmxext( uint8_t *pix, int stride, int parity )
+;-----------------------------------------------------------------------------
+ALIGN 16
+x264_prefetch_ref_mmxext:
+    mov   eax, [esp+12]
+    mov   ecx, [esp+8]
+    mov   edx, [esp+4]
+    sub   eax, 1
+    and   eax, ecx
+    lea   edx, [edx+eax*8+64]
+    lea   eax, [ecx*3]
+    prefetcht0 [edx]
+    prefetcht0 [edx+ecx]
+    prefetcht0 [edx+ecx*2]
+    prefetcht0 [edx+eax]
+    lea   edx, [edx+ecx*4]
+    prefetcht0 [edx]
+    prefetcht0 [edx+ecx]
+    prefetcht0 [edx+ecx*2]
+    prefetcht0 [edx+eax]
+    ret
diff --git a/common/i386/mc-c.c b/common/i386/mc-c.c

index fa15fcd7cb26609aba903fbf50a440fe6159a03a..2f89b105199800322ebe8605d58a2e5cc26e8577 100644 (file)
--- a/common/i386/mc-c.c
+++ b/common/i386/mc-c.c
@@ -40,6 +40,8 @@ extern void x264_mc_copy_w8_mmx( uint8_t *, int, uint8_t *, int, int );
  extern void x264_mc_copy_w16_mmx( uint8_t *, int, uint8_t *, int, int );
  extern void x264_mc_copy_w16_sse2( uint8_t *, int, uint8_t *, int, int );
  extern void x264_plane_copy_mmxext( uint8_t *, int, uint8_t *, int, int w, int h);
+extern void x264_prefetch_fenc_mmxext( uint8_t *, int, uint8_t *, int, int );
+extern void x264_prefetch_ref_mmxext( uint8_t *, int, int );
  
  #define AVG(W,H) \
  static void x264_pixel_avg_ ## W ## x ## H ## _mmxext( uint8_t *dst, int i_dst, uint8_t *src, int i_src ) \
@@ -161,6 +163,9 @@ void x264_mc_mmxext_init( x264_mc_functions_t *pf )
      pf->copy[PIXEL_4x4]   = x264_mc_copy_w4_mmx;
  
      pf->plane_copy = x264_plane_copy_mmxext;
+
+    pf->prefetch_fenc = x264_prefetch_fenc_mmxext;
+    pf->prefetch_ref  = x264_prefetch_ref_mmxext;
  }
  void x264_mc_sse2_init( x264_mc_functions_t *pf )
  {
diff --git a/common/macroblock.c b/common/macroblock.c

index 41b4adaf0de3f630d56a301d37820f8be1d9ee8e..f9744e8b3660cf686cce251f9e3679da67a8594f 100644 (file)
--- a/common/macroblock.c
+++ b/common/macroblock.c
@@ -927,6 +927,15 @@ void x264_macroblock_slice_init( x264_t *h )
          memset( h->mb.cache.skip, 0, X264_SCAN8_SIZE * sizeof( int8_t ) );
  }
  
+void x264_prefetch_fenc( x264_t *h, x264_frame_t *fenc, int i_mb_x, int i_mb_y )
+{
+    int stride_y  = fenc->i_stride[0];
+    int stride_uv = fenc->i_stride[1];
+    int off_y = 16 * (i_mb_x + i_mb_y * stride_y);
+    int off_uv = 8 * (i_mb_x + i_mb_y * stride_uv);
+    h->mc.prefetch_fenc( fenc->plane[0]+off_y, stride_y,
+                         fenc->plane[1+(i_mb_x&1)]+off_uv, stride_uv, i_mb_x );
+}
  
  void x264_macroblock_cache_load( x264_t *h, int i_mb_x, int i_mb_y )
  {
@@ -1143,6 +1152,8 @@ void x264_macroblock_cache_load( x264_t *h, int i_mb_x, int i_mb_y )
              h->mb.pic.p_integral[1][i] = &h->fref1[i]->integral[ 16 * ( i_mb_x + i_mb_y * h->fdec->i_stride[0] )];
      }
  
+    x264_prefetch_fenc( h, h->fenc, i_mb_x, i_mb_y );
+
      /* load ref/mv/mvd */
      if( h->sh.i_type != SLICE_TYPE_I )
      {
@@ -1359,6 +1370,8 @@ void x264_macroblock_cache_save( x264_t *h )
              h->mb.pic.p_fdec[i], FDEC_STRIDE, w );
      }
  
+    x264_prefetch_fenc( h, h->fdec, h->mb.i_mb_x, h->mb.i_mb_y );
+
      h->mb.type[i_mb_xy] = i_mb_type;
  
      if( h->mb.i_type != I_16x16 && h->mb.i_cbp_luma == 0 && h->mb.i_cbp_chroma == 0 )
diff --git a/common/macroblock.h b/common/macroblock.h

index ff9dd826b77057935801fe947da802941676dabe..6a9e733dd998ad83c12d2c03b54653aa4e495836 100644 (file)
--- a/common/macroblock.h
+++ b/common/macroblock.h
@@ -248,6 +248,8 @@ void x264_macroblock_cache_end( x264_t *h );
  
  void x264_macroblock_bipred_init( x264_t *h );
  
+void x264_prefetch_fenc( x264_t *h, x264_frame_t *fenc, int i_mb_x, int i_mb_y );
+
  /* x264_mb_predict_mv_16x16:
   *      set mvp with predicted mv for D_16x16 block
   *      h->mb. need only valid values from other blocks */
diff --git a/common/mc.c b/common/mc.c

index 432928089c148fbe224d42901d532df96017a658..7721b63a34461291259472c4b31bf3fd0d2009a6 100644 (file)
--- a/common/mc.c
+++ b/common/mc.c
@@ -327,6 +327,13 @@ static void plane_copy( uint8_t *dst, int i_dst,
      }
  }
  
+void prefetch_fenc_null( uint8_t *pix_y, int stride_y,
+                         uint8_t *pix_uv, int stride_uv, int mb_x )
+{}
+
+void prefetch_ref_null( uint8_t *pix, int stride, int parity )
+{}
+
  void x264_mc_init( int cpu, x264_mc_functions_t *pf )
  {
      pf->mc_luma   = mc_luma;
@@ -361,6 +368,9 @@ void x264_mc_init( int cpu, x264_mc_functions_t *pf )
  
      pf->plane_copy = plane_copy;
  
+    pf->prefetch_fenc = prefetch_fenc_null;
+    pf->prefetch_ref  = prefetch_ref_null;
+
  #ifdef HAVE_MMXEXT
      if( cpu&X264_CPU_MMXEXT ) {
          x264_mc_mmxext_init( pf );
diff --git a/common/mc.h b/common/mc.h

index 52f5b8e9eaa73f1210b91e01a3980b5a18ba0327..9c9fe517ca39e0cb28e3905d76486280ca47dc45 100644 (file)
--- a/common/mc.h
+++ b/common/mc.h
@@ -55,6 +55,13 @@ typedef struct
  
      void (*plane_copy)( uint8_t *dst, int i_dst,
                          uint8_t *src, int i_src, int w, int h);
+
+    /* prefetch the next few macroblocks of fenc or fdec */
+    void (*prefetch_fenc)( uint8_t *pix_y, int stride_y,
+                           uint8_t *pix_uv, int stride_uv, int mb_x );
+    /* prefetch the next few macroblocks of a hpel reference frame */
+    void (*prefetch_ref)( uint8_t *pix, int stride, int parity );
+
  } x264_mc_functions_t;
  
  void x264_mc_init( int cpu, x264_mc_functions_t *pf );
diff --git a/encoder/analyse.c b/encoder/analyse.c

index 064932e27507b06b2a386bd47c8ff5fed7651b09..c3d3c26160e91c829870dcf10922667937440ce9 100644 (file)
--- a/encoder/analyse.c
+++ b/encoder/analyse.c
@@ -1996,6 +1996,8 @@ void x264_macroblock_analyse( x264_t *h )
          int b_skip = 0;
          int i_intra_cost, i_intra_type;
  
+        h->mc.prefetch_ref( h->mb.pic.p_fref[0][0][h->mb.i_mb_x&3], h->mb.pic.i_stride[0], 0 );
+
          /* Fast P_SKIP detection */
          analysis.b_try_pskip = 0;
          if( h->param.analyse.b_fast_pskip )
@@ -2009,6 +2011,8 @@ void x264_macroblock_analyse( x264_t *h )
                  b_skip = x264_macroblock_probe_pskip( h );
          }
  
+        h->mc.prefetch_ref( h->mb.pic.p_fref[0][0][h->mb.i_mb_x&3], h->mb.pic.i_stride[0], 1 );
+
          if( b_skip )
          {
              h->mb.i_type = P_SKIP;
author	Loren Merritt <pengvado@videolan.org>
	Tue, 10 Oct 2006 21:26:31 +0000 (21:26 +0000)
committer	Loren Merritt <pengvado@videolan.org>
	Tue, 10 Oct 2006 21:26:31 +0000 (21:26 +0000)
common/amd64/mc-a.asm		patch \| blob \| history
common/frame.c		patch \| blob \| history
common/i386/mc-a.asm		patch \| blob \| history
common/i386/mc-c.c		patch \| blob \| history
common/macroblock.c		patch \| blob \| history
common/macroblock.h		patch \| blob \| history
common/mc.c		patch \| blob \| history
common/mc.h		patch \| blob \| history
encoder/analyse.c		patch \| blob \| history