]> granicus.if.org Git - libx264/commitdiff
prefetch pixels for motion compensation and deblocking.
authorLoren Merritt <pengvado@videolan.org>
Tue, 10 Oct 2006 21:26:31 +0000 (21:26 +0000)
committerLoren Merritt <pengvado@videolan.org>
Tue, 10 Oct 2006 21:26:31 +0000 (21:26 +0000)
git-svn-id: svn://svn.videolan.org/x264/trunk@590 df754926-b1dd-0310-bc7b-ec298dee348c

common/amd64/mc-a.asm
common/frame.c
common/i386/mc-a.asm
common/i386/mc-c.c
common/macroblock.c
common/macroblock.h
common/mc.c
common/mc.h
encoder/analyse.c

index ad3e73fb150c77ae3b8810485b371b1ca78cf0f3..8ae1416bcf30c6735885efbfd7938fb592460a63 100644 (file)
@@ -72,6 +72,9 @@ cglobal x264_mc_copy_w16_sse2
 
 cglobal x264_mc_chroma_mmxext
 
+cglobal x264_prefetch_fenc_mmxext
+cglobal x264_prefetch_ref_mmxext
+
 ;=============================================================================
 ; pixel avg
 ;=============================================================================
@@ -549,3 +552,49 @@ ALIGN 4
     dec      r11d
     jnz .height_loop1_w8
     rep ret
+
+
+
+;-----------------------------------------------------------------------------
+; void x264_prefetch_fenc_mmxext( uint8_t *pix_y, int stride_y, 
+;                                 uint8_t *pix_uv, int stride_uv, int mb_x )
+;-----------------------------------------------------------------------------
+ALIGN 16
+x264_prefetch_fenc_mmxext:
+    mov     eax, parm5d
+    and     eax, 3
+    imul    eax, parm2d
+    lea  parm1q, [parm1q+rax*4+64]
+    prefetcht0   [parm1q]
+    prefetcht0   [parm1q+parm2q]
+    lea  parm1q, [parm1q+parm2q*2]
+    prefetcht0   [parm1q]
+    prefetcht0   [parm1q+parm2q]
+
+    mov     eax, parm5d
+    and     eax, 6
+    imul    eax, parm4d
+    lea  parm3q, [parm3q+rax+64]
+    prefetcht0   [parm3q]
+    prefetcht0   [parm3q+parm4q]
+    ret
+
+;-----------------------------------------------------------------------------
+; void x264_prefetch_ref_mmxext( uint8_t *pix, int stride, int parity )
+;-----------------------------------------------------------------------------
+ALIGN 16
+x264_prefetch_ref_mmxext:
+    dec  parm3d
+    and  parm3d, parm2d
+    lea  parm1q, [parm1q+parm3q*8+64]
+    lea     rax, [parm2q*3]
+    prefetcht0   [parm1q]
+    prefetcht0   [parm1q+parm2q]
+    prefetcht0   [parm1q+parm2q*2]
+    prefetcht0   [parm1q+rax]
+    lea  parm1q, [parm1q+parm2q*4]
+    prefetcht0   [parm1q]
+    prefetcht0   [parm1q+parm2q]
+    prefetcht0   [parm1q+parm2q*2]
+    prefetcht0   [parm1q+rax]
+    ret
index 96ba0d7a5789b30f1dd86ad32ea895b85650c4ab..7720947984d5a71ac7c1ef72d92be76320bc229f 100644 (file)
@@ -536,6 +536,8 @@ void x264_frame_deblocking_filter( x264_t *h, int i_slice_type )
             i_pix_y[2] -=  7*h->fdec->i_stride[2];
         }
 
+        x264_prefetch_fenc( h, h->fdec, mb_x, mb_y );
+
         /* cavlc + 8x8 transform stores nnz per 16 coeffs for the purpose of
          * entropy coding, but per 64 coeffs for the purpose of deblocking */
         if( !h->param.b_cabac && b_8x8_transform )
index d48ccfd974eaf6ef2ec4e5420a4763aacd171404..6f233c4c4a40628a322f3b7bcda23efff9c6b9c7 100644 (file)
@@ -72,6 +72,9 @@ cglobal x264_mc_copy_w16_sse2
 
 cglobal x264_mc_chroma_mmxext
 
+cglobal x264_prefetch_fenc_mmxext
+cglobal x264_prefetch_ref_mmxext
+
 ;=============================================================================
 ; pixel avg
 ;=============================================================================
@@ -595,3 +598,59 @@ ALIGN 4
     pop     edi
     picpop  ebx
     ret
+
+
+
+; prefetches tuned for 64 byte cachelines (K7/K8/Core2)
+; TODO add 32 and 128 byte versions for P3/P4
+
+;-----------------------------------------------------------------------------
+; void x264_prefetch_fenc_mmxext( uint8_t *pix_y, int stride_y, 
+;                                 uint8_t *pix_uv, int stride_uv, int mb_x )
+;-----------------------------------------------------------------------------
+ALIGN 16
+x264_prefetch_fenc_mmxext:
+    mov   eax, [esp+20]
+    mov   ecx, [esp+8]
+    mov   edx, [esp+4]
+    and   eax, 3
+    imul  eax, ecx
+    lea   edx, [edx+eax*4+64]
+    prefetcht0 [edx]
+    prefetcht0 [edx+ecx]
+    lea   edx, [edx+ecx*2]
+    prefetcht0 [edx]
+    prefetcht0 [edx+ecx]
+
+    mov   eax, [esp+20]
+    mov   ecx, [esp+16]
+    mov   edx, [esp+12]
+    and   eax, 6
+    imul  eax, ecx
+    lea   edx, [edx+eax+64]
+    prefetcht0 [edx]
+    prefetcht0 [edx+ecx]
+    ret
+
+;-----------------------------------------------------------------------------
+; void x264_prefetch_ref_mmxext( uint8_t *pix, int stride, int parity )
+;-----------------------------------------------------------------------------
+ALIGN 16
+x264_prefetch_ref_mmxext:
+    mov   eax, [esp+12]
+    mov   ecx, [esp+8]
+    mov   edx, [esp+4]
+    sub   eax, 1
+    and   eax, ecx
+    lea   edx, [edx+eax*8+64]
+    lea   eax, [ecx*3]
+    prefetcht0 [edx]
+    prefetcht0 [edx+ecx]
+    prefetcht0 [edx+ecx*2]
+    prefetcht0 [edx+eax]
+    lea   edx, [edx+ecx*4]
+    prefetcht0 [edx]
+    prefetcht0 [edx+ecx]
+    prefetcht0 [edx+ecx*2]
+    prefetcht0 [edx+eax]
+    ret
index fa15fcd7cb26609aba903fbf50a440fe6159a03a..2f89b105199800322ebe8605d58a2e5cc26e8577 100644 (file)
@@ -40,6 +40,8 @@ extern void x264_mc_copy_w8_mmx( uint8_t *, int, uint8_t *, int, int );
 extern void x264_mc_copy_w16_mmx( uint8_t *, int, uint8_t *, int, int );
 extern void x264_mc_copy_w16_sse2( uint8_t *, int, uint8_t *, int, int );
 extern void x264_plane_copy_mmxext( uint8_t *, int, uint8_t *, int, int w, int h);
+extern void x264_prefetch_fenc_mmxext( uint8_t *, int, uint8_t *, int, int );
+extern void x264_prefetch_ref_mmxext( uint8_t *, int, int );
 
 #define AVG(W,H) \
 static void x264_pixel_avg_ ## W ## x ## H ## _mmxext( uint8_t *dst, int i_dst, uint8_t *src, int i_src ) \
@@ -161,6 +163,9 @@ void x264_mc_mmxext_init( x264_mc_functions_t *pf )
     pf->copy[PIXEL_4x4]   = x264_mc_copy_w4_mmx;
 
     pf->plane_copy = x264_plane_copy_mmxext;
+
+    pf->prefetch_fenc = x264_prefetch_fenc_mmxext;
+    pf->prefetch_ref  = x264_prefetch_ref_mmxext;
 }
 void x264_mc_sse2_init( x264_mc_functions_t *pf )
 {
index 41b4adaf0de3f630d56a301d37820f8be1d9ee8e..f9744e8b3660cf686cce251f9e3679da67a8594f 100644 (file)
@@ -927,6 +927,15 @@ void x264_macroblock_slice_init( x264_t *h )
         memset( h->mb.cache.skip, 0, X264_SCAN8_SIZE * sizeof( int8_t ) );
 }
 
+void x264_prefetch_fenc( x264_t *h, x264_frame_t *fenc, int i_mb_x, int i_mb_y )
+{
+    int stride_y  = fenc->i_stride[0];
+    int stride_uv = fenc->i_stride[1];
+    int off_y = 16 * (i_mb_x + i_mb_y * stride_y);
+    int off_uv = 8 * (i_mb_x + i_mb_y * stride_uv);
+    h->mc.prefetch_fenc( fenc->plane[0]+off_y, stride_y,
+                         fenc->plane[1+(i_mb_x&1)]+off_uv, stride_uv, i_mb_x );
+}
 
 void x264_macroblock_cache_load( x264_t *h, int i_mb_x, int i_mb_y )
 {
@@ -1143,6 +1152,8 @@ void x264_macroblock_cache_load( x264_t *h, int i_mb_x, int i_mb_y )
             h->mb.pic.p_integral[1][i] = &h->fref1[i]->integral[ 16 * ( i_mb_x + i_mb_y * h->fdec->i_stride[0] )];
     }
 
+    x264_prefetch_fenc( h, h->fenc, i_mb_x, i_mb_y );
+
     /* load ref/mv/mvd */
     if( h->sh.i_type != SLICE_TYPE_I )
     {
@@ -1359,6 +1370,8 @@ void x264_macroblock_cache_save( x264_t *h )
             h->mb.pic.p_fdec[i], FDEC_STRIDE, w );
     }
 
+    x264_prefetch_fenc( h, h->fdec, h->mb.i_mb_x, h->mb.i_mb_y );
+
     h->mb.type[i_mb_xy] = i_mb_type;
 
     if( h->mb.i_type != I_16x16 && h->mb.i_cbp_luma == 0 && h->mb.i_cbp_chroma == 0 )
index ff9dd826b77057935801fe947da802941676dabe..6a9e733dd998ad83c12d2c03b54653aa4e495836 100644 (file)
@@ -248,6 +248,8 @@ void x264_macroblock_cache_end( x264_t *h );
 
 void x264_macroblock_bipred_init( x264_t *h );
 
+void x264_prefetch_fenc( x264_t *h, x264_frame_t *fenc, int i_mb_x, int i_mb_y );
+
 /* x264_mb_predict_mv_16x16:
  *      set mvp with predicted mv for D_16x16 block
  *      h->mb. need only valid values from other blocks */
index 432928089c148fbe224d42901d532df96017a658..7721b63a34461291259472c4b31bf3fd0d2009a6 100644 (file)
@@ -327,6 +327,13 @@ static void plane_copy( uint8_t *dst, int i_dst,
     }
 }
 
+void prefetch_fenc_null( uint8_t *pix_y, int stride_y,
+                         uint8_t *pix_uv, int stride_uv, int mb_x )
+{}
+
+void prefetch_ref_null( uint8_t *pix, int stride, int parity )
+{}
+
 void x264_mc_init( int cpu, x264_mc_functions_t *pf )
 {
     pf->mc_luma   = mc_luma;
@@ -361,6 +368,9 @@ void x264_mc_init( int cpu, x264_mc_functions_t *pf )
 
     pf->plane_copy = plane_copy;
 
+    pf->prefetch_fenc = prefetch_fenc_null;
+    pf->prefetch_ref  = prefetch_ref_null;
+
 #ifdef HAVE_MMXEXT
     if( cpu&X264_CPU_MMXEXT ) {
         x264_mc_mmxext_init( pf );
index 52f5b8e9eaa73f1210b91e01a3980b5a18ba0327..9c9fe517ca39e0cb28e3905d76486280ca47dc45 100644 (file)
@@ -55,6 +55,13 @@ typedef struct
 
     void (*plane_copy)( uint8_t *dst, int i_dst,
                         uint8_t *src, int i_src, int w, int h);
+
+    /* prefetch the next few macroblocks of fenc or fdec */
+    void (*prefetch_fenc)( uint8_t *pix_y, int stride_y,
+                           uint8_t *pix_uv, int stride_uv, int mb_x );
+    /* prefetch the next few macroblocks of a hpel reference frame */
+    void (*prefetch_ref)( uint8_t *pix, int stride, int parity );
+
 } x264_mc_functions_t;
 
 void x264_mc_init( int cpu, x264_mc_functions_t *pf );
index 064932e27507b06b2a386bd47c8ff5fed7651b09..c3d3c26160e91c829870dcf10922667937440ce9 100644 (file)
@@ -1996,6 +1996,8 @@ void x264_macroblock_analyse( x264_t *h )
         int b_skip = 0;
         int i_intra_cost, i_intra_type;
 
+        h->mc.prefetch_ref( h->mb.pic.p_fref[0][0][h->mb.i_mb_x&3], h->mb.pic.i_stride[0], 0 );
+
         /* Fast P_SKIP detection */
         analysis.b_try_pskip = 0;
         if( h->param.analyse.b_fast_pskip )
@@ -2009,6 +2011,8 @@ void x264_macroblock_analyse( x264_t *h )
                 b_skip = x264_macroblock_probe_pskip( h );
         }
 
+        h->mc.prefetch_ref( h->mb.pic.p_fref[0][0][h->mb.i_mb_x&3], h->mb.pic.i_stride[0], 1 );
+
         if( b_skip )
         {
             h->mb.i_type = P_SKIP;