From: Fiona Glaser <fiona@x264.com>
Date: Tue, 16 Dec 2008 07:02:49 +0000 (-0800)
Subject: More macroblock_cache optimizations
X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=8e5d63a544efb6eb0f6677f718033f049c1ccd56;p=libx264

More macroblock_cache optimizations
Patch partially by Loren Merritt
---

diff --git a/common/macroblock.c b/common/macroblock.c
index bf2fe496..d4ed3784 100644
--- a/common/macroblock.c
+++ b/common/macroblock.c
@@ -923,15 +923,16 @@ void x264_prefetch_fenc( x264_t *h, x264_frame_t *fenc, int i_mb_x, int i_mb_y )
 
 static NOINLINE void copy_column8( uint8_t *dst, uint8_t *src )
 {
+    // input pointers are offset by 4 rows because that's faster (smaller instruction size on x86)
     int i;
-    for(i=0; i<8; i++)
+    for( i = -4; i < 4; i++ )
         dst[i*FDEC_STRIDE] = src[i*FDEC_STRIDE];
 }
 
 static void ALWAYS_INLINE x264_macroblock_load_pic_pointers( x264_t *h, int i_mb_x, int i_mb_y, int i)
 {
     const int w = (i == 0 ? 16 : 8);
-    const int i_stride = h->fdec->i_stride[i];
+    const int i_stride = h->fdec->i_stride[!!i];
     const int i_stride2 = i_stride << h->mb.b_interlaced;
     const int i_pix_offset = h->mb.b_interlaced
                            ? w * (i_mb_x + (i_mb_y&~1) * i_stride) + (i_mb_y&1) * i_stride
@@ -985,9 +986,6 @@ void x264_macroblock_cache_load( x264_t *h, int i_mb_x, int i_mb_y )
 
     int i;
 
-    assert( h->mb.i_b8_stride == 2*h->mb.i_mb_stride );
-    assert( h->mb.i_b4_stride == 4*h->mb.i_mb_stride );
-
     /* init index */
     h->mb.i_mb_x = i_mb_x;
     h->mb.i_mb_y = i_mb_y;
@@ -1114,10 +1112,10 @@ void x264_macroblock_cache_load( x264_t *h, int i_mb_x, int i_mb_y )
 
     if( !h->mb.b_interlaced )
     {
-        copy_column8( h->mb.pic.p_fdec[0]-1, h->mb.pic.p_fdec[0]+15 );
-        copy_column8( h->mb.pic.p_fdec[0]-1+8*FDEC_STRIDE, h->mb.pic.p_fdec[0]+15+8*FDEC_STRIDE );
-        copy_column8( h->mb.pic.p_fdec[1]-1, h->mb.pic.p_fdec[1]+7 );
-        copy_column8( h->mb.pic.p_fdec[2]-1, h->mb.pic.p_fdec[2]+7 );
+        copy_column8( h->mb.pic.p_fdec[0]-1+ 4*FDEC_STRIDE, h->mb.pic.p_fdec[0]+15+ 4*FDEC_STRIDE );
+        copy_column8( h->mb.pic.p_fdec[0]-1+12*FDEC_STRIDE, h->mb.pic.p_fdec[0]+15+12*FDEC_STRIDE );
+        copy_column8( h->mb.pic.p_fdec[1]-1+ 4*FDEC_STRIDE, h->mb.pic.p_fdec[1]+ 7+ 4*FDEC_STRIDE );
+        copy_column8( h->mb.pic.p_fdec[2]-1+ 4*FDEC_STRIDE, h->mb.pic.p_fdec[2]+ 7+ 4*FDEC_STRIDE );
     }
 
     /* load picture pointers */
@@ -1264,19 +1262,14 @@ void x264_macroblock_cache_load( x264_t *h, int i_mb_x, int i_mb_y )
         /* load skip */
         if( h->sh.i_type == SLICE_TYPE_B && h->param.b_cabac )
         {
-            memset( h->mb.cache.skip, 0, X264_SCAN8_SIZE * sizeof( int8_t ) );
-            if( i_left_type >= 0 )
-            {
-                uint8_t skipbp = h->mb.skipbp[i_left_xy];
-                h->mb.cache.skip[x264_scan8[0] - 1] = skipbp & 0x2;
-                h->mb.cache.skip[x264_scan8[8] - 1] = skipbp & 0x8;
-            }
-            if( i_top_type >= 0 )
-            {
-                uint8_t skipbp = h->mb.skipbp[i_top_xy];
-                h->mb.cache.skip[x264_scan8[0] - 8] = skipbp & 0x4;
-                h->mb.cache.skip[x264_scan8[4] - 8] = skipbp & 0x8;
-            }
+            uint8_t skipbp;
+            x264_macroblock_cache_skip( h, 0, 0, 4, 4, 0 );
+            skipbp = i_left_type >= 0 ? h->mb.skipbp[i_left_xy] : 0;
+            h->mb.cache.skip[x264_scan8[0] - 1] = skipbp & 0x2;
+            h->mb.cache.skip[x264_scan8[8] - 1] = skipbp & 0x8;
+            skipbp = i_top_type >= 0 ? h->mb.skipbp[i_top_xy] : 0;
+            h->mb.cache.skip[x264_scan8[0] - 8] = skipbp & 0x4;
+            h->mb.cache.skip[x264_scan8[4] - 8] = skipbp & 0x8;
         }
 
         if( h->sh.i_type == SLICE_TYPE_P )
@@ -1300,7 +1293,7 @@ void x264_macroblock_cache_load( x264_t *h, int i_mb_x, int i_mb_y )
 static void ALWAYS_INLINE x264_macroblock_store_pic( x264_t *h, int i)
 {
     int w = i ? 8 : 16;
-    int i_stride = h->fdec->i_stride[i];
+    int i_stride = h->fdec->i_stride[!!i];
     int i_stride2 = i_stride << h->mb.b_interlaced;
     int i_pix_offset = h->mb.b_interlaced
                      ? w * (h->mb.i_mb_x + (h->mb.i_mb_y&~1) * i_stride) + (h->mb.i_mb_y&1) * i_stride