From: Fiona Glaser <fiona@x264.com>
Date: Wed, 14 Apr 2010 21:43:25 +0000 (-0700)
Subject: Prefetch MB data in cache_load
X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=9df61bcc12b3c28e4cd743a2a789ef2f197fc1aa;p=libx264

Prefetch MB data in cache_load
Dramatically reduces L1 cache misses.
~10% faster cache_load.
---

diff --git a/common/macroblock.c b/common/macroblock.c
index 2b0ee647..56bbe090 100644
--- a/common/macroblock.c
+++ b/common/macroblock.c
@@ -941,6 +941,7 @@ static void ALWAYS_INLINE x264_macroblock_load_pic_pointers( x264_t *h, int mb_x
 static void inline x264_macroblock_cache_load_neighbours( x264_t *h, int mb_x, int mb_y )
 {
     int top = (mb_y - (1 << h->mb.b_interlaced)) * h->mb.i_mb_stride + mb_x;
+
     h->mb.i_mb_x = mb_x;
     h->mb.i_mb_y = mb_y;
     h->mb.i_mb_xy = mb_y * h->mb.i_mb_stride + mb_x;
@@ -986,6 +987,16 @@ static void inline x264_macroblock_cache_load_neighbours( x264_t *h, int mb_x, i
 
                 if( !h->param.b_constrained_intra || IS_INTRA( h->mb.i_mb_type_top ) )
                     h->mb.i_neighbour_intra |= MB_TOP;
+
+                /* We only need to prefetch the top blocks because the left was just written
+                 * to as part of the previous cache_save.  Since most target CPUs use write-allocate
+                 * caches, left blocks are near-guaranteed to be in L1 cache.  Top--not so much. */
+                x264_prefetch( &h->mb.cbp[top] );
+                x264_prefetch( h->mb.intra4x4_pred_mode[top] );
+                x264_prefetch( &h->mb.non_zero_count[top][12] );
+                /* These aren't always allocated, but prefetching an invalid address can't hurt. */
+                x264_prefetch( &h->mb.mb_transform_size[top] );
+                x264_prefetch( &h->mb.skipbp[top] );
             }
         }
 
@@ -1025,6 +1036,9 @@ void x264_macroblock_cache_load( x264_t *h, int mb_x, int mb_y )
 
     int left = h->mb.i_mb_left_xy;
     int top  = h->mb.i_mb_top_xy;
+    int top_y = mb_y - (1 << h->mb.b_interlaced);
+    int top_8x8 = (2*top_y+1) * h->mb.i_b8_stride + 2*mb_x;
+    int top_4x4 = (4*top_y+3) * h->mb.i_b4_stride + 4*mb_x;
 
     /* GCC pessimizes direct loads from heap-allocated arrays due to aliasing. */
     /* By only dereferencing them once, we avoid this issue. */
@@ -1079,6 +1093,18 @@ void x264_macroblock_cache_load( x264_t *h, int mb_x, int mb_y )
 
         h->mb.cache.non_zero_count[x264_scan8[16+4+0] - 1] = nnz[left][16+4+1];
         h->mb.cache.non_zero_count[x264_scan8[16+4+2] - 1] = nnz[left][16+4+3];
+
+        /* Finish the prefetching */
+        if( h->sh.i_type != SLICE_TYPE_I )
+            for( int l = 0; l < (h->sh.i_type == SLICE_TYPE_B) + 1; l++ )
+            {
+                x264_prefetch( &h->mb.mv[l][top_4x4-1] );
+                /* Top right being not in the same cacheline as top left will happen
+                 * once every 4 MBs, so one extra prefetch is worthwhile */
+                x264_prefetch( &h->mb.mv[l][top_4x4+4] );
+                x264_prefetch( &h->mb.ref[l][top_8x8-1] );
+                x264_prefetch( &h->mb.mvd[l][top] );
+            }
     }
     else
     {
@@ -1143,11 +1169,8 @@ void x264_macroblock_cache_load( x264_t *h, int mb_x, int mb_y )
     /* load ref/mv/mvd */
     if( h->sh.i_type != SLICE_TYPE_I )
     {
-        const int s8x8 = h->mb.i_b8_stride;
-        const int s4x4 = h->mb.i_b4_stride;
-        const int top_y = mb_y - (1 << h->mb.b_interlaced);
-        const int top_8x8 = (2*top_y+1) * h->mb.i_b8_stride + 2*mb_x;
-        const int top_4x4 = (4*top_y+3) * h->mb.i_b4_stride + 4*mb_x;
+        int s8x8 = h->mb.i_b8_stride;
+        int s4x4 = h->mb.i_b4_stride;
 
         for( int l = 0; l < (h->sh.i_type == SLICE_TYPE_B) + 1; l++ )
         {
diff --git a/common/osdep.h b/common/osdep.h
index f97547f7..4f49d308 100644
--- a/common/osdep.h
+++ b/common/osdep.h
@@ -251,6 +251,22 @@ static int ALWAYS_INLINE x264_ctz( uint32_t x )
 }
 #endif
 
+#if defined(__GNUC__) && defined(HAVE_MMX)
+/* Don't use __builtin_prefetch; even as recent as 4.3.4, GCC seems incapable of
+ * using complex address modes properly unless we use inline asm. */
+static ALWAYS_INLINE void x264_prefetch( void *p )
+{
+    asm volatile( "prefetcht0 %0"::"m"(*(uint8_t*)p) );
+}
+/* We require that prefetch not fault on invalid reads, so we only enable it on
+ * known architectures. */
+#elif defined(__GNUC__) && (__GNUC__ > 3 || __GNUC__ == 3 && __GNUC_MINOR__ > 1) &&\
+      (defined(ARCH_X86) || defined(ARCH_X86_64) || defined(ARCH_ARM) || defined(ARCH_PPC))
+#define x264_prefetch(x) __builtin_prefetch(x)
+#else
+#define x264_prefetch(x)
+#endif
+
 #ifdef USE_REAL_PTHREAD
 #ifdef SYS_MINGW
 #define x264_lower_thread_priority(p)\