From: Fiona Glaser Date: Wed, 14 Apr 2010 21:43:25 +0000 (-0700) Subject: Prefetch MB data in cache_load X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=9df61bcc12b3c28e4cd743a2a789ef2f197fc1aa;p=libx264 Prefetch MB data in cache_load Dramatically reduces L1 cache misses. ~10% faster cache_load. --- diff --git a/common/macroblock.c b/common/macroblock.c index 2b0ee647..56bbe090 100644 --- a/common/macroblock.c +++ b/common/macroblock.c @@ -941,6 +941,7 @@ static void ALWAYS_INLINE x264_macroblock_load_pic_pointers( x264_t *h, int mb_x static void inline x264_macroblock_cache_load_neighbours( x264_t *h, int mb_x, int mb_y ) { int top = (mb_y - (1 << h->mb.b_interlaced)) * h->mb.i_mb_stride + mb_x; + h->mb.i_mb_x = mb_x; h->mb.i_mb_y = mb_y; h->mb.i_mb_xy = mb_y * h->mb.i_mb_stride + mb_x; @@ -986,6 +987,16 @@ static void inline x264_macroblock_cache_load_neighbours( x264_t *h, int mb_x, i if( !h->param.b_constrained_intra || IS_INTRA( h->mb.i_mb_type_top ) ) h->mb.i_neighbour_intra |= MB_TOP; + + /* We only need to prefetch the top blocks because the left was just written + * to as part of the previous cache_save. Since most target CPUs use write-allocate + * caches, left blocks are near-guaranteed to be in L1 cache. Top--not so much. */ + x264_prefetch( &h->mb.cbp[top] ); + x264_prefetch( h->mb.intra4x4_pred_mode[top] ); + x264_prefetch( &h->mb.non_zero_count[top][12] ); + /* These aren't always allocated, but prefetching an invalid address can't hurt. */ + x264_prefetch( &h->mb.mb_transform_size[top] ); + x264_prefetch( &h->mb.skipbp[top] ); } } @@ -1025,6 +1036,9 @@ void x264_macroblock_cache_load( x264_t *h, int mb_x, int mb_y ) int left = h->mb.i_mb_left_xy; int top = h->mb.i_mb_top_xy; + int top_y = mb_y - (1 << h->mb.b_interlaced); + int top_8x8 = (2*top_y+1) * h->mb.i_b8_stride + 2*mb_x; + int top_4x4 = (4*top_y+3) * h->mb.i_b4_stride + 4*mb_x; /* GCC pessimizes direct loads from heap-allocated arrays due to aliasing. */ /* By only dereferencing them once, we avoid this issue. */ @@ -1079,6 +1093,18 @@ void x264_macroblock_cache_load( x264_t *h, int mb_x, int mb_y ) h->mb.cache.non_zero_count[x264_scan8[16+4+0] - 1] = nnz[left][16+4+1]; h->mb.cache.non_zero_count[x264_scan8[16+4+2] - 1] = nnz[left][16+4+3]; + + /* Finish the prefetching */ + if( h->sh.i_type != SLICE_TYPE_I ) + for( int l = 0; l < (h->sh.i_type == SLICE_TYPE_B) + 1; l++ ) + { + x264_prefetch( &h->mb.mv[l][top_4x4-1] ); + /* Top right being not in the same cacheline as top left will happen + * once every 4 MBs, so one extra prefetch is worthwhile */ + x264_prefetch( &h->mb.mv[l][top_4x4+4] ); + x264_prefetch( &h->mb.ref[l][top_8x8-1] ); + x264_prefetch( &h->mb.mvd[l][top] ); + } } else { @@ -1143,11 +1169,8 @@ void x264_macroblock_cache_load( x264_t *h, int mb_x, int mb_y ) /* load ref/mv/mvd */ if( h->sh.i_type != SLICE_TYPE_I ) { - const int s8x8 = h->mb.i_b8_stride; - const int s4x4 = h->mb.i_b4_stride; - const int top_y = mb_y - (1 << h->mb.b_interlaced); - const int top_8x8 = (2*top_y+1) * h->mb.i_b8_stride + 2*mb_x; - const int top_4x4 = (4*top_y+3) * h->mb.i_b4_stride + 4*mb_x; + int s8x8 = h->mb.i_b8_stride; + int s4x4 = h->mb.i_b4_stride; for( int l = 0; l < (h->sh.i_type == SLICE_TYPE_B) + 1; l++ ) { diff --git a/common/osdep.h b/common/osdep.h index f97547f7..4f49d308 100644 --- a/common/osdep.h +++ b/common/osdep.h @@ -251,6 +251,22 @@ static int ALWAYS_INLINE x264_ctz( uint32_t x ) } #endif +#if defined(__GNUC__) && defined(HAVE_MMX) +/* Don't use __builtin_prefetch; even as recent as 4.3.4, GCC seems incapable of + * using complex address modes properly unless we use inline asm. */ +static ALWAYS_INLINE void x264_prefetch( void *p ) +{ + asm volatile( "prefetcht0 %0"::"m"(*(uint8_t*)p) ); +} +/* We require that prefetch not fault on invalid reads, so we only enable it on + * known architectures. */ +#elif defined(__GNUC__) && (__GNUC__ > 3 || __GNUC__ == 3 && __GNUC_MINOR__ > 1) &&\ + (defined(ARCH_X86) || defined(ARCH_X86_64) || defined(ARCH_ARM) || defined(ARCH_PPC)) +#define x264_prefetch(x) __builtin_prefetch(x) +#else +#define x264_prefetch(x) +#endif + #ifdef USE_REAL_PTHREAD #ifdef SYS_MINGW #define x264_lower_thread_priority(p)\