From: Fiona Glaser <fiona@x264.com>
Date: Sun, 23 Feb 2014 18:36:55 +0000 (-0800)
Subject: Macroblock tree overhaul/optimization
X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=b3fb718404d6cce9c82987ea2909cda5072d040c;p=libx264

Macroblock tree overhaul/optimization

Move the second core part of macroblock tree into an assembly function;
SIMD-optimize roughly half of it (for x86). Roughly ~25-65% faster mbtree,
depending on content.

Slightly change how mbtree handles the tradeoff between range and precision
for propagation.

Overall a slight (but mostly negligible) effect on SSIM and ~2% faster.
---

diff --git a/common/macroblock.c b/common/macroblock.c
index 8437b729..8494bfe1 100644
--- a/common/macroblock.c
+++ b/common/macroblock.c
@@ -389,7 +389,7 @@ int x264_macroblock_thread_allocate( x264_t *h, int b_lookahead )
             ((me_range*2+24) * sizeof(int16_t) + (me_range+4) * (me_range+1) * 4 * sizeof(mvsad_t));
         scratch_size = X264_MAX3( buf_hpel, buf_ssim, buf_tesa );
     }
-    int buf_mbtree = h->param.rc.b_mb_tree * ((h->mb.i_mb_width+7)&~7) * sizeof(int);
+    int buf_mbtree = h->param.rc.b_mb_tree * ((h->mb.i_mb_width+7)&~7) * sizeof(int16_t);
     scratch_size = X264_MAX( scratch_size, buf_mbtree );
     if( scratch_size )
         CHECKED_MALLOC( h->scratch_buffer, scratch_size );
@@ -397,7 +397,9 @@ int x264_macroblock_thread_allocate( x264_t *h, int b_lookahead )
         h->scratch_buffer = NULL;
 
     int buf_lookahead_threads = (h->mb.i_mb_height + (4 + 32) * h->param.i_lookahead_threads) * sizeof(int) * 2;
-    CHECKED_MALLOC( h->scratch_buffer2, buf_lookahead_threads );
+    int buf_mbtree2 = buf_mbtree * 12; /* size of the internal propagate_list asm buffer */
+    scratch_size = X264_MAX( buf_lookahead_threads, buf_mbtree2 );
+    CHECKED_MALLOC( h->scratch_buffer2, scratch_size );
 
     return 0;
 fail:
diff --git a/common/mc.c b/common/mc.c
index 71474965..6797f0ac 100644
--- a/common/mc.c
+++ b/common/mc.c
@@ -483,20 +483,97 @@ static void frame_init_lowres_core( pixel *src0, pixel *dst0, pixel *dsth, pixel
 
 /* Estimate the total amount of influence on future quality that could be had if we
  * were to improve the reference samples used to inter predict any given macroblock. */
-static void mbtree_propagate_cost( int *dst, uint16_t *propagate_in, uint16_t *intra_costs,
+static void mbtree_propagate_cost( int16_t *dst, uint16_t *propagate_in, uint16_t *intra_costs,
                                    uint16_t *inter_costs, uint16_t *inv_qscales, float *fps_factor, int len )
 {
     float fps = *fps_factor;
     for( int i = 0; i < len; i++ )
     {
-        float intra_cost       = intra_costs[i] * inv_qscales[i];
-        float propagate_amount = propagate_in[i] + intra_cost*fps;
-        float propagate_num    = intra_costs[i] - (inter_costs[i] & LOWRES_COST_MASK);
-        float propagate_denom  = intra_costs[i];
-        dst[i] = (int)(propagate_amount * propagate_num / propagate_denom + 0.5f);
+        int intra_cost = intra_costs[i];
+        int inter_cost = X264_MIN(intra_costs[i], inter_costs[i] & LOWRES_COST_MASK);
+        float propagate_intra  = intra_cost * inv_qscales[i];
+        float propagate_amount = propagate_in[i] + propagate_intra*fps;
+        float propagate_num    = intra_cost - inter_cost;
+        float propagate_denom  = intra_cost;
+        dst[i] = X264_MIN((int)(propagate_amount * propagate_num / propagate_denom + 0.5f), 32767);
     }
 }
 
+static void mbtree_propagate_list( x264_t *h, uint16_t *ref_costs, int16_t (*mvs)[2],
+                                   int16_t *propagate_amount, uint16_t *lowres_costs,
+                                   int bipred_weight, int mb_y, int len, int list )
+{
+    unsigned stride = h->mb.i_mb_stride;
+    unsigned width = h->mb.i_mb_width;
+    unsigned height = h->mb.i_mb_height;
+
+    for( unsigned i = 0; i < len; i++ )
+    {
+#define CLIP_ADD(s,x) (s) = X264_MIN((s)+(x),(1<<15)-1)
+        int lists_used = lowres_costs[i]>>LOWRES_COST_SHIFT;
+
+        if( !(lists_used & (1 << list)) )
+            continue;
+
+        int listamount = propagate_amount[i];
+        /* Apply bipred weighting. */
+        if( lists_used == 3 )
+            listamount = (listamount * bipred_weight + 32) >> 6;
+
+        /* Early termination for simple case of mv0. */
+        if( !M32( mvs[i] ) )
+        {
+            CLIP_ADD( ref_costs[mb_y*stride + i], listamount );
+            continue;
+        }
+
+        int x = mvs[i][0];
+        int y = mvs[i][1];
+        unsigned mbx = (x>>5)+i;
+        unsigned mby = (y>>5)+mb_y;
+        unsigned idx0 = mbx + mby * stride;
+        unsigned idx2 = idx0 + stride;
+        x &= 31;
+        y &= 31;
+        int idx0weight = (32-y)*(32-x);
+        int idx1weight = (32-y)*x;
+        int idx2weight = y*(32-x);
+        int idx3weight = y*x;
+        idx0weight = (idx0weight * listamount + 512) >> 10;
+        idx1weight = (idx1weight * listamount + 512) >> 10;
+        idx2weight = (idx2weight * listamount + 512) >> 10;
+        idx3weight = (idx3weight * listamount + 512) >> 10;
+
+        if( mbx < width-1 && mby < height-1 )
+        {
+            CLIP_ADD( ref_costs[idx0+0], idx0weight );
+            CLIP_ADD( ref_costs[idx0+1], idx1weight );
+            CLIP_ADD( ref_costs[idx2+0], idx2weight );
+            CLIP_ADD( ref_costs[idx2+1], idx3weight );
+        }
+        else
+        {
+            /* Note: this takes advantage of unsigned representation to
+             * catch negative mbx/mby. */
+            if( mby < height )
+            {
+                if( mbx < width )
+                    CLIP_ADD( ref_costs[idx0+0], idx0weight );
+                if( mbx+1 < width )
+                    CLIP_ADD( ref_costs[idx0+1], idx1weight );
+            }
+            if( mby+1 < height )
+            {
+                if( mbx < width )
+                    CLIP_ADD( ref_costs[idx2+0], idx2weight );
+                if( mbx+1 < width )
+                    CLIP_ADD( ref_costs[idx2+1], idx3weight );
+            }
+        }
+    }
+#undef CLIP_ADD
+}
+
 void x264_mc_init( int cpu, x264_mc_functions_t *pf, int cpu_independent )
 {
     pf->mc_luma   = mc_luma;
@@ -552,6 +629,7 @@ void x264_mc_init( int cpu, x264_mc_functions_t *pf, int cpu_independent )
     pf->integral_init8v = integral_init8v;
 
     pf->mbtree_propagate_cost = mbtree_propagate_cost;
+    pf->mbtree_propagate_list = mbtree_propagate_list;
 
 #if HAVE_MMX
     x264_mc_init_mmx( cpu, pf );
@@ -565,7 +643,10 @@ void x264_mc_init( int cpu, x264_mc_functions_t *pf, int cpu_independent )
 #endif
 
     if( cpu_independent )
+    {
         pf->mbtree_propagate_cost = mbtree_propagate_cost;
+        pf->mbtree_propagate_list = mbtree_propagate_list;
+    }
 }
 
 void x264_frame_filter( x264_t *h, x264_frame_t *frame, int mb_y, int b_end )
diff --git a/common/mc.h b/common/mc.h
index 054ba60e..1e97499a 100644
--- a/common/mc.h
+++ b/common/mc.h
@@ -122,8 +122,12 @@ typedef struct
     weight_fn_t *offsetsub;
     void (*weight_cache)( x264_t *, x264_weight_t * );
 
-    void (*mbtree_propagate_cost)( int *dst, uint16_t *propagate_in, uint16_t *intra_costs,
+    void (*mbtree_propagate_cost)( int16_t *dst, uint16_t *propagate_in, uint16_t *intra_costs,
                                    uint16_t *inter_costs, uint16_t *inv_qscales, float *fps_factor, int len );
+
+    void (*mbtree_propagate_list)( x264_t *h, uint16_t *ref_costs, int16_t (*mvs)[2],
+                                   int16_t *propagate_amount, uint16_t *lowres_costs,
+                                   int bipred_weight, int mb_y, int len, int list );
 } x264_mc_functions_t;
 
 void x264_mc_init( int cpu, x264_mc_functions_t *pf, int cpu_independent );
diff --git a/common/x86/const-a.asm b/common/x86/const-a.asm
index fafe567d..19c55b23 100644
--- a/common/x86/const-a.asm
+++ b/common/x86/const-a.asm
@@ -36,6 +36,7 @@ const pw_32,       times 16 dw 32
 const pw_512,      times 16 dw 512
 const pw_00ff,     times 16 dw 0x00ff
 const pw_pixel_max,times 16 dw ((1 << BIT_DEPTH)-1)
+const pw_0to15,    dw 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
 const pd_1,        times 8 dd 1
 const deinterleave_shufd, dd 0,4,1,5,2,6,3,7
 const pb_unpackbd1, times 2 db 0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3
diff --git a/common/x86/mc-a2.asm b/common/x86/mc-a2.asm
index cc70ed12..d56ad4f6 100644
--- a/common/x86/mc-a2.asm
+++ b/common/x86/mc-a2.asm
@@ -32,6 +32,7 @@
 
 SECTION_RODATA 32
 
+pw_1024: times 16 dw 1024
 filt_mul20: times 32 db 20
 filt_mul15: times 16 db 1, -5
 filt_mul51: times 16 db -5, 1
@@ -56,8 +57,6 @@ deinterleave_shuf32a: db 0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30
 deinterleave_shuf32b: db 1,3,5,7,9,11,13,15,17,19,21,23,25,27,29,31
 %endif ; !HIGH_BIT_DEPTH
 
-pw_1024: times 16 dw 1024
-
 pd_16: times 4 dd 16
 pd_0f: times 4 dd 0xffff
 
@@ -70,16 +69,22 @@ tap1: times 4 dw  1, -5
 tap2: times 4 dw 20, 20
 tap3: times 4 dw -5,  1
 
+pw_0xc000: times 8 dw 0xc000
+pw_31: times 8 dw 31
+pd_4: times 4 dd 4
+
 SECTION .text
 
 cextern pb_0
 cextern pw_1
+cextern pw_8
 cextern pw_16
 cextern pw_32
 cextern pw_512
 cextern pw_00ff
 cextern pw_3fff
 cextern pw_pixel_max
+cextern pw_0to15
 cextern pd_ffff
 
 %macro LOAD_ADD 4
@@ -1986,7 +1991,7 @@ FRAME_INIT_LOWRES
 cglobal mbtree_propagate_cost, 6,6,7
     movss     m6, [r5]
     mov      r5d, r6m
-    lea       r0, [r0+r5*4]
+    lea       r0, [r0+r5*2]
     add      r5d, r5d
     add       r1, r5
     add       r2, r5
@@ -2001,10 +2006,11 @@ cglobal mbtree_propagate_cost, 6,6,7
     movq      m0, [r4+r5] ; invq
     movq      m3, [r3+r5] ; inter
     movq      m1, [r1+r5] ; prop
+    pand      m3, m5
+    pminsw    m3, m2
     punpcklwd m2, m4
     punpcklwd m0, m4
     pmaddwd   m0, m2
-    pand      m3, m5
     punpcklwd m1, m4
     punpcklwd m3, m4
 %if cpuflag(fma4)
@@ -2037,7 +2043,8 @@ cglobal mbtree_propagate_cost, 6,6,7
     mulps     m0, m3    ; / intra
 %endif
     cvtps2dq  m0, m0
-    mova [r0+r5*2], m0
+    packssdw  m0, m0
+    movh [r0+r5], m0
     add       r5, 8
     jl .loop
     RET
@@ -2060,7 +2067,7 @@ MBTREE
 cglobal mbtree_propagate_cost, 6,6,%1
     vbroadcastss m6, [r5]
     mov         r5d, r6m
-    lea          r0, [r0+r5*4]
+    lea          r0, [r0+r5*2]
     add         r5d, r5d
     add          r1, r5
     add          r2, r5
@@ -2078,6 +2085,7 @@ cglobal mbtree_propagate_cost, 6,6,%1
     pmovzxwd     m2, [r1+r5]      ; prop
     pand        xm3, xm5, [r3+r5] ; inter
     pmovzxwd     m3, xm3
+    pminsd       m3, m0
     pmaddwd      m1, m0
     psubd        m4, m0, m3
     cvtdq2ps     m0, m0
@@ -2096,6 +2104,7 @@ cglobal mbtree_propagate_cost, 6,6,%1
     movu        xm1, [r4+r5]
     movu        xm2, [r1+r5]
     pand        xm3, xm5, [r3+r5]
+    pminsw      xm3, xm0
     INT16_UNPACK 0
     INT16_UNPACK 1
     INT16_UNPACK 2
@@ -2117,7 +2126,9 @@ cglobal mbtree_propagate_cost, 6,6,%1
     mulps        m1, m3         ; / intra
 %endif
     vcvtps2dq    m1, m1
-    mova  [r0+r5*2], m1
+    vextractf128 xm2, m1, 1
+    packssdw    xm1, xm2
+    mova    [r0+r5], xm1
     add          r5, 16
     jl .loop
     RET
@@ -2127,3 +2138,95 @@ INIT_YMM avx
 MBTREE_AVX 8
 INIT_YMM avx2,fma3
 MBTREE_AVX 7
+
+%macro MBTREE_PROPAGATE_LIST 0
+;-----------------------------------------------------------------------------
+; void mbtree_propagate_list_internal( int16_t (*mvs)[2], int *propagate_amount, uint16_t *lowres_costs,
+;                                      int16_t *output, int bipred_weight, int mb_y, int len )
+;-----------------------------------------------------------------------------
+cglobal mbtree_propagate_list_internal, 4,6,8
+    movh     m6, [pw_0to15] ; mb_x
+    movd     m7, r5m
+    pshuflw  m7, m7, 0
+    punpcklwd m6, m7       ; 0 y 1 y 2 y 3 y
+    movd     m7, r4m
+    SPLATW   m7, m7        ; bipred_weight
+    psllw    m7, 9         ; bipred_weight << 9
+
+    mov     r5d, r6m
+    xor     r4d, r4d
+.loop:
+    mova     m3, [r1+r4*2]
+    movu     m4, [r2+r4*2]
+    mova     m5, [pw_0xc000]
+    pand     m4, m5
+    pcmpeqw  m4, m5
+    pmulhrsw m5, m3, m7    ; propagate_amount = (propagate_amount * bipred_weight + 32) >> 6
+%if cpuflag(avx)
+    pblendvb m5, m3, m5, m4
+%else
+    pand     m5, m4
+    pandn    m4, m3
+    por      m5, m4        ; if( lists_used == 3 )
+                           ;     propagate_amount = (propagate_amount * bipred_weight + 32) >> 6
+%endif
+
+    movu     m0, [r0+r4*4] ; x,y
+    movu     m1, [r0+r4*4+mmsize]
+
+    psraw    m2, m0, 5
+    psraw    m3, m1, 5
+    mova     m4, [pd_4]
+    paddw    m2, m6        ; {mbx, mby} = ({x,y}>>5)+{h->mb.i_mb_x,h->mb.i_mb_y}
+    paddw    m6, m4        ; {mbx, mby} += {4, 0}
+    paddw    m3, m6        ; {mbx, mby} = ({x,y}>>5)+{h->mb.i_mb_x,h->mb.i_mb_y}
+    paddw    m6, m4        ; {mbx, mby} += {4, 0}
+
+    mova [r3+mmsize*0], m2
+    mova [r3+mmsize*1], m3
+
+    mova     m3, [pw_31]
+    pand     m0, m3        ; x &= 31
+    pand     m1, m3        ; y &= 31
+    packuswb m0, m1
+    psrlw    m1, m0, 3
+    pand     m0, m3        ; x
+    SWAP      1, 3
+    pandn    m1, m3        ; y premultiplied by (1<<5) for later use of pmulhrsw
+
+    mova     m3, [pw_32]
+    psubw    m3, m0        ; 32 - x
+    mova     m4, [pw_1024]
+    psubw    m4, m1        ; (32 - y) << 5
+
+    pmullw   m2, m3, m4    ; idx0weight = (32-y)*(32-x) << 5
+    pmullw   m4, m0        ; idx1weight = (32-y)*x << 5
+    pmullw   m0, m1        ; idx3weight = y*x << 5
+    pmullw   m1, m3        ; idx2weight = y*(32-x) << 5
+
+    ; avoid overflow in the input to pmulhrsw
+    psrlw    m3, m2, 15
+    psubw    m2, m3        ; idx0weight -= (idx0weight == 32768)
+
+    pmulhrsw m2, m5        ; idx0weight * propagate_amount + 512 >> 10
+    pmulhrsw m4, m5        ; idx1weight * propagate_amount + 512 >> 10
+    pmulhrsw m1, m5        ; idx2weight * propagate_amount + 512 >> 10
+    pmulhrsw m0, m5        ; idx3weight * propagate_amount + 512 >> 10
+
+    SBUTTERFLY wd, 2, 4, 3
+    SBUTTERFLY wd, 1, 0, 3
+    mova [r3+mmsize*2], m2
+    mova [r3+mmsize*3], m4
+    mova [r3+mmsize*4], m1
+    mova [r3+mmsize*5], m0
+    add     r4d, mmsize/2
+    add      r3, mmsize*6
+    cmp     r4d, r5d
+    jl .loop
+    REP_RET
+%endmacro
+
+INIT_XMM ssse3
+MBTREE_PROPAGATE_LIST
+INIT_XMM avx
+MBTREE_PROPAGATE_LIST
diff --git a/common/x86/mc-c.c b/common/x86/mc-c.c
index d38fcc16..9bd990ce 100644
--- a/common/x86/mc-c.c
+++ b/common/x86/mc-c.c
@@ -161,13 +161,13 @@ void x264_integral_init4v_avx2( uint16_t *sum8, uint16_t *sum4, intptr_t stride
 void x264_integral_init8v_mmx ( uint16_t *sum8, intptr_t stride );
 void x264_integral_init8v_sse2( uint16_t *sum8, intptr_t stride );
 void x264_integral_init8v_avx2( uint16_t *sum8, intptr_t stride );
-void x264_mbtree_propagate_cost_sse2( int *dst, uint16_t *propagate_in, uint16_t *intra_costs,
+void x264_mbtree_propagate_cost_sse2( int16_t *dst, uint16_t *propagate_in, uint16_t *intra_costs,
                                       uint16_t *inter_costs, uint16_t *inv_qscales, float *fps_factor, int len );
-void x264_mbtree_propagate_cost_avx ( int *dst, uint16_t *propagate_in, uint16_t *intra_costs,
+void x264_mbtree_propagate_cost_avx ( int16_t *dst, uint16_t *propagate_in, uint16_t *intra_costs,
                                       uint16_t *inter_costs, uint16_t *inv_qscales, float *fps_factor, int len );
-void x264_mbtree_propagate_cost_fma4( int *dst, uint16_t *propagate_in, uint16_t *intra_costs,
+void x264_mbtree_propagate_cost_fma4( int16_t *dst, uint16_t *propagate_in, uint16_t *intra_costs,
                                       uint16_t *inter_costs, uint16_t *inv_qscales, float *fps_factor, int len );
-void x264_mbtree_propagate_cost_avx2_fma3( int *dst, uint16_t *propagate_in, uint16_t *intra_costs,
+void x264_mbtree_propagate_cost_avx2_fma3( int16_t *dst, uint16_t *propagate_in, uint16_t *intra_costs,
                                            uint16_t *inter_costs, uint16_t *inv_qscales, float *fps_factor, int len );
 
 #define MC_CHROMA(cpu)\
@@ -533,6 +533,113 @@ PLANE_INTERLEAVE(sse2)
 PLANE_INTERLEAVE(avx)
 #endif
 
+#if HAVE_X86_INLINE_ASM
+#define CLIP_ADD(s,x)\
+do\
+{\
+    int temp;\
+    asm("movd       %0, %%xmm0     \n"\
+        "movd       %2, %%xmm1     \n"\
+        "paddsw %%xmm1, %%xmm0     \n"\
+        "movd   %%xmm0, %1         \n"\
+        :"+m"(s), "=&r"(temp)\
+        :"m"(x)\
+    );\
+    s = temp;\
+} while(0)
+
+#define CLIP_ADD2(s,x)\
+do\
+{\
+    asm("movd       %0, %%xmm0     \n"\
+        "movd       %1, %%xmm1     \n"\
+        "paddsw %%xmm1, %%xmm0     \n"\
+        "movd   %%xmm0, %0         \n"\
+        :"+m"(M32(s))\
+        :"m"(M32(x))\
+    );\
+} while(0)
+#else
+#define CLIP_ADD(s,x) (s) = X264_MIN((s)+(x),(1<<15)-1)
+#define CLIP_ADD2(s,x)\
+do\
+{\
+    CLIP_ADD((s)[0], (x)[0]);\
+    CLIP_ADD((s)[1], (x)[1]);\
+} while(0)
+#endif
+
+#define PROPAGATE_LIST(cpu)\
+void x264_mbtree_propagate_list_internal_##cpu( int16_t (*mvs)[2], int16_t *propagate_amount,\
+                                                uint16_t *lowres_costs, int16_t *output,\
+                                                int bipred_weight, int mb_y, int len );\
+\
+static void x264_mbtree_propagate_list_##cpu( x264_t *h, uint16_t *ref_costs, int16_t (*mvs)[2],\
+                                              int16_t *propagate_amount, uint16_t *lowres_costs,\
+                                              int bipred_weight, int mb_y, int len, int list )\
+{\
+    int16_t *current = h->scratch_buffer2;\
+\
+    x264_mbtree_propagate_list_internal_##cpu( mvs, propagate_amount, lowres_costs,\
+                                               current, bipred_weight, mb_y, len );\
+\
+    unsigned stride = h->mb.i_mb_stride;\
+    unsigned width = h->mb.i_mb_width;\
+    unsigned height = h->mb.i_mb_height;\
+\
+    for( unsigned i = 0; i < len; current += 32 )\
+    {\
+        int end = X264_MIN( i+8, len );\
+        for( ; i < end; i++, current += 2 )\
+        {\
+            if( !(lowres_costs[i] & (1 << (list+LOWRES_COST_SHIFT))) )\
+                continue;\
+\
+            unsigned mbx = current[0];\
+            unsigned mby = current[1];\
+            unsigned idx0 = mbx + mby * stride;\
+            unsigned idx2 = idx0 + stride;\
+\
+            /* Shortcut for the simple/common case of zero MV */\
+            if( !M32( mvs[i] ) )\
+            {\
+                CLIP_ADD( ref_costs[idx0], current[16] );\
+                continue;\
+            }\
+\
+            if( mbx < width-1 && mby < height-1 )\
+            {\
+                CLIP_ADD2( ref_costs+idx0, current+16 );\
+                CLIP_ADD2( ref_costs+idx2, current+32 );\
+            }\
+            else\
+            {\
+                /* Note: this takes advantage of unsigned representation to\
+                 * catch negative mbx/mby. */\
+                if( mby < height )\
+                {\
+                    if( mbx < width )\
+                        CLIP_ADD( ref_costs[idx0+0], current[16] );\
+                    if( mbx+1 < width )\
+                        CLIP_ADD( ref_costs[idx0+1], current[17] );\
+                }\
+                if( mby+1 < height )\
+                {\
+                    if( mbx < width )\
+                        CLIP_ADD( ref_costs[idx2+0], current[32] );\
+                    if( mbx+1 < width )\
+                        CLIP_ADD( ref_costs[idx2+1], current[33] );\
+                }\
+            }\
+        }\
+    }\
+}
+
+PROPAGATE_LIST(ssse3)
+PROPAGATE_LIST(avx)
+#undef CLIP_ADD
+#undef CLIP_ADD2
+
 void x264_mc_init_mmx( int cpu, x264_mc_functions_t *pf )
 {
     if( !(cpu&X264_CPU_MMX) )
@@ -645,6 +752,7 @@ void x264_mc_init_mmx( int cpu, x264_mc_functions_t *pf )
 
     pf->frame_init_lowres_core = x264_frame_init_lowres_core_ssse3;
     pf->plane_copy_deinterleave_v210 = x264_plane_copy_deinterleave_v210_ssse3;
+    pf->mbtree_propagate_list = x264_mbtree_propagate_list_ssse3;
 
     if( !(cpu&(X264_CPU_SLOW_SHUFFLE|X264_CPU_SLOW_ATOM|X264_CPU_SLOW_PALIGNR)) )
         pf->integral_init4v = x264_integral_init4v_ssse3;
@@ -748,6 +856,7 @@ void x264_mc_init_mmx( int cpu, x264_mc_functions_t *pf )
     pf->avg[PIXEL_4x4]   = x264_pixel_avg_4x4_ssse3;
     pf->avg[PIXEL_4x2]   = x264_pixel_avg_4x2_ssse3;
     pf->plane_copy_deinterleave_rgb = x264_plane_copy_deinterleave_rgb_ssse3;
+    pf->mbtree_propagate_list = x264_mbtree_propagate_list_ssse3;
 
     if( !(cpu&X264_CPU_SLOW_PSHUFB) )
     {
@@ -824,6 +933,7 @@ void x264_mc_init_mmx( int cpu, x264_mc_functions_t *pf )
         return;
     pf->memzero_aligned = x264_memzero_aligned_avx;
     pf->mbtree_propagate_cost = x264_mbtree_propagate_cost_avx;
+    pf->mbtree_propagate_list = x264_mbtree_propagate_list_avx;
 
     if( cpu&X264_CPU_FMA4 )
         pf->mbtree_propagate_cost = x264_mbtree_propagate_cost_fma4;
diff --git a/common/x86/predict-a.asm b/common/x86/predict-a.asm
index c0fabfa8..98768442 100644
--- a/common/x86/predict-a.asm
+++ b/common/x86/predict-a.asm
@@ -31,7 +31,6 @@
 
 SECTION_RODATA 32
 
-pw_0to15:    dw 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
 pw_43210123: times 2 dw -3, -2, -1, 0, 1, 2, 3, 4
 pw_m3:       times 16 dw -3
 pw_m7:       times 16 dw -7
@@ -56,6 +55,7 @@ cextern pw_8
 cextern pw_16
 cextern pw_00ff
 cextern pw_pixel_max
+cextern pw_0to15
 
 %macro STORE8 1
     mova [r0+0*FDEC_STRIDEB], %1
diff --git a/encoder/slicetype.c b/encoder/slicetype.c
index b3da7420..0f4d831a 100644
--- a/encoder/slicetype.c
+++ b/encoder/slicetype.c
@@ -1022,9 +1022,12 @@ static int x264_slicetype_frame_cost_recalculate( x264_t *h, x264_frame_t **fram
     return i_score;
 }
 
+/* Trade off precision in mbtree for increased range */
+#define MBTREE_PRECISION 0.5f
+
 static void x264_macroblock_tree_finish( x264_t *h, x264_frame_t *frame, float average_duration, int ref0_distance )
 {
-    int fps_factor = round( CLIP_DURATION(average_duration) / CLIP_DURATION(frame->f_duration) * 256 );
+    int fps_factor = round( CLIP_DURATION(average_duration) / CLIP_DURATION(frame->f_duration) * 256 / MBTREE_PRECISION );
     float weightdelta = 0.0;
     if( ref0_distance && frame->f_weighted_cost_delta[ref0_distance-1] > 0 )
         weightdelta = (1.0 - frame->f_weighted_cost_delta[ref0_distance-1]);
@@ -1051,11 +1054,12 @@ static void x264_macroblock_tree_propagate( x264_t *h, x264_frame_t **frames, fl
     int i_bipred_weight = h->param.analyse.b_weighted_bipred ? 64 - (dist_scale_factor>>2) : 32;
     int16_t (*mvs[2])[2] = { frames[b]->lowres_mvs[0][b-p0-1], frames[b]->lowres_mvs[1][p1-b-1] };
     int bipred_weights[2] = {i_bipred_weight, 64 - i_bipred_weight};
-    int *buf = h->scratch_buffer;
+    int16_t *buf = h->scratch_buffer;
     uint16_t *propagate_cost = frames[b]->i_propagate_cost;
+    uint16_t *lowres_costs = frames[b]->lowres_costs[b-p0][p1-b];
 
     x264_emms();
-    float fps_factor = CLIP_DURATION(frames[b]->f_duration) / (CLIP_DURATION(average_duration) * 256.0f);
+    float fps_factor = CLIP_DURATION(frames[b]->f_duration) / (CLIP_DURATION(average_duration) * 256.0f) * MBTREE_PRECISION;
 
     /* For non-reffed frames the source costs are always zero, so just memset one row and re-use it. */
     if( !referenced )
@@ -1065,72 +1069,17 @@ static void x264_macroblock_tree_propagate( x264_t *h, x264_frame_t **frames, fl
     {
         int mb_index = h->mb.i_mb_y*h->mb.i_mb_stride;
         h->mc.mbtree_propagate_cost( buf, propagate_cost,
-            frames[b]->i_intra_cost+mb_index, frames[b]->lowres_costs[b-p0][p1-b]+mb_index,
+            frames[b]->i_intra_cost+mb_index, lowres_costs+mb_index,
             frames[b]->i_inv_qscale_factor+mb_index, &fps_factor, h->mb.i_mb_width );
         if( referenced )
             propagate_cost += h->mb.i_mb_width;
-        for( h->mb.i_mb_x = 0; h->mb.i_mb_x < h->mb.i_mb_width; h->mb.i_mb_x++, mb_index++ )
+
+        h->mc.mbtree_propagate_list( h, ref_costs[0], &mvs[0][mb_index], buf, &lowres_costs[mb_index],
+                                     bipred_weights[0], h->mb.i_mb_y, h->mb.i_mb_width, 0 );
+        if( b != p1 )
         {
-            int propagate_amount = buf[h->mb.i_mb_x];
-            /* Don't propagate for an intra block. */
-            if( propagate_amount > 0 )
-            {
-                /* Access width-2 bitfield. */
-                int lists_used = frames[b]->lowres_costs[b-p0][p1-b][mb_index] >> LOWRES_COST_SHIFT;
-                /* Follow the MVs to the previous frame(s). */
-                for( int list = 0; list < 2; list++ )
-                    if( (lists_used >> list)&1 )
-                    {
-#define CLIP_ADD(s,x) (s) = X264_MIN((s)+(x),(1<<16)-1)
-                        int listamount = propagate_amount;
-                        /* Apply bipred weighting. */
-                        if( lists_used == 3 )
-                            listamount = (listamount * bipred_weights[list] + 32) >> 6;
-
-                        /* Early termination for simple case of mv0. */
-                        if( !M32( mvs[list][mb_index] ) )
-                        {
-                            CLIP_ADD( ref_costs[list][mb_index], listamount );
-                            continue;
-                        }
-
-                        int x = mvs[list][mb_index][0];
-                        int y = mvs[list][mb_index][1];
-                        int mbx = (x>>5)+h->mb.i_mb_x;
-                        int mby = (y>>5)+h->mb.i_mb_y;
-                        int idx0 = mbx + mby * h->mb.i_mb_stride;
-                        int idx1 = idx0 + 1;
-                        int idx2 = idx0 + h->mb.i_mb_stride;
-                        int idx3 = idx0 + h->mb.i_mb_stride + 1;
-                        x &= 31;
-                        y &= 31;
-                        int idx0weight = (32-y)*(32-x);
-                        int idx1weight = (32-y)*x;
-                        int idx2weight = y*(32-x);
-                        int idx3weight = y*x;
-
-                        /* We could just clip the MVs, but pixels that lie outside the frame probably shouldn't
-                         * be counted. */
-                        if( mbx < h->mb.i_mb_width-1 && mby < h->mb.i_mb_height-1 && mbx >= 0 && mby >= 0 )
-                        {
-                            CLIP_ADD( ref_costs[list][idx0], (listamount*idx0weight+512)>>10 );
-                            CLIP_ADD( ref_costs[list][idx1], (listamount*idx1weight+512)>>10 );
-                            CLIP_ADD( ref_costs[list][idx2], (listamount*idx2weight+512)>>10 );
-                            CLIP_ADD( ref_costs[list][idx3], (listamount*idx3weight+512)>>10 );
-                        }
-                        else /* Check offsets individually */
-                        {
-                            if( mbx < h->mb.i_mb_width && mby < h->mb.i_mb_height && mbx >= 0 && mby >= 0 )
-                                CLIP_ADD( ref_costs[list][idx0], (listamount*idx0weight+512)>>10 );
-                            if( mbx+1 < h->mb.i_mb_width && mby < h->mb.i_mb_height && mbx+1 >= 0 && mby >= 0 )
-                                CLIP_ADD( ref_costs[list][idx1], (listamount*idx1weight+512)>>10 );
-                            if( mbx < h->mb.i_mb_width && mby+1 < h->mb.i_mb_height && mbx >= 0 && mby+1 >= 0 )
-                                CLIP_ADD( ref_costs[list][idx2], (listamount*idx2weight+512)>>10 );
-                            if( mbx+1 < h->mb.i_mb_width && mby+1 < h->mb.i_mb_height && mbx+1 >= 0 && mby+1 >= 0 )
-                                CLIP_ADD( ref_costs[list][idx3], (listamount*idx3weight+512)>>10 );
-                        }
-                    }
-            }
+            h->mc.mbtree_propagate_list( h, ref_costs[1], &mvs[1][mb_index], buf, &lowres_costs[mb_index],
+                                         bipred_weights[1], h->mb.i_mb_y, h->mb.i_mb_width, 1 );
         }
     }
 
diff --git a/tools/checkasm.c b/tools/checkasm.c
index adb73d51..f72b7a00 100644
--- a/tools/checkasm.c
+++ b/tools/checkasm.c
@@ -1598,16 +1598,17 @@ static int check_mc( int cpu_ref, int cpu_new )
     INTEGRAL_INIT( integral_init8v, 9, sum, stride );
     report( "integral init :" );
 
+    ok = 1; used_asm = 0;
     if( mc_a.mbtree_propagate_cost != mc_ref.mbtree_propagate_cost )
     {
-        ok = 1; used_asm = 1;
+        used_asm = 1;
         x264_emms();
         for( int i = 0; i < 10; i++ )
         {
             float fps_factor = (rand()&65535) / 65535.0f;
-            set_func_name( "mbtree_propagate" );
-            int *dsta = (int*)buf3;
-            int *dstc = dsta+400;
+            set_func_name( "mbtree_propagate_cost" );
+            int16_t *dsta = (int16_t*)buf3;
+            int16_t *dstc = dsta+400;
             uint16_t *prop = (uint16_t*)buf1;
             uint16_t *intra = (uint16_t*)buf4;
             uint16_t *inter = intra+128;
@@ -1629,12 +1630,60 @@ static int check_mc( int cpu_ref, int cpu_new )
             {
                 ok &= abs( dstc[j]-dsta[j] ) <= 1 || fabs( (double)dstc[j]/dsta[j]-1 ) < 1e-4;
                 if( !ok )
-                    fprintf( stderr, "mbtree_propagate FAILED: %f !~= %f\n", (double)dstc[j], (double)dsta[j] );
+                    fprintf( stderr, "mbtree_propagate_cost FAILED: %f !~= %f\n", (double)dstc[j], (double)dsta[j] );
             }
         }
-        report( "mbtree propagate :" );
     }
 
+    if( mc_a.mbtree_propagate_list != mc_ref.mbtree_propagate_list )
+    {
+        used_asm = 1;
+        for( int i = 0; i < 8; i++ )
+        {
+            set_func_name( "mbtree_propagate_list" );
+            x264_t h;
+            int height = 4;
+            int width = 128;
+            int size = width*height;
+            h.mb.i_mb_stride = width;
+            h.mb.i_mb_width = width;
+            h.mb.i_mb_height = height;
+
+            uint16_t *ref_costsc = (uint16_t*)buf3;
+            uint16_t *ref_costsa = (uint16_t*)buf4;
+            int16_t (*mvs)[2] = (int16_t(*)[2])(ref_costsc + size);
+            int16_t *propagate_amount = (int16_t*)(mvs + width);
+            uint16_t *lowres_costs = (uint16_t*)(propagate_amount + width);
+            h.scratch_buffer2 = (uint8_t*)(ref_costsa + size);
+            int bipred_weight = (rand()%63)+1;
+            int list = i&1;
+            for( int j = 0; j < size; j++ )
+                ref_costsc[j] = ref_costsa[j] = rand()&32767;
+            for( int j = 0; j < width; j++ )
+            {
+                static const uint8_t list_dist[2][8] = {{0,1,1,1,1,1,1,1},{1,1,3,3,3,3,3,2}};
+                for( int k = 0; k < 2; k++ )
+                    mvs[j][k] = (rand()&127) - 64;
+                propagate_amount[j] = rand()&32767;
+                lowres_costs[j] = list_dist[list][rand()&7] << LOWRES_COST_SHIFT;
+            }
+
+            call_c1( mc_c.mbtree_propagate_list, &h, ref_costsc, mvs, propagate_amount, lowres_costs, bipred_weight, 0, width, list );
+            call_a1( mc_a.mbtree_propagate_list, &h, ref_costsa, mvs, propagate_amount, lowres_costs, bipred_weight, 0, width, list );
+
+            for( int j = 0; j < size && ok; j++ )
+            {
+                ok &= abs(ref_costsa[j] - ref_costsc[j]) <= 1;
+                if( !ok )
+                    fprintf( stderr, "mbtree_propagate_list FAILED at %d: %d !~= %d\n", j, ref_costsc[j], ref_costsa[j] );
+            }
+
+            call_c2( mc_c.mbtree_propagate_list, &h, ref_costsc, mvs, propagate_amount, lowres_costs, bipred_weight, 0, width, list );
+            call_a2( mc_a.mbtree_propagate_list, &h, ref_costsa, mvs, propagate_amount, lowres_costs, bipred_weight, 0, width, list );
+        }
+    }
+    report( "mbtree :" );
+
     if( mc_a.memcpy_aligned != mc_ref.memcpy_aligned )
     {
         set_func_name( "memcpy_aligned" );