From 6371c3a527a337c7521912990c89d0474288e105 Mon Sep 17 00:00:00 2001
From: Fiona Glaser <fiona@x264.com>
Date: Fri, 18 Jan 2013 22:55:46 -0800
Subject: [PATCH] x86: optimize and clean up predictor checking Branchlessly
 handle elimination of candidates in MMX roundclip asm. Add a new asm
 function, similar to roundclip, except without the round part. Optimize and
 organize the C code, and make both subme>=3 and subme<3 consistent. Add lots
 of explanatory comments and try to make things a little more understandable.
 ~5-10% faster with subme>=3, ~15-20% faster with subme<3.

---
 common/common.h     |  47 +++++++++----
 common/x86/util.h   | 152 +++++++++++++++++++++++++++++++--------
 encoder/analyse.c   |  12 ++--
 encoder/me.c        | 168 ++++++++++++++++++++++++++++----------------
 encoder/slicetype.c |  16 ++---
 5 files changed, 275 insertions(+), 120 deletions(-)

diff --git a/common/common.h b/common/common.h
index 64a7f025..39ad5cb1 100644
--- a/common/common.h
+++ b/common/common.h
@@ -291,17 +291,6 @@ static ALWAYS_INLINE uint16_t x264_cabac_mvd_sum( uint8_t *mvdleft, uint8_t *mvd
     return amvd0 + (amvd1<<8);
 }
 
-static void ALWAYS_INLINE x264_predictor_roundclip( int16_t (*dst)[2], int16_t (*mvc)[2], int i_mvc, int mv_x_min, int mv_x_max, int mv_y_min, int mv_y_max )
-{
-    for( int i = 0; i < i_mvc; i++ )
-    {
-        int mx = (mvc[i][0] + 2) >> 2;
-        int my = (mvc[i][1] + 2) >> 2;
-        dst[i][0] = x264_clip3( mx, mv_x_min, mv_x_max );
-        dst[i][1] = x264_clip3( my, mv_y_min, mv_y_max );
-    }
-}
-
 extern const uint8_t x264_exp2_lut[64];
 extern const float x264_log2_lut[128];
 extern const float x264_log2_lz_lut[32];
@@ -671,8 +660,7 @@ struct x264_t
         int     mv_miny_spel_row[3];
         int     mv_maxy_spel_row[3];
         /* Fullpel MV range for motion search */
-        int     mv_min_fpel[2];
-        int     mv_max_fpel[2];
+        ALIGNED_8( int16_t mv_limit_fpel[2][2] ); /* min_x, min_y, max_x, max_y */
         int     mv_miny_fpel_row[3];
         int     mv_maxy_fpel_row[3];
 
@@ -952,6 +940,39 @@ struct x264_t
 // included at the end because it needs x264_t
 #include "macroblock.h"
 
+static int ALWAYS_INLINE x264_predictor_roundclip( int16_t (*dst)[2], int16_t (*mvc)[2], int i_mvc, int16_t mv_limit[2][2], uint32_t pmv )
+{
+    int cnt = 0;
+    for( int i = 0; i < i_mvc; i++ )
+    {
+        int mx = (mvc[i][0] + 2) >> 2;
+        int my = (mvc[i][1] + 2) >> 2;
+        uint32_t mv = pack16to32_mask(mx, my);
+        if( !mv || mv == pmv ) continue;
+        dst[cnt][0] = x264_clip3( mx, mv_limit[0][0], mv_limit[1][0] );
+        dst[cnt][1] = x264_clip3( my, mv_limit[0][1], mv_limit[1][1] );
+        cnt++;
+    }
+    return cnt;
+}
+
+static int ALWAYS_INLINE x264_predictor_clip( int16_t (*dst)[2], int16_t (*mvc)[2], int i_mvc, int16_t mv_limit[2][2], uint32_t pmv )
+{
+    int cnt = 0;
+    int qpel_limit[4] = {mv_limit[0][0] << 2, mv_limit[0][1] << 2, mv_limit[1][0] << 2, mv_limit[1][1] << 2};
+    for( int i = 0; i < i_mvc; i++ )
+    {
+        uint32_t mv = M32( mvc[i] );
+        int mx = mvc[i][0];
+        int my = mvc[i][1];
+        if( !mv || mv == pmv ) continue;
+        dst[cnt][0] = x264_clip3( mx, qpel_limit[0], qpel_limit[2] );
+        dst[cnt][1] = x264_clip3( my, qpel_limit[1], qpel_limit[3] );
+        cnt++;
+    }
+    return cnt;
+}
+
 #if ARCH_X86 || ARCH_X86_64
 #include "x86/util.h"
 #endif
diff --git a/common/x86/util.h b/common/x86/util.h
index fb9912b1..972f0de9 100644
--- a/common/x86/util.h
+++ b/common/x86/util.h
@@ -121,42 +121,132 @@ static ALWAYS_INLINE uint16_t x264_cabac_mvd_sum_mmx2(uint8_t *mvdleft, uint8_t
     return amvd;
 }
 
+#define x264_predictor_clip x264_predictor_clip_mmx2
+static int ALWAYS_INLINE x264_predictor_clip_mmx2( int16_t (*dst)[2], int16_t (*mvc)[2], int i_mvc, int16_t mv_limit[2][2], uint32_t pmv )
+{
+    static const uint32_t pd_32 = 0x20;
+    intptr_t tmp = (intptr_t)mv_limit, mvc_max = i_mvc, i = 0;
+
+    asm(
+        "movq       (%2), %%mm5 \n"
+        "movd         %6, %%mm3 \n"
+        "psllw        $2, %%mm5 \n" // Convert to subpel
+        "pshufw $0xEE, %%mm5, %%mm6 \n"
+        "dec         %k3        \n"
+        "jz 2f                  \n" // if( i_mvc == 1 ) {do the last iteration}
+        "punpckldq %%mm3, %%mm3 \n"
+        "punpckldq %%mm5, %%mm5 \n"
+        "movd         %7, %%mm4 \n"
+        "lea   (%0,%3,4), %3    \n"
+        "1:                     \n"
+        "movq       (%0), %%mm0 \n"
+        "add          $8, %0    \n"
+        "movq      %%mm3, %%mm1 \n"
+        "pxor      %%mm2, %%mm2 \n"
+        "pcmpeqd   %%mm0, %%mm1 \n" // mv == pmv
+        "pcmpeqd   %%mm0, %%mm2 \n" // mv == 0
+        "por       %%mm1, %%mm2 \n" // (mv == pmv || mv == 0) * -1
+        "pmovmskb  %%mm2, %k2   \n" // (mv == pmv || mv == 0) * 0xf
+        "pmaxsw    %%mm5, %%mm0 \n"
+        "pminsw    %%mm6, %%mm0 \n"
+        "pand      %%mm4, %%mm2 \n" // (mv0 == pmv || mv0 == 0) * 32
+        "psrlq     %%mm2, %%mm0 \n" // drop mv0 if it's skipped
+        "movq      %%mm0, (%5,%4,4) \n"
+        "and         $24, %k2   \n"
+        "add          $2, %4    \n"
+        "add          $8, %k2   \n"
+        "shr          $4, %k2   \n" // (4-val)>>1
+        "sub          %2, %4    \n" // +1 for each valid motion vector
+        "cmp          %3, %0    \n"
+        "jl 1b                  \n"
+        "jg 3f                  \n" // if( i == i_mvc - 1 ) {do the last iteration}
+
+        /* Do the last iteration */
+        "2:                     \n"
+        "movd       (%0), %%mm0 \n"
+        "pxor      %%mm2, %%mm2 \n"
+        "pcmpeqd   %%mm0, %%mm3 \n"
+        "pcmpeqd   %%mm0, %%mm2 \n"
+        "por       %%mm3, %%mm2 \n"
+        "pmovmskb  %%mm2, %k2   \n"
+        "pmaxsw    %%mm5, %%mm0 \n"
+        "pminsw    %%mm6, %%mm0 \n"
+        "movd      %%mm0, (%5,%4,4) \n"
+        "inc          %4        \n"
+        "and          $1, %k2   \n"
+        "sub          %2, %4    \n" // output += !(mv == pmv || mv == 0)
+        "3:                     \n"
+        :"+r"(mvc), "=m"(M64( dst )), "+r"(tmp), "+r"(mvc_max), "+r"(i)
+        :"r"(dst), "g"(pmv), "m"(pd_32), "m"(M64( mvc ))
+    );
+    return i;
+}
+
+/* Same as the above, except we do (mv + 2) >> 2 on the input. */
 #define x264_predictor_roundclip x264_predictor_roundclip_mmx2
-static void ALWAYS_INLINE x264_predictor_roundclip_mmx2( int16_t (*dst)[2], int16_t (*mvc)[2], int i_mvc, int mv_x_min, int mv_x_max, int mv_y_min, int mv_y_max )
+static int ALWAYS_INLINE x264_predictor_roundclip_mmx2( int16_t (*dst)[2], int16_t (*mvc)[2], int i_mvc, int16_t mv_limit[2][2], uint32_t pmv )
 {
-    uint32_t mv_min = pack16to32_mask( mv_x_min, mv_y_min );
-    uint32_t mv_max = pack16to32_mask( mv_x_max, mv_y_max );
     static const uint64_t pw_2 = 0x0002000200020002ULL;
-    intptr_t i = i_mvc;
+    static const uint32_t pd_32 = 0x20;
+    intptr_t tmp = (intptr_t)mv_limit, mvc_max = i_mvc, i = 0;
+
     asm(
-        "movd    %2, %%mm5       \n"
-        "movd    %3, %%mm6       \n"
-        "movq    %4, %%mm7       \n"
-        "punpckldq %%mm5, %%mm5  \n"
-        "punpckldq %%mm6, %%mm6  \n"
-        "test $1, %0             \n"
-        "jz 1f                   \n"
-        "movd -4(%6,%0,4), %%mm0 \n"
-        "paddw %%mm7, %%mm0      \n"
-        "psraw $2, %%mm0         \n"
-        "pmaxsw %%mm5, %%mm0     \n"
-        "pminsw %%mm6, %%mm0     \n"
-        "movd %%mm0, -4(%5,%0,4) \n"
-        "dec %0                  \n"
-        "jz 2f                   \n"
-        "1:                      \n"
-        "movq -8(%6,%0,4), %%mm0 \n"
-        "paddw %%mm7, %%mm0      \n"
-        "psraw $2, %%mm0         \n"
-        "pmaxsw %%mm5, %%mm0     \n"
-        "pminsw %%mm6, %%mm0     \n"
-        "movq %%mm0, -8(%5,%0,4) \n"
-        "sub $2, %0              \n"
-        "jnz 1b                  \n"
-        "2:                      \n"
-        :"+r"(i), "=m"(M64( dst ))
-        :"g"(mv_min), "g"(mv_max), "m"(pw_2), "r"(dst), "r"(mvc), "m"(M64( mvc ))
+        "movq       (%2), %%mm5 \n"
+        "movq         %6, %%mm7 \n"
+        "movd         %7, %%mm3 \n"
+        "pshufw $0xEE, %%mm5, %%mm6 \n"
+        "dec         %k3        \n"
+        "jz 2f                  \n"
+        "punpckldq %%mm3, %%mm3 \n"
+        "punpckldq %%mm5, %%mm5 \n"
+        "movd         %8, %%mm4 \n"
+        "lea   (%0,%3,4), %3    \n"
+        "1:                     \n"
+        "movq       (%0), %%mm0 \n"
+        "add          $8, %0    \n"
+        "paddw     %%mm7, %%mm0 \n"
+        "psraw        $2, %%mm0 \n"
+        "movq      %%mm3, %%mm1 \n"
+        "pxor      %%mm2, %%mm2 \n"
+        "pcmpeqd   %%mm0, %%mm1 \n"
+        "pcmpeqd   %%mm0, %%mm2 \n"
+        "por       %%mm1, %%mm2 \n"
+        "pmovmskb  %%mm2, %k2   \n"
+        "pmaxsw    %%mm5, %%mm0 \n"
+        "pminsw    %%mm6, %%mm0 \n"
+        "pand      %%mm4, %%mm2 \n"
+        "psrlq     %%mm2, %%mm0 \n"
+        "movq      %%mm0, (%5,%4,4) \n"
+        "and         $24, %k2   \n"
+        "add          $2, %4    \n"
+        "add          $8, %k2   \n"
+        "shr          $4, %k2   \n"
+        "sub          %2, %4    \n"
+        "cmp          %3, %0    \n"
+        "jl 1b                  \n"
+        "jg 3f                  \n"
+
+        /* Do the last iteration */
+        "2:                     \n"
+        "movd       (%0), %%mm0 \n"
+        "paddw     %%mm7, %%mm0 \n"
+        "psraw        $2, %%mm0 \n"
+        "pxor      %%mm2, %%mm2 \n"
+        "pcmpeqd   %%mm0, %%mm3 \n"
+        "pcmpeqd   %%mm0, %%mm2 \n"
+        "por       %%mm3, %%mm2 \n"
+        "pmovmskb  %%mm2, %k2   \n"
+        "pmaxsw    %%mm5, %%mm0 \n"
+        "pminsw    %%mm6, %%mm0 \n"
+        "movd      %%mm0, (%5,%4,4) \n"
+        "inc          %4        \n"
+        "and          $1, %k2   \n"
+        "sub          %2, %4    \n"
+        "3:                     \n"
+        :"+r"(mvc), "=m"(M64( dst )), "+r"(tmp), "+r"(mvc_max), "+r"(i)
+        :"r"(dst), "m"(pw_2), "g"(pmv), "m"(pd_32), "m"(M64( mvc ))
     );
+    return i;
 }
 
 #endif
diff --git a/encoder/analyse.c b/encoder/analyse.c
index 39bb6b19..ec942b3c 100644
--- a/encoder/analyse.c
+++ b/encoder/analyse.c
@@ -467,8 +467,8 @@ static void x264_mb_analyse_init( x264_t *h, x264_mb_analysis_t *a, int qp )
             if( max_mv > 0 && h->mb.i_mb_x < h->fdec->i_pir_start_col )
                 h->mb.mv_max_spel[0] = X264_MIN( h->mb.mv_max_spel[0], max_mv );
         }
-        h->mb.mv_min_fpel[0] = (h->mb.mv_min_spel[0]>>2) + i_fpel_border;
-        h->mb.mv_max_fpel[0] = (h->mb.mv_max_spel[0]>>2) - i_fpel_border;
+        h->mb.mv_limit_fpel[0][0] = (h->mb.mv_min_spel[0]>>2) + i_fpel_border;
+        h->mb.mv_limit_fpel[1][0] = (h->mb.mv_max_spel[0]>>2) - i_fpel_border;
         if( h->mb.i_mb_x == 0 && !(h->mb.i_mb_y & PARAM_INTERLACED) )
         {
             int mb_y = h->mb.i_mb_y >> SLICE_MBAFF;
@@ -516,8 +516,8 @@ static void x264_mb_analyse_init( x264_t *h, x264_mb_analysis_t *a, int qp )
                 h->mb.mv_min_spel[1] = x264_clip3( h->mb.mv_min[1], -i_fmv_range, i_fmv_range );
                 h->mb.mv_max_spel[1] = CLIP_FMV( h->mb.mv_max[1] );
                 h->mb.mv_max_spel[1] = X264_MIN( h->mb.mv_max_spel[1], thread_mvy_range*4 );
-                h->mb.mv_min_fpel[1] = (h->mb.mv_min_spel[1]>>2) + i_fpel_border;
-                h->mb.mv_max_fpel[1] = (h->mb.mv_max_spel[1]>>2) - i_fpel_border;
+                h->mb.mv_limit_fpel[0][1] = (h->mb.mv_min_spel[1]>>2) + i_fpel_border;
+                h->mb.mv_limit_fpel[1][1] = (h->mb.mv_max_spel[1]>>2) - i_fpel_border;
             }
         }
         if( PARAM_INTERLACED )
@@ -527,8 +527,8 @@ static void x264_mb_analyse_init( x264_t *h, x264_mb_analysis_t *a, int qp )
             h->mb.mv_max[1] = h->mb.mv_maxy_row[i];
             h->mb.mv_min_spel[1] = h->mb.mv_miny_spel_row[i];
             h->mb.mv_max_spel[1] = h->mb.mv_maxy_spel_row[i];
-            h->mb.mv_min_fpel[1] = h->mb.mv_miny_fpel_row[i];
-            h->mb.mv_max_fpel[1] = h->mb.mv_maxy_fpel_row[i];
+            h->mb.mv_limit_fpel[0][1] = h->mb.mv_miny_fpel_row[i];
+            h->mb.mv_limit_fpel[1][1] = h->mb.mv_maxy_fpel_row[i];
         }
 #undef CLIP_FMV
 
diff --git a/encoder/me.c b/encoder/me.c
index 49f143c3..65c95a6d 100644
--- a/encoder/me.c
+++ b/encoder/me.c
@@ -61,21 +61,22 @@ static void refine_subpel( x264_t *h, x264_me_t *m, int hpel_iters, int qpel_ite
     (p_cost_mvx[(mx)<<2] + p_cost_mvy[(my)<<2])
 
 #define COST_MV( mx, my )\
+do\
 {\
     int cost = h->pixf.fpelcmp[i_pixel]( p_fenc, FENC_STRIDE,\
                    &p_fref_w[(my)*stride+(mx)], stride )\
              + BITS_MVD(mx,my);\
     COPY3_IF_LT( bcost, cost, bmx, mx, bmy, my );\
-}
+} while(0)
 
-#define COST_MV_HPEL( mx, my ) \
-{ \
-    intptr_t stride2 = 16; \
-    pixel *src = h->mc.get_ref( pix, &stride2, m->p_fref, stride, mx, my, bw, bh, &m->weight[0] ); \
-    int cost = h->pixf.fpelcmp[i_pixel]( p_fenc, FENC_STRIDE, src, stride2 ) \
-             + p_cost_mvx[ mx ] + p_cost_mvy[ my ]; \
-    COPY3_IF_LT( bpred_cost, cost, bpred_mx, mx, bpred_my, my ); \
-}
+#define COST_MV_HPEL( mx, my, cost )\
+do\
+{\
+    intptr_t stride2 = 16;\
+    pixel *src = h->mc.get_ref( pix, &stride2, m->p_fref, stride, mx, my, bw, bh, &m->weight[0] );\
+    cost = h->pixf.fpelcmp[i_pixel]( p_fenc, FENC_STRIDE, src, stride2 )\
+         + p_cost_mvx[ mx ] + p_cost_mvy[ my ];\
+} while(0)
 
 #define COST_MV_X3_DIR( m0x, m0y, m1x, m1y, m2x, m2y, costs )\
 {\
@@ -174,6 +175,9 @@ static void refine_subpel( x264_t *h, x264_me_t *m, int hpel_iters, int qpel_ite
     }\
 }
 
+#define FPEL(mv) (((mv)+2)>>2) /* Convert subpel MV to fullpel with rounding... */
+#define SPEL(mv) ((mv)<<2)     /* ... and the reverse. */
+
 void x264_me_search_ref( x264_t *h, x264_me_t *m, int16_t (*mvc)[2], int i_mvc, int *p_halfpel_thresh )
 {
     const int bw = x264_pixel_size[m->i_pixel].w;
@@ -181,95 +185,135 @@ void x264_me_search_ref( x264_t *h, x264_me_t *m, int16_t (*mvc)[2], int i_mvc,
     const int i_pixel = m->i_pixel;
     const int stride = m->i_stride[0];
     int i_me_range = h->param.analyse.i_me_range;
-    int bmx, bmy, bcost;
+    int bmx, bmy, bcost = COST_MAX;
     int bpred_mx = 0, bpred_my = 0, bpred_cost = COST_MAX;
     int omx, omy, pmx, pmy;
     pixel *p_fenc = m->p_fenc[0];
     pixel *p_fref_w = m->p_fref_w;
     ALIGNED_ARRAY_16( pixel, pix,[16*16] );
+    ALIGNED_ARRAY_8( int16_t, mvc_temp,[16],[2] );
 
     int costs[16];
 
-    int mv_x_min = h->mb.mv_min_fpel[0];
-    int mv_y_min = h->mb.mv_min_fpel[1];
-    int mv_x_max = h->mb.mv_max_fpel[0];
-    int mv_y_max = h->mb.mv_max_fpel[1];
-    int mv_x_min_qpel = mv_x_min << 2;
-    int mv_y_min_qpel = mv_y_min << 2;
-    int mv_x_max_qpel = mv_x_max << 2;
-    int mv_y_max_qpel = mv_y_max << 2;
+    int mv_x_min = h->mb.mv_limit_fpel[0][0];
+    int mv_y_min = h->mb.mv_limit_fpel[0][1];
+    int mv_x_max = h->mb.mv_limit_fpel[1][0];
+    int mv_y_max = h->mb.mv_limit_fpel[1][1];
 /* Special version of pack to allow shortcuts in CHECK_MVRANGE */
 #define pack16to32_mask2(mx,my) ((mx<<16)|(my&0x7FFF))
     uint32_t mv_min = pack16to32_mask2( -mv_x_min, -mv_y_min );
     uint32_t mv_max = pack16to32_mask2( mv_x_max, mv_y_max )|0x8000;
+    uint32_t pmv;
 
 #define CHECK_MVRANGE(mx,my) (!(((pack16to32_mask2(mx,my) + mv_min) | (mv_max - pack16to32_mask2(mx,my))) & 0x80004000))
 
     const uint16_t *p_cost_mvx = m->p_cost_mv - m->mvp[0];
     const uint16_t *p_cost_mvy = m->p_cost_mv - m->mvp[1];
 
-    uint32_t pmv;
-    bmx = x264_clip3( m->mvp[0], mv_x_min_qpel, mv_x_max_qpel );
-    bmy = x264_clip3( m->mvp[1], mv_y_min_qpel, mv_y_max_qpel );
-    pmx = ( bmx + 2 ) >> 2;
-    pmy = ( bmy + 2 ) >> 2;
-    bcost = COST_MAX;
-
-    /* try extra predictors if provided */
+    /* Try extra predictors if provided.  If subme >= 3, check subpel predictors,
+     * otherwise round them to fullpel. */
     if( h->mb.i_subpel_refine >= 3 )
     {
-        pmv = pack16to32_mask(bmx,bmy);
-        COST_MV_HPEL( bmx, bmy );
-        for( int i = 0; i < i_mvc; i++ )
+        /* Calculate and check the MVP first */
+        bpred_mx = x264_clip3( m->mvp[0], SPEL(mv_x_min), SPEL(mv_x_max) );
+        bpred_my = x264_clip3( m->mvp[1], SPEL(mv_y_min), SPEL(mv_y_max) );
+        pmv = pack16to32_mask( bpred_mx, bpred_my );
+        pmx = FPEL( bpred_mx );
+        pmy = FPEL( bpred_my );
+
+        COST_MV_HPEL( bpred_mx, bpred_my, bpred_cost );
+        int pmv_cost = bpred_cost;
+
+        if( i_mvc > 0 )
         {
-            if( M32( mvc[i] ) && (pmv != M32( mvc[i] )) )
+            /* Clip MV candidates and eliminate those equal to zero and pmv. */
+            int valid_mvcs = x264_predictor_clip( mvc_temp+2, mvc, i_mvc, h->mb.mv_limit_fpel, pmv );
+            if( valid_mvcs > 0 )
             {
-                int mx = x264_clip3( mvc[i][0], mv_x_min_qpel, mv_x_max_qpel );
-                int my = x264_clip3( mvc[i][1], mv_y_min_qpel, mv_y_max_qpel );
-                COST_MV_HPEL( mx, my );
+                int i = 1, cost;
+                /* We stuff pmv here to branchlessly pick between pmv and the various
+                 * MV candidates. [0] gets skipped in order to maintain alignment for
+                 * x264_predictor_clip. */
+                M32( mvc_temp[1] ) = pmv;
+                bpred_cost <<= 4;
+                do
+                {
+                    int mx = mvc_temp[i+1][0];
+                    int my = mvc_temp[i+1][1];
+                    COST_MV_HPEL( mx, my, cost );
+                    COPY1_IF_LT( bpred_cost, (cost << 4) + i );
+                } while( ++i <= valid_mvcs );
+                bpred_mx = mvc_temp[(bpred_cost&15)+1][0];
+                bpred_my = mvc_temp[(bpred_cost&15)+1][1];
+                bpred_cost >>= 4;
             }
         }
-        bmx = ( bpred_mx + 2 ) >> 2;
-        bmy = ( bpred_my + 2 ) >> 2;
-        COST_MV( bmx, bmy );
+
+        /* Round the best predictor back to fullpel and get the cost, since this is where
+         * we'll be starting the fullpel motion search. */
+        bmx = FPEL( bpred_mx );
+        bmy = FPEL( bpred_my );
+        if( (bpred_mx|bpred_my)&0x3 ) /* Only test if the tested predictor is actually subpel... */
+            COST_MV( bmx, bmy );
+        else                          /* Otherwise just copy the cost (we already know it) */
+            bcost = bpred_cost;
+
+        /* Test the zero vector if it hasn't been tested yet. */
+        if( pmv )
+        {
+            if( bmx|bmy ) COST_MV( 0, 0 );
+        }
+        /* If a subpel mv candidate was better than the zero vector, the previous
+         * fullpel check won't have gotten it even if the pmv was zero. So handle
+         * that possibility here. */
+        else
+        {
+            COPY3_IF_LT( bcost, pmv_cost, bmx, 0, bmy, 0 );
+        }
     }
     else
     {
-        /* check the MVP */
-        bmx = pmx;
-        bmy = pmy;
+        /* Calculate and check the fullpel MVP first */
+        bmx = pmx = x264_clip3( FPEL(m->mvp[0]), mv_x_min, mv_x_max );
+        bmy = pmy = x264_clip3( FPEL(m->mvp[1]), mv_y_min, mv_y_max );
+        pmv = pack16to32_mask( bmx, bmy );
+
         /* Because we are rounding the predicted motion vector to fullpel, there will be
          * an extra MV cost in 15 out of 16 cases.  However, when the predicted MV is
          * chosen as the best predictor, it is often the case that the subpel search will
-         * result in a vector at or next to the predicted motion vector.  Therefore, it is
-         * sensible to omit the cost of the MV from the rounded MVP to avoid unfairly
-         * biasing against use of the predicted motion vector. */
+         * result in a vector at or next to the predicted motion vector.  Therefore, we omit
+         * the cost of the MV from the rounded MVP to avoid unfairly biasing against use of
+         * the predicted motion vector.
+         *
+         * Disclaimer: this is a post-hoc rationalization for why this hack works. */
         bcost = h->pixf.fpelcmp[i_pixel]( p_fenc, FENC_STRIDE, &p_fref_w[bmy*stride+bmx], stride );
-        pmv = pack16to32_mask( bmx, bmy );
+
         if( i_mvc > 0 )
         {
-            ALIGNED_ARRAY_8( int16_t, mvc_fpel,[16],[2] );
-            x264_predictor_roundclip( mvc_fpel+2, mvc, i_mvc, mv_x_min, mv_x_max, mv_y_min, mv_y_max );
-            M32( mvc_fpel[1] ) = pmv;
-            bcost <<= 4;
-            for( int i = 1; i <= i_mvc; i++ )
+            /* Like in subme>=3, except we also round the candidates to fullpel. */
+            int valid_mvcs = x264_predictor_roundclip( mvc_temp+2, mvc, i_mvc, h->mb.mv_limit_fpel, pmv );
+            if( valid_mvcs > 0 )
             {
-                if( M32( mvc_fpel[i+1] ) && (pmv != M32( mvc_fpel[i+1] )) )
+                int i = 1, cost;
+                M32( mvc_temp[1] ) = pmv;
+                bcost <<= 4;
+                do
                 {
-                    int mx = mvc_fpel[i+1][0];
-                    int my = mvc_fpel[i+1][1];
-                    int cost = h->pixf.fpelcmp[i_pixel]( p_fenc, FENC_STRIDE, &p_fref_w[my*stride+mx], stride ) + BITS_MVD( mx, my );
-                    cost = (cost << 4) + i;
-                    COPY1_IF_LT( bcost, cost );
-                }
+                    int mx = mvc_temp[i+1][0];
+                    int my = mvc_temp[i+1][1];
+                    cost = h->pixf.fpelcmp[i_pixel]( p_fenc, FENC_STRIDE, &p_fref_w[my*stride+mx], stride ) + BITS_MVD( mx, my );
+                    COPY1_IF_LT( bcost, (cost << 4) + i );
+                } while( ++i <= valid_mvcs );
+                bmx = mvc_temp[(bcost&15)+1][0];
+                bmy = mvc_temp[(bcost&15)+1][1];
+                bcost >>= 4;
             }
-            bmx = mvc_fpel[(bcost&15)+1][0];
-            bmy = mvc_fpel[(bcost&15)+1][1];
-            bcost >>= 4;
         }
-    }
 
-    COST_MV( 0, 0 );
+        /* Same as above, except the condition is simpler. */
+        if( pmv )
+            COST_MV( 0, 0 );
+    }
 
     switch( h->mb.i_me_method )
     {
@@ -733,8 +777,8 @@ void x264_me_search_ref( x264_t *h, x264_me_t *m, int16_t (*mvc)[2], int i_mvc,
     }
     else
     {
-        m->mv[0] = bmx << 2;
-        m->mv[1] = bmy << 2;
+        m->mv[0] = SPEL(bmx);
+        m->mv[1] = SPEL(bmy);
         m->cost = bcost;
     }
 
diff --git a/encoder/slicetype.c b/encoder/slicetype.c
index e9c7f118..0ecd91e2 100644
--- a/encoder/slicetype.c
+++ b/encoder/slicetype.c
@@ -472,16 +472,16 @@ static void x264_slicetype_mb_cost( x264_t *h, x264_mb_analysis_t *a,
         goto lowres_intra_mb;
 
     // no need for h->mb.mv_min[]
-    h->mb.mv_min_fpel[0] = -8*h->mb.i_mb_x - 4;
-    h->mb.mv_max_fpel[0] = 8*( h->mb.i_mb_width - h->mb.i_mb_x - 1 ) + 4;
-    h->mb.mv_min_spel[0] = 4*( h->mb.mv_min_fpel[0] - 8 );
-    h->mb.mv_max_spel[0] = 4*( h->mb.mv_max_fpel[0] + 8 );
+    h->mb.mv_limit_fpel[0][0] = -8*h->mb.i_mb_x - 4;
+    h->mb.mv_limit_fpel[1][0] = 8*( h->mb.i_mb_width - h->mb.i_mb_x - 1 ) + 4;
+    h->mb.mv_min_spel[0] = 4*( h->mb.mv_limit_fpel[0][0] - 8 );
+    h->mb.mv_max_spel[0] = 4*( h->mb.mv_limit_fpel[1][0] + 8 );
     if( h->mb.i_mb_x >= h->mb.i_mb_width - 2 )
     {
-        h->mb.mv_min_fpel[1] = -8*h->mb.i_mb_y - 4;
-        h->mb.mv_max_fpel[1] = 8*( h->mb.i_mb_height - h->mb.i_mb_y - 1 ) + 4;
-        h->mb.mv_min_spel[1] = 4*( h->mb.mv_min_fpel[1] - 8 );
-        h->mb.mv_max_spel[1] = 4*( h->mb.mv_max_fpel[1] + 8 );
+        h->mb.mv_limit_fpel[0][1] = -8*h->mb.i_mb_y - 4;
+        h->mb.mv_limit_fpel[1][1] = 8*( h->mb.i_mb_height - h->mb.i_mb_y - 1 ) + 4;
+        h->mb.mv_min_spel[1] = 4*( h->mb.mv_limit_fpel[0][1] - 8 );
+        h->mb.mv_max_spel[1] = 4*( h->mb.mv_limit_fpel[1][1] + 8 );
     }
 
 #define LOAD_HPELS_LUMA(dst, src) \
-- 
2.40.0