From: Fiona Glaser <fiona@x264.com>
Date: Sun, 26 Jul 2009 05:31:06 +0000 (-0700)
Subject: Fix a nondeterminism with threads and subme>7
X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=fa3b8139a19d578c12c87e20a3215b41462866b4;p=libx264

Fix a nondeterminism with threads and subme>7
Also add a few more checks to eliminate the need for spel_border.
---

diff --git a/encoder/analyse.c b/encoder/analyse.c
index dc75fb16..bdc005ba 100644
--- a/encoder/analyse.c
+++ b/encoder/analyse.c
@@ -309,8 +309,7 @@ static void x264_mb_analyse_init( x264_t *h, x264_mb_analysis_t *a, int i_qp )
         int i_fmv_range = 4 * h->param.analyse.i_mv_range;
         // limit motion search to a slightly smaller range than the theoretical limit,
         // since the search may go a few iterations past its given range
-        int i_fpel_border = 5; // umh unconditional radius
-        int i_spel_border = 8; // 1.5 for subpel_satd, 1.5 for subpel_rd, 2 for bime, round up
+        int i_fpel_border = 6; // umh: 1 for diamond, 2 for octagon, 2 for hpel
 
         /* Calculate max allowed MV range */
 #define CLIP_FMV(mv) x264_clip3( mv, -i_fmv_range, i_fmv_range-1 )
@@ -348,7 +347,7 @@ static void x264_mb_analyse_init( x264_t *h, x264_mb_analysis_t *a, int i_qp )
 
             h->mb.mv_min[1] = 4*( -16*mb_y - 24 );
             h->mb.mv_max[1] = 4*( 16*( mb_height - mb_y - 1 ) + 24 );
-            h->mb.mv_min_spel[1] = x264_clip3( h->mb.mv_min[1], X264_MAX(4*(-512+i_spel_border), -i_fmv_range), i_fmv_range );
+            h->mb.mv_min_spel[1] = x264_clip3( h->mb.mv_min[1], -i_fmv_range, i_fmv_range );
             h->mb.mv_max_spel[1] = CLIP_FMV( h->mb.mv_max[1] );
             h->mb.mv_max_spel[1] = X264_MIN( h->mb.mv_max_spel[1], thread_mvy_range*4 );
             h->mb.mv_min_fpel[1] = (h->mb.mv_min_spel[1]>>2) + i_fpel_border;
diff --git a/encoder/me.c b/encoder/me.c
index 2e520441..eb6a3a34 100644
--- a/encoder/me.c
+++ b/encoder/me.c
@@ -535,7 +535,7 @@ me_hex2:
                     }
                 }
             } while( ++i <= i_me_range/4 );
-            if( bmy <= mv_y_max )
+            if( bmy <= mv_y_max && bmy >= mv_y_min )
                 goto me_hex2;
             break;
         }
@@ -718,8 +718,6 @@ me_hex2:
         int qpel = subpel_iterations[h->mb.i_subpel_refine][3];
         refine_subpel( h, m, hpel, qpel, p_halfpel_thresh, 0 );
     }
-    else if( m->mv[1] > h->mb.mv_max_spel[1] )
-        m->mv[1] = h->mb.mv_max_spel[1];
 }
 #undef COST_MV
 
@@ -790,8 +788,8 @@ static void refine_subpel( x264_t *h, x264_me_t *m, int hpel_iters, int qpel_ite
     /* try the subpel component of the predicted mv */
     if( hpel_iters && h->mb.i_subpel_refine < 3 )
     {
-        int mx = x264_clip3( m->mvp[0], h->mb.mv_min_spel[0], h->mb.mv_max_spel[0] );
-        int my = x264_clip3( m->mvp[1], h->mb.mv_min_spel[1], h->mb.mv_max_spel[1] );
+        int mx = x264_clip3( m->mvp[0], h->mb.mv_min_spel[0]+2, h->mb.mv_max_spel[0]-2 );
+        int my = x264_clip3( m->mvp[1], h->mb.mv_min_spel[1]+2, h->mb.mv_max_spel[1]-2 );
         if( (mx-bmx)|(my-bmy) )
             COST_MV_SAD( mx, my );
     }
@@ -818,9 +816,6 @@ static void refine_subpel( x264_t *h, x264_me_t *m, int hpel_iters, int qpel_ite
 
     if( !b_refine_qpel )
     {
-        /* check for mvrange */
-        if( bmy > h->mb.mv_max_spel[1] )
-            bmy = h->mb.mv_max_spel[1];
         bcost = COST_MAX;
         COST_MV_SATD( bmx, bmy, -1 );
     }
@@ -844,6 +839,8 @@ static void refine_subpel( x264_t *h, x264_me_t *m, int hpel_iters, int qpel_ite
     bdir = -1;
     for( i = qpel_iters; i > 0; i-- )
     {
+        if( bmy <= h->mb.mv_min_spel[1] || bmy >= h->mb.mv_max_spel[1] )
+            break;
         odir = bdir;
         omx = bmx;
         omy = bmy;
@@ -855,14 +852,6 @@ static void refine_subpel( x264_t *h, x264_me_t *m, int hpel_iters, int qpel_ite
             break;
     }
 
-    /* check for mvrange */
-    if( bmy > h->mb.mv_max_spel[1] )
-    {
-        bmy = h->mb.mv_max_spel[1];
-        bcost = COST_MAX;
-        COST_MV_SATD( bmx, bmy, -1 );
-    }
-
     m->cost = bcost;
     m->mv[0] = bmx;
     m->mv[1] = bmy;
@@ -970,8 +959,8 @@ static void ALWAYS_INLINE x264_me_refine_bidir( x264_t *h, x264_me_t *m0, x264_m
     /* each byte of visited represents 8 possible m1y positions, so a 4D array isn't needed */
     DECLARE_ALIGNED_16( uint8_t visited[8][8][8] );
 
-    if( bm0y > h->mb.mv_max_spel[1] - 8 ||
-        bm1y > h->mb.mv_max_spel[1] - 8 )
+    if( bm0y < h->mb.mv_min_spel[1] + 8 || bm1y < h->mb.mv_min_spel[1] + 8 ||
+        bm0y > h->mb.mv_max_spel[1] - 8 || bm1y > h->mb.mv_max_spel[1] - 8 )
         return;
 
     h->mc.memzero_aligned( visited, sizeof(visited) );
@@ -1096,6 +1085,10 @@ void x264_me_refine_qpel_rd( x264_t *h, x264_me_t *m, int i_lambda2, int i4, int
         }
     }
 
+    if( bmy < h->mb.mv_min_spel[1] + 3 ||
+        bmy > h->mb.mv_max_spel[1] - 3 )
+        return;
+
     /* subpel hex search, same pattern as ME HEX. */
     dir = -2;
     omx = bmx;
@@ -1109,8 +1102,8 @@ void x264_me_refine_qpel_rd( x264_t *h, x264_me_t *m, int i_lambda2, int i4, int
         for( i = 1; i < 10; i++ )
         {
             const int odir = mod6m1[dir+1];
-            if( bmy > h->mb.mv_max_spel[1] - 2 ||
-                bmy < h->mb.mv_min_spel[1] - 2 )
+            if( bmy < h->mb.mv_min_spel[1] + 3 ||
+                bmy > h->mb.mv_max_spel[1] - 3 )
                 break;
             dir = -2;
             omx = bmx;
@@ -1128,7 +1121,6 @@ void x264_me_refine_qpel_rd( x264_t *h, x264_me_t *m, int i_lambda2, int i4, int
     for( i=0; i<8; i++ ) COST_MV_SATD( omx + square1[i+1][0], omy + square1[i+1][1], satds[i], 1 );
     for( i=0; i<8; i++ ) COST_MV_RD  ( omx + square1[i+1][0], omy + square1[i+1][1], satds[i], 0,0 );
 
-    bmy = x264_clip3( bmy, h->mb.mv_min_spel[1],  h->mb.mv_max_spel[1] );
     m->cost = bcost;
     m->mv[0] = bmx;
     m->mv[1] = bmy;