From: Fiona Glaser Date: Sun, 26 Jul 2009 05:31:06 +0000 (-0700) Subject: Fix a nondeterminism with threads and subme>7 X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=fa3b8139a19d578c12c87e20a3215b41462866b4;p=libx264 Fix a nondeterminism with threads and subme>7 Also add a few more checks to eliminate the need for spel_border. --- diff --git a/encoder/analyse.c b/encoder/analyse.c index dc75fb16..bdc005ba 100644 --- a/encoder/analyse.c +++ b/encoder/analyse.c @@ -309,8 +309,7 @@ static void x264_mb_analyse_init( x264_t *h, x264_mb_analysis_t *a, int i_qp ) int i_fmv_range = 4 * h->param.analyse.i_mv_range; // limit motion search to a slightly smaller range than the theoretical limit, // since the search may go a few iterations past its given range - int i_fpel_border = 5; // umh unconditional radius - int i_spel_border = 8; // 1.5 for subpel_satd, 1.5 for subpel_rd, 2 for bime, round up + int i_fpel_border = 6; // umh: 1 for diamond, 2 for octagon, 2 for hpel /* Calculate max allowed MV range */ #define CLIP_FMV(mv) x264_clip3( mv, -i_fmv_range, i_fmv_range-1 ) @@ -348,7 +347,7 @@ static void x264_mb_analyse_init( x264_t *h, x264_mb_analysis_t *a, int i_qp ) h->mb.mv_min[1] = 4*( -16*mb_y - 24 ); h->mb.mv_max[1] = 4*( 16*( mb_height - mb_y - 1 ) + 24 ); - h->mb.mv_min_spel[1] = x264_clip3( h->mb.mv_min[1], X264_MAX(4*(-512+i_spel_border), -i_fmv_range), i_fmv_range ); + h->mb.mv_min_spel[1] = x264_clip3( h->mb.mv_min[1], -i_fmv_range, i_fmv_range ); h->mb.mv_max_spel[1] = CLIP_FMV( h->mb.mv_max[1] ); h->mb.mv_max_spel[1] = X264_MIN( h->mb.mv_max_spel[1], thread_mvy_range*4 ); h->mb.mv_min_fpel[1] = (h->mb.mv_min_spel[1]>>2) + i_fpel_border; diff --git a/encoder/me.c b/encoder/me.c index 2e520441..eb6a3a34 100644 --- a/encoder/me.c +++ b/encoder/me.c @@ -535,7 +535,7 @@ me_hex2: } } } while( ++i <= i_me_range/4 ); - if( bmy <= mv_y_max ) + if( bmy <= mv_y_max && bmy >= mv_y_min ) goto me_hex2; break; } @@ -718,8 +718,6 @@ me_hex2: int qpel = subpel_iterations[h->mb.i_subpel_refine][3]; refine_subpel( h, m, hpel, qpel, p_halfpel_thresh, 0 ); } - else if( m->mv[1] > h->mb.mv_max_spel[1] ) - m->mv[1] = h->mb.mv_max_spel[1]; } #undef COST_MV @@ -790,8 +788,8 @@ static void refine_subpel( x264_t *h, x264_me_t *m, int hpel_iters, int qpel_ite /* try the subpel component of the predicted mv */ if( hpel_iters && h->mb.i_subpel_refine < 3 ) { - int mx = x264_clip3( m->mvp[0], h->mb.mv_min_spel[0], h->mb.mv_max_spel[0] ); - int my = x264_clip3( m->mvp[1], h->mb.mv_min_spel[1], h->mb.mv_max_spel[1] ); + int mx = x264_clip3( m->mvp[0], h->mb.mv_min_spel[0]+2, h->mb.mv_max_spel[0]-2 ); + int my = x264_clip3( m->mvp[1], h->mb.mv_min_spel[1]+2, h->mb.mv_max_spel[1]-2 ); if( (mx-bmx)|(my-bmy) ) COST_MV_SAD( mx, my ); } @@ -818,9 +816,6 @@ static void refine_subpel( x264_t *h, x264_me_t *m, int hpel_iters, int qpel_ite if( !b_refine_qpel ) { - /* check for mvrange */ - if( bmy > h->mb.mv_max_spel[1] ) - bmy = h->mb.mv_max_spel[1]; bcost = COST_MAX; COST_MV_SATD( bmx, bmy, -1 ); } @@ -844,6 +839,8 @@ static void refine_subpel( x264_t *h, x264_me_t *m, int hpel_iters, int qpel_ite bdir = -1; for( i = qpel_iters; i > 0; i-- ) { + if( bmy <= h->mb.mv_min_spel[1] || bmy >= h->mb.mv_max_spel[1] ) + break; odir = bdir; omx = bmx; omy = bmy; @@ -855,14 +852,6 @@ static void refine_subpel( x264_t *h, x264_me_t *m, int hpel_iters, int qpel_ite break; } - /* check for mvrange */ - if( bmy > h->mb.mv_max_spel[1] ) - { - bmy = h->mb.mv_max_spel[1]; - bcost = COST_MAX; - COST_MV_SATD( bmx, bmy, -1 ); - } - m->cost = bcost; m->mv[0] = bmx; m->mv[1] = bmy; @@ -970,8 +959,8 @@ static void ALWAYS_INLINE x264_me_refine_bidir( x264_t *h, x264_me_t *m0, x264_m /* each byte of visited represents 8 possible m1y positions, so a 4D array isn't needed */ DECLARE_ALIGNED_16( uint8_t visited[8][8][8] ); - if( bm0y > h->mb.mv_max_spel[1] - 8 || - bm1y > h->mb.mv_max_spel[1] - 8 ) + if( bm0y < h->mb.mv_min_spel[1] + 8 || bm1y < h->mb.mv_min_spel[1] + 8 || + bm0y > h->mb.mv_max_spel[1] - 8 || bm1y > h->mb.mv_max_spel[1] - 8 ) return; h->mc.memzero_aligned( visited, sizeof(visited) ); @@ -1096,6 +1085,10 @@ void x264_me_refine_qpel_rd( x264_t *h, x264_me_t *m, int i_lambda2, int i4, int } } + if( bmy < h->mb.mv_min_spel[1] + 3 || + bmy > h->mb.mv_max_spel[1] - 3 ) + return; + /* subpel hex search, same pattern as ME HEX. */ dir = -2; omx = bmx; @@ -1109,8 +1102,8 @@ void x264_me_refine_qpel_rd( x264_t *h, x264_me_t *m, int i_lambda2, int i4, int for( i = 1; i < 10; i++ ) { const int odir = mod6m1[dir+1]; - if( bmy > h->mb.mv_max_spel[1] - 2 || - bmy < h->mb.mv_min_spel[1] - 2 ) + if( bmy < h->mb.mv_min_spel[1] + 3 || + bmy > h->mb.mv_max_spel[1] - 3 ) break; dir = -2; omx = bmx; @@ -1128,7 +1121,6 @@ void x264_me_refine_qpel_rd( x264_t *h, x264_me_t *m, int i_lambda2, int i4, int for( i=0; i<8; i++ ) COST_MV_SATD( omx + square1[i+1][0], omy + square1[i+1][1], satds[i], 1 ); for( i=0; i<8; i++ ) COST_MV_RD ( omx + square1[i+1][0], omy + square1[i+1][1], satds[i], 0,0 ); - bmy = x264_clip3( bmy, h->mb.mv_min_spel[1], h->mb.mv_max_spel[1] ); m->cost = bcost; m->mv[0] = bmx; m->mv[1] = bmy;