From 6371c3a527a337c7521912990c89d0474288e105 Mon Sep 17 00:00:00 2001 From: Fiona Glaser Date: Fri, 18 Jan 2013 22:55:46 -0800 Subject: [PATCH] x86: optimize and clean up predictor checking Branchlessly handle elimination of candidates in MMX roundclip asm. Add a new asm function, similar to roundclip, except without the round part. Optimize and organize the C code, and make both subme>=3 and subme<3 consistent. Add lots of explanatory comments and try to make things a little more understandable. ~5-10% faster with subme>=3, ~15-20% faster with subme<3. --- common/common.h | 47 +++++++++---- common/x86/util.h | 152 +++++++++++++++++++++++++++++++-------- encoder/analyse.c | 12 ++-- encoder/me.c | 168 ++++++++++++++++++++++++++++---------------- encoder/slicetype.c | 16 ++--- 5 files changed, 275 insertions(+), 120 deletions(-) diff --git a/common/common.h b/common/common.h index 64a7f025..39ad5cb1 100644 --- a/common/common.h +++ b/common/common.h @@ -291,17 +291,6 @@ static ALWAYS_INLINE uint16_t x264_cabac_mvd_sum( uint8_t *mvdleft, uint8_t *mvd return amvd0 + (amvd1<<8); } -static void ALWAYS_INLINE x264_predictor_roundclip( int16_t (*dst)[2], int16_t (*mvc)[2], int i_mvc, int mv_x_min, int mv_x_max, int mv_y_min, int mv_y_max ) -{ - for( int i = 0; i < i_mvc; i++ ) - { - int mx = (mvc[i][0] + 2) >> 2; - int my = (mvc[i][1] + 2) >> 2; - dst[i][0] = x264_clip3( mx, mv_x_min, mv_x_max ); - dst[i][1] = x264_clip3( my, mv_y_min, mv_y_max ); - } -} - extern const uint8_t x264_exp2_lut[64]; extern const float x264_log2_lut[128]; extern const float x264_log2_lz_lut[32]; @@ -671,8 +660,7 @@ struct x264_t int mv_miny_spel_row[3]; int mv_maxy_spel_row[3]; /* Fullpel MV range for motion search */ - int mv_min_fpel[2]; - int mv_max_fpel[2]; + ALIGNED_8( int16_t mv_limit_fpel[2][2] ); /* min_x, min_y, max_x, max_y */ int mv_miny_fpel_row[3]; int mv_maxy_fpel_row[3]; @@ -952,6 +940,39 @@ struct x264_t // included at the end because it needs x264_t #include "macroblock.h" +static int ALWAYS_INLINE x264_predictor_roundclip( int16_t (*dst)[2], int16_t (*mvc)[2], int i_mvc, int16_t mv_limit[2][2], uint32_t pmv ) +{ + int cnt = 0; + for( int i = 0; i < i_mvc; i++ ) + { + int mx = (mvc[i][0] + 2) >> 2; + int my = (mvc[i][1] + 2) >> 2; + uint32_t mv = pack16to32_mask(mx, my); + if( !mv || mv == pmv ) continue; + dst[cnt][0] = x264_clip3( mx, mv_limit[0][0], mv_limit[1][0] ); + dst[cnt][1] = x264_clip3( my, mv_limit[0][1], mv_limit[1][1] ); + cnt++; + } + return cnt; +} + +static int ALWAYS_INLINE x264_predictor_clip( int16_t (*dst)[2], int16_t (*mvc)[2], int i_mvc, int16_t mv_limit[2][2], uint32_t pmv ) +{ + int cnt = 0; + int qpel_limit[4] = {mv_limit[0][0] << 2, mv_limit[0][1] << 2, mv_limit[1][0] << 2, mv_limit[1][1] << 2}; + for( int i = 0; i < i_mvc; i++ ) + { + uint32_t mv = M32( mvc[i] ); + int mx = mvc[i][0]; + int my = mvc[i][1]; + if( !mv || mv == pmv ) continue; + dst[cnt][0] = x264_clip3( mx, qpel_limit[0], qpel_limit[2] ); + dst[cnt][1] = x264_clip3( my, qpel_limit[1], qpel_limit[3] ); + cnt++; + } + return cnt; +} + #if ARCH_X86 || ARCH_X86_64 #include "x86/util.h" #endif diff --git a/common/x86/util.h b/common/x86/util.h index fb9912b1..972f0de9 100644 --- a/common/x86/util.h +++ b/common/x86/util.h @@ -121,42 +121,132 @@ static ALWAYS_INLINE uint16_t x264_cabac_mvd_sum_mmx2(uint8_t *mvdleft, uint8_t return amvd; } +#define x264_predictor_clip x264_predictor_clip_mmx2 +static int ALWAYS_INLINE x264_predictor_clip_mmx2( int16_t (*dst)[2], int16_t (*mvc)[2], int i_mvc, int16_t mv_limit[2][2], uint32_t pmv ) +{ + static const uint32_t pd_32 = 0x20; + intptr_t tmp = (intptr_t)mv_limit, mvc_max = i_mvc, i = 0; + + asm( + "movq (%2), %%mm5 \n" + "movd %6, %%mm3 \n" + "psllw $2, %%mm5 \n" // Convert to subpel + "pshufw $0xEE, %%mm5, %%mm6 \n" + "dec %k3 \n" + "jz 2f \n" // if( i_mvc == 1 ) {do the last iteration} + "punpckldq %%mm3, %%mm3 \n" + "punpckldq %%mm5, %%mm5 \n" + "movd %7, %%mm4 \n" + "lea (%0,%3,4), %3 \n" + "1: \n" + "movq (%0), %%mm0 \n" + "add $8, %0 \n" + "movq %%mm3, %%mm1 \n" + "pxor %%mm2, %%mm2 \n" + "pcmpeqd %%mm0, %%mm1 \n" // mv == pmv + "pcmpeqd %%mm0, %%mm2 \n" // mv == 0 + "por %%mm1, %%mm2 \n" // (mv == pmv || mv == 0) * -1 + "pmovmskb %%mm2, %k2 \n" // (mv == pmv || mv == 0) * 0xf + "pmaxsw %%mm5, %%mm0 \n" + "pminsw %%mm6, %%mm0 \n" + "pand %%mm4, %%mm2 \n" // (mv0 == pmv || mv0 == 0) * 32 + "psrlq %%mm2, %%mm0 \n" // drop mv0 if it's skipped + "movq %%mm0, (%5,%4,4) \n" + "and $24, %k2 \n" + "add $2, %4 \n" + "add $8, %k2 \n" + "shr $4, %k2 \n" // (4-val)>>1 + "sub %2, %4 \n" // +1 for each valid motion vector + "cmp %3, %0 \n" + "jl 1b \n" + "jg 3f \n" // if( i == i_mvc - 1 ) {do the last iteration} + + /* Do the last iteration */ + "2: \n" + "movd (%0), %%mm0 \n" + "pxor %%mm2, %%mm2 \n" + "pcmpeqd %%mm0, %%mm3 \n" + "pcmpeqd %%mm0, %%mm2 \n" + "por %%mm3, %%mm2 \n" + "pmovmskb %%mm2, %k2 \n" + "pmaxsw %%mm5, %%mm0 \n" + "pminsw %%mm6, %%mm0 \n" + "movd %%mm0, (%5,%4,4) \n" + "inc %4 \n" + "and $1, %k2 \n" + "sub %2, %4 \n" // output += !(mv == pmv || mv == 0) + "3: \n" + :"+r"(mvc), "=m"(M64( dst )), "+r"(tmp), "+r"(mvc_max), "+r"(i) + :"r"(dst), "g"(pmv), "m"(pd_32), "m"(M64( mvc )) + ); + return i; +} + +/* Same as the above, except we do (mv + 2) >> 2 on the input. */ #define x264_predictor_roundclip x264_predictor_roundclip_mmx2 -static void ALWAYS_INLINE x264_predictor_roundclip_mmx2( int16_t (*dst)[2], int16_t (*mvc)[2], int i_mvc, int mv_x_min, int mv_x_max, int mv_y_min, int mv_y_max ) +static int ALWAYS_INLINE x264_predictor_roundclip_mmx2( int16_t (*dst)[2], int16_t (*mvc)[2], int i_mvc, int16_t mv_limit[2][2], uint32_t pmv ) { - uint32_t mv_min = pack16to32_mask( mv_x_min, mv_y_min ); - uint32_t mv_max = pack16to32_mask( mv_x_max, mv_y_max ); static const uint64_t pw_2 = 0x0002000200020002ULL; - intptr_t i = i_mvc; + static const uint32_t pd_32 = 0x20; + intptr_t tmp = (intptr_t)mv_limit, mvc_max = i_mvc, i = 0; + asm( - "movd %2, %%mm5 \n" - "movd %3, %%mm6 \n" - "movq %4, %%mm7 \n" - "punpckldq %%mm5, %%mm5 \n" - "punpckldq %%mm6, %%mm6 \n" - "test $1, %0 \n" - "jz 1f \n" - "movd -4(%6,%0,4), %%mm0 \n" - "paddw %%mm7, %%mm0 \n" - "psraw $2, %%mm0 \n" - "pmaxsw %%mm5, %%mm0 \n" - "pminsw %%mm6, %%mm0 \n" - "movd %%mm0, -4(%5,%0,4) \n" - "dec %0 \n" - "jz 2f \n" - "1: \n" - "movq -8(%6,%0,4), %%mm0 \n" - "paddw %%mm7, %%mm0 \n" - "psraw $2, %%mm0 \n" - "pmaxsw %%mm5, %%mm0 \n" - "pminsw %%mm6, %%mm0 \n" - "movq %%mm0, -8(%5,%0,4) \n" - "sub $2, %0 \n" - "jnz 1b \n" - "2: \n" - :"+r"(i), "=m"(M64( dst )) - :"g"(mv_min), "g"(mv_max), "m"(pw_2), "r"(dst), "r"(mvc), "m"(M64( mvc )) + "movq (%2), %%mm5 \n" + "movq %6, %%mm7 \n" + "movd %7, %%mm3 \n" + "pshufw $0xEE, %%mm5, %%mm6 \n" + "dec %k3 \n" + "jz 2f \n" + "punpckldq %%mm3, %%mm3 \n" + "punpckldq %%mm5, %%mm5 \n" + "movd %8, %%mm4 \n" + "lea (%0,%3,4), %3 \n" + "1: \n" + "movq (%0), %%mm0 \n" + "add $8, %0 \n" + "paddw %%mm7, %%mm0 \n" + "psraw $2, %%mm0 \n" + "movq %%mm3, %%mm1 \n" + "pxor %%mm2, %%mm2 \n" + "pcmpeqd %%mm0, %%mm1 \n" + "pcmpeqd %%mm0, %%mm2 \n" + "por %%mm1, %%mm2 \n" + "pmovmskb %%mm2, %k2 \n" + "pmaxsw %%mm5, %%mm0 \n" + "pminsw %%mm6, %%mm0 \n" + "pand %%mm4, %%mm2 \n" + "psrlq %%mm2, %%mm0 \n" + "movq %%mm0, (%5,%4,4) \n" + "and $24, %k2 \n" + "add $2, %4 \n" + "add $8, %k2 \n" + "shr $4, %k2 \n" + "sub %2, %4 \n" + "cmp %3, %0 \n" + "jl 1b \n" + "jg 3f \n" + + /* Do the last iteration */ + "2: \n" + "movd (%0), %%mm0 \n" + "paddw %%mm7, %%mm0 \n" + "psraw $2, %%mm0 \n" + "pxor %%mm2, %%mm2 \n" + "pcmpeqd %%mm0, %%mm3 \n" + "pcmpeqd %%mm0, %%mm2 \n" + "por %%mm3, %%mm2 \n" + "pmovmskb %%mm2, %k2 \n" + "pmaxsw %%mm5, %%mm0 \n" + "pminsw %%mm6, %%mm0 \n" + "movd %%mm0, (%5,%4,4) \n" + "inc %4 \n" + "and $1, %k2 \n" + "sub %2, %4 \n" + "3: \n" + :"+r"(mvc), "=m"(M64( dst )), "+r"(tmp), "+r"(mvc_max), "+r"(i) + :"r"(dst), "m"(pw_2), "g"(pmv), "m"(pd_32), "m"(M64( mvc )) ); + return i; } #endif diff --git a/encoder/analyse.c b/encoder/analyse.c index 39bb6b19..ec942b3c 100644 --- a/encoder/analyse.c +++ b/encoder/analyse.c @@ -467,8 +467,8 @@ static void x264_mb_analyse_init( x264_t *h, x264_mb_analysis_t *a, int qp ) if( max_mv > 0 && h->mb.i_mb_x < h->fdec->i_pir_start_col ) h->mb.mv_max_spel[0] = X264_MIN( h->mb.mv_max_spel[0], max_mv ); } - h->mb.mv_min_fpel[0] = (h->mb.mv_min_spel[0]>>2) + i_fpel_border; - h->mb.mv_max_fpel[0] = (h->mb.mv_max_spel[0]>>2) - i_fpel_border; + h->mb.mv_limit_fpel[0][0] = (h->mb.mv_min_spel[0]>>2) + i_fpel_border; + h->mb.mv_limit_fpel[1][0] = (h->mb.mv_max_spel[0]>>2) - i_fpel_border; if( h->mb.i_mb_x == 0 && !(h->mb.i_mb_y & PARAM_INTERLACED) ) { int mb_y = h->mb.i_mb_y >> SLICE_MBAFF; @@ -516,8 +516,8 @@ static void x264_mb_analyse_init( x264_t *h, x264_mb_analysis_t *a, int qp ) h->mb.mv_min_spel[1] = x264_clip3( h->mb.mv_min[1], -i_fmv_range, i_fmv_range ); h->mb.mv_max_spel[1] = CLIP_FMV( h->mb.mv_max[1] ); h->mb.mv_max_spel[1] = X264_MIN( h->mb.mv_max_spel[1], thread_mvy_range*4 ); - h->mb.mv_min_fpel[1] = (h->mb.mv_min_spel[1]>>2) + i_fpel_border; - h->mb.mv_max_fpel[1] = (h->mb.mv_max_spel[1]>>2) - i_fpel_border; + h->mb.mv_limit_fpel[0][1] = (h->mb.mv_min_spel[1]>>2) + i_fpel_border; + h->mb.mv_limit_fpel[1][1] = (h->mb.mv_max_spel[1]>>2) - i_fpel_border; } } if( PARAM_INTERLACED ) @@ -527,8 +527,8 @@ static void x264_mb_analyse_init( x264_t *h, x264_mb_analysis_t *a, int qp ) h->mb.mv_max[1] = h->mb.mv_maxy_row[i]; h->mb.mv_min_spel[1] = h->mb.mv_miny_spel_row[i]; h->mb.mv_max_spel[1] = h->mb.mv_maxy_spel_row[i]; - h->mb.mv_min_fpel[1] = h->mb.mv_miny_fpel_row[i]; - h->mb.mv_max_fpel[1] = h->mb.mv_maxy_fpel_row[i]; + h->mb.mv_limit_fpel[0][1] = h->mb.mv_miny_fpel_row[i]; + h->mb.mv_limit_fpel[1][1] = h->mb.mv_maxy_fpel_row[i]; } #undef CLIP_FMV diff --git a/encoder/me.c b/encoder/me.c index 49f143c3..65c95a6d 100644 --- a/encoder/me.c +++ b/encoder/me.c @@ -61,21 +61,22 @@ static void refine_subpel( x264_t *h, x264_me_t *m, int hpel_iters, int qpel_ite (p_cost_mvx[(mx)<<2] + p_cost_mvy[(my)<<2]) #define COST_MV( mx, my )\ +do\ {\ int cost = h->pixf.fpelcmp[i_pixel]( p_fenc, FENC_STRIDE,\ &p_fref_w[(my)*stride+(mx)], stride )\ + BITS_MVD(mx,my);\ COPY3_IF_LT( bcost, cost, bmx, mx, bmy, my );\ -} +} while(0) -#define COST_MV_HPEL( mx, my ) \ -{ \ - intptr_t stride2 = 16; \ - pixel *src = h->mc.get_ref( pix, &stride2, m->p_fref, stride, mx, my, bw, bh, &m->weight[0] ); \ - int cost = h->pixf.fpelcmp[i_pixel]( p_fenc, FENC_STRIDE, src, stride2 ) \ - + p_cost_mvx[ mx ] + p_cost_mvy[ my ]; \ - COPY3_IF_LT( bpred_cost, cost, bpred_mx, mx, bpred_my, my ); \ -} +#define COST_MV_HPEL( mx, my, cost )\ +do\ +{\ + intptr_t stride2 = 16;\ + pixel *src = h->mc.get_ref( pix, &stride2, m->p_fref, stride, mx, my, bw, bh, &m->weight[0] );\ + cost = h->pixf.fpelcmp[i_pixel]( p_fenc, FENC_STRIDE, src, stride2 )\ + + p_cost_mvx[ mx ] + p_cost_mvy[ my ];\ +} while(0) #define COST_MV_X3_DIR( m0x, m0y, m1x, m1y, m2x, m2y, costs )\ {\ @@ -174,6 +175,9 @@ static void refine_subpel( x264_t *h, x264_me_t *m, int hpel_iters, int qpel_ite }\ } +#define FPEL(mv) (((mv)+2)>>2) /* Convert subpel MV to fullpel with rounding... */ +#define SPEL(mv) ((mv)<<2) /* ... and the reverse. */ + void x264_me_search_ref( x264_t *h, x264_me_t *m, int16_t (*mvc)[2], int i_mvc, int *p_halfpel_thresh ) { const int bw = x264_pixel_size[m->i_pixel].w; @@ -181,95 +185,135 @@ void x264_me_search_ref( x264_t *h, x264_me_t *m, int16_t (*mvc)[2], int i_mvc, const int i_pixel = m->i_pixel; const int stride = m->i_stride[0]; int i_me_range = h->param.analyse.i_me_range; - int bmx, bmy, bcost; + int bmx, bmy, bcost = COST_MAX; int bpred_mx = 0, bpred_my = 0, bpred_cost = COST_MAX; int omx, omy, pmx, pmy; pixel *p_fenc = m->p_fenc[0]; pixel *p_fref_w = m->p_fref_w; ALIGNED_ARRAY_16( pixel, pix,[16*16] ); + ALIGNED_ARRAY_8( int16_t, mvc_temp,[16],[2] ); int costs[16]; - int mv_x_min = h->mb.mv_min_fpel[0]; - int mv_y_min = h->mb.mv_min_fpel[1]; - int mv_x_max = h->mb.mv_max_fpel[0]; - int mv_y_max = h->mb.mv_max_fpel[1]; - int mv_x_min_qpel = mv_x_min << 2; - int mv_y_min_qpel = mv_y_min << 2; - int mv_x_max_qpel = mv_x_max << 2; - int mv_y_max_qpel = mv_y_max << 2; + int mv_x_min = h->mb.mv_limit_fpel[0][0]; + int mv_y_min = h->mb.mv_limit_fpel[0][1]; + int mv_x_max = h->mb.mv_limit_fpel[1][0]; + int mv_y_max = h->mb.mv_limit_fpel[1][1]; /* Special version of pack to allow shortcuts in CHECK_MVRANGE */ #define pack16to32_mask2(mx,my) ((mx<<16)|(my&0x7FFF)) uint32_t mv_min = pack16to32_mask2( -mv_x_min, -mv_y_min ); uint32_t mv_max = pack16to32_mask2( mv_x_max, mv_y_max )|0x8000; + uint32_t pmv; #define CHECK_MVRANGE(mx,my) (!(((pack16to32_mask2(mx,my) + mv_min) | (mv_max - pack16to32_mask2(mx,my))) & 0x80004000)) const uint16_t *p_cost_mvx = m->p_cost_mv - m->mvp[0]; const uint16_t *p_cost_mvy = m->p_cost_mv - m->mvp[1]; - uint32_t pmv; - bmx = x264_clip3( m->mvp[0], mv_x_min_qpel, mv_x_max_qpel ); - bmy = x264_clip3( m->mvp[1], mv_y_min_qpel, mv_y_max_qpel ); - pmx = ( bmx + 2 ) >> 2; - pmy = ( bmy + 2 ) >> 2; - bcost = COST_MAX; - - /* try extra predictors if provided */ + /* Try extra predictors if provided. If subme >= 3, check subpel predictors, + * otherwise round them to fullpel. */ if( h->mb.i_subpel_refine >= 3 ) { - pmv = pack16to32_mask(bmx,bmy); - COST_MV_HPEL( bmx, bmy ); - for( int i = 0; i < i_mvc; i++ ) + /* Calculate and check the MVP first */ + bpred_mx = x264_clip3( m->mvp[0], SPEL(mv_x_min), SPEL(mv_x_max) ); + bpred_my = x264_clip3( m->mvp[1], SPEL(mv_y_min), SPEL(mv_y_max) ); + pmv = pack16to32_mask( bpred_mx, bpred_my ); + pmx = FPEL( bpred_mx ); + pmy = FPEL( bpred_my ); + + COST_MV_HPEL( bpred_mx, bpred_my, bpred_cost ); + int pmv_cost = bpred_cost; + + if( i_mvc > 0 ) { - if( M32( mvc[i] ) && (pmv != M32( mvc[i] )) ) + /* Clip MV candidates and eliminate those equal to zero and pmv. */ + int valid_mvcs = x264_predictor_clip( mvc_temp+2, mvc, i_mvc, h->mb.mv_limit_fpel, pmv ); + if( valid_mvcs > 0 ) { - int mx = x264_clip3( mvc[i][0], mv_x_min_qpel, mv_x_max_qpel ); - int my = x264_clip3( mvc[i][1], mv_y_min_qpel, mv_y_max_qpel ); - COST_MV_HPEL( mx, my ); + int i = 1, cost; + /* We stuff pmv here to branchlessly pick between pmv and the various + * MV candidates. [0] gets skipped in order to maintain alignment for + * x264_predictor_clip. */ + M32( mvc_temp[1] ) = pmv; + bpred_cost <<= 4; + do + { + int mx = mvc_temp[i+1][0]; + int my = mvc_temp[i+1][1]; + COST_MV_HPEL( mx, my, cost ); + COPY1_IF_LT( bpred_cost, (cost << 4) + i ); + } while( ++i <= valid_mvcs ); + bpred_mx = mvc_temp[(bpred_cost&15)+1][0]; + bpred_my = mvc_temp[(bpred_cost&15)+1][1]; + bpred_cost >>= 4; } } - bmx = ( bpred_mx + 2 ) >> 2; - bmy = ( bpred_my + 2 ) >> 2; - COST_MV( bmx, bmy ); + + /* Round the best predictor back to fullpel and get the cost, since this is where + * we'll be starting the fullpel motion search. */ + bmx = FPEL( bpred_mx ); + bmy = FPEL( bpred_my ); + if( (bpred_mx|bpred_my)&0x3 ) /* Only test if the tested predictor is actually subpel... */ + COST_MV( bmx, bmy ); + else /* Otherwise just copy the cost (we already know it) */ + bcost = bpred_cost; + + /* Test the zero vector if it hasn't been tested yet. */ + if( pmv ) + { + if( bmx|bmy ) COST_MV( 0, 0 ); + } + /* If a subpel mv candidate was better than the zero vector, the previous + * fullpel check won't have gotten it even if the pmv was zero. So handle + * that possibility here. */ + else + { + COPY3_IF_LT( bcost, pmv_cost, bmx, 0, bmy, 0 ); + } } else { - /* check the MVP */ - bmx = pmx; - bmy = pmy; + /* Calculate and check the fullpel MVP first */ + bmx = pmx = x264_clip3( FPEL(m->mvp[0]), mv_x_min, mv_x_max ); + bmy = pmy = x264_clip3( FPEL(m->mvp[1]), mv_y_min, mv_y_max ); + pmv = pack16to32_mask( bmx, bmy ); + /* Because we are rounding the predicted motion vector to fullpel, there will be * an extra MV cost in 15 out of 16 cases. However, when the predicted MV is * chosen as the best predictor, it is often the case that the subpel search will - * result in a vector at or next to the predicted motion vector. Therefore, it is - * sensible to omit the cost of the MV from the rounded MVP to avoid unfairly - * biasing against use of the predicted motion vector. */ + * result in a vector at or next to the predicted motion vector. Therefore, we omit + * the cost of the MV from the rounded MVP to avoid unfairly biasing against use of + * the predicted motion vector. + * + * Disclaimer: this is a post-hoc rationalization for why this hack works. */ bcost = h->pixf.fpelcmp[i_pixel]( p_fenc, FENC_STRIDE, &p_fref_w[bmy*stride+bmx], stride ); - pmv = pack16to32_mask( bmx, bmy ); + if( i_mvc > 0 ) { - ALIGNED_ARRAY_8( int16_t, mvc_fpel,[16],[2] ); - x264_predictor_roundclip( mvc_fpel+2, mvc, i_mvc, mv_x_min, mv_x_max, mv_y_min, mv_y_max ); - M32( mvc_fpel[1] ) = pmv; - bcost <<= 4; - for( int i = 1; i <= i_mvc; i++ ) + /* Like in subme>=3, except we also round the candidates to fullpel. */ + int valid_mvcs = x264_predictor_roundclip( mvc_temp+2, mvc, i_mvc, h->mb.mv_limit_fpel, pmv ); + if( valid_mvcs > 0 ) { - if( M32( mvc_fpel[i+1] ) && (pmv != M32( mvc_fpel[i+1] )) ) + int i = 1, cost; + M32( mvc_temp[1] ) = pmv; + bcost <<= 4; + do { - int mx = mvc_fpel[i+1][0]; - int my = mvc_fpel[i+1][1]; - int cost = h->pixf.fpelcmp[i_pixel]( p_fenc, FENC_STRIDE, &p_fref_w[my*stride+mx], stride ) + BITS_MVD( mx, my ); - cost = (cost << 4) + i; - COPY1_IF_LT( bcost, cost ); - } + int mx = mvc_temp[i+1][0]; + int my = mvc_temp[i+1][1]; + cost = h->pixf.fpelcmp[i_pixel]( p_fenc, FENC_STRIDE, &p_fref_w[my*stride+mx], stride ) + BITS_MVD( mx, my ); + COPY1_IF_LT( bcost, (cost << 4) + i ); + } while( ++i <= valid_mvcs ); + bmx = mvc_temp[(bcost&15)+1][0]; + bmy = mvc_temp[(bcost&15)+1][1]; + bcost >>= 4; } - bmx = mvc_fpel[(bcost&15)+1][0]; - bmy = mvc_fpel[(bcost&15)+1][1]; - bcost >>= 4; } - } - COST_MV( 0, 0 ); + /* Same as above, except the condition is simpler. */ + if( pmv ) + COST_MV( 0, 0 ); + } switch( h->mb.i_me_method ) { @@ -733,8 +777,8 @@ void x264_me_search_ref( x264_t *h, x264_me_t *m, int16_t (*mvc)[2], int i_mvc, } else { - m->mv[0] = bmx << 2; - m->mv[1] = bmy << 2; + m->mv[0] = SPEL(bmx); + m->mv[1] = SPEL(bmy); m->cost = bcost; } diff --git a/encoder/slicetype.c b/encoder/slicetype.c index e9c7f118..0ecd91e2 100644 --- a/encoder/slicetype.c +++ b/encoder/slicetype.c @@ -472,16 +472,16 @@ static void x264_slicetype_mb_cost( x264_t *h, x264_mb_analysis_t *a, goto lowres_intra_mb; // no need for h->mb.mv_min[] - h->mb.mv_min_fpel[0] = -8*h->mb.i_mb_x - 4; - h->mb.mv_max_fpel[0] = 8*( h->mb.i_mb_width - h->mb.i_mb_x - 1 ) + 4; - h->mb.mv_min_spel[0] = 4*( h->mb.mv_min_fpel[0] - 8 ); - h->mb.mv_max_spel[0] = 4*( h->mb.mv_max_fpel[0] + 8 ); + h->mb.mv_limit_fpel[0][0] = -8*h->mb.i_mb_x - 4; + h->mb.mv_limit_fpel[1][0] = 8*( h->mb.i_mb_width - h->mb.i_mb_x - 1 ) + 4; + h->mb.mv_min_spel[0] = 4*( h->mb.mv_limit_fpel[0][0] - 8 ); + h->mb.mv_max_spel[0] = 4*( h->mb.mv_limit_fpel[1][0] + 8 ); if( h->mb.i_mb_x >= h->mb.i_mb_width - 2 ) { - h->mb.mv_min_fpel[1] = -8*h->mb.i_mb_y - 4; - h->mb.mv_max_fpel[1] = 8*( h->mb.i_mb_height - h->mb.i_mb_y - 1 ) + 4; - h->mb.mv_min_spel[1] = 4*( h->mb.mv_min_fpel[1] - 8 ); - h->mb.mv_max_spel[1] = 4*( h->mb.mv_max_fpel[1] + 8 ); + h->mb.mv_limit_fpel[0][1] = -8*h->mb.i_mb_y - 4; + h->mb.mv_limit_fpel[1][1] = 8*( h->mb.i_mb_height - h->mb.i_mb_y - 1 ) + 4; + h->mb.mv_min_spel[1] = 4*( h->mb.mv_limit_fpel[0][1] - 8 ); + h->mb.mv_max_spel[1] = 4*( h->mb.mv_limit_fpel[1][1] + 8 ); } #define LOAD_HPELS_LUMA(dst, src) \ -- 2.40.0