From: Laurent Aimar Date: Wed, 22 Sep 2004 07:07:48 +0000 (+0000) Subject: * all: Patches by Loren Merritt: X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=48e288644ed493a77b5935e90010973a4e15faf6;p=libx264 * all: Patches by Loren Merritt: "Improved patch. Now supports subpel ME on all candidate MB types, not just on the winner. subpel_refine: (completely different scale from before) 0 => halfpel only 1 => 1 iteration of qpel on the winner (same as x264 r46) 2 => 2 iterations of qpel (about the same as my earlier patch, but faster 3 => halfpel on all MB types, qpel on the winner 4 => qpel on all 5 => more iterations benchmarks: mencoder dvd://1 -ovc x264 -x264encopts qp_constant=19:fullinter:cabac:iframe=200:psnr subpel_refine=1: PSNR Global:46.82 kb/s:1048.1 fps:17.335 subpel_refine=2: PSNR Global:46.83 kb/s:1034.4 fps:16.970 subpel_refine=3: PSNR Global:46.84 kb/s:1023.3 fps:14.770 subpel_refine=4: PSNR Global:46.87 kb/s:1010.8 fps:11.598 subpel_refine=5: PSNR Global:46.88 kb/s:1006.9 fps:10.824" And "The current code for calculating the cost of encoding which reference frame a MB is predicted from, introduces a bias towards ref0 and against P16x16. Removing this bias produces an improvement of .4% - 2% bitrate, depending on content and number of reference frames." git-svn-id: svn://svn.videolan.org/x264/trunk@47 df754926-b1dd-0310-bc7b-ec298dee348c --- diff --git a/core/common.c b/core/common.c index b8a157ed..403c0764 100644 --- a/core/common.c +++ b/core/common.c @@ -96,6 +96,7 @@ void x264_param_default( x264_param_t *param ) /* */ param->analyse.intra = X264_ANALYSE_I4x4; param->analyse.inter = X264_ANALYSE_I4x4 | X264_ANALYSE_PSUB16x16; + param->analyse.i_subpel_refine = 1; param->analyse.b_psnr = 1; } diff --git a/encoder/analyse.c b/encoder/analyse.c index 27cf9938..41c561d9 100644 --- a/encoder/analyse.c +++ b/encoder/analyse.c @@ -25,6 +25,7 @@ #include #include #include +#include #include "../core/common.h" #include "../core/macroblock.h" @@ -464,15 +465,8 @@ static void x264_mb_analyse_inter_p16x16( x264_t *h, x264_mb_analysis_t *a ) // m.mvc[0] = 0; // m.mvc[1] = 0; - /* ME for ref 0 */ - m.p_fref = h->mb.pic.p_fref[0][0][0]; - x264_mb_predict_mv_16x16( h, 0, 0, m.mvp ); - x264_me_search( h, &m ); - - a->l0.i_ref = 0; - a->l0.me16x16 = m; - - for( i_ref = 1; i_ref < h->i_ref0; i_ref++ ) + a->l0.me16x16.cost = INT_MAX; + for( i_ref = 0; i_ref < h->i_ref0; i_ref++ ) { /* search with ref */ m.p_fref = h->mb.pic.p_fref[0][i_ref][0]; @@ -489,6 +483,9 @@ static void x264_mb_analyse_inter_p16x16( x264_t *h, x264_mb_analysis_t *a ) } } + /* subtract ref cost, so we don't have to add it for the other P types */ + a->l0.me16x16.cost -= m.lm * bs_size_te( h->sh.i_num_ref_idx_l0_active - 1, a->l0.i_ref ); + /* Set global ref, needed for all others modes */ x264_macroblock_cache_ref( h, 0, 0, 4, 4, 0, a->l0.i_ref ); } @@ -765,15 +762,9 @@ static void x264_mb_analyse_inter_b16x16( x264_t *h, x264_mb_analysis_t *a ) m.b_mvc = 0; m.i_mv_range = a->i_mv_range; - /* ME for List 0 ref 0 */ - m.p_fref = h->mb.pic.p_fref[0][0][0]; - x264_mb_predict_mv_16x16( h, 0, 0, m.mvp ); - x264_me_search( h, &m ); - - a->l0.i_ref = 0; - a->l0.me16x16 = m; - - for( i_ref = 1; i_ref < h->i_ref0; i_ref++ ) + /* ME for List 0 */ + a->l0.me16x16.cost = INT_MAX; + for( i_ref = 0; i_ref < h->i_ref0; i_ref++ ) { /* search with ref */ m.p_fref = h->mb.pic.p_fref[0][i_ref][0]; @@ -790,15 +781,9 @@ static void x264_mb_analyse_inter_b16x16( x264_t *h, x264_mb_analysis_t *a ) } } - /* ME for list 1 ref 0 */ - m.p_fref = h->mb.pic.p_fref[1][0][0]; - x264_mb_predict_mv_16x16( h, 1, 0, m.mvp ); - x264_me_search( h, &m ); - - a->l1.i_ref = 0; - a->l1.me16x16 = m; - - for( i_ref = 1; i_ref < h->i_ref1; i_ref++ ) + /* ME for list 1 */ + a->l1.me16x16.cost = INT_MAX; + for( i_ref = 0; i_ref < h->i_ref1; i_ref++ ) { /* search with ref */ m.p_fref = h->mb.pic.p_fref[1][i_ref][0]; diff --git a/encoder/encoder.c b/encoder/encoder.c index 5fdf21a9..517d730a 100644 --- a/encoder/encoder.c +++ b/encoder/encoder.c @@ -347,6 +347,11 @@ x264_t *x264_encoder_open ( x264_param_t *param ) h->param.i_cabac_init_idc = x264_clip3( h->param.i_cabac_init_idc, -1, 2 ); + if( param->analyse.i_subpel_refine < 0 ) + param->analyse.i_subpel_refine = 0; + if( param->analyse.i_subpel_refine > 5 ) + param->analyse.i_subpel_refine = 5; + /* VUI */ if( h->param.vui.i_sar_width > 0 && h->param.vui.i_sar_height > 0 ) { diff --git a/encoder/me.c b/encoder/me.c index 86396f3b..90131515 100644 --- a/encoder/me.c +++ b/encoder/me.c @@ -28,6 +28,20 @@ #include "../core/common.h" #include "me.h" +/* presets selected from good points on the speed-vs-quality curve of several test videos + * subpel_iters[i_subpel_refine] = { refine_hpel, refine_qpel, me_hpel, me_qpel } + * where me_* are the number of EPZS iterations run on all candidate block types, + * and refine_* are run only on the winner. */ +const static int subpel_iterations[][4] = + {{1,0,0,0}, + {1,1,0,0}, + {1,2,0,0}, + {0,2,1,0}, + {0,2,1,1}, + {0,2,1,2}}; + +static void refine_subpel( x264_t *h, x264_me_t *m, int hpel_iters, int qpel_iters ); + void x264_me_search( x264_t *h, x264_me_t *m ) { const int i_pixel = m->i_pixel; @@ -35,6 +49,7 @@ void x264_me_search( x264_t *h, x264_me_t *m ) int bmx, bmy; uint8_t *p_fref = m->p_fref; int i_iter; + int hpel, qpel; /* init with mvp */ @@ -118,9 +133,22 @@ void x264_me_search( x264_t *h, x264_me_t *m ) m->cost = h->pixf.satd[i_pixel]( m->p_fenc, m->i_stride, p_fref, m->i_stride ) + m->lm * ( bs_size_se( m->mv[0] - m->mvp[0] ) + bs_size_se( m->mv[1] - m->mvp[1] ) ); + + hpel = subpel_iterations[h->param.analyse.i_subpel_refine][2]; + qpel = subpel_iterations[h->param.analyse.i_subpel_refine][3]; + if( hpel || qpel ) + refine_subpel( h, m, hpel, qpel ); } void x264_me_refine_qpel( x264_t *h, x264_me_t *m ) +{ + int hpel = subpel_iterations[h->param.analyse.i_subpel_refine][0]; + int qpel = subpel_iterations[h->param.analyse.i_subpel_refine][1]; + if( hpel || qpel ) + refine_subpel( h, m, hpel, qpel ); +} + +static void refine_subpel( x264_t *h, x264_me_t *m, int hpel_iters, int qpel_iters ) { const int bw = x264_pixel_size[m->i_pixel].w; const int bh = x264_pixel_size[m->i_pixel].h; @@ -128,66 +156,47 @@ void x264_me_refine_qpel( x264_t *h, x264_me_t *m ) DECLARE_ALIGNED( uint8_t, pix[4][16*16], 16 ); int cost[4]; int best; + int step, i; int bmx = m->mv[0]; int bmy = m->mv[1]; - h->mc[MC_LUMA]( m->p_fref, m->i_stride, pix[0], 16, bmx + 0, bmy - 2, bw, bh ); - h->mc[MC_LUMA]( m->p_fref, m->i_stride, pix[1], 16, bmx + 0, bmy + 2, bw, bh ); - h->mc[MC_LUMA]( m->p_fref, m->i_stride, pix[2], 16, bmx - 2, bmy + 0, bw, bh ); - h->mc[MC_LUMA]( m->p_fref, m->i_stride, pix[3], 16, bmx + 2, bmy + 0, bw, bh ); - - cost[0] = h->pixf.satd[m->i_pixel]( m->p_fenc, m->i_stride, pix[0], 16 ) + - m->lm * ( bs_size_se( bmx + 0 - m->mvp[0] ) + bs_size_se( bmy - 2 - m->mvp[1] ) ); - cost[1] = h->pixf.satd[m->i_pixel]( m->p_fenc, m->i_stride, pix[1], 16 ) + - m->lm * ( bs_size_se( bmx + 0 - m->mvp[0] ) + bs_size_se( bmy + 2 - m->mvp[1] ) ); - cost[2] = h->pixf.satd[m->i_pixel]( m->p_fenc, m->i_stride, pix[2], 16 ) + - m->lm * ( bs_size_se( bmx - 2 - m->mvp[0] ) + bs_size_se( bmy + 0 - m->mvp[1] ) ); - cost[3] = h->pixf.satd[m->i_pixel]( m->p_fenc, m->i_stride, pix[3], 16 ) + - m->lm * ( bs_size_se( bmx + 2 - m->mvp[0] ) + bs_size_se( bmy + 0 - m->mvp[1] ) ); - - best = 0; - if( cost[1] < cost[0] ) best = 1; - if( cost[2] < cost[best] ) best = 2; - if( cost[3] < cost[best] ) best = 3; - - if( cost[best] < m->cost ) + for( step = 2; step >= 1; step-- ) { - m->cost = cost[best]; - if( best == 0 ) bmy -= 2; - else if( best == 1 ) bmy += 2; - else if( best == 2 ) bmx -= 2; - else if( best == 3 ) bmx += 2; - } - - h->mc[MC_LUMA]( m->p_fref, m->i_stride, pix[0], 16, bmx + 0, bmy - 1, bw, bh ); - h->mc[MC_LUMA]( m->p_fref, m->i_stride, pix[1], 16, bmx + 0, bmy + 1, bw, bh ); - h->mc[MC_LUMA]( m->p_fref, m->i_stride, pix[2], 16, bmx - 1, bmy + 0, bw, bh ); - h->mc[MC_LUMA]( m->p_fref, m->i_stride, pix[3], 16, bmx + 1, bmy + 0, bw, bh ); - - cost[0] = h->pixf.satd[m->i_pixel]( m->p_fenc, m->i_stride, pix[0], 16 ) + - m->lm * ( bs_size_se( bmx + 0 - m->mvp[0] ) + bs_size_se( bmy - 1 - m->mvp[1] ) ); - cost[1] = h->pixf.satd[m->i_pixel]( m->p_fenc, m->i_stride, pix[1], 16 ) + - m->lm * ( bs_size_se( bmx + 0 - m->mvp[0] ) + bs_size_se( bmy + 1 - m->mvp[1] ) ); - cost[2] = h->pixf.satd[m->i_pixel]( m->p_fenc, m->i_stride, pix[2], 16 ) + - m->lm * ( bs_size_se( bmx - 1 - m->mvp[0] ) + bs_size_se( bmy + 0 - m->mvp[1] ) ); - cost[3] = h->pixf.satd[m->i_pixel]( m->p_fenc, m->i_stride, pix[3], 16 ) + - m->lm * ( bs_size_se( bmx + 1 - m->mvp[0] ) + bs_size_se( bmy + 0 - m->mvp[1] ) ); - - best = 0; - if( cost[1] < cost[0] ) best = 1; - if( cost[2] < cost[best] ) best = 2; - if( cost[3] < cost[best] ) best = 3; - - if( cost[best] < m->cost ) - { - m->cost = cost[best]; - if( best == 0 ) bmy--; - else if( best == 1 ) bmy++; - else if( best == 2 ) bmx--; - else if( best == 3 ) bmx++; + for( i = step>1 ? hpel_iters : qpel_iters; i > 0; i-- ) + { + h->mc[MC_LUMA]( m->p_fref, m->i_stride, pix[0], 16, bmx + 0, bmy - step, bw, bh ); + h->mc[MC_LUMA]( m->p_fref, m->i_stride, pix[1], 16, bmx + 0, bmy + step, bw, bh ); + h->mc[MC_LUMA]( m->p_fref, m->i_stride, pix[2], 16, bmx - step, bmy + 0, bw, bh ); + h->mc[MC_LUMA]( m->p_fref, m->i_stride, pix[3], 16, bmx + step, bmy + 0, bw, bh ); + + cost[0] = h->pixf.satd[m->i_pixel]( m->p_fenc, m->i_stride, pix[0], 16 ) + + m->lm * ( bs_size_se( bmx + 0 - m->mvp[0] ) + bs_size_se( bmy - step - m->mvp[1] ) ); + cost[1] = h->pixf.satd[m->i_pixel]( m->p_fenc, m->i_stride, pix[1], 16 ) + + m->lm * ( bs_size_se( bmx + 0 - m->mvp[0] ) + bs_size_se( bmy + step - m->mvp[1] ) ); + cost[2] = h->pixf.satd[m->i_pixel]( m->p_fenc, m->i_stride, pix[2], 16 ) + + m->lm * ( bs_size_se( bmx - step - m->mvp[0] ) + bs_size_se( bmy + 0 - m->mvp[1] ) ); + cost[3] = h->pixf.satd[m->i_pixel]( m->p_fenc, m->i_stride, pix[3], 16 ) + + m->lm * ( bs_size_se( bmx + step - m->mvp[0] ) + bs_size_se( bmy + 0 - m->mvp[1] ) ); + + best = 0; + if( cost[1] < cost[0] ) best = 1; + if( cost[2] < cost[best] ) best = 2; + if( cost[3] < cost[best] ) best = 3; + + if( cost[best] < m->cost ) + { + m->cost = cost[best]; + if( best == 0 ) bmy -= step; + else if( best == 1 ) bmy += step; + else if( best == 2 ) bmx -= step; + else if( best == 3 ) bmx += step; + } + else break; + } } m->mv[0] = bmx; m->mv[1] = bmy; } + diff --git a/x264.c b/x264.c index c7c6df96..4c662d8e 100644 --- a/x264.c +++ b/x264.c @@ -132,6 +132,7 @@ static void Help( void ) " - i4x4\n" " - psub16x16,psub8x8\n" " - none, all\n" + " --subme Subpixel motion estimation quality\n" "\n" " -s, --sar width:height Specify Sample Aspect Ratio\n" " -o, --output Specify output file\n" @@ -176,6 +177,7 @@ static int Parse( int argc, char **argv, #define OPT_QCOMP 266 #define OPT_NOPSNR 267 #define OPT_QUIET 268 +#define OPT_SUBME 269 static struct option long_options[] = { @@ -196,6 +198,7 @@ static int Parse( int argc, char **argv, { "sar", required_argument, NULL, 's' }, { "output", required_argument, NULL, 'o' }, { "analyse", required_argument, NULL, 'A' }, + { "subme", required_argument, NULL, OPT_SUBME }, { "rcsens", required_argument, NULL, OPT_RCSENS }, { "rcbuf", required_argument, NULL, OPT_RCBUF }, { "rcinitbuf",required_argument, NULL, OPT_RCIBUF }, @@ -304,6 +307,9 @@ static int Parse( int argc, char **argv, if( strstr( optarg, "psub16x16" ) ) param->analyse.inter |= X264_ANALYSE_PSUB16x16; if( strstr( optarg, "psub8x8" ) ) param->analyse.inter |= X264_ANALYSE_PSUB8x8; break; + case OPT_SUBME: + param->analyse.i_subpel_refine = atoi(optarg); + break; case OPT_RCBUF: param->rc.i_rc_buffer_size = atoi(optarg); break; diff --git a/x264.h b/x264.h index 76deec4b..8da79c82 100644 --- a/x264.h +++ b/x264.h @@ -124,6 +124,8 @@ typedef struct unsigned int intra; /* intra flags */ unsigned int inter; /* inter flags */ + int i_subpel_refine; /* subpixel motion estimation quality */ + int b_psnr; /* Do we compute PSNR stats (save a few % of cpu) */ } analyse;