From ecc9bfab548f464d4c2be899055f7ba567c1ed8e Mon Sep 17 00:00:00 2001 From: Fiona Glaser Date: Sun, 14 Sep 2008 21:36:45 -0700 Subject: [PATCH] Psychovisually optimized rate-distortion optimization and trellis The latter, psy-trellis, is disabled by default and is reserved as experimental; your mileage may vary. Default subme is raised to 6 so that psy RD is on by default. --- common/common.c | 20 +++++++- common/common.h | 12 +++++ common/dct.h | 11 +++++ encoder/analyse.c | 74 +++++++++++++++++++++++++++--- encoder/encoder.c | 21 +++++++++ encoder/macroblock.c | 22 ++++----- encoder/macroblock.h | 4 +- encoder/rdo.c | 106 +++++++++++++++++++++++++++++++++++-------- x264.c | 5 ++ x264.h | 4 +- 10 files changed, 240 insertions(+), 39 deletions(-) diff --git a/common/common.c b/common/common.c index 138c5f3b..9d84bfaa 100644 --- a/common/common.c +++ b/common/common.c @@ -116,8 +116,10 @@ void x264_param_default( x264_param_t *param ) | X264_ANALYSE_PSUB16x16 | X264_ANALYSE_BSUB16x16; param->analyse.i_direct_mv_pred = X264_DIRECT_PRED_SPATIAL; param->analyse.i_me_method = X264_ME_HEX; + param->analyse.f_psy_rd = 1.0; + param->analyse.f_psy_trellis = 0; param->analyse.i_me_range = 16; - param->analyse.i_subpel_refine = 5; + param->analyse.i_subpel_refine = 6; param->analyse.b_chroma_me = 1; param->analyse.i_mv_range_thread = -1; param->analyse.i_mv_range = -1; // set from level_idc @@ -470,6 +472,21 @@ int x264_param_parse( x264_param_t *p, const char *name, const char *value ) p->analyse.i_mv_range_thread = atoi(value); OPT2("subme", "subq") p->analyse.i_subpel_refine = atoi(value); + OPT("psy-rd") + { + if( 2 == sscanf( value, "%f:%f", &p->analyse.f_psy_rd, &p->analyse.f_psy_trellis ) || + 2 == sscanf( value, "%f,%f", &p->analyse.f_psy_rd, &p->analyse.f_psy_trellis ) ) + { } + else if( sscanf( value, "%f", &p->analyse.f_psy_rd ) ) + { + p->analyse.f_psy_trellis = 0; + } + else + { + p->analyse.f_psy_rd = 0; + p->analyse.f_psy_trellis = 0; + } + } OPT("bime") p->analyse.b_bidir_me = atobool(value); OPT("chroma-me") @@ -824,6 +841,7 @@ char *x264_param2string( x264_param_t *p, int b_res ) s += sprintf( s, " analyse=%#x:%#x", p->analyse.intra, p->analyse.inter ); s += sprintf( s, " me=%s", x264_motion_est_names[ p->analyse.i_me_method ] ); s += sprintf( s, " subme=%d", p->analyse.i_subpel_refine ); + s += sprintf( s, " psy_rd=%.1f:%.1f", p->analyse.f_psy_rd, p->analyse.f_psy_trellis ); s += sprintf( s, " brdo=%d", p->analyse.b_bframe_rdo ); s += sprintf( s, " mixed_ref=%d", p->analyse.b_mixed_references ); s += sprintf( s, " me_range=%d", p->analyse.i_me_range ); diff --git a/common/common.h b/common/common.h index 90c7cdb4..37b0b205 100644 --- a/common/common.h +++ b/common/common.h @@ -381,6 +381,8 @@ struct x264_t int b_chroma_me; int b_trellis; int b_noise_reduction; + int i_psy_rd; /* Psy RD strength--fixed point value*/ + int i_psy_trellis; /* Psy trellis strength--fixed point value*/ int b_interlaced; @@ -462,6 +464,16 @@ struct x264_t DECLARE_ALIGNED_16( int16_t i8x8_dct_buf[3][64] ); DECLARE_ALIGNED_16( int16_t i4x4_dct_buf[15][16] ); + /* Psy trellis DCT data */ + DECLARE_ALIGNED_16( int16_t fenc_dct8[4][64] ); + DECLARE_ALIGNED_16( int16_t fenc_dct4[16][16] ); + + /* Psy RD SATD scores */ + int fenc_satd[4][4]; + int fenc_satd_sum; + int fenc_sa8d[2][2]; + int fenc_sa8d_sum; + /* pointer over mb of the frame to be compressed */ uint8_t *p_fenc[3]; diff --git a/common/dct.h b/common/dct.h index 1078023d..daa96f4a 100644 --- a/common/dct.h +++ b/common/dct.h @@ -41,6 +41,17 @@ static const uint16_t x264_dct8_weight_tab[64] = { }; #undef W +#define W(i) (i==0 ? FIX8(1.76777) :\ + i==1 ? FIX8(1.11803) :\ + i==2 ? FIX8(0.70711) :0) +static const uint16_t x264_dct4_weight_tab[16] = { + W(0), W(1), W(0), W(1), + W(1), W(2), W(1), W(2), + W(0), W(1), W(0), W(1), + W(1), W(2), W(1), W(2) +}; +#undef W + /* inverse squared */ #define W(i) (i==0 ? FIX8(3.125) :\ i==1 ? FIX8(1.25) :\ diff --git a/encoder/analyse.c b/encoder/analyse.c index 10399690..8f7b4fa9 100644 --- a/encoder/analyse.c +++ b/encoder/analyse.c @@ -467,6 +467,58 @@ static void predict_4x4_mode_available( unsigned int i_neighbour, } } +/* For trellis=2, we need to do this for both sizes of DCT, for trellis=1 we only need to use it on the chosen mode. */ +static void inline x264_psy_trellis_init( x264_t *h, int do_both_dct ) +{ + DECLARE_ALIGNED_16( int16_t dct8x8[4][8][8] ); + DECLARE_ALIGNED_16( int16_t dct4x4[16][4][4] ); + DECLARE_ALIGNED_16( uint8_t zero[16*FDEC_STRIDE] ) = {0}; + int i; + + if( do_both_dct || h->mb.b_transform_8x8 ) + { + h->dctf.sub16x16_dct8( dct8x8, h->mb.pic.p_fenc[0], zero ); + for( i = 0; i < 4; i++ ) + h->zigzagf.scan_8x8( h->mb.pic.fenc_dct8[i], dct8x8[i] ); + } + if( do_both_dct || !h->mb.b_transform_8x8) + { + h->dctf.sub16x16_dct( dct4x4, h->mb.pic.p_fenc[0], zero ); + for( i = 0; i < 16; i++ ) + h->zigzagf.scan_4x4( h->mb.pic.fenc_dct4[i], dct4x4[i] ); + } +} + +/* Pre-calculate fenc satd scores for psy RD, minus DC coefficients */ +static inline void x264_mb_cache_fenc_satd( x264_t *h ) +{ + DECLARE_ALIGNED_16(uint8_t zero[16]) = {0}; + uint8_t *fenc; + int x, y, satd_sum = 0, sa8d_sum = 0; + if( h->param.analyse.i_trellis == 2 && h->mb.i_psy_trellis ) + x264_psy_trellis_init( h, h->param.analyse.b_transform_8x8 ); + if( !h->mb.i_psy_rd ) + return; + for( y = 0; y < 4; y++ ) + for( x = 0; x < 4; x++ ) + { + fenc = h->mb.pic.p_fenc[0]+x*4+y*4*FENC_STRIDE; + h->mb.pic.fenc_satd[y][x] = h->pixf.satd[PIXEL_4x4]( zero, 0, fenc, FENC_STRIDE ) + - (h->pixf.sad[PIXEL_4x4]( zero, 0, fenc, FENC_STRIDE )>>1); + satd_sum += h->mb.pic.fenc_satd[y][x]; + } + for( y = 0; y < 2; y++ ) + for( x = 0; x < 2; x++ ) + { + fenc = h->mb.pic.p_fenc[0]+x*8+y*8*FENC_STRIDE; + h->mb.pic.fenc_sa8d[y][x] = h->pixf.sa8d[PIXEL_8x8]( zero, 0, fenc, FENC_STRIDE ) + - (h->pixf.sad[PIXEL_8x8]( zero, 0, fenc, FENC_STRIDE )>>2); + sa8d_sum += h->mb.pic.fenc_sa8d[y][x]; + } + h->mb.pic.fenc_satd_sum = satd_sum; + h->mb.pic.fenc_sa8d_sum = sa8d_sum; +} + static void x264_mb_analyse_intra_chroma( x264_t *h, x264_mb_analysis_t *a ) { int i; @@ -1017,12 +1069,15 @@ static void x264_mb_analyse_inter_p16x16( x264_t *h, x264_mb_analysis_t *a ) assert( a->l0.me16x16.mv[1] <= h->mb.mv_max_spel[1] || h->param.i_threads == 1 ); h->mb.i_type = P_L0; - if( a->b_mbrd && a->l0.me16x16.i_ref == 0 - && *(uint32_t*)a->l0.me16x16.mv == *(uint32_t*)h->mb.cache.pskip_mv ) + if( a->b_mbrd ) { - h->mb.i_partition = D_16x16; - x264_macroblock_cache_mv_ptr( h, 0, 0, 4, 4, 0, a->l0.me16x16.mv ); - a->l0.i_rd16x16 = x264_rd_cost_mb( h, a->i_lambda2 ); + x264_mb_cache_fenc_satd( h ); + if( a->l0.me16x16.i_ref == 0 && *(uint32_t*)a->l0.me16x16.mv == *(uint32_t*)h->mb.cache.pskip_mv ) + { + h->mb.i_partition = D_16x16; + x264_macroblock_cache_mv_ptr( h, 0, 0, 4, 4, 0, a->l0.me16x16.mv ); + a->l0.i_rd16x16 = x264_rd_cost_mb( h, a->i_lambda2 ); + } } } @@ -1907,7 +1962,7 @@ static void x264_mb_analyse_p_rd( x264_t *h, x264_mb_analysis_t *a, int i_satd ) static void x264_mb_analyse_b_rd( x264_t *h, x264_mb_analysis_t *a, int i_satd_inter ) { - int thresh = i_satd_inter * 17/16; + int thresh = i_satd_inter * (17 + (!!h->mb.i_psy_rd))/16; if( a->b_direct_available && a->i_rd16x16direct == COST_MAX ) { @@ -2066,6 +2121,8 @@ void x264_macroblock_analyse( x264_t *h ) /*--------------------------- Do the analysis ---------------------------*/ if( h->sh.i_type == SLICE_TYPE_I ) { + if( analysis.b_mbrd ) + x264_mb_cache_fenc_satd( h ); x264_mb_analyse_intra( h, &analysis, COST_MAX ); if( analysis.b_mbrd ) x264_intra_rd( h, &analysis, COST_MAX ); @@ -2344,6 +2401,9 @@ void x264_macroblock_analyse( x264_t *h ) int i_bskip_cost = COST_MAX; int b_skip = 0; + if( analysis.b_mbrd ) + x264_mb_cache_fenc_satd( h ); + h->mb.i_type = B_SKIP; if( h->mb.b_direct_auto_write ) { @@ -2589,6 +2649,8 @@ void x264_macroblock_analyse( x264_t *h ) h->mb.b_trellis = h->param.analyse.i_trellis; h->mb.b_noise_reduction = !!h->param.analyse.i_noise_reduction; + if( !IS_SKIP(h->mb.i_type) && h->mb.i_psy_trellis && h->param.analyse.i_trellis == 1 ) + x264_psy_trellis_init( h, 0 ); if( h->mb.b_trellis == 1 || h->mb.b_noise_reduction ) h->mb.i_skip_intra = 0; } diff --git a/encoder/encoder.c b/encoder/encoder.c index e918352f..5a8869d4 100644 --- a/encoder/encoder.c +++ b/encoder/encoder.c @@ -410,6 +410,7 @@ static int x264_validate_parameters( x264_t *h ) h->param.analyse.i_trellis = 0; h->param.analyse.b_fast_pskip = 0; h->param.analyse.i_noise_reduction = 0; + h->param.analyse.f_psy_rd = 0; } if( h->param.rc.i_rc_method == X264_RC_CQP ) { @@ -488,6 +489,26 @@ static int x264_validate_parameters( x264_t *h ) if( !h->param.b_cabac ) h->param.analyse.i_trellis = 0; h->param.analyse.i_trellis = x264_clip3( h->param.analyse.i_trellis, 0, 2 ); + if( !h->param.analyse.i_trellis ) + h->param.analyse.f_psy_trellis = 0; + h->param.analyse.f_psy_rd = x264_clip3f( h->param.analyse.f_psy_rd, 0, 10 ); + h->param.analyse.f_psy_trellis = x264_clip3f( h->param.analyse.f_psy_trellis, 0, 10 ); + if( h->param.analyse.i_subpel_refine < 6 ) + h->param.analyse.f_psy_rd = 0; + h->mb.i_psy_rd = FIX8( h->param.analyse.f_psy_rd ); + /* Psy RDO increases overall quantizers to improve the quality of luma--this indirectly hurts chroma quality */ + /* so we lower the chroma QP offset to compensate */ + /* This can be triggered repeatedly on multiple calls to parameter_validate, but since encoding + * uses the pps chroma qp offset not the param chroma qp offset, this is not a problem. */ + if( h->mb.i_psy_rd ) + h->param.analyse.i_chroma_qp_offset -= h->param.analyse.f_psy_rd < 0.25 ? 1 : 2; + h->mb.i_psy_trellis = FIX8( h->param.analyse.f_psy_trellis / 4 ); + /* Psy trellis has a similar effect. */ + if( h->mb.i_psy_trellis ) + h->param.analyse.i_chroma_qp_offset -= h->param.analyse.f_psy_trellis < 0.25 ? 1 : 2; + else + h->mb.i_psy_trellis = 0; + h->param.analyse.i_chroma_qp_offset = x264_clip3(h->param.analyse.i_chroma_qp_offset, -12, 12); h->param.rc.i_aq_mode = x264_clip3( h->param.rc.i_aq_mode, 0, 1 ); if( h->param.rc.f_aq_strength <= 0 ) h->param.rc.i_aq_mode = 0; diff --git a/encoder/macroblock.c b/encoder/macroblock.c index a353ce71..e877ccbb 100644 --- a/encoder/macroblock.c +++ b/encoder/macroblock.c @@ -79,20 +79,20 @@ static int x264_mb_decimate_score( int16_t *dct, int i_max ) return i_score; } -static ALWAYS_INLINE void x264_quant_4x4( x264_t *h, int16_t dct[4][4], int i_qp, int i_ctxBlockCat, int b_intra ) +static ALWAYS_INLINE void x264_quant_4x4( x264_t *h, int16_t dct[4][4], int i_qp, int i_ctxBlockCat, int b_intra, int idx ) { int i_quant_cat = b_intra ? CQM_4IY : CQM_4PY; if( h->mb.b_trellis ) - x264_quant_4x4_trellis( h, dct, i_quant_cat, i_qp, i_ctxBlockCat, b_intra ); + x264_quant_4x4_trellis( h, dct, i_quant_cat, i_qp, i_ctxBlockCat, b_intra, idx ); else h->quantf.quant_4x4( dct, h->quant4_mf[i_quant_cat][i_qp], h->quant4_bias[i_quant_cat][i_qp] ); } -static ALWAYS_INLINE void x264_quant_8x8( x264_t *h, int16_t dct[8][8], int i_qp, int b_intra ) +static ALWAYS_INLINE void x264_quant_8x8( x264_t *h, int16_t dct[8][8], int i_qp, int b_intra, int idx ) { int i_quant_cat = b_intra ? CQM_8IY : CQM_8PY; if( h->mb.b_trellis ) - x264_quant_8x8_trellis( h, dct, i_quant_cat, i_qp, b_intra ); + x264_quant_8x8_trellis( h, dct, i_quant_cat, i_qp, b_intra, idx ); else h->quantf.quant_8x8( dct, h->quant8_mf[i_quant_cat][i_qp], h->quant8_bias[i_quant_cat][i_qp] ); } @@ -111,7 +111,7 @@ void x264_mb_encode_i4x4( x264_t *h, int idx, int i_qp ) h->dctf.sub4x4_dct( dct4x4, p_src, p_dst ); - x264_quant_4x4( h, dct4x4, i_qp, DCT_LUMA_4x4, 1 ); + x264_quant_4x4( h, dct4x4, i_qp, DCT_LUMA_4x4, 1, idx ); if( array_non_zero( dct4x4 ) ) { @@ -135,7 +135,7 @@ void x264_mb_encode_i8x8( x264_t *h, int idx, int i_qp ) h->dctf.sub8x8_dct8( dct8x8, p_src, p_dst ); - x264_quant_8x8( h, dct8x8, i_qp, 1 ); + x264_quant_8x8( h, dct8x8, i_qp, 1, idx ); h->zigzagf.scan_8x8( h->dct.luma8x8[idx], dct8x8 ); h->quantf.dequant_8x8( dct8x8, h->dequant8_mf[CQM_8IY], i_qp ); @@ -174,7 +174,7 @@ static void x264_mb_encode_i16x16( x264_t *h, int i_qp ) dct4x4[i][0][0] = 0; /* quant/scan/dequant */ - x264_quant_4x4( h, dct4x4[i], i_qp, DCT_LUMA_AC, 1 ); + x264_quant_4x4( h, dct4x4[i], i_qp, DCT_LUMA_AC, 1, i ); h->zigzagf.scan_4x4( h->dct.luma4x4[i], dct4x4[i] ); h->quantf.dequant_4x4( dct4x4[i], h->dequant4_mf[CQM_4IY], i_qp ); @@ -453,7 +453,7 @@ void x264_macroblock_encode( x264_t *h ) { if( h->mb.b_noise_reduction ) h->quantf.denoise_dct( *dct8x8[idx], h->nr_residual_sum[1], h->nr_offset[1], 64 ); - x264_quant_8x8( h, dct8x8[idx], i_qp, 0 ); + x264_quant_8x8( h, dct8x8[idx], i_qp, 0, idx ); h->zigzagf.scan_8x8( h->dct.luma8x8[idx], dct8x8[idx] ); @@ -498,7 +498,7 @@ void x264_macroblock_encode( x264_t *h ) if( h->mb.b_noise_reduction ) h->quantf.denoise_dct( *dct4x4[idx], h->nr_residual_sum[0], h->nr_offset[0], 16 ); - x264_quant_4x4( h, dct4x4[idx], i_qp, DCT_LUMA_4x4, 0 ); + x264_quant_4x4( h, dct4x4[idx], i_qp, DCT_LUMA_4x4, 0, idx ); h->zigzagf.scan_4x4( h->dct.luma4x4[idx], dct4x4[idx] ); @@ -777,7 +777,7 @@ void x264_macroblock_encode_p8x8( x264_t *h, int i8 ) { DECLARE_ALIGNED_16( int16_t dct8x8[8][8] ); h->dctf.sub8x8_dct8( dct8x8, p_fenc, p_fdec ); - x264_quant_8x8( h, dct8x8, i_qp, 0 ); + x264_quant_8x8( h, dct8x8, i_qp, 0, i8 ); h->zigzagf.scan_8x8( h->dct.luma8x8[i8], dct8x8 ); if( b_decimate && !h->mb.b_trellis ) @@ -797,7 +797,7 @@ void x264_macroblock_encode_p8x8( x264_t *h, int i8 ) DECLARE_ALIGNED_16( int16_t dct4x4[4][4][4] ); h->dctf.sub8x8_dct( dct4x4, p_fenc, p_fdec ); for( i4 = 0; i4 < 4; i4++ ) - x264_quant_4x4( h, dct4x4[i4], i_qp, DCT_LUMA_4x4, 0 ); + x264_quant_4x4( h, dct4x4[i4], i_qp, DCT_LUMA_4x4, 0, i8*4+i4 ); for( i4 = 0; i4 < 4; i4++ ) h->zigzagf.scan_4x4( h->dct.luma4x4[i8*4+i4], dct4x4[i4] ); diff --git a/encoder/macroblock.h b/encoder/macroblock.h index e2b9d318..7035aa21 100644 --- a/encoder/macroblock.h +++ b/encoder/macroblock.h @@ -50,9 +50,9 @@ void x264_mb_encode_8x8_chroma( x264_t *h, int b_inter, int i_qp ); void x264_cabac_mb_skip( x264_t *h, int b_skip ); void x264_quant_4x4_trellis( x264_t *h, int16_t dct[4][4], int i_quant_cat, - int i_qp, int i_ctxBlockCat, int b_intra ); + int i_qp, int i_ctxBlockCat, int b_intra, int idx ); void x264_quant_8x8_trellis( x264_t *h, int16_t dct[8][8], int i_quant_cat, - int i_qp, int b_intra ); + int i_qp, int b_intra, int idx ); void x264_noise_reduction_update( x264_t *h ); diff --git a/encoder/rdo.c b/encoder/rdo.c index a8a2386e..f169e9a4 100644 --- a/encoder/rdo.c +++ b/encoder/rdo.c @@ -51,20 +51,78 @@ static uint16_t cabac_prefix_size[15][128]; #define COPY_CABAC h->mc.memcpy_aligned( &cabac_tmp.f8_bits_encoded, &h->cabac.f8_bits_encoded, \ sizeof(x264_cabac_t) - offsetof(x264_cabac_t,f8_bits_encoded) ) -static int ssd_mb( x264_t *h ) + +#define ADD_ABS_SATD(satdtype, pixel)\ + satd += abs((h->pixf.satdtype[pixel]( zero, 0, fdec, FDEC_STRIDE ) - dc_coef)\ + - sum_##satdtype( h, pixel, x, y )); + +/* Sum the cached SATDs to avoid repeating them. */ +static inline int sum_satd( x264_t *h, int pixel, int x, int y ) +{ + int satd = 0; + int min_x = x>>2; + int min_y = y>>2; + int max_x = (x>>2) + (x264_pixel_size[pixel].w>>2); + int max_y = (y>>2) + (x264_pixel_size[pixel].h>>2); + if( pixel == PIXEL_16x16 ) + return h->mb.pic.fenc_satd_sum; + for( y = min_y; y < max_y; y++ ) + for( x = min_x; x < max_x; x++ ) + satd += h->mb.pic.fenc_satd[y][x]; + return satd; +} + +static inline int sum_sa8d( x264_t *h, int pixel, int x, int y ) +{ + int sa8d = 0; + int min_x = x>>3; + int min_y = y>>3; + int max_x = (x>>3) + (x264_pixel_size[pixel].w>>3); + int max_y = (y>>3) + (x264_pixel_size[pixel].h>>3); + if( pixel == PIXEL_16x16 ) + return h->mb.pic.fenc_sa8d_sum; + for( y = min_y; y < max_y; y++ ) + for( x = min_x; x < max_x; x++ ) + sa8d += h->mb.pic.fenc_sa8d[y][x]; + return sa8d; +} + +/* Psy RD distortion metric: SSD plus "Absolute Difference of Complexities" */ +/* SATD and SA8D are used to measure block complexity. */ +/* The difference between SATD and SA8D scores are both used to avoid bias from the DCT size. Using SATD */ +/* only, for example, results in overusage of 8x8dct, while the opposite occurs when using SA8D. */ + +/* FIXME: Is there a better metric than averaged SATD/SA8D difference for complexity difference? */ +/* Hadamard transform is recursive, so a SATD+SA8D can be done faster by taking advantage of this fact. */ +/* This optimization can also be used in non-RD transform decision. */ + +static inline int ssd_plane( x264_t *h, int size, int p, int x, int y ) { - return h->pixf.ssd[PIXEL_16x16]( h->mb.pic.p_fenc[0], FENC_STRIDE, - h->mb.pic.p_fdec[0], FDEC_STRIDE ) - + h->pixf.ssd[PIXEL_8x8]( h->mb.pic.p_fenc[1], FENC_STRIDE, - h->mb.pic.p_fdec[1], FDEC_STRIDE ) - + h->pixf.ssd[PIXEL_8x8]( h->mb.pic.p_fenc[2], FENC_STRIDE, - h->mb.pic.p_fdec[2], FDEC_STRIDE ); + DECLARE_ALIGNED_16(uint8_t zero[16]) = {0}; + int satd = 0; + uint8_t *fdec = h->mb.pic.p_fdec[p] + x + y*FDEC_STRIDE; + uint8_t *fenc = h->mb.pic.p_fenc[p] + x + y*FENC_STRIDE; + if( p == 0 && h->mb.i_psy_rd ) + { + int dc_coef = h->pixf.sad[size](zero, 0, fdec, FDEC_STRIDE) >> 1; + ADD_ABS_SATD(satd, size); + /* If the plane is smaller than 8x8, we can't do an SA8D; this probably isn't a big problem. */ + if(size <= PIXEL_8x8) + { + dc_coef >>= 1; + ADD_ABS_SATD(sa8d, size); + satd >>= 1; + } + satd = (satd * h->mb.i_psy_rd * x264_lambda_tab[h->mb.i_qp] + 128) >> 8; + } + return h->pixf.ssd[size](fenc, FENC_STRIDE, fdec, FDEC_STRIDE) + satd; } -static int ssd_plane( x264_t *h, int size, int p, int x, int y ) +static inline int ssd_mb( x264_t *h ) { - return h->pixf.ssd[size]( h->mb.pic.p_fenc[p] + x+y*FENC_STRIDE, FENC_STRIDE, - h->mb.pic.p_fdec[p] + x+y*FDEC_STRIDE, FDEC_STRIDE ); + return ssd_plane(h, PIXEL_16x16, 0, 0, 0) + + ssd_plane(h, PIXEL_8x8, 1, 0, 0) + + ssd_plane(h, PIXEL_8x8, 2, 0, 0); } static int x264_rd_cost_mb( x264_t *h, int i_lambda2 ) @@ -269,7 +327,7 @@ static const int lambda2_tab[2][52] = { }; typedef struct { - uint64_t score; + int64_t score; int level_idx; // index into level_tree[] uint8_t cabac_state[10]; //just the contexts relevant to coding abs_level_m1 } trellis_node_t; @@ -298,7 +356,7 @@ typedef struct { static inline void quant_trellis_cabac( x264_t *h, int16_t *dct, const uint16_t *quant_mf, const int *unquant_mf, const int *coef_weight, const uint8_t *zigzag, - int i_ctxBlockCat, int i_lambda2, int b_ac, int i_coefs ) + int i_ctxBlockCat, int i_lambda2, int b_ac, int i_coefs, int idx ) { int abs_coefs[64], signs[64]; trellis_node_t nodes[2][8]; @@ -430,8 +488,20 @@ static inline void quant_trellis_cabac( x264_t *h, int16_t *dct, // that are better left coded, especially at QP > 40. for( abs_level = q; abs_level >= q-1; abs_level-- ) { - int d = i_coef - ((unquant_mf[zigzag[i]] * abs_level + 128) >> 8); - uint64_t ssd = (int64_t)d*d * coef_weight[i]; + int unquant_abs_level = ((unquant_mf[zigzag[i]] * abs_level + 128) >> 8); + int d = i_coef - unquant_abs_level; + int64_t ssd; + /* Psy trellis: bias in favor of higher AC coefficients in the reconstructed frame. */ + if( h->mb.i_psy_trellis && i ) + { + int orig_coef = (i_coefs == 64) ? h->mb.pic.fenc_dct8[idx][i] : h->mb.pic.fenc_dct4[idx][i]; + int predicted_coef = orig_coef - i_coef * signs[i]; + int psy_value = h->mb.i_psy_trellis * abs(predicted_coef + unquant_abs_level * signs[i]); + int psy_weight = (i_coefs == 64) ? x264_dct8_weight_tab[zigzag[i]] : x264_dct4_weight_tab[zigzag[i]]; + ssd = (int64_t)d*d * coef_weight[i] - psy_weight * psy_value; + } + else + ssd = (int64_t)d*d * coef_weight[i]; for( j = 0; j < 8; j++ ) { @@ -495,24 +565,24 @@ static inline void quant_trellis_cabac( x264_t *h, int16_t *dct, void x264_quant_4x4_trellis( x264_t *h, int16_t dct[4][4], int i_quant_cat, - int i_qp, int i_ctxBlockCat, int b_intra ) + int i_qp, int i_ctxBlockCat, int b_intra, int idx ) { int b_ac = (i_ctxBlockCat == DCT_LUMA_AC); quant_trellis_cabac( h, (int16_t*)dct, h->quant4_mf[i_quant_cat][i_qp], h->unquant4_mf[i_quant_cat][i_qp], x264_dct4_weight2_zigzag[h->mb.b_interlaced], x264_zigzag_scan4[h->mb.b_interlaced], - i_ctxBlockCat, lambda2_tab[b_intra][i_qp], b_ac, 16 ); + i_ctxBlockCat, lambda2_tab[b_intra][i_qp], b_ac, 16, idx ); } void x264_quant_8x8_trellis( x264_t *h, int16_t dct[8][8], int i_quant_cat, - int i_qp, int b_intra ) + int i_qp, int b_intra, int idx ) { quant_trellis_cabac( h, (int16_t*)dct, h->quant8_mf[i_quant_cat][i_qp], h->unquant8_mf[i_quant_cat][i_qp], x264_dct8_weight2_zigzag[h->mb.b_interlaced], x264_zigzag_scan8[h->mb.b_interlaced], - DCT_LUMA_8x8, lambda2_tab[b_intra][i_qp], 0, 64 ); + DCT_LUMA_8x8, lambda2_tab[b_intra][i_qp], 0, 64, idx ); } diff --git a/x264.c b/x264.c index e8e9f435..b6341d82 100644 --- a/x264.c +++ b/x264.c @@ -251,6 +251,10 @@ static void Help( x264_param_t *defaults, int b_longhelp ) H0( " -m, --subme Subpixel motion estimation and partition\n" " decision quality: 1=fast, 7=best. [%d]\n", defaults->analyse.i_subpel_refine ); H0( " --b-rdo RD based mode decision for B-frames. Requires subme 6.\n" ); + H0( " --psy-rd Strength of psychovisual optimization [\"%.1f:%.1f\"]\n" + " #1: RDO (requires subme>=6)\n" + " #2: Trellis (requires trellis, experimental)\n", + defaults->analyse.f_psy_rd,defaults->analyse.f_psy_trellis ); H0( " --mixed-refs Decide references on a per partition basis\n" ); H1( " --no-chroma-me Ignore chroma in motion estimation\n" ); H1( " --bime Jointly optimize both MVs in B-frames\n" ); @@ -420,6 +424,7 @@ static int Parse( int argc, char **argv, { "mvrange", required_argument, NULL, 0 }, { "mvrange-thread", required_argument, NULL, 0 }, { "subme", required_argument, NULL, 'm' }, + { "psy-rd", required_argument, NULL, 0 }, { "b-rdo", no_argument, NULL, 0 }, { "mixed-refs", no_argument, NULL, 0 }, { "no-chroma-me", no_argument, NULL, 0 }, diff --git a/x264.h b/x264.h index 839645bc..4d81bced 100644 --- a/x264.h +++ b/x264.h @@ -35,7 +35,7 @@ #include -#define X264_BUILD 63 +#define X264_BUILD 64 /* x264_t: * opaque handler for encoder */ @@ -241,6 +241,8 @@ typedef struct x264_param_t int b_fast_pskip; /* early SKIP detection on P-frames */ int b_dct_decimate; /* transform coefficient thresholding on P-frames */ int i_noise_reduction; /* adaptive pseudo-deadzone */ + float f_psy_rd; /* Psy RD strength */ + float f_psy_trellis; /* Psy trellis strength */ /* the deadzone size that will be used in luma quantization */ int i_luma_deadzone[2]; /* {inter, intra} */ -- 2.40.0