From c583687fab832ba7eaf8626048f05ad1f861a855 Mon Sep 17 00:00:00 2001 From: Fiona Glaser Date: Thu, 23 Dec 2010 19:33:01 -0500 Subject: [PATCH] VFR/framerate-aware ratecontrol, part 2 MB-tree and qcomp complexity estimation now consider the duration of a frame in their calculations. This is very important for visual optimizations, as frames that last longer are inherently more important quality-wise. Improves VFR-aware PSNR as much as 1-2db on extreme test cases, ~0.5db on more ordinary VFR clips (e.g. deduped anime episodes). WARNING: This change redefines x264's internal quality measurement. x264 will now scale its quality based on the framerate of the video due to the aforementioned frame duration logic. That is, --crf X will give lower quality per frame for a 60fps video than for a 30fps one. This will make --crf closer to constant perceptual quality than previously. The "center" for this change is 25fps: that is, videos lower than 25fps will go up in quality at the same CRF and videos above will go down. This choice is completely arbitrary. Note that to take full advantage of this, x264 must encode your video at the correct framerate, with the correct timestamps. --- common/mc.c | 25 +++++------------ common/mc.h | 2 +- common/x86/const-a.asm | 1 - common/x86/mc-a2.asm | 64 ++++++++++++++++++++++-------------------- common/x86/mc-c.c | 2 +- encoder/ratecontrol.c | 15 +++++----- encoder/ratecontrol.h | 10 +++++++ encoder/slicetype.c | 42 +++++++++++++++++---------- tools/checkasm.c | 45 ++++++++++++++++------------- 9 files changed, 112 insertions(+), 94 deletions(-) diff --git a/common/mc.c b/common/mc.c index 5f8c260b..b0b38e92 100644 --- a/common/mc.c +++ b/common/mc.c @@ -431,30 +431,19 @@ static void frame_init_lowres_core( pixel *src0, pixel *dst0, pixel *dsth, pixel } } -#if defined(__GNUC__) && (ARCH_X86 || ARCH_X86_64) -// gcc isn't smart enough to use the "idiv" instruction -static ALWAYS_INLINE int32_t div_64_32(int64_t x, int32_t y) -{ - int32_t quotient, remainder; - asm("idiv %4" - :"=a"(quotient), "=d"(remainder) - :"a"((uint32_t)x), "d"((int32_t)(x>>32)), "r"(y) - ); - return quotient; -} -#else -#define div_64_32(x,y) ((x)/(y)) -#endif - /* Estimate the total amount of influence on future quality that could be had if we * were to improve the reference samples used to inter predict any given macroblock. */ static void mbtree_propagate_cost( int *dst, uint16_t *propagate_in, uint16_t *intra_costs, - uint16_t *inter_costs, uint16_t *inv_qscales, int len ) + uint16_t *inter_costs, uint16_t *inv_qscales, float *fps_factor, int len ) { + float fps = *fps_factor / 256.f; for( int i = 0; i < len; i++ ) { - int propagate_amount = propagate_in[i] + ((intra_costs[i] * inv_qscales[i] + 128)>>8); - dst[i] = div_64_32((int64_t)propagate_amount * (intra_costs[i] - (inter_costs[i] & LOWRES_COST_MASK)), intra_costs[i]); + float intra_cost = intra_costs[i] * inv_qscales[i]; + float propagate_amount = propagate_in[i] + intra_cost*fps; + float propagate_num = intra_costs[i] - (inter_costs[i] & LOWRES_COST_MASK); + float propagate_denom = intra_costs[i]; + dst[i] = (int)(propagate_amount * propagate_num / propagate_denom + 0.5f); } } diff --git a/common/mc.h b/common/mc.h index 92d0ded5..2a96fa6a 100644 --- a/common/mc.h +++ b/common/mc.h @@ -123,7 +123,7 @@ typedef struct void (*weight_cache)( x264_t *, x264_weight_t * ); void (*mbtree_propagate_cost)( int *dst, uint16_t *propagate_in, uint16_t *intra_costs, - uint16_t *inter_costs, uint16_t *inv_qscales, int len ); + uint16_t *inter_costs, uint16_t *inv_qscales, float *fps_factor, int len ); } x264_mc_functions_t; void x264_mc_init( int cpu, x264_mc_functions_t *pf ); diff --git a/common/x86/const-a.asm b/common/x86/const-a.asm index d2c7fe69..f01856d1 100644 --- a/common/x86/const-a.asm +++ b/common/x86/const-a.asm @@ -51,7 +51,6 @@ const pw_pixel_max,times 8 dw ((1 << BIT_DEPTH)-1) const pd_1, times 4 dd 1 const pd_32, times 4 dd 32 -const pd_128, times 4 dd 128 const pd_ffff, times 4 dd 0xffff const pw_00ff, times 8 dw 0x00ff const pw_ff00, times 8 dw 0xff00 diff --git a/common/x86/mc-a2.asm b/common/x86/mc-a2.asm index 3a1ea14f..bb639fe2 100644 --- a/common/x86/mc-a2.asm +++ b/common/x86/mc-a2.asm @@ -40,6 +40,7 @@ deinterleave_shuf: db 0,2,4,6,8,10,12,14,1,3,5,7,9,11,13,15 pd_16: times 4 dd 16 pd_0f: times 4 dd 0xffff +pf_inv256: times 4 dd 0.00390625 pad10: times 8 dw 10*PIXEL_MAX pad20: times 8 dw 20*PIXEL_MAX @@ -59,7 +60,6 @@ cextern pw_32 cextern pw_00ff cextern pw_3fff cextern pw_pixel_max -cextern pd_128 cextern pd_ffff %macro LOAD_ADD 4 @@ -1649,47 +1649,49 @@ FRAME_INIT_LOWRES ssse3 ;----------------------------------------------------------------------------- ; void mbtree_propagate_cost( int *dst, uint16_t *propagate_in, uint16_t *intra_costs, -; uint16_t *inter_costs, uint16_t *inv_qscales, int len ) +; uint16_t *inter_costs, uint16_t *inv_qscales, float *fps_factor, int len ) ;----------------------------------------------------------------------------- -cglobal mbtree_propagate_cost_sse2, 6,6,7 - shl r5d, 1 - lea r0, [r0+r5*2] - add r1, r5 - add r2, r5 - add r3, r5 - add r4, r5 - neg r5 - pxor xmm5, xmm5 - movdqa xmm6, [pw_3fff] - movdqa xmm4, [pd_128] +cglobal mbtree_propagate_cost_sse2, 7,7,7 + shl r6d, 1 + lea r0, [r0+r6*2] + add r1, r6 + add r2, r6 + add r3, r6 + add r4, r6 + neg r6 + pxor xmm4, xmm4 + movss xmm6, [r5] + shufps xmm6, xmm6, 0 + mulps xmm6, [pf_inv256] + movdqa xmm5, [pw_3fff] .loop: - movq xmm2, [r2+r5] ; intra - movq xmm0, [r4+r5] ; invq - movq xmm3, [r3+r5] ; inter - movq xmm1, [r1+r5] ; prop - punpcklwd xmm2, xmm5 - punpcklwd xmm0, xmm5 + movq xmm2, [r2+r6] ; intra + movq xmm0, [r4+r6] ; invq + movq xmm3, [r3+r6] ; inter + movq xmm1, [r1+r6] ; prop + punpcklwd xmm2, xmm4 + punpcklwd xmm0, xmm4 pmaddwd xmm0, xmm2 - pand xmm3, xmm6 - punpcklwd xmm1, xmm5 - punpcklwd xmm3, xmm5 - paddd xmm0, xmm4 - psrld xmm0, 8 ; intra*invq>>8 - paddd xmm0, xmm1 ; prop + (intra*invq>>8) + pand xmm3, xmm5 + punpcklwd xmm1, xmm4 + punpcklwd xmm3, xmm4 + cvtdq2ps xmm0, xmm0 + mulps xmm0, xmm6 ; intra*invq*fps_factor>>8 + cvtdq2ps xmm1, xmm1 ; prop + addps xmm0, xmm1 ; prop + (intra*invq*fps_factor>>8) cvtdq2ps xmm1, xmm2 ; intra psubd xmm2, xmm3 ; intra - inter + cvtdq2ps xmm2, xmm2 ; intra - inter rcpps xmm3, xmm1 ; 1 / intra 1st approximation - cvtdq2ps xmm0, xmm0 mulps xmm1, xmm3 ; intra * (1/intra 1st approx) - cvtdq2ps xmm2, xmm2 mulps xmm1, xmm3 ; intra * (1/intra 1st approx)^2 - mulps xmm0, xmm2 ; (prop + (intra*invq>>8)) * (intra - inter) + mulps xmm0, xmm2 ; (prop + (intra*invq*fps_factor>>8)) * (intra - inter) addps xmm3, xmm3 ; 2 * (1/intra 1st approx) subps xmm3, xmm1 ; 2nd approximation for 1/intra mulps xmm0, xmm3 ; / intra - cvttps2dq xmm0, xmm0 ; truncation isn't really desired, but matches the integer implementation - movdqa [r0+r5*2], xmm0 - add r5, 8 + cvtps2dq xmm0, xmm0 + movdqa [r0+r6*2], xmm0 + add r6, 8 jl .loop REP_RET diff --git a/common/x86/mc-c.c b/common/x86/mc-c.c index 881f2d77..cdd9d572 100644 --- a/common/x86/mc-c.c +++ b/common/x86/mc-c.c @@ -124,7 +124,7 @@ void x264_integral_init8v_mmx( uint16_t *sum8, int stride ); void x264_integral_init8v_sse2( uint16_t *sum8, int stride ); void x264_integral_init4v_ssse3( uint16_t *sum8, uint16_t *sum4, int stride ); void x264_mbtree_propagate_cost_sse2( int *dst, uint16_t *propagate_in, uint16_t *intra_costs, - uint16_t *inter_costs, uint16_t *inv_qscales, int len ); + uint16_t *inter_costs, uint16_t *inv_qscales, float *fps_factor, int len ); #define MC_CHROMA(cpu)\ void x264_mc_chroma_##cpu( pixel *dstu, pixel *dstv, int i_dst,\ diff --git a/encoder/ratecontrol.c b/encoder/ratecontrol.c index 4de6ab51..2d587150 100644 --- a/encoder/ratecontrol.c +++ b/encoder/ratecontrol.c @@ -1603,9 +1603,7 @@ int x264_ratecontrol_end( x264_t *h, int bits, int *filler ) rc->cplxr_sum += bits * qp2qscale( rc->qpa_rc ) / (rc->last_rceq * fabs( h->param.rc.f_pb_factor )); } rc->cplxr_sum *= rc->cbr_decay; - double frame_duration = (double)h->fenc->i_duration * h->sps->vui.i_num_units_in_tick / h->sps->vui.i_time_scale; - - rc->wanted_bits_window += frame_duration * rc->bitrate; + rc->wanted_bits_window += h->fenc->f_duration * rc->bitrate; rc->wanted_bits_window *= rc->cbr_decay; } @@ -2184,7 +2182,7 @@ static float rate_estimate_qscale( x264_t *h ) rcc->last_satd = x264_rc_analyse_slice( h ); rcc->short_term_cplxsum *= 0.5; rcc->short_term_cplxcount *= 0.5; - rcc->short_term_cplxsum += rcc->last_satd; + rcc->short_term_cplxsum += rcc->last_satd / (CLIP_DURATION(h->fenc->f_duration) / BASE_FRAME_DURATION); rcc->short_term_cplxcount ++; rce.tex_bits = rcc->last_satd; @@ -2541,10 +2539,11 @@ static int init_pass2( x264_t *h ) { x264_ratecontrol_t *rcc = h->rc; uint64_t all_const_bits = 0; + double timescale = (double)h->sps->vui.i_num_units_in_tick / h->sps->vui.i_time_scale; double duration = 0; for( int i = 0; i < rcc->num_entries; i++ ) duration += rcc->entry[i].i_duration; - duration *= (double)h->sps->vui.i_num_units_in_tick / h->sps->vui.i_time_scale; + duration *= timescale; uint64_t all_available_bits = h->param.rc.i_bitrate * 1000. * duration; double rate_factor, step_mult; double qblur = h->param.rc.f_qblur; @@ -2583,21 +2582,23 @@ static int init_pass2( x264_t *h ) for( int j = 1; j < cplxblur*2 && j < rcc->num_entries-i; j++ ) { ratecontrol_entry_t *rcj = &rcc->entry[i+j]; + double frame_duration = CLIP_DURATION(rcj->i_duration * timescale) / BASE_FRAME_DURATION; weight *= 1 - pow( (float)rcj->i_count / rcc->nmb, 2 ); if( weight < .0001 ) break; gaussian_weight = weight * exp( -j*j/200.0 ); weight_sum += gaussian_weight; - cplx_sum += gaussian_weight * (qscale2bits(rcj, 1) - rcj->misc_bits); + cplx_sum += gaussian_weight * (qscale2bits( rcj, 1 ) - rcj->misc_bits) / frame_duration; } /* weighted average of cplx of past frames */ weight = 1.0; for( int j = 0; j <= cplxblur*2 && j <= i; j++ ) { ratecontrol_entry_t *rcj = &rcc->entry[i-j]; + double frame_duration = CLIP_DURATION(rcj->i_duration * timescale) / BASE_FRAME_DURATION; gaussian_weight = weight * exp( -j*j/200.0 ); weight_sum += gaussian_weight; - cplx_sum += gaussian_weight * (qscale2bits( rcj, 1 ) - rcj->misc_bits); + cplx_sum += gaussian_weight * (qscale2bits( rcj, 1 ) - rcj->misc_bits) / frame_duration; weight *= 1 - pow( (float)rcj->i_count / rcc->nmb, 2 ); if( weight < .0001 ) break; diff --git a/encoder/ratecontrol.h b/encoder/ratecontrol.h index 03c82cb0..28e6a3d7 100644 --- a/encoder/ratecontrol.h +++ b/encoder/ratecontrol.h @@ -27,6 +27,16 @@ #ifndef X264_RATECONTROL_H #define X264_RATECONTROL_H +/* Completely arbitrary. Ratecontrol lowers relative quality at higher framerates + * and the reverse at lower framerates; this serves as the center of the curve. */ +#define BASE_FRAME_DURATION (0.04f) + +/* Arbitrary limitations as a sanity check. */ +#define MAX_FRAME_DURATION 1.00f +#define MIN_FRAME_DURATION 0.01f + +#define CLIP_DURATION(f) x264_clip3f(f,MIN_FRAME_DURATION,MAX_FRAME_DURATION) + int x264_ratecontrol_new ( x264_t * ); void x264_ratecontrol_delete( x264_t * ); diff --git a/encoder/slicetype.c b/encoder/slicetype.c index cc0193bb..97cf61e3 100644 --- a/encoder/slicetype.c +++ b/encoder/slicetype.c @@ -748,9 +748,10 @@ static int x264_slicetype_frame_cost_recalculate( x264_t *h, x264_frame_t **fram return i_score; } -static void x264_macroblock_tree_finish( x264_t *h, x264_frame_t *frame, int ref0_distance ) +static void x264_macroblock_tree_finish( x264_t *h, x264_frame_t *frame, float average_duration, int ref0_distance ) { - x264_emms(); + int fps_factor_intra = round( CLIP_DURATION(frame->f_duration) / BASE_FRAME_DURATION * 256 ); + int fps_factor_propagate = round( CLIP_DURATION( average_duration) / BASE_FRAME_DURATION * 256 ); float weightdelta = 0.0; if( ref0_distance && frame->f_weighted_cost_delta[ref0_distance-1] > 0 ) weightdelta = (1.0 - frame->f_weighted_cost_delta[ref0_distance-1]); @@ -760,17 +761,18 @@ static void x264_macroblock_tree_finish( x264_t *h, x264_frame_t *frame, int ref float strength = 5.0f * (1.0f - h->param.rc.f_qcompress); for( int mb_index = 0; mb_index < h->mb.i_mb_count; mb_index++ ) { - int intra_cost = (frame->i_intra_cost[mb_index] * frame->i_inv_qscale_factor[mb_index]+128)>>8; + int intra_cost = (frame->i_intra_cost[mb_index] * frame->i_inv_qscale_factor[mb_index] + 128) >> 8; + int intra_cost_scaled = (intra_cost * fps_factor_intra + 128) >> 8; if( intra_cost ) { - int propagate_cost = frame->i_propagate_cost[mb_index]; - float log2_ratio = x264_log2(intra_cost + propagate_cost) - x264_log2(intra_cost) + weightdelta; + int propagate_cost = (frame->i_propagate_cost[mb_index] * fps_factor_propagate + 128) >> 8; + float log2_ratio = x264_log2(intra_cost_scaled + propagate_cost) - x264_log2(intra_cost) + weightdelta; frame->f_qp_offset[mb_index] = frame->f_qp_offset_aq[mb_index] - strength * log2_ratio; } } } -static void x264_macroblock_tree_propagate( x264_t *h, x264_frame_t **frames, int p0, int p1, int b, int referenced ) +static void x264_macroblock_tree_propagate( x264_t *h, x264_frame_t **frames, float average_duration, int p0, int p1, int b, int referenced ) { uint16_t *ref_costs[2] = {frames[p0]->i_propagate_cost,frames[p1]->i_propagate_cost}; int dist_scale_factor = ( ((b-p0) << 8) + ((p1-p0) >> 1) ) / (p1-p0); @@ -780,6 +782,9 @@ static void x264_macroblock_tree_propagate( x264_t *h, x264_frame_t **frames, in int *buf = h->scratch_buffer; uint16_t *propagate_cost = frames[b]->i_propagate_cost; + x264_emms(); + float fps_factor = CLIP_DURATION(frames[b]->f_duration) / CLIP_DURATION(average_duration); + /* For non-reffed frames the source costs are always zero, so just memset one row and re-use it. */ if( !referenced ) memset( frames[b]->i_propagate_cost, 0, h->mb.i_mb_width * sizeof(uint16_t) ); @@ -789,7 +794,7 @@ static void x264_macroblock_tree_propagate( x264_t *h, x264_frame_t **frames, in int mb_index = h->mb.i_mb_y*h->mb.i_mb_stride; h->mc.mbtree_propagate_cost( buf, propagate_cost, frames[b]->i_intra_cost+mb_index, frames[b]->lowres_costs[b-p0][p1-b]+mb_index, - frames[b]->i_inv_qscale_factor+mb_index, h->mb.i_mb_width ); + frames[b]->i_inv_qscale_factor+mb_index, &fps_factor, h->mb.i_mb_width ); if( referenced ) propagate_cost += h->mb.i_mb_width; for( h->mb.i_mb_x = 0; h->mb.i_mb_x < h->mb.i_mb_width; h->mb.i_mb_x++, mb_index++ ) @@ -858,7 +863,7 @@ static void x264_macroblock_tree_propagate( x264_t *h, x264_frame_t **frames, in } if( h->param.rc.i_vbv_buffer_size && h->param.rc.i_lookahead && referenced ) - x264_macroblock_tree_finish( h, frames[b], b == p1 ? b - p0 : 0 ); + x264_macroblock_tree_finish( h, frames[b], average_duration, b == p1 ? b - p0 : 0 ); } static void x264_macroblock_tree( x264_t *h, x264_mb_analysis_t *a, x264_frame_t **frames, int num_frames, int b_intra ) @@ -866,6 +871,13 @@ static void x264_macroblock_tree( x264_t *h, x264_mb_analysis_t *a, x264_frame_t int idx = !b_intra; int last_nonb, cur_nonb = 1; int bframes = 0; + + x264_emms(); + float total_duration = 0.0; + for( int j = 0; j <= num_frames; j++ ) + total_duration += frames[j]->f_duration; + float average_duration = total_duration / (num_frames + 1); + int i = num_frames; if( b_intra ) @@ -918,34 +930,34 @@ static void x264_macroblock_tree( x264_t *h, x264_mb_analysis_t *a, x264_frame_t if( i != middle ) { x264_slicetype_frame_cost( h, a, frames, p0, p1, i, 0 ); - x264_macroblock_tree_propagate( h, frames, p0, p1, i, 0 ); + x264_macroblock_tree_propagate( h, frames, average_duration, p0, p1, i, 0 ); } i--; } - x264_macroblock_tree_propagate( h, frames, cur_nonb, last_nonb, middle, 1 ); + x264_macroblock_tree_propagate( h, frames, average_duration, cur_nonb, last_nonb, middle, 1 ); } else { while( i > cur_nonb ) { x264_slicetype_frame_cost( h, a, frames, cur_nonb, last_nonb, i, 0 ); - x264_macroblock_tree_propagate( h, frames, cur_nonb, last_nonb, i, 0 ); + x264_macroblock_tree_propagate( h, frames, average_duration, cur_nonb, last_nonb, i, 0 ); i--; } } - x264_macroblock_tree_propagate( h, frames, cur_nonb, last_nonb, last_nonb, 1 ); + x264_macroblock_tree_propagate( h, frames, average_duration, cur_nonb, last_nonb, last_nonb, 1 ); last_nonb = cur_nonb; } if( !h->param.rc.i_lookahead ) { - x264_macroblock_tree_propagate( h, frames, 0, last_nonb, last_nonb, 1 ); + x264_macroblock_tree_propagate( h, frames, average_duration, 0, last_nonb, last_nonb, 1 ); XCHG( uint16_t*, frames[last_nonb]->i_propagate_cost, frames[0]->i_propagate_cost ); } - x264_macroblock_tree_finish( h, frames[last_nonb], last_nonb ); + x264_macroblock_tree_finish( h, frames[last_nonb], average_duration, last_nonb ); if( h->param.i_bframe_pyramid && bframes > 1 && !h->param.rc.i_vbv_buffer_size ) - x264_macroblock_tree_finish( h, frames[last_nonb+(bframes+1)/2], 0 ); + x264_macroblock_tree_finish( h, frames[last_nonb+(bframes+1)/2], average_duration, 0 ); } static int x264_vbv_frame_cost( x264_t *h, x264_mb_analysis_t *a, x264_frame_t **frames, int p0, int p1, int b ) diff --git a/tools/checkasm.c b/tools/checkasm.c index c552ab9b..7c7faa72 100644 --- a/tools/checkasm.c +++ b/tools/checkasm.c @@ -1236,29 +1236,34 @@ static int check_mc( int cpu_ref, int cpu_new ) if( mc_a.mbtree_propagate_cost != mc_ref.mbtree_propagate_cost ) { - ok = 1; used_asm = 1; - set_func_name( "mbtree_propagate" ); - int *dsta = (int*)buf3; - int *dstc = dsta+400; - uint16_t *prop = (uint16_t*)buf1; - uint16_t *intra = (uint16_t*)buf4; - uint16_t *inter = intra+400; - uint16_t *qscale = inter+400; - uint16_t *rnd = (uint16_t*)buf2; x264_emms(); - for( int i = 0; i < 400; i++ ) + for( int i = 0; i < 10; i++ ) { - intra[i] = *rnd++ & 0x7fff; - intra[i] += !intra[i]; - inter[i] = *rnd++ & 0x7fff; - qscale[i] = *rnd++ & 0x7fff; + float fps_factor = (rand()&65535) / 256.; + ok = 1; used_asm = 1; + set_func_name( "mbtree_propagate" ); + int *dsta = (int*)buf3; + int *dstc = dsta+400; + uint16_t *prop = (uint16_t*)buf1; + uint16_t *intra = (uint16_t*)buf4; + uint16_t *inter = intra+100; + uint16_t *qscale = inter+100; + uint16_t *rnd = (uint16_t*)buf2; + x264_emms(); + for( int j = 0; j < 100; j++ ) + { + intra[j] = *rnd++ & 0x7fff; + intra[j] += !intra[j]; + inter[j] = *rnd++ & 0x7fff; + qscale[j] = *rnd++ & 0x7fff; + } + call_c( mc_c.mbtree_propagate_cost, dstc, prop, intra, inter, qscale, &fps_factor, 100 ); + call_a( mc_a.mbtree_propagate_cost, dsta, prop, intra, inter, qscale, &fps_factor, 100 ); + // I don't care about exact rounding, this is just how close the floating-point implementation happens to be + x264_emms(); + for( int j = 0; j < 100; j++ ) + ok &= abs( dstc[j]-dsta[j] ) <= 1 || fabs( (double)dstc[j]/dsta[j]-1 ) < 1e-4; } - call_c( mc_c.mbtree_propagate_cost, dstc, prop, intra, inter, qscale, 400 ); - call_a( mc_a.mbtree_propagate_cost, dsta, prop, intra, inter, qscale, 400 ); - // I don't care about exact rounding, this is just how close the floating-point implementation happens to be - x264_emms(); - for( int i = 0; i < 400; i++ ) - ok &= abs( dstc[i]-dsta[i] ) <= 1 || fabs( (double)dstc[i]/dsta[i]-1 ) < 1e-6; report( "mbtree propagate :" ); } -- 2.40.0