From c583687fab832ba7eaf8626048f05ad1f861a855 Mon Sep 17 00:00:00 2001
From: Fiona Glaser <fiona@x264.com>
Date: Thu, 23 Dec 2010 19:33:01 -0500
Subject: [PATCH] VFR/framerate-aware ratecontrol, part 2 MB-tree and qcomp
 complexity estimation now consider the duration of a frame in their
 calculations. This is very important for visual optimizations, as frames that
 last longer are inherently more important quality-wise. Improves VFR-aware
 PSNR as much as 1-2db on extreme test cases, ~0.5db on more ordinary VFR
 clips (e.g. deduped anime episodes).

WARNING: This change redefines x264's internal quality measurement.
x264 will now scale its quality based on the framerate of the video due to the aforementioned frame duration logic.
That is, --crf X will give lower quality per frame for a 60fps video than for a 30fps one.
This will make --crf closer to constant perceptual quality than previously.
The "center" for this change is 25fps: that is, videos lower than 25fps will go up in quality at the same CRF and videos above will go down.
This choice is completely arbitrary.

Note that to take full advantage of this, x264 must encode your video at the correct framerate, with the correct timestamps.
---
 common/mc.c            | 25 +++++------------
 common/mc.h            |  2 +-
 common/x86/const-a.asm |  1 -
 common/x86/mc-a2.asm   | 64 ++++++++++++++++++++++--------------------
 common/x86/mc-c.c      |  2 +-
 encoder/ratecontrol.c  | 15 +++++-----
 encoder/ratecontrol.h  | 10 +++++++
 encoder/slicetype.c    | 42 +++++++++++++++++----------
 tools/checkasm.c       | 45 ++++++++++++++++-------------
 9 files changed, 112 insertions(+), 94 deletions(-)

diff --git a/common/mc.c b/common/mc.c
index 5f8c260b..b0b38e92 100644
--- a/common/mc.c
+++ b/common/mc.c
@@ -431,30 +431,19 @@ static void frame_init_lowres_core( pixel *src0, pixel *dst0, pixel *dsth, pixel
     }
 }
 
-#if defined(__GNUC__) && (ARCH_X86 || ARCH_X86_64)
-// gcc isn't smart enough to use the "idiv" instruction
-static ALWAYS_INLINE int32_t div_64_32(int64_t x, int32_t y)
-{
-    int32_t quotient, remainder;
-    asm("idiv %4"
-        :"=a"(quotient), "=d"(remainder)
-        :"a"((uint32_t)x), "d"((int32_t)(x>>32)), "r"(y)
-    );
-    return quotient;
-}
-#else
-#define div_64_32(x,y) ((x)/(y))
-#endif
-
 /* Estimate the total amount of influence on future quality that could be had if we
  * were to improve the reference samples used to inter predict any given macroblock. */
 static void mbtree_propagate_cost( int *dst, uint16_t *propagate_in, uint16_t *intra_costs,
-                                   uint16_t *inter_costs, uint16_t *inv_qscales, int len )
+                                   uint16_t *inter_costs, uint16_t *inv_qscales, float *fps_factor, int len )
 {
+    float fps = *fps_factor / 256.f;
     for( int i = 0; i < len; i++ )
     {
-        int propagate_amount = propagate_in[i] + ((intra_costs[i] * inv_qscales[i] + 128)>>8);
-        dst[i] = div_64_32((int64_t)propagate_amount * (intra_costs[i] - (inter_costs[i] & LOWRES_COST_MASK)), intra_costs[i]);
+        float intra_cost       = intra_costs[i] * inv_qscales[i];
+        float propagate_amount = propagate_in[i] + intra_cost*fps;
+        float propagate_num    = intra_costs[i] - (inter_costs[i] & LOWRES_COST_MASK);
+        float propagate_denom  = intra_costs[i];
+        dst[i] = (int)(propagate_amount * propagate_num / propagate_denom + 0.5f);
     }
 }
 
diff --git a/common/mc.h b/common/mc.h
index 92d0ded5..2a96fa6a 100644
--- a/common/mc.h
+++ b/common/mc.h
@@ -123,7 +123,7 @@ typedef struct
     void (*weight_cache)( x264_t *, x264_weight_t * );
 
     void (*mbtree_propagate_cost)( int *dst, uint16_t *propagate_in, uint16_t *intra_costs,
-                                   uint16_t *inter_costs, uint16_t *inv_qscales, int len );
+                                   uint16_t *inter_costs, uint16_t *inv_qscales, float *fps_factor, int len );
 } x264_mc_functions_t;
 
 void x264_mc_init( int cpu, x264_mc_functions_t *pf );
diff --git a/common/x86/const-a.asm b/common/x86/const-a.asm
index d2c7fe69..f01856d1 100644
--- a/common/x86/const-a.asm
+++ b/common/x86/const-a.asm
@@ -51,7 +51,6 @@ const pw_pixel_max,times 8 dw ((1 << BIT_DEPTH)-1)
 
 const pd_1,        times 4 dd 1
 const pd_32,       times 4 dd 32
-const pd_128,      times 4 dd 128
 const pd_ffff,     times 4 dd 0xffff
 const pw_00ff,     times 8 dw 0x00ff
 const pw_ff00,     times 8 dw 0xff00
diff --git a/common/x86/mc-a2.asm b/common/x86/mc-a2.asm
index 3a1ea14f..bb639fe2 100644
--- a/common/x86/mc-a2.asm
+++ b/common/x86/mc-a2.asm
@@ -40,6 +40,7 @@ deinterleave_shuf: db 0,2,4,6,8,10,12,14,1,3,5,7,9,11,13,15
 
 pd_16: times 4 dd 16
 pd_0f: times 4 dd 0xffff
+pf_inv256: times 4 dd 0.00390625
 
 pad10: times 8 dw    10*PIXEL_MAX
 pad20: times 8 dw    20*PIXEL_MAX
@@ -59,7 +60,6 @@ cextern pw_32
 cextern pw_00ff
 cextern pw_3fff
 cextern pw_pixel_max
-cextern pd_128
 cextern pd_ffff
 
 %macro LOAD_ADD 4
@@ -1649,47 +1649,49 @@ FRAME_INIT_LOWRES ssse3
 
 ;-----------------------------------------------------------------------------
 ; void mbtree_propagate_cost( int *dst, uint16_t *propagate_in, uint16_t *intra_costs,
-;                             uint16_t *inter_costs, uint16_t *inv_qscales, int len )
+;                             uint16_t *inter_costs, uint16_t *inv_qscales, float *fps_factor, int len )
 ;-----------------------------------------------------------------------------
-cglobal mbtree_propagate_cost_sse2, 6,6,7
-    shl r5d, 1
-    lea r0, [r0+r5*2]
-    add r1, r5
-    add r2, r5
-    add r3, r5
-    add r4, r5
-    neg r5
-    pxor      xmm5, xmm5
-    movdqa    xmm6, [pw_3fff]
-    movdqa    xmm4, [pd_128]
+cglobal mbtree_propagate_cost_sse2, 7,7,7
+    shl        r6d, 1
+    lea         r0, [r0+r6*2]
+    add         r1, r6
+    add         r2, r6
+    add         r3, r6
+    add         r4, r6
+    neg         r6
+    pxor      xmm4, xmm4
+    movss     xmm6, [r5]
+    shufps    xmm6, xmm6, 0
+    mulps     xmm6, [pf_inv256]
+    movdqa    xmm5, [pw_3fff]
 .loop:
-    movq      xmm2, [r2+r5] ; intra
-    movq      xmm0, [r4+r5] ; invq
-    movq      xmm3, [r3+r5] ; inter
-    movq      xmm1, [r1+r5] ; prop
-    punpcklwd xmm2, xmm5
-    punpcklwd xmm0, xmm5
+    movq      xmm2, [r2+r6] ; intra
+    movq      xmm0, [r4+r6] ; invq
+    movq      xmm3, [r3+r6] ; inter
+    movq      xmm1, [r1+r6] ; prop
+    punpcklwd xmm2, xmm4
+    punpcklwd xmm0, xmm4
     pmaddwd   xmm0, xmm2
-    pand      xmm3, xmm6
-    punpcklwd xmm1, xmm5
-    punpcklwd xmm3, xmm5
-    paddd     xmm0, xmm4
-    psrld     xmm0, 8       ; intra*invq>>8
-    paddd     xmm0, xmm1    ; prop + (intra*invq>>8)
+    pand      xmm3, xmm5
+    punpcklwd xmm1, xmm4
+    punpcklwd xmm3, xmm4
+    cvtdq2ps  xmm0, xmm0
+    mulps     xmm0, xmm6    ; intra*invq*fps_factor>>8
+    cvtdq2ps  xmm1, xmm1    ; prop
+    addps     xmm0, xmm1    ; prop + (intra*invq*fps_factor>>8)
     cvtdq2ps  xmm1, xmm2    ; intra
     psubd     xmm2, xmm3    ; intra - inter
+    cvtdq2ps  xmm2, xmm2    ; intra - inter
     rcpps     xmm3, xmm1    ; 1 / intra 1st approximation
-    cvtdq2ps  xmm0, xmm0
     mulps     xmm1, xmm3    ; intra * (1/intra 1st approx)
-    cvtdq2ps  xmm2, xmm2
     mulps     xmm1, xmm3    ; intra * (1/intra 1st approx)^2
-    mulps     xmm0, xmm2    ; (prop + (intra*invq>>8)) * (intra - inter)
+    mulps     xmm0, xmm2    ; (prop + (intra*invq*fps_factor>>8)) * (intra - inter)
     addps     xmm3, xmm3    ; 2 * (1/intra 1st approx)
     subps     xmm3, xmm1    ; 2nd approximation for 1/intra
     mulps     xmm0, xmm3    ; / intra
-    cvttps2dq xmm0, xmm0    ; truncation isn't really desired, but matches the integer implementation
-    movdqa [r0+r5*2], xmm0
-    add r5, 8
+    cvtps2dq  xmm0, xmm0
+    movdqa [r0+r6*2], xmm0
+    add         r6, 8
     jl .loop
     REP_RET
 
diff --git a/common/x86/mc-c.c b/common/x86/mc-c.c
index 881f2d77..cdd9d572 100644
--- a/common/x86/mc-c.c
+++ b/common/x86/mc-c.c
@@ -124,7 +124,7 @@ void x264_integral_init8v_mmx( uint16_t *sum8, int stride );
 void x264_integral_init8v_sse2( uint16_t *sum8, int stride );
 void x264_integral_init4v_ssse3( uint16_t *sum8, uint16_t *sum4, int stride );
 void x264_mbtree_propagate_cost_sse2( int *dst, uint16_t *propagate_in, uint16_t *intra_costs,
-                                      uint16_t *inter_costs, uint16_t *inv_qscales, int len );
+                                      uint16_t *inter_costs, uint16_t *inv_qscales, float *fps_factor, int len );
 
 #define MC_CHROMA(cpu)\
 void x264_mc_chroma_##cpu( pixel *dstu, pixel *dstv, int i_dst,\
diff --git a/encoder/ratecontrol.c b/encoder/ratecontrol.c
index 4de6ab51..2d587150 100644
--- a/encoder/ratecontrol.c
+++ b/encoder/ratecontrol.c
@@ -1603,9 +1603,7 @@ int x264_ratecontrol_end( x264_t *h, int bits, int *filler )
             rc->cplxr_sum += bits * qp2qscale( rc->qpa_rc ) / (rc->last_rceq * fabs( h->param.rc.f_pb_factor ));
         }
         rc->cplxr_sum *= rc->cbr_decay;
-        double frame_duration = (double)h->fenc->i_duration * h->sps->vui.i_num_units_in_tick / h->sps->vui.i_time_scale;
-
-        rc->wanted_bits_window += frame_duration * rc->bitrate;
+        rc->wanted_bits_window += h->fenc->f_duration * rc->bitrate;
         rc->wanted_bits_window *= rc->cbr_decay;
     }
 
@@ -2184,7 +2182,7 @@ static float rate_estimate_qscale( x264_t *h )
             rcc->last_satd = x264_rc_analyse_slice( h );
             rcc->short_term_cplxsum *= 0.5;
             rcc->short_term_cplxcount *= 0.5;
-            rcc->short_term_cplxsum += rcc->last_satd;
+            rcc->short_term_cplxsum += rcc->last_satd / (CLIP_DURATION(h->fenc->f_duration) / BASE_FRAME_DURATION);
             rcc->short_term_cplxcount ++;
 
             rce.tex_bits = rcc->last_satd;
@@ -2541,10 +2539,11 @@ static int init_pass2( x264_t *h )
 {
     x264_ratecontrol_t *rcc = h->rc;
     uint64_t all_const_bits = 0;
+    double timescale = (double)h->sps->vui.i_num_units_in_tick / h->sps->vui.i_time_scale;
     double duration = 0;
     for( int i = 0; i < rcc->num_entries; i++ )
         duration += rcc->entry[i].i_duration;
-    duration *= (double)h->sps->vui.i_num_units_in_tick / h->sps->vui.i_time_scale;
+    duration *= timescale;
     uint64_t all_available_bits = h->param.rc.i_bitrate * 1000. * duration;
     double rate_factor, step_mult;
     double qblur = h->param.rc.f_qblur;
@@ -2583,21 +2582,23 @@ static int init_pass2( x264_t *h )
         for( int j = 1; j < cplxblur*2 && j < rcc->num_entries-i; j++ )
         {
             ratecontrol_entry_t *rcj = &rcc->entry[i+j];
+            double frame_duration = CLIP_DURATION(rcj->i_duration * timescale) / BASE_FRAME_DURATION;
             weight *= 1 - pow( (float)rcj->i_count / rcc->nmb, 2 );
             if( weight < .0001 )
                 break;
             gaussian_weight = weight * exp( -j*j/200.0 );
             weight_sum += gaussian_weight;
-            cplx_sum += gaussian_weight * (qscale2bits(rcj, 1) - rcj->misc_bits);
+            cplx_sum += gaussian_weight * (qscale2bits( rcj, 1 ) - rcj->misc_bits) / frame_duration;
         }
         /* weighted average of cplx of past frames */
         weight = 1.0;
         for( int j = 0; j <= cplxblur*2 && j <= i; j++ )
         {
             ratecontrol_entry_t *rcj = &rcc->entry[i-j];
+            double frame_duration = CLIP_DURATION(rcj->i_duration * timescale) / BASE_FRAME_DURATION;
             gaussian_weight = weight * exp( -j*j/200.0 );
             weight_sum += gaussian_weight;
-            cplx_sum += gaussian_weight * (qscale2bits( rcj, 1 ) - rcj->misc_bits);
+            cplx_sum += gaussian_weight * (qscale2bits( rcj, 1 ) - rcj->misc_bits) / frame_duration;
             weight *= 1 - pow( (float)rcj->i_count / rcc->nmb, 2 );
             if( weight < .0001 )
                 break;
diff --git a/encoder/ratecontrol.h b/encoder/ratecontrol.h
index 03c82cb0..28e6a3d7 100644
--- a/encoder/ratecontrol.h
+++ b/encoder/ratecontrol.h
@@ -27,6 +27,16 @@
 #ifndef X264_RATECONTROL_H
 #define X264_RATECONTROL_H
 
+/* Completely arbitrary.  Ratecontrol lowers relative quality at higher framerates
+ * and the reverse at lower framerates; this serves as the center of the curve. */
+#define BASE_FRAME_DURATION (0.04f)
+
+/* Arbitrary limitations as a sanity check. */
+#define MAX_FRAME_DURATION 1.00f
+#define MIN_FRAME_DURATION 0.01f
+
+#define CLIP_DURATION(f) x264_clip3f(f,MIN_FRAME_DURATION,MAX_FRAME_DURATION)
+
 int  x264_ratecontrol_new   ( x264_t * );
 void x264_ratecontrol_delete( x264_t * );
 
diff --git a/encoder/slicetype.c b/encoder/slicetype.c
index cc0193bb..97cf61e3 100644
--- a/encoder/slicetype.c
+++ b/encoder/slicetype.c
@@ -748,9 +748,10 @@ static int x264_slicetype_frame_cost_recalculate( x264_t *h, x264_frame_t **fram
     return i_score;
 }
 
-static void x264_macroblock_tree_finish( x264_t *h, x264_frame_t *frame, int ref0_distance )
+static void x264_macroblock_tree_finish( x264_t *h, x264_frame_t *frame, float average_duration, int ref0_distance )
 {
-    x264_emms();
+    int fps_factor_intra     = round( CLIP_DURATION(frame->f_duration) / BASE_FRAME_DURATION * 256 );
+    int fps_factor_propagate = round( CLIP_DURATION( average_duration) / BASE_FRAME_DURATION * 256 );
     float weightdelta = 0.0;
     if( ref0_distance && frame->f_weighted_cost_delta[ref0_distance-1] > 0 )
         weightdelta = (1.0 - frame->f_weighted_cost_delta[ref0_distance-1]);
@@ -760,17 +761,18 @@ static void x264_macroblock_tree_finish( x264_t *h, x264_frame_t *frame, int ref
     float strength = 5.0f * (1.0f - h->param.rc.f_qcompress);
     for( int mb_index = 0; mb_index < h->mb.i_mb_count; mb_index++ )
     {
-        int intra_cost = (frame->i_intra_cost[mb_index] * frame->i_inv_qscale_factor[mb_index]+128)>>8;
+        int intra_cost = (frame->i_intra_cost[mb_index] * frame->i_inv_qscale_factor[mb_index] + 128) >> 8;
+        int intra_cost_scaled = (intra_cost * fps_factor_intra + 128) >> 8;
         if( intra_cost )
         {
-            int propagate_cost = frame->i_propagate_cost[mb_index];
-            float log2_ratio = x264_log2(intra_cost + propagate_cost) - x264_log2(intra_cost) + weightdelta;
+            int propagate_cost = (frame->i_propagate_cost[mb_index] * fps_factor_propagate + 128) >> 8;
+            float log2_ratio = x264_log2(intra_cost_scaled + propagate_cost) - x264_log2(intra_cost) + weightdelta;
             frame->f_qp_offset[mb_index] = frame->f_qp_offset_aq[mb_index] - strength * log2_ratio;
         }
     }
 }
 
-static void x264_macroblock_tree_propagate( x264_t *h, x264_frame_t **frames, int p0, int p1, int b, int referenced )
+static void x264_macroblock_tree_propagate( x264_t *h, x264_frame_t **frames, float average_duration, int p0, int p1, int b, int referenced )
 {
     uint16_t *ref_costs[2] = {frames[p0]->i_propagate_cost,frames[p1]->i_propagate_cost};
     int dist_scale_factor = ( ((b-p0) << 8) + ((p1-p0) >> 1) ) / (p1-p0);
@@ -780,6 +782,9 @@ static void x264_macroblock_tree_propagate( x264_t *h, x264_frame_t **frames, in
     int *buf = h->scratch_buffer;
     uint16_t *propagate_cost = frames[b]->i_propagate_cost;
 
+    x264_emms();
+    float fps_factor = CLIP_DURATION(frames[b]->f_duration) / CLIP_DURATION(average_duration);
+
     /* For non-reffed frames the source costs are always zero, so just memset one row and re-use it. */
     if( !referenced )
         memset( frames[b]->i_propagate_cost, 0, h->mb.i_mb_width * sizeof(uint16_t) );
@@ -789,7 +794,7 @@ static void x264_macroblock_tree_propagate( x264_t *h, x264_frame_t **frames, in
         int mb_index = h->mb.i_mb_y*h->mb.i_mb_stride;
         h->mc.mbtree_propagate_cost( buf, propagate_cost,
             frames[b]->i_intra_cost+mb_index, frames[b]->lowres_costs[b-p0][p1-b]+mb_index,
-            frames[b]->i_inv_qscale_factor+mb_index, h->mb.i_mb_width );
+            frames[b]->i_inv_qscale_factor+mb_index, &fps_factor, h->mb.i_mb_width );
         if( referenced )
             propagate_cost += h->mb.i_mb_width;
         for( h->mb.i_mb_x = 0; h->mb.i_mb_x < h->mb.i_mb_width; h->mb.i_mb_x++, mb_index++ )
@@ -858,7 +863,7 @@ static void x264_macroblock_tree_propagate( x264_t *h, x264_frame_t **frames, in
     }
 
     if( h->param.rc.i_vbv_buffer_size && h->param.rc.i_lookahead && referenced )
-        x264_macroblock_tree_finish( h, frames[b], b == p1 ? b - p0 : 0 );
+        x264_macroblock_tree_finish( h, frames[b], average_duration, b == p1 ? b - p0 : 0 );
 }
 
 static void x264_macroblock_tree( x264_t *h, x264_mb_analysis_t *a, x264_frame_t **frames, int num_frames, int b_intra )
@@ -866,6 +871,13 @@ static void x264_macroblock_tree( x264_t *h, x264_mb_analysis_t *a, x264_frame_t
     int idx = !b_intra;
     int last_nonb, cur_nonb = 1;
     int bframes = 0;
+
+    x264_emms();
+    float total_duration = 0.0;
+    for( int j = 0; j <= num_frames; j++ )
+        total_duration += frames[j]->f_duration;
+    float average_duration = total_duration / (num_frames + 1);
+
     int i = num_frames;
 
     if( b_intra )
@@ -918,34 +930,34 @@ static void x264_macroblock_tree( x264_t *h, x264_mb_analysis_t *a, x264_frame_t
                 if( i != middle )
                 {
                     x264_slicetype_frame_cost( h, a, frames, p0, p1, i, 0 );
-                    x264_macroblock_tree_propagate( h, frames, p0, p1, i, 0 );
+                    x264_macroblock_tree_propagate( h, frames, average_duration, p0, p1, i, 0 );
                 }
                 i--;
             }
-            x264_macroblock_tree_propagate( h, frames, cur_nonb, last_nonb, middle, 1 );
+            x264_macroblock_tree_propagate( h, frames, average_duration, cur_nonb, last_nonb, middle, 1 );
         }
         else
         {
             while( i > cur_nonb )
             {
                 x264_slicetype_frame_cost( h, a, frames, cur_nonb, last_nonb, i, 0 );
-                x264_macroblock_tree_propagate( h, frames, cur_nonb, last_nonb, i, 0 );
+                x264_macroblock_tree_propagate( h, frames, average_duration, cur_nonb, last_nonb, i, 0 );
                 i--;
             }
         }
-        x264_macroblock_tree_propagate( h, frames, cur_nonb, last_nonb, last_nonb, 1 );
+        x264_macroblock_tree_propagate( h, frames, average_duration, cur_nonb, last_nonb, last_nonb, 1 );
         last_nonb = cur_nonb;
     }
 
     if( !h->param.rc.i_lookahead )
     {
-        x264_macroblock_tree_propagate( h, frames, 0, last_nonb, last_nonb, 1 );
+        x264_macroblock_tree_propagate( h, frames, average_duration, 0, last_nonb, last_nonb, 1 );
         XCHG( uint16_t*, frames[last_nonb]->i_propagate_cost, frames[0]->i_propagate_cost );
     }
 
-    x264_macroblock_tree_finish( h, frames[last_nonb], last_nonb );
+    x264_macroblock_tree_finish( h, frames[last_nonb], average_duration, last_nonb );
     if( h->param.i_bframe_pyramid && bframes > 1 && !h->param.rc.i_vbv_buffer_size )
-        x264_macroblock_tree_finish( h, frames[last_nonb+(bframes+1)/2], 0 );
+        x264_macroblock_tree_finish( h, frames[last_nonb+(bframes+1)/2], average_duration, 0 );
 }
 
 static int x264_vbv_frame_cost( x264_t *h, x264_mb_analysis_t *a, x264_frame_t **frames, int p0, int p1, int b )
diff --git a/tools/checkasm.c b/tools/checkasm.c
index c552ab9b..7c7faa72 100644
--- a/tools/checkasm.c
+++ b/tools/checkasm.c
@@ -1236,29 +1236,34 @@ static int check_mc( int cpu_ref, int cpu_new )
 
     if( mc_a.mbtree_propagate_cost != mc_ref.mbtree_propagate_cost )
     {
-        ok = 1; used_asm = 1;
-        set_func_name( "mbtree_propagate" );
-        int *dsta = (int*)buf3;
-        int *dstc = dsta+400;
-        uint16_t *prop = (uint16_t*)buf1;
-        uint16_t *intra = (uint16_t*)buf4;
-        uint16_t *inter = intra+400;
-        uint16_t *qscale = inter+400;
-        uint16_t *rnd = (uint16_t*)buf2;
         x264_emms();
-        for( int i = 0; i < 400; i++ )
+        for( int i = 0; i < 10; i++ )
         {
-            intra[i]  = *rnd++ & 0x7fff;
-            intra[i] += !intra[i];
-            inter[i]  = *rnd++ & 0x7fff;
-            qscale[i] = *rnd++ & 0x7fff;
+            float fps_factor = (rand()&65535) / 256.;
+            ok = 1; used_asm = 1;
+            set_func_name( "mbtree_propagate" );
+            int *dsta = (int*)buf3;
+            int *dstc = dsta+400;
+            uint16_t *prop = (uint16_t*)buf1;
+            uint16_t *intra = (uint16_t*)buf4;
+            uint16_t *inter = intra+100;
+            uint16_t *qscale = inter+100;
+            uint16_t *rnd = (uint16_t*)buf2;
+            x264_emms();
+            for( int j = 0; j < 100; j++ )
+            {
+                intra[j]  = *rnd++ & 0x7fff;
+                intra[j] += !intra[j];
+                inter[j]  = *rnd++ & 0x7fff;
+                qscale[j] = *rnd++ & 0x7fff;
+            }
+            call_c( mc_c.mbtree_propagate_cost, dstc, prop, intra, inter, qscale, &fps_factor, 100 );
+            call_a( mc_a.mbtree_propagate_cost, dsta, prop, intra, inter, qscale, &fps_factor, 100 );
+            // I don't care about exact rounding, this is just how close the floating-point implementation happens to be
+            x264_emms();
+            for( int j = 0; j < 100; j++ )
+                ok &= abs( dstc[j]-dsta[j] ) <= 1 || fabs( (double)dstc[j]/dsta[j]-1 ) < 1e-4;
         }
-        call_c( mc_c.mbtree_propagate_cost, dstc, prop, intra, inter, qscale, 400 );
-        call_a( mc_a.mbtree_propagate_cost, dsta, prop, intra, inter, qscale, 400 );
-        // I don't care about exact rounding, this is just how close the floating-point implementation happens to be
-        x264_emms();
-        for( int i = 0; i < 400; i++ )
-            ok &= abs( dstc[i]-dsta[i] ) <= 1 || fabs( (double)dstc[i]/dsta[i]-1 ) < 1e-6;
         report( "mbtree propagate :" );
     }
 
-- 
2.40.0