From 80458ffcd62f0852e7092176b7b155bdfd3d5a82 Mon Sep 17 00:00:00 2001
From: Fiona Glaser <fiona@x264.com>
Date: Sat, 13 Sep 2008 14:03:12 -0700
Subject: [PATCH] Move adaptive quantization to before ratecontrol, eliminate
 qcomp bias This change improves VBV accuracy and improves bit distribution in
 CRF and 2pass. Instead of being applied after ratecontrol, AQ becomes part of
 the complexity measure that ratecontrol uses. This allows for modularity for
 changes to AQ; a new AQ algorithm can be introduced simply by introducing a
 new aq_mode and a corresponding if in adaptive_quant_frame. This also allows
 quantizer field smoothing, since quantizers are calculated beofrehand rather
 during encoding. Since there is no more reason for it, aq_mode 1 is removed. 
 The new mode 1 is in a sense a merger of the old modes 1 and 2. WARNING: This
 change redefines CRF when using AQ, so output bitrate for a given CRF may be
 significantly different from before this change!

---
 common/common.c       |  2 +-
 common/frame.c        |  3 ++
 common/frame.h        |  2 ++
 encoder/encoder.c     |  8 ++---
 encoder/ratecontrol.c | 84 +++++++++++--------------------------------
 encoder/ratecontrol.h |  3 +-
 encoder/slicetype.c   | 47 ++++++++++++++++++------
 x264.c                |  5 ++-
 x264.h                |  5 ++-
 9 files changed, 73 insertions(+), 86 deletions(-)

diff --git a/common/common.c b/common/common.c
index 96b6f2bd..c25dea7c 100644
--- a/common/common.c
+++ b/common/common.c
@@ -93,7 +93,7 @@ void    x264_param_default( x264_param_t *param )
     param->rc.i_qp_step = 4;
     param->rc.f_ip_factor = 1.4;
     param->rc.f_pb_factor = 1.3;
-    param->rc.i_aq_mode = X264_AQ_GLOBAL;
+    param->rc.i_aq_mode = X264_AQ_VARIANCE;
     param->rc.f_aq_strength = 1.0;
 
     param->rc.b_stat_write = 0;
diff --git a/common/frame.c b/common/frame.c
index bba0da9b..c824a76d 100644
--- a/common/frame.c
+++ b/common/frame.c
@@ -114,6 +114,9 @@ x264_frame_t *x264_frame_new( x264_t *h )
         for( j = 0; j < h->param.i_bframe + 2; j++ )
             CHECKED_MALLOC( frame->i_row_satds[i][j], i_lines/16 * sizeof(int) );
 
+    if( h->param.rc.i_aq_mode )
+        CHECKED_MALLOC( frame->f_qp_offset, h->mb.i_mb_count * sizeof(float) );
+
     x264_pthread_mutex_init( &frame->mutex, NULL );
     x264_pthread_cond_init( &frame->cv, NULL );
 
diff --git a/common/frame.h b/common/frame.h
index 523689fc..6e96da65 100644
--- a/common/frame.h
+++ b/common/frame.h
@@ -71,12 +71,14 @@ typedef struct
      * contains the SATD cost of the lowres frame encoded in various modes
      * FIXME: how big an array do we need? */
     int     i_cost_est[X264_BFRAME_MAX+2][X264_BFRAME_MAX+2];
+    int     i_cost_est_aq[X264_BFRAME_MAX+2][X264_BFRAME_MAX+2];
     int     i_satd; // the i_cost_est of the selected frametype
     int     i_intra_mbs[X264_BFRAME_MAX+2];
     int     *i_row_satds[X264_BFRAME_MAX+2][X264_BFRAME_MAX+2];
     int     *i_row_satd;
     int     *i_row_bits;
     int     *i_row_qp;
+    float   *f_qp_offset;
 
     /* threading */
     int     i_lines_completed; /* in pixels */
diff --git a/encoder/encoder.c b/encoder/encoder.c
index 92bc699c..4141d095 100644
--- a/encoder/encoder.c
+++ b/encoder/encoder.c
@@ -487,12 +487,9 @@ static int x264_validate_parameters( x264_t *h )
     if( !h->param.b_cabac )
         h->param.analyse.i_trellis = 0;
     h->param.analyse.i_trellis = x264_clip3( h->param.analyse.i_trellis, 0, 2 );
-    h->param.rc.i_aq_mode = x264_clip3( h->param.rc.i_aq_mode, 0, 2 );
+    h->param.rc.i_aq_mode = x264_clip3( h->param.rc.i_aq_mode, 0, 1 );
     if( h->param.rc.f_aq_strength <= 0 )
         h->param.rc.i_aq_mode = 0;
-    /* VAQ effectively replaces qcomp, so qcomp is raised towards 1 to compensate. */
-    if( h->param.rc.i_aq_mode == X264_AQ_GLOBAL )
-        h->param.rc.f_qcompress = x264_clip3f(h->param.rc.f_qcompress + h->param.rc.f_aq_strength / 0.7, 0, 1);
     h->param.analyse.i_noise_reduction = x264_clip3( h->param.analyse.i_noise_reduction, 0, 1<<16 );
 
     {
@@ -1362,6 +1359,9 @@ int     x264_encoder_encode( x264_t *h,
         if( h->frames.b_have_lowres )
             x264_frame_init_lowres( h, fenc );
 
+        if( h->param.rc.i_aq_mode )
+            x264_adaptive_quant_frame( h, fenc );
+
         if( h->frames.i_input <= h->frames.i_delay + 1 - h->param.i_threads )
         {
             /* Nothing yet to encode */
diff --git a/encoder/ratecontrol.c b/encoder/ratecontrol.c
index 4c425e35..555404f9 100644
--- a/encoder/ratecontrol.c
+++ b/encoder/ratecontrol.c
@@ -127,10 +127,6 @@ struct x264_ratecontrol_t
     int bframes;                /* # consecutive B-frames before this P-frame */
     int bframe_bits;            /* total cost of those frames */
 
-    /* AQ stuff */
-    float aq_threshold;
-    int *ac_energy;
-
     int i_zones;
     x264_zone_t *zones;
     x264_zone_t *prev_zone;
@@ -172,64 +168,40 @@ static inline double qscale2bits(ratecontrol_entry_t *rce, double qscale)
 }
 
 // Find the total AC energy of the block in all planes.
-static NOINLINE int ac_energy_mb( x264_t *h, int mb_x, int mb_y, int *satd )
+static NOINLINE int ac_energy_mb( x264_t *h, int mb_x, int mb_y, x264_frame_t *frame )
 {
     /* This function contains annoying hacks because GCC has a habit of reordering emms
      * and putting it after floating point ops.  As a result, we put the emms at the end of the
      * function and make sure that its always called before the float math.  Noinline makes
      * sure no reordering goes on. */
-    /* FIXME: This array is larger than necessary because a bug in GCC causes an all-zero
-    * array to be placed in .bss despite .bss not being correctly aligned on some platforms (win32?) */
-    DECLARE_ALIGNED_16( static uint8_t zero[17] ) = {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1};
     unsigned int var=0, sad, i;
-    if( satd || h->param.rc.i_aq_mode == X264_AQ_GLOBAL )
+    for( i=0; i<3; i++ )
     {
-        for( i=0; i<3; i++ )
-        {
-            int w = i ? 8 : 16;
-            int stride = h->fenc->i_stride[i];
-            int offset = h->mb.b_interlaced
-                ? w * (mb_x + (mb_y&~1) * stride) + (mb_y&1) * stride
-                : w * (mb_x + mb_y * stride);
-            int pix = i ? PIXEL_8x8 : PIXEL_16x16;
-            stride <<= h->mb.b_interlaced;
-            var += h->pixf.var[pix]( h->fenc->plane[i]+offset, stride, &sad );
-            // SATD to represent the block's overall complexity (bit cost) for intra encoding.
-            // exclude the DC coef, because nothing short of an actual intra prediction will estimate DC cost.
-            if( var && satd )
-                *satd += h->pixf.satd[pix]( zero, 0, h->fenc->plane[i]+offset, stride ) - sad/2;
-        }
-        var = X264_MAX(var,1);
+        int w = i ? 8 : 16;
+        int stride = frame->i_stride[i];
+        int offset = h->mb.b_interlaced
+            ? w * (mb_x + (mb_y&~1) * stride) + (mb_y&1) * stride
+            : w * (mb_x + mb_y * stride);
+        int pix = i ? PIXEL_8x8 : PIXEL_16x16;
+        stride <<= h->mb.b_interlaced;
+        var += h->pixf.var[pix]( frame->plane[i]+offset, stride, &sad );
     }
-    else var = h->rc->ac_energy[h->mb.i_mb_xy];
+    var = X264_MAX(var,1);
     x264_emms();
     return var;
 }
 
-static void x264_autosense_aq( x264_t *h )
+void x264_adaptive_quant_frame( x264_t *h, x264_frame_t *frame )
 {
-    double total = 0;
-    double n = 0;
     int mb_x, mb_y;
-    // FIXME: Some of the SATDs might be already calculated elsewhere (ratecontrol?). Can we reuse them?
-    // FIXME: Is chroma SATD necessary?
     for( mb_y=0; mb_y<h->sps->i_mb_height; mb_y++ )
         for( mb_x=0; mb_x<h->sps->i_mb_width; mb_x++ )
         {
-            int satd=0;
-            int energy = ac_energy_mb( h, mb_x, mb_y, &satd );
-            h->rc->ac_energy[mb_x + mb_y * h->sps->i_mb_width] = energy;
-            /* Weight the energy value by the SATD value of the MB.
-             * This represents the fact that the more complex blocks in a frame should
-             * be weighted more when calculating the optimal threshold. This also helps
-             * diminish the negative effect of large numbers of simple blocks in a frame,
-             * such as in the case of a letterboxed film. */
-            total += logf(energy) * satd;
-            n += satd;
+            int energy = ac_energy_mb( h, mb_x, mb_y, frame );
+            /* 10 constant chosen to result in approximately the same overall bitrate as without AQ. */
+            float qp_adj = h->param.rc.f_aq_strength * 1.5 * (logf(energy) - 10.0);
+            frame->f_qp_offset[mb_x + mb_y*h->mb.i_mb_stride] = qp_adj;
         }
-    x264_emms();
-    /* Calculate and store the threshold. */
-    h->rc->aq_threshold = n ? total/n : 15;
 }
 
 /*****************************************************************************
@@ -241,13 +213,11 @@ static void x264_autosense_aq( x264_t *h )
 *****************************************************************************/
 void x264_adaptive_quant( x264_t *h )
 {
-    int energy = ac_energy_mb( h, h->mb.i_mb_x, h->mb.i_mb_y, NULL );
-    /* Adjust the QP based on the AC energy of the macroblock. */
-    float qp = h->rc->f_qpm;
-    float qp_adj = 1.5 * (logf(energy) - h->rc->aq_threshold);
-    if( h->param.rc.i_aq_mode == X264_AQ_LOCAL )
-        qp_adj = x264_clip3f( qp_adj, -5, 5 );
-    h->mb.i_qp = x264_clip3( qp + qp_adj * h->param.rc.f_aq_strength + .5, h->param.rc.i_qp_min, h->param.rc.i_qp_max );
+    float qp, qp_adj;
+    x264_emms();
+    qp = h->rc->f_qpm;
+    qp_adj = h->fenc->f_qp_offset[h->mb.i_mb_x + h->mb.i_mb_y*h->mb.i_mb_stride];
+    h->mb.i_qp = x264_clip3( qp + qp_adj + .5, h->param.rc.i_qp_min, h->param.rc.i_qp_max );
     /* If the QP of this MB is within 1 of the previous MB, code the same QP as the previous MB,
      * to lower the bit cost of the qp_delta. */
     if( abs(h->mb.i_qp - h->mb.i_last_qp) == 1 )
@@ -554,8 +524,6 @@ int x264_ratecontrol_new( x264_t *h )
         h->thread[i]->rc = rc+i;
         if( i )
             rc[i] = rc[0];
-        if( h->param.rc.i_aq_mode == X264_AQ_LOCAL )
-            rc[i].ac_energy = x264_malloc( h->mb.i_mb_count * sizeof(int) );
     }
 
     return 0;
@@ -717,8 +685,6 @@ void x264_ratecontrol_delete( x264_t *h )
                     x264_free( rc->zones[i].param );
         x264_free( rc->zones );
     }
-    for( i=0; i<h->param.i_threads; i++ )
-        x264_free( rc[i].ac_energy );
     x264_free( rc );
 }
 
@@ -842,14 +808,6 @@ void x264_ratecontrol_start( x264_t *h, int i_force_qp )
 
     if( h->sh.i_type != SLICE_TYPE_B )
         rc->last_non_b_pict_type = h->sh.i_type;
-
-    /* Adaptive AQ thresholding algorithm. */
-    if( h->param.rc.i_aq_mode == X264_AQ_GLOBAL )
-        /* Arbitrary value for "center" of the AQ curve.
-         * Chosen so that any given value of CRF has on average similar bitrate with and without AQ. */
-        h->rc->aq_threshold = logf(5000);
-    else if( h->param.rc.i_aq_mode == X264_AQ_LOCAL )
-        x264_autosense_aq(h);
 }
 
 static double predict_row_size( x264_t *h, int y, int qp )
diff --git a/encoder/ratecontrol.h b/encoder/ratecontrol.h
index d5e3371a..3310d3c2 100644
--- a/encoder/ratecontrol.h
+++ b/encoder/ratecontrol.h
@@ -27,6 +27,8 @@
 int  x264_ratecontrol_new   ( x264_t * );
 void x264_ratecontrol_delete( x264_t * );
 
+void x264_adaptive_quant_frame( x264_t *h, x264_frame_t *frame );
+void x264_adaptive_quant( x264_t * );
 void x264_thread_sync_ratecontrol( x264_t *cur, x264_t *prev, x264_t *next );
 void x264_ratecontrol_start( x264_t *, int i_force_qp );
 int  x264_ratecontrol_slice_type( x264_t *, int i_frame );
@@ -34,7 +36,6 @@ void x264_ratecontrol_mb( x264_t *, int bits );
 int  x264_ratecontrol_qp( x264_t * );
 void x264_ratecontrol_end( x264_t *, int bits );
 void x264_ratecontrol_summary( x264_t * );
-void x264_adaptive_quant( x264_t * );
 void x264_ratecontrol_set_estimated_size( x264_t *, int bits );
 int  x264_ratecontrol_get_estimated_size( x264_t const *);
 int  x264_rc_analyse_slice( x264_t *h );
diff --git a/encoder/slicetype.c b/encoder/slicetype.c
index e1b42f35..e4585e84 100644
--- a/encoder/slicetype.c
+++ b/encoder/slicetype.c
@@ -248,6 +248,8 @@ static int x264_slicetype_frame_cost( x264_t *h, x264_mb_analysis_t *a,
                                int b_intra_penalty )
 {
     int i_score = 0;
+    /* Don't use the AQ'd scores for slicetype decision. */
+    int i_score_aq = 0;
 
     /* Check whether we already evaluated this frame
      * If we have tried this frame as P, then we have also tried
@@ -276,9 +278,15 @@ static int x264_slicetype_frame_cost( x264_t *h, x264_mb_analysis_t *a,
         if( p1 != p0 )
             dist_scale_factor = ( ((b-p0) << 8) + ((p1-p0) >> 1) ) / (p1-p0);
 
+        if( h->sps->i_mb_width <= 2 || h->sps->i_mb_height <= 2 )
+        {
+            for( h->mb.i_mb_y = 0; h->mb.i_mb_y < h->sps->i_mb_height; h->mb.i_mb_y++ )
+                for( h->mb.i_mb_x = 0; h->mb.i_mb_x < h->sps->i_mb_width; h->mb.i_mb_x++ )
+                    i_score += x264_slicetype_mb_cost( h, a, frames, p0, p1, b, dist_scale_factor );
+        }
         /* the edge mbs seem to reduce the predictive quality of the
          * whole frame's score, but are needed for a spatial distribution. */
-        if( h->param.rc.i_vbv_buffer_size )
+        else if( h->param.rc.i_vbv_buffer_size )
         {
             for( h->mb.i_mb_y = 0; h->mb.i_mb_y < h->sps->i_mb_height; h->mb.i_mb_y++ )
             {
@@ -286,33 +294,45 @@ static int x264_slicetype_frame_cost( x264_t *h, x264_mb_analysis_t *a,
                 for( h->mb.i_mb_x = 0; h->mb.i_mb_x < h->sps->i_mb_width; h->mb.i_mb_x++ )
                 {
                     int i_mb_cost = x264_slicetype_mb_cost( h, a, frames, p0, p1, b, dist_scale_factor );
-                    row_satd[ h->mb.i_mb_y ] += i_mb_cost;
+                    int i_mb_cost_aq = i_mb_cost;
+                    if( h->param.rc.i_aq_mode )
+                    {
+                        x264_emms();
+                        i_mb_cost_aq *= pow(2.0,-(frames[b]->f_qp_offset[h->mb.i_mb_x + h->mb.i_mb_y*h->mb.i_mb_stride])/6.0);
+                    }
+                    row_satd[ h->mb.i_mb_y ] += i_mb_cost_aq;
                     if( h->mb.i_mb_y > 0 && h->mb.i_mb_y < h->sps->i_mb_height - 1 &&
                         h->mb.i_mb_x > 0 && h->mb.i_mb_x < h->sps->i_mb_width - 1 )
                     {
+                        /* Don't use AQ-weighted costs for slicetype decision, only for ratecontrol. */
                         i_score += i_mb_cost;
+                        i_score_aq += i_mb_cost_aq;
                     }
                 }
             }
         }
-        else if( h->sps->i_mb_width > 2 && h->sps->i_mb_height > 2 )
+        else
         {
             for( h->mb.i_mb_y = 1; h->mb.i_mb_y < h->sps->i_mb_height - 1; h->mb.i_mb_y++ )
                 for( h->mb.i_mb_x = 1; h->mb.i_mb_x < h->sps->i_mb_width - 1; h->mb.i_mb_x++ )
-                    i_score += x264_slicetype_mb_cost( h, a, frames, p0, p1, b, dist_scale_factor );
-        }
-        else
-        {
-            for( h->mb.i_mb_y = 0; h->mb.i_mb_y < h->sps->i_mb_height; h->mb.i_mb_y++ )
-                for( h->mb.i_mb_x = 0; h->mb.i_mb_x < h->sps->i_mb_width; h->mb.i_mb_x++ )
-                    i_score += x264_slicetype_mb_cost( h, a, frames, p0, p1, b, dist_scale_factor );
+                {
+                    int i_mb_cost = x264_slicetype_mb_cost( h, a, frames, p0, p1, b, dist_scale_factor );
+                    int i_mb_cost_aq = i_mb_cost;
+                    if( h->param.rc.i_aq_mode )
+                    {
+                        x264_emms();
+                        i_mb_cost_aq *= pow(2.0,-(frames[b]->f_qp_offset[h->mb.i_mb_x + h->mb.i_mb_y*h->mb.i_mb_stride])/6.0);
+                    }
+                    i_score += i_mb_cost;
+                    i_score_aq += i_mb_cost_aq;
+                }
         }
 
-
         if( b != p1 )
             i_score = i_score * 100 / (120 + h->param.i_bframe_bias);
 
         frames[b]->i_cost_est[b-p0][p1-b] = i_score;
+        frames[b]->i_cost_est_aq[b-p0][p1-b] = i_score_aq;
 //      fprintf( stderr, "frm %d %c(%d,%d): %6d %6d imb:%d  \n", frames[b]->i_frame,
 //               (p1==0?'I':b<p1?'B':'P'), b-p0, p1-b, i_score, frames[b]->i_cost_est[0][0], frames[b]->i_intra_mbs[b-p0] );
         x264_emms();
@@ -538,6 +558,11 @@ int x264_rc_analyse_slice( x264_t *h )
     frames[b] = h->fenc;
 
     cost = x264_slicetype_frame_cost( h, &a, frames, p0, p1, b, 0 );
+
+    /* In AQ, use the weighted score instead. */
+    if( h->param.rc.i_aq_mode )
+        cost = frames[b]->i_cost_est[b-p0][p1-b];
+
     h->fenc->i_row_satd = h->fenc->i_row_satds[b-p0][p1-b];
     h->fdec->i_row_satd = h->fdec->i_row_satds[b-p0][p1-b];
     h->fdec->i_satd = cost;
diff --git a/x264.c b/x264.c
index f472fd78..e57c297a 100644
--- a/x264.c
+++ b/x264.c
@@ -194,10 +194,9 @@ static void Help( x264_param_t *defaults, int b_longhelp )
     H0( "      --ipratio <float>       QP factor between I and P [%.2f]\n", defaults->rc.f_ip_factor );
     H0( "      --pbratio <float>       QP factor between P and B [%.2f]\n", defaults->rc.f_pb_factor );
     H1( "      --chroma-qp-offset <integer>  QP difference between chroma and luma [%d]\n", defaults->analyse.i_chroma_qp_offset );
-    H0( "      --aq-mode <integer>     How AQ distributes bits [%d]\n"
+    H1( "      --aq-mode <integer>     AQ method [%d]\n"
         "                                  - 0: Disabled\n"
-        "                                  - 1: Avoid moving bits between frames\n"
-        "                                  - 2: Move bits between frames\n", defaults->rc.i_aq_mode );
+        "                                  - 1: Variance AQ (complexity mask)\n", defaults->rc.i_aq_mode );
     H0( "      --aq-strength <float>   Reduces blocking and blurring in flat and\n"
         "                              textured areas. [%.1f]\n"
         "                                  - 0.5: weak AQ\n"
diff --git a/x264.h b/x264.h
index 538e9a74..c16e597b 100644
--- a/x264.h
+++ b/x264.h
@@ -35,7 +35,7 @@
 
 #include <stdarg.h>
 
-#define X264_BUILD 61
+#define X264_BUILD 62
 
 /* x264_t:
  *      opaque handler for encoder */
@@ -85,8 +85,7 @@ typedef struct x264_t x264_t;
 #define X264_RC_CRF                  1
 #define X264_RC_ABR                  2
 #define X264_AQ_NONE                 0
-#define X264_AQ_LOCAL                1
-#define X264_AQ_GLOBAL               2
+#define X264_AQ_VARIANCE             1
 
 static const char * const x264_direct_pred_names[] = { "none", "spatial", "temporal", "auto", 0 };
 static const char * const x264_motion_est_names[] = { "dia", "hex", "umh", "esa", "tesa", 0 };
-- 
2.40.0