From 6abf5d67010f8c3889f3184769e09f12fbe473c2 Mon Sep 17 00:00:00 2001
From: Loren Merritt <pengvado@akuvian.org>
Date: Wed, 10 Dec 2008 20:54:17 -0800
Subject: [PATCH] use lookup tables instead of actual exp/pow for AQ
 Significant speed boost, especially on CPUs with atrociously slow floating
 point units (e.g. Pentium 4 saves 800 clocks per MB with this change). Add
 x264_clz function as part of the LUT system: this may be useful later. Note
 this changes output somewhat as the numbers from the lookup table are not
 exact.

---
 common/osdep.h        | 16 ++++++++++++
 encoder/ratecontrol.c | 59 +++++++++++++++++++++++++++++++++++--------
 2 files changed, 64 insertions(+), 11 deletions(-)
diff --git a/common/osdep.h b/common/osdep.h
index 416a76ee..25bb1380 100644
--- a/common/osdep.h
+++ b/common/osdep.h
@@ -169,4 +169,20 @@ static ALWAYS_INLINE intptr_t endian_fix( intptr_t x )
 }
 #endif
 
+#ifdef __GNUC__
+#define x264_clz(x) __builtin_clz(x)
+#else
+static int ALWAYS_INLINE x264_clz( uint32_t x )
+{
+    static uint8_t lut[16] = {4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0};
+    int y, z = ((x - 0x10000) >> 27) & 16;
+    x >>= z^16;
+    z += y = ((x - 0x100) >> 28) & 8;
+    x >>= y^8;
+    z += y = ((x - 0x10) >> 29) & 4;
+    x >>= y^4;
+    return z + lut[x];
+}
+#endif
+
 #endif /* X264_OSDEP_H */
diff --git a/encoder/ratecontrol.c b/encoder/ratecontrol.c
index bb3797d8..c25495d8 100644
--- a/encoder/ratecontrol.c
+++ b/encoder/ratecontrol.c
@@ -191,21 +191,62 @@ static NOINLINE int ac_energy_mb( x264_t *h, int mb_x, int mb_y, x264_frame_t *f
     return var;
 }
 
+static const float log2_lut[128] = {
+    0.00000, 0.01123, 0.02237, 0.03342, 0.04439, 0.05528, 0.06609, 0.07682,
+    0.08746, 0.09803, 0.10852, 0.11894, 0.12928, 0.13955, 0.14975, 0.15987,
+    0.16993, 0.17991, 0.18982, 0.19967, 0.20945, 0.21917, 0.22882, 0.23840,
+    0.24793, 0.25739, 0.26679, 0.27612, 0.28540, 0.29462, 0.30378, 0.31288,
+    0.32193, 0.33092, 0.33985, 0.34873, 0.35755, 0.36632, 0.37504, 0.38370,
+    0.39232, 0.40088, 0.40939, 0.41785, 0.42626, 0.43463, 0.44294, 0.45121,
+    0.45943, 0.46761, 0.47573, 0.48382, 0.49185, 0.49985, 0.50779, 0.51570,
+    0.52356, 0.53138, 0.53916, 0.54689, 0.55459, 0.56224, 0.56986, 0.57743,
+    0.58496, 0.59246, 0.59991, 0.60733, 0.61471, 0.62205, 0.62936, 0.63662,
+    0.64386, 0.65105, 0.65821, 0.66534, 0.67243, 0.67948, 0.68650, 0.69349,
+    0.70044, 0.70736, 0.71425, 0.72110, 0.72792, 0.73471, 0.74147, 0.74819,
+    0.75489, 0.76155, 0.76818, 0.77479, 0.78136, 0.78790, 0.79442, 0.80090,
+    0.80735, 0.81378, 0.82018, 0.82655, 0.83289, 0.83920, 0.84549, 0.85175,
+    0.85798, 0.86419, 0.87036, 0.87652, 0.88264, 0.88874, 0.89482, 0.90087,
+    0.90689, 0.91289, 0.91886, 0.92481, 0.93074, 0.93664, 0.94251, 0.94837,
+    0.95420, 0.96000, 0.96578, 0.97154, 0.97728, 0.98299, 0.98868, 0.99435,
+};
+
+static const uint8_t exp2_lut[64] = {
+      1,   4,   7,  10,  13,  16,  19,  22,  25,  28,  31,  34,  37,  40,  44,  47,
+     50,  53,  57,  60,  64,  67,  71,  74,  78,  81,  85,  89,  93,  96, 100, 104,
+    108, 112, 116, 120, 124, 128, 132, 137, 141, 145, 150, 154, 159, 163, 168, 172,
+    177, 182, 186, 191, 196, 201, 206, 211, 216, 221, 226, 232, 237, 242, 248, 253,
+};
+
+static int x264_exp2fix8( float x )
+{
+    int i, f;
+    x += 8;
+    if( x <= 0 ) return 0;
+    if( x >= 16 ) return 0xffff;
+    i = x;
+    f = (x-i)*64;
+    return (exp2_lut[f]+256) << i >> 8;
+}
+
 void x264_adaptive_quant_frame( x264_t *h, x264_frame_t *frame )
 {
+    /* constants chosen to result in approximately the same overall bitrate as without AQ.
+     * FIXME: while they're written in 5 significant digits, they're only tuned to 2. */
+    float strength = h->param.rc.f_aq_strength * 1.0397;
     int mb_x, mb_y;
-    for( mb_y=0; mb_y<h->sps->i_mb_height; mb_y++ )
-        for( mb_x=0; mb_x<h->sps->i_mb_width; mb_x++ )
+    for( mb_y = 0; mb_y < h->sps->i_mb_height; mb_y++ )
+        for( mb_x = 0; mb_x < h->sps->i_mb_width; mb_x++ )
         {
-            int energy = ac_energy_mb( h, mb_x, mb_y, frame );
-            /* 10 constant chosen to result in approximately the same overall bitrate as without AQ. */
-            float qp_adj = h->param.rc.f_aq_strength * 1.5 * (logf(energy) - 10.0);
+            uint32_t energy = ac_energy_mb( h, mb_x, mb_y, frame );
+            int lz = x264_clz( energy );
+            float qp_adj = strength * (log2_lut[(energy<<lz>>24)&0x7f] - lz + 16.573f);
             frame->f_qp_offset[mb_x + mb_y*h->mb.i_mb_stride] = qp_adj;
             if( h->frames.b_have_lowres )
-                frame->i_inv_qscale_factor[mb_x+mb_y*h->mb.i_mb_stride] = FIX8(pow(2.0,-qp_adj/6.0));
+                frame->i_inv_qscale_factor[mb_x + mb_y*h->mb.i_mb_stride] = x264_exp2fix8(qp_adj*(-1.f/6.f));
         }
 }
 
+
 /*****************************************************************************
 * x264_adaptive_quant:
  * adjust macroblock QP based on variance (AC energy) of the MB.
@@ -215,16 +256,12 @@ void x264_adaptive_quant_frame( x264_t *h, x264_frame_t *frame )
 *****************************************************************************/
 void x264_adaptive_quant( x264_t *h )
 {
-    float qp, qp_adj;
     x264_emms();
-    qp = h->rc->f_qpm;
-    qp_adj = h->fenc->f_qp_offset[h->mb.i_mb_x + h->mb.i_mb_y*h->mb.i_mb_stride];
-    h->mb.i_qp = x264_clip3( qp + qp_adj + .5, h->param.rc.i_qp_min, h->param.rc.i_qp_max );
+    h->mb.i_qp = x264_clip3( h->rc->f_qpm + h->fenc->f_qp_offset[h->mb.i_mb_xy] + .5, h->param.rc.i_qp_min, h->param.rc.i_qp_max );
     /* If the QP of this MB is within 1 of the previous MB, code the same QP as the previous MB,
      * to lower the bit cost of the qp_delta. */
     if( abs(h->mb.i_qp - h->mb.i_last_qp) == 1 )
         h->mb.i_qp = h->mb.i_last_qp;
-    h->mb.i_chroma_qp = h->chroma_qp_table[h->mb.i_qp];
 }
 
 int x264_ratecontrol_new( x264_t *h )
-- 
2.40.0