From 6abf5d67010f8c3889f3184769e09f12fbe473c2 Mon Sep 17 00:00:00 2001 From: Loren Merritt Date: Wed, 10 Dec 2008 20:54:17 -0800 Subject: [PATCH] use lookup tables instead of actual exp/pow for AQ Significant speed boost, especially on CPUs with atrociously slow floating point units (e.g. Pentium 4 saves 800 clocks per MB with this change). Add x264_clz function as part of the LUT system: this may be useful later. Note this changes output somewhat as the numbers from the lookup table are not exact. --- common/osdep.h | 16 ++++++++++++ encoder/ratecontrol.c | 59 +++++++++++++++++++++++++++++++++++-------- 2 files changed, 64 insertions(+), 11 deletions(-) diff --git a/common/osdep.h b/common/osdep.h index 416a76ee..25bb1380 100644 --- a/common/osdep.h +++ b/common/osdep.h @@ -169,4 +169,20 @@ static ALWAYS_INLINE intptr_t endian_fix( intptr_t x ) } #endif +#ifdef __GNUC__ +#define x264_clz(x) __builtin_clz(x) +#else +static int ALWAYS_INLINE x264_clz( uint32_t x ) +{ + static uint8_t lut[16] = {4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0}; + int y, z = ((x - 0x10000) >> 27) & 16; + x >>= z^16; + z += y = ((x - 0x100) >> 28) & 8; + x >>= y^8; + z += y = ((x - 0x10) >> 29) & 4; + x >>= y^4; + return z + lut[x]; +} +#endif + #endif /* X264_OSDEP_H */ diff --git a/encoder/ratecontrol.c b/encoder/ratecontrol.c index bb3797d8..c25495d8 100644 --- a/encoder/ratecontrol.c +++ b/encoder/ratecontrol.c @@ -191,21 +191,62 @@ static NOINLINE int ac_energy_mb( x264_t *h, int mb_x, int mb_y, x264_frame_t *f return var; } +static const float log2_lut[128] = { + 0.00000, 0.01123, 0.02237, 0.03342, 0.04439, 0.05528, 0.06609, 0.07682, + 0.08746, 0.09803, 0.10852, 0.11894, 0.12928, 0.13955, 0.14975, 0.15987, + 0.16993, 0.17991, 0.18982, 0.19967, 0.20945, 0.21917, 0.22882, 0.23840, + 0.24793, 0.25739, 0.26679, 0.27612, 0.28540, 0.29462, 0.30378, 0.31288, + 0.32193, 0.33092, 0.33985, 0.34873, 0.35755, 0.36632, 0.37504, 0.38370, + 0.39232, 0.40088, 0.40939, 0.41785, 0.42626, 0.43463, 0.44294, 0.45121, + 0.45943, 0.46761, 0.47573, 0.48382, 0.49185, 0.49985, 0.50779, 0.51570, + 0.52356, 0.53138, 0.53916, 0.54689, 0.55459, 0.56224, 0.56986, 0.57743, + 0.58496, 0.59246, 0.59991, 0.60733, 0.61471, 0.62205, 0.62936, 0.63662, + 0.64386, 0.65105, 0.65821, 0.66534, 0.67243, 0.67948, 0.68650, 0.69349, + 0.70044, 0.70736, 0.71425, 0.72110, 0.72792, 0.73471, 0.74147, 0.74819, + 0.75489, 0.76155, 0.76818, 0.77479, 0.78136, 0.78790, 0.79442, 0.80090, + 0.80735, 0.81378, 0.82018, 0.82655, 0.83289, 0.83920, 0.84549, 0.85175, + 0.85798, 0.86419, 0.87036, 0.87652, 0.88264, 0.88874, 0.89482, 0.90087, + 0.90689, 0.91289, 0.91886, 0.92481, 0.93074, 0.93664, 0.94251, 0.94837, + 0.95420, 0.96000, 0.96578, 0.97154, 0.97728, 0.98299, 0.98868, 0.99435, +}; + +static const uint8_t exp2_lut[64] = { + 1, 4, 7, 10, 13, 16, 19, 22, 25, 28, 31, 34, 37, 40, 44, 47, + 50, 53, 57, 60, 64, 67, 71, 74, 78, 81, 85, 89, 93, 96, 100, 104, + 108, 112, 116, 120, 124, 128, 132, 137, 141, 145, 150, 154, 159, 163, 168, 172, + 177, 182, 186, 191, 196, 201, 206, 211, 216, 221, 226, 232, 237, 242, 248, 253, +}; + +static int x264_exp2fix8( float x ) +{ + int i, f; + x += 8; + if( x <= 0 ) return 0; + if( x >= 16 ) return 0xffff; + i = x; + f = (x-i)*64; + return (exp2_lut[f]+256) << i >> 8; +} + void x264_adaptive_quant_frame( x264_t *h, x264_frame_t *frame ) { + /* constants chosen to result in approximately the same overall bitrate as without AQ. + * FIXME: while they're written in 5 significant digits, they're only tuned to 2. */ + float strength = h->param.rc.f_aq_strength * 1.0397; int mb_x, mb_y; - for( mb_y=0; mb_ysps->i_mb_height; mb_y++ ) - for( mb_x=0; mb_xsps->i_mb_width; mb_x++ ) + for( mb_y = 0; mb_y < h->sps->i_mb_height; mb_y++ ) + for( mb_x = 0; mb_x < h->sps->i_mb_width; mb_x++ ) { - int energy = ac_energy_mb( h, mb_x, mb_y, frame ); - /* 10 constant chosen to result in approximately the same overall bitrate as without AQ. */ - float qp_adj = h->param.rc.f_aq_strength * 1.5 * (logf(energy) - 10.0); + uint32_t energy = ac_energy_mb( h, mb_x, mb_y, frame ); + int lz = x264_clz( energy ); + float qp_adj = strength * (log2_lut[(energy<>24)&0x7f] - lz + 16.573f); frame->f_qp_offset[mb_x + mb_y*h->mb.i_mb_stride] = qp_adj; if( h->frames.b_have_lowres ) - frame->i_inv_qscale_factor[mb_x+mb_y*h->mb.i_mb_stride] = FIX8(pow(2.0,-qp_adj/6.0)); + frame->i_inv_qscale_factor[mb_x + mb_y*h->mb.i_mb_stride] = x264_exp2fix8(qp_adj*(-1.f/6.f)); } } + /***************************************************************************** * x264_adaptive_quant: * adjust macroblock QP based on variance (AC energy) of the MB. @@ -215,16 +256,12 @@ void x264_adaptive_quant_frame( x264_t *h, x264_frame_t *frame ) *****************************************************************************/ void x264_adaptive_quant( x264_t *h ) { - float qp, qp_adj; x264_emms(); - qp = h->rc->f_qpm; - qp_adj = h->fenc->f_qp_offset[h->mb.i_mb_x + h->mb.i_mb_y*h->mb.i_mb_stride]; - h->mb.i_qp = x264_clip3( qp + qp_adj + .5, h->param.rc.i_qp_min, h->param.rc.i_qp_max ); + h->mb.i_qp = x264_clip3( h->rc->f_qpm + h->fenc->f_qp_offset[h->mb.i_mb_xy] + .5, h->param.rc.i_qp_min, h->param.rc.i_qp_max ); /* If the QP of this MB is within 1 of the previous MB, code the same QP as the previous MB, * to lower the bit cost of the qp_delta. */ if( abs(h->mb.i_qp - h->mb.i_last_qp) == 1 ) h->mb.i_qp = h->mb.i_last_qp; - h->mb.i_chroma_qp = h->chroma_qp_table[h->mb.i_qp]; } int x264_ratecontrol_new( x264_t *h ) -- 2.40.0