From b1eac26510d0532ae9202249767e5f3ba22443ef Mon Sep 17 00:00:00 2001
From: Fiona Glaser <fiona@x264.com>
Date: Sun, 13 Sep 2009 01:02:37 -0700
Subject: [PATCH] Make MV costs global instead of static Fixes some extremely
 rare threading race conditions and makes the code cleaner. Downside: slightly
 higher memory usage when calling multiple encoders from the same application.

---
 common/common.h       |  8 ++++
 common/osdep.h        |  2 +
 encoder/analyse.c     | 92 +++++++++++++++++++++++--------------------
 encoder/analyse.h     |  5 ++-
 encoder/encoder.c     | 20 +++++-----
 encoder/me.c          | 24 +++++------
 encoder/me.h          |  2 +-
 encoder/ratecontrol.c |  2 +
 encoder/slicetype.c   | 14 ++-----
 x264.c                |  5 ++-
 10 files changed, 94 insertions(+), 80 deletions(-)

diff --git a/common/common.h b/common/common.h
index 81c7b003..6271bc59 100644
--- a/common/common.h
+++ b/common/common.h
@@ -52,6 +52,8 @@ do {\
 #define X264_THREAD_MAX 128
 #define X264_PCM_COST (386*8)
 #define X264_LOOKAHEAD_MAX 250
+// arbitrary, but low because SATD scores are 1/4 normal
+#define X264_LOOKAHEAD_QP 12
 
 // number of pixels (per thread) in progress at any given time.
 // 16 for the macroblock in progress + 3 for deblocking + 3 for motion compensation filter + 2 for extra safety
@@ -347,6 +349,12 @@ struct x264_t
     uint16_t        (*quant4_bias[4])[16];   /* [4][52][16] */
     uint16_t        (*quant8_bias[2])[64];   /* [2][52][64] */
 
+    /* mv/ref cost arrays.  Indexed by lambda instead of
+     * qp because, due to rounding, some quantizers share
+     * lambdas.  This saves memory. */
+    uint16_t *cost_mv[92];
+    uint16_t *cost_mv_fpel[92][4];
+
     const uint8_t   *chroma_qp_table; /* includes both the nonlinear luma->chroma mapping and chroma_qp_offset */
 
     ALIGNED_16( uint32_t nr_residual_sum[2][64] );
diff --git a/common/osdep.h b/common/osdep.h
index cb61b14b..71780a24 100644
--- a/common/osdep.h
+++ b/common/osdep.h
@@ -140,6 +140,7 @@ static inline int x264_pthread_create( x264_pthread_t *t, void *a, void *(*f)(vo
 #define x264_pthread_attr_t          pthread_attr_t
 #define x264_pthread_attr_init       pthread_attr_init
 #define x264_pthread_attr_destroy    pthread_attr_destroy
+#define X264_PTHREAD_MUTEX_INITIALIZER PTHREAD_MUTEX_INITIALIZER
 #else
 #define x264_pthread_mutex_t         int
 #define x264_pthread_mutex_init(m,f) 0
@@ -154,6 +155,7 @@ static inline int x264_pthread_create( x264_pthread_t *t, void *a, void *(*f)(vo
 #define x264_pthread_attr_t          int
 #define x264_pthread_attr_init(a)    0
 #define x264_pthread_attr_destroy(a)
+#define X264_PTHREAD_MUTEX_INITIALIZER 0
 #endif
 
 #define WORD_SIZE sizeof(void*)
diff --git a/encoder/analyse.c b/encoder/analyse.c
index 68d0e948..70f8d0f5 100644
--- a/encoder/analyse.c
+++ b/encoder/analyse.c
@@ -77,7 +77,7 @@ typedef struct
     int i_lambda;
     int i_lambda2;
     int i_qp;
-    int16_t *p_cost_mv;
+    uint16_t *p_cost_mv;
     uint16_t *p_cost_ref0;
     uint16_t *p_cost_ref1;
     int i_mbrd;
@@ -237,46 +237,36 @@ static const int i_sub_mb_p_cost_table[4] = {
 
 static void x264_analyse_update_cache( x264_t *h, x264_mb_analysis_t *a );
 
-/* Indexed by lambda instead of qp because, due to rounding,
- * some quantizers share lambdas.  This saves memory. */
-uint16_t *x264_cost_mv_fpel[92][4];
-uint16_t x264_cost_ref[92][3][33];
+static uint16_t x264_cost_ref[92][3][33];
+static x264_pthread_mutex_t cost_ref_mutex = X264_PTHREAD_MUTEX_INITIALIZER;
 
-/* initialize an array of lambda*nbits for all possible mvs */
-static int x264_mb_analyse_load_costs( x264_t *h, x264_mb_analysis_t *a )
+int x264_analyse_init_costs( x264_t *h, int qp )
 {
-    static int16_t *p_cost_mv[92];
     int i, j;
-
-    if( !p_cost_mv[a->i_lambda] )
-    {
-        x264_emms();
-        /* could be faster, but isn't called many times */
-        /* factor of 4 from qpel, 2 from sign, and 2 because mv can be opposite from mvp */
-        CHECKED_MALLOC( p_cost_mv[a->i_lambda], (4*4*2048 + 1) * sizeof(int16_t) );
-        p_cost_mv[a->i_lambda] += 2*4*2048;
-        for( i = 0; i <= 2*4*2048; i++ )
-        {
-            p_cost_mv[a->i_lambda][-i] =
-            p_cost_mv[a->i_lambda][i]  = a->i_lambda * (log2f(i+1)*2 + 0.718f + !!i) + .5f;
-        }
-        for( i = 0; i < 3; i++ )
-            for( j = 0; j < 33; j++ )
-                x264_cost_ref[a->i_lambda][i][j] = i ? a->i_lambda * bs_size_te( i, j ) : 0;
-    }
-    a->p_cost_mv = p_cost_mv[a->i_lambda];
-    a->p_cost_ref0 = x264_cost_ref[a->i_lambda][x264_clip3(h->sh.i_num_ref_idx_l0_active-1,0,2)];
-    a->p_cost_ref1 = x264_cost_ref[a->i_lambda][x264_clip3(h->sh.i_num_ref_idx_l1_active-1,0,2)];
-
-    /* FIXME is this useful for all me methods? */
-    if( h->param.analyse.i_me_method >= X264_ME_ESA && !x264_cost_mv_fpel[a->i_lambda][0] )
+    int lambda = x264_lambda_tab[qp];
+    if( h->cost_mv[lambda] )
+        return 0;
+    /* factor of 4 from qpel, 2 from sign, and 2 because mv can be opposite from mvp */
+    CHECKED_MALLOC( h->cost_mv[lambda], (4*4*2048 + 1) * sizeof(uint16_t) );
+    h->cost_mv[lambda] += 2*4*2048;
+    for( i = 0; i <= 2*4*2048; i++ )
+    {
+        h->cost_mv[lambda][-i] =
+        h->cost_mv[lambda][i]  = lambda * (log2f(i+1)*2 + 0.718f + !!i) + .5f;
+    }
+    x264_pthread_mutex_lock( &cost_ref_mutex );
+    for( i = 0; i < 3; i++ )
+        for( j = 0; j < 33; j++ )
+            x264_cost_ref[lambda][i][j] = i ? lambda * bs_size_te( i, j ) : 0;
+    x264_pthread_mutex_unlock( &cost_ref_mutex );
+    if( h->param.analyse.i_me_method >= X264_ME_ESA && !h->cost_mv_fpel[lambda][0] )
     {
         for( j=0; j<4; j++ )
         {
-            CHECKED_MALLOC( x264_cost_mv_fpel[a->i_lambda][j], (4*2048 + 1) * sizeof(int16_t) );
-            x264_cost_mv_fpel[a->i_lambda][j] += 2*2048;
+            CHECKED_MALLOC( h->cost_mv_fpel[lambda][j], (4*2048 + 1) * sizeof(uint16_t) );
+            h->cost_mv_fpel[lambda][j] += 2*2048;
             for( i = -2*2048; i < 2*2048; i++ )
-                x264_cost_mv_fpel[a->i_lambda][j][i] = p_cost_mv[a->i_lambda][i*4+j];
+                h->cost_mv_fpel[lambda][j][i] = h->cost_mv[lambda][i*4+j];
         }
     }
     return 0;
@@ -284,6 +274,27 @@ fail:
     return -1;
 }
 
+void x264_analyse_free_costs( x264_t *h )
+{
+    int i, j;
+    for( i = 0; i < 92; i++ )
+    {
+        if( h->cost_mv[i] )
+            x264_free( h->cost_mv[i] - 2*4*2048 );
+        if( h->cost_mv_fpel[i][0] )
+            for( j = 0; j < 4; j++ )
+                x264_free( h->cost_mv_fpel[i][j] - 2*2048 );
+    }
+}
+
+/* initialize an array of lambda*nbits for all possible mvs */
+static void x264_mb_analyse_load_costs( x264_t *h, x264_mb_analysis_t *a )
+{
+    a->p_cost_mv = h->cost_mv[a->i_lambda];
+    a->p_cost_ref0 = x264_cost_ref[a->i_lambda][x264_clip3(h->sh.i_num_ref_idx_l0_active-1,0,2)];
+    a->p_cost_ref1 = x264_cost_ref[a->i_lambda][x264_clip3(h->sh.i_num_ref_idx_l1_active-1,0,2)];
+}
+
 static void x264_mb_analyse_init( x264_t *h, x264_mb_analysis_t *a, int i_qp )
 {
     int i = h->param.analyse.i_subpel_refine - (h->sh.i_type == SLICE_TYPE_B);
@@ -2317,7 +2328,7 @@ static inline void x264_mb_analyse_qp_rd( x264_t *h, x264_mb_analysis_t *a )
 /*****************************************************************************
  * x264_macroblock_analyse:
  *****************************************************************************/
-int x264_macroblock_analyse( x264_t *h )
+void x264_macroblock_analyse( x264_t *h )
 {
     x264_mb_analysis_t analysis;
     int i_cost = COST_MAX;
@@ -2392,13 +2403,12 @@ int x264_macroblock_analyse( x264_t *h )
             int i_thresh16x8;
             int i_satd_inter, i_satd_intra;
 
-            if( x264_mb_analyse_load_costs( h, &analysis ) )
-                return -1;
+            x264_mb_analyse_load_costs( h, &analysis );
 
             x264_mb_analyse_inter_p16x16( h, &analysis );
 
             if( h->mb.i_type == P_SKIP )
-                return 0;
+                return;
 
             if( flags & X264_ANALYSE_PSUB16x16 )
             {
@@ -2686,8 +2696,7 @@ int x264_macroblock_analyse( x264_t *h )
             int i_satd_inter;
             h->mb.b_skip_mc = 0;
 
-            if( x264_mb_analyse_load_costs( h, &analysis ) )
-                return -1;
+            x264_mb_analyse_load_costs( h, &analysis );
 
             /* select best inter mode */
             /* direct must be first */
@@ -2713,7 +2722,7 @@ int x264_macroblock_analyse( x264_t *h )
                 {
                     h->mb.i_type = B_SKIP;
                     x264_analyse_update_cache( h, &analysis );
-                    return 0;
+                    return;
                 }
             }
 
@@ -2945,7 +2954,6 @@ int x264_macroblock_analyse( x264_t *h )
         x264_psy_trellis_init( h, 0 );
     if( h->mb.b_trellis == 1 || h->mb.b_noise_reduction )
         h->mb.i_skip_intra = 0;
-    return 0;
 }
 
 /*-------------------- Update MB from the analysis ----------------------*/
diff --git a/encoder/analyse.h b/encoder/analyse.h
index 05aae40d..5342d04b 100644
--- a/encoder/analyse.h
+++ b/encoder/analyse.h
@@ -24,9 +24,10 @@
 #ifndef X264_ANALYSE_H
 #define X264_ANALYSE_H
 
-int  x264_macroblock_analyse( x264_t *h );
+int x264_analyse_init_costs( x264_t *h, int qp );
+void x264_analyse_free_costs( x264_t *h );
+void x264_macroblock_analyse( x264_t *h );
 void x264_slicetype_decide( x264_t *h );
-int  x264_lowres_context_alloc( x264_t *h );
 
 void x264_slicetype_analyse( x264_t *h, int keyframe );
 
diff --git a/encoder/encoder.c b/encoder/encoder.c
index c6b33980..8e614a2b 100644
--- a/encoder/encoder.c
+++ b/encoder/encoder.c
@@ -748,7 +748,7 @@ x264_t *x264_encoder_open( x264_param_t *param )
 {
     x264_t *h;
     char buf[1000], *p;
-    int i, i_slicetype_length;
+    int i, qp, i_slicetype_length;
 
     CHECKED_MALLOCZERO( h, sizeof(x264_t) );
 
@@ -869,6 +869,12 @@ x264_t *x264_encoder_open( x264_param_t *param )
         p += sprintf( p, " none!" );
     x264_log( h, X264_LOG_INFO, "%s\n", buf );
 
+    for( qp = h->param.rc.i_qp_min; qp <= h->param.rc.i_qp_max; qp++ )
+        if( x264_analyse_init_costs( h, qp ) )
+            goto fail;
+    if( x264_analyse_init_costs( h, X264_LOOKAHEAD_QP ) )
+        goto fail;
+
     h->out.i_nal = 0;
     h->out.i_bitstream = X264_MAX( 1000000, h->param.i_width * h->param.i_height * 4
         * ( h->param.rc.i_rc_method == X264_RC_ABR ? pow( 0.95, h->param.rc.i_qp_min )
@@ -900,9 +906,6 @@ x264_t *x264_encoder_open( x264_param_t *param )
     if( x264_ratecontrol_new( h ) < 0 )
         goto fail;
 
-    if( x264_lowres_context_alloc( h ) )
-        goto fail;
-
     if( h->param.psz_dump_yuv )
     {
         /* create or truncate the reconstructed video file */
@@ -1332,12 +1335,7 @@ static int x264_slice_write( x264_t *h )
         /* load cache */
         x264_macroblock_cache_load( h, i_mb_x, i_mb_y );
 
-        /* analyse parameters
-         * Slice I: choose I_4x4 or I_16x16 mode
-         * Slice P: choose between using P mode or intra (4x4 or 16x16)
-         * */
-        if( x264_macroblock_analyse( h ) )
-            return -1;
+        x264_macroblock_analyse( h );
 
         /* encode this macroblock -> be careful it can change the mb type to P_SKIP if needed */
         x264_macroblock_encode( h );
@@ -2230,6 +2228,8 @@ void    x264_encoder_close  ( x264_t *h )
 
     x264_cqm_delete( h );
 
+    x264_analyse_free_costs( h );
+
     if( h->param.i_threads > 1)
         h = h->thread[ h->i_thread_phase % h->param.i_threads ];
 
diff --git a/encoder/me.c b/encoder/me.c
index eb0fd5a0..d7c71653 100644
--- a/encoder/me.c
+++ b/encoder/me.c
@@ -195,8 +195,8 @@ void x264_me_search_ref( x264_t *h, x264_me_t *m, int16_t (*mvc)[2], int i_mvc,
 
 #define CHECK_MVRANGE(mx,my) ( mx >= mv_x_min && mx <= mv_x_max && my >= mv_y_min && my <= mv_y_max )
 
-    const int16_t *p_cost_mvx = m->p_cost_mv - m->mvp[0];
-    const int16_t *p_cost_mvy = m->p_cost_mv - m->mvp[1];
+    const uint16_t *p_cost_mvx = m->p_cost_mv - m->mvp[0];
+    const uint16_t *p_cost_mvy = m->p_cost_mv - m->mvp[1];
 
     bmx = x264_clip3( m->mvp[0], mv_x_min*4, mv_x_max*4 );
     bmy = x264_clip3( m->mvp[1], mv_y_min*4, mv_y_max*4 );
@@ -452,8 +452,8 @@ me_hex2:
 
             /* hexagon grid */
             omx = bmx; omy = bmy;
-            const int16_t *p_cost_omvx = p_cost_mvx + omx*4;
-            const int16_t *p_cost_omvy = p_cost_mvy + omy*4;
+            const uint16_t *p_cost_omvx = p_cost_mvx + omx*4;
+            const uint16_t *p_cost_omvy = p_cost_mvy + omy*4;
             i = 1;
             do
             {
@@ -569,7 +569,7 @@ me_hex2:
             int delta = x264_pixel_size[sad_size].w;
             int16_t *xs = h->scratch_buffer;
             int xn;
-            uint16_t *cost_fpel_mvx = x264_cost_mv_fpel[x264_lambda_tab[h->mb.i_qp]][-m->mvp[0]&3] + (-m->mvp[0]>>2);
+            uint16_t *cost_fpel_mvx = h->cost_mv_fpel[x264_lambda_tab[h->mb.i_qp]][-m->mvp[0]&3] + (-m->mvp[0]>>2);
 
             h->pixf.sad_x4[sad_size]( zero, p_fenc, p_fenc+delta,
                 p_fenc+delta*FENC_STRIDE, p_fenc+delta+delta*FENC_STRIDE,
@@ -768,8 +768,8 @@ static void refine_subpel( x264_t *h, x264_me_t *m, int hpel_iters, int qpel_ite
 {
     const int bw = x264_pixel_size[m->i_pixel].w;
     const int bh = x264_pixel_size[m->i_pixel].h;
-    const int16_t *p_cost_mvx = m->p_cost_mv - m->mvp[0];
-    const int16_t *p_cost_mvy = m->p_cost_mv - m->mvp[1];
+    const uint16_t *p_cost_mvx = m->p_cost_mv - m->mvp[0];
+    const uint16_t *p_cost_mvy = m->p_cost_mv - m->mvp[1];
     const int i_pixel = m->i_pixel;
     const int b_chroma_me = h->mb.b_chroma_me && i_pixel <= PIXEL_8x8;
 
@@ -942,10 +942,10 @@ static void ALWAYS_INLINE x264_me_refine_bidir( x264_t *h, x264_me_t *m0, x264_m
     const int i_pixel = m0->i_pixel;
     const int bw = x264_pixel_size[i_pixel].w;
     const int bh = x264_pixel_size[i_pixel].h;
-    const int16_t *p_cost_m0x = m0->p_cost_mv - m0->mvp[0];
-    const int16_t *p_cost_m0y = m0->p_cost_mv - m0->mvp[1];
-    const int16_t *p_cost_m1x = m1->p_cost_mv - m1->mvp[0];
-    const int16_t *p_cost_m1y = m1->p_cost_mv - m1->mvp[1];
+    const uint16_t *p_cost_m0x = m0->p_cost_mv - m0->mvp[0];
+    const uint16_t *p_cost_m0y = m0->p_cost_mv - m0->mvp[1];
+    const uint16_t *p_cost_m1x = m1->p_cost_mv - m1->mvp[0];
+    const uint16_t *p_cost_m1y = m1->p_cost_mv - m1->mvp[1];
     ALIGNED_ARRAY_16( uint8_t, pixy_buf,[2],[9][16*16] );
     ALIGNED_8( uint8_t pixu_buf[2][9][8*8] );
     ALIGNED_8( uint8_t pixv_buf[2][9][8*8] );
@@ -1073,7 +1073,7 @@ void x264_me_refine_qpel_rd( x264_t *h, x264_me_t *m, int i_lambda2, int i4, int
     static const int pixel_mv_offs[] = { 0, 4, 4*8, 0, 2, 2*8, 0 };
     int16_t *cache_mv = h->mb.cache.mv[i_list][x264_scan8[i4]];
     int16_t *cache_mv2 = cache_mv + pixel_mv_offs[m->i_pixel];
-    const int16_t *p_cost_mvx, *p_cost_mvy;
+    const uint16_t *p_cost_mvx, *p_cost_mvy;
     const int bw = x264_pixel_size[m->i_pixel].w>>2;
     const int bh = x264_pixel_size[m->i_pixel].h>>2;
     const int i_pixel = m->i_pixel;
diff --git a/encoder/me.h b/encoder/me.h
index 8bdee2e5..0122b8b7 100644
--- a/encoder/me.h
+++ b/encoder/me.h
@@ -31,7 +31,7 @@ typedef struct
 {
     /* input */
     int      i_pixel;   /* PIXEL_WxH */
-    int16_t *p_cost_mv; /* lambda * nbits for each possible mv */
+    uint16_t *p_cost_mv; /* lambda * nbits for each possible mv */
     int      i_ref_cost;
     int      i_ref;
 
diff --git a/encoder/ratecontrol.c b/encoder/ratecontrol.c
index cb7fd3b8..b11c7c43 100644
--- a/encoder/ratecontrol.c
+++ b/encoder/ratecontrol.c
@@ -953,6 +953,8 @@ void x264_ratecontrol_start( x264_t *h, int i_force_qp )
         }
     }
 
+    q = x264_clip3f( q, h->param.rc.i_qp_min, h->param.rc.i_qp_max );
+
     rc->qpa_rc =
     rc->qpa_aq = 0;
     h->fdec->f_qp_avg_rc =
diff --git a/encoder/slicetype.c b/encoder/slicetype.c
index 56b56e1b..985dfd65 100644
--- a/encoder/slicetype.c
+++ b/encoder/slicetype.c
@@ -29,22 +29,14 @@
 #include "me.h"
 
 
-static int x264_lowres_context_init( x264_t *h, x264_mb_analysis_t *a )
+static void x264_lowres_context_init( x264_t *h, x264_mb_analysis_t *a )
 {
-    a->i_qp = 12; // arbitrary, but low because SATD scores are 1/4 normal
+    a->i_qp = X264_LOOKAHEAD_QP;
     a->i_lambda = x264_lambda_tab[ a->i_qp ];
-    if( x264_mb_analyse_load_costs( h, a ) )
-        return -1;
+    x264_mb_analyse_load_costs( h, a );
     h->mb.i_me_method = X264_MIN( X264_ME_HEX, h->param.analyse.i_me_method ); // maybe dia?
     h->mb.i_subpel_refine = 4; // 3 should be enough, but not tweaking for speed now
     h->mb.b_chroma_me = 0;
-    return 0;
-}
-
-int x264_lowres_context_alloc( x264_t *h )
-{
-    x264_mb_analysis_t a;
-    return x264_lowres_context_init( h, &a );
 }
 
 static int x264_slicetype_mb_cost( x264_t *h, x264_mb_analysis_t *a,
diff --git a/x264.c b/x264.c
index 0fdde7d8..3d43eb55 100644
--- a/x264.c
+++ b/x264.c
@@ -262,9 +262,10 @@ static void Help( x264_param_t *defaults, int longhelp )
         "                                  where <option> is either\n"
         "                                      q=<integer> (force QP)\n"
         "                                  or  b=<float> (bitrate multiplier)\n" );
-    H1( "      --qpfile <string>       Force frametypes and QPs for some or all frames\n"
+    H2( "      --qpfile <string>       Force frametypes and QPs for some or all frames\n"
         "                              Format of each line: framenumber frametype QP\n"
-        "                              QP of -1 lets x264 choose. Frametypes: I,i,P,B,b.\n" );
+        "                              QP of -1 lets x264 choose. Frametypes: I,i,P,B,b.\n"
+        "                              QPs are restricted by qpmin/qpmax.\n" );
     H1( "\n" );
     H1( "Analysis:\n" );
     H1( "\n" );
-- 
2.40.0