Make MV costs global instead of static

author Fiona Glaser <fiona@x264.com>

Sun, 13 Sep 2009 08:02:37 +0000 (01:02 -0700)

committer Fiona Glaser <fiona@x264.com>

Mon, 14 Sep 2009 19:27:38 +0000 (12:27 -0700)
author Fiona Glaser <fiona@x264.com>
Sun, 13 Sep 2009 08:02:37 +0000 (01:02 -0700)
committer Fiona Glaser <fiona@x264.com>
Mon, 14 Sep 2009 19:27:38 +0000 (12:27 -0700)
diff --git a/common/common.h b/common/common.h

index 81c7b003334e50bfee0e5a0fe93d504116936aa4..6271bc5978bdcf674e09d2d2d500ce1bc6a63591 100644 (file)
--- a/common/common.h
+++ b/common/common.h
@@ -52,6 +52,8 @@ do {\
  #define X264_THREAD_MAX 128
  #define X264_PCM_COST (386*8)
  #define X264_LOOKAHEAD_MAX 250
+// arbitrary, but low because SATD scores are 1/4 normal
+#define X264_LOOKAHEAD_QP 12
  
  // number of pixels (per thread) in progress at any given time.
  // 16 for the macroblock in progress + 3 for deblocking + 3 for motion compensation filter + 2 for extra safety
@@ -347,6 +349,12 @@ struct x264_t
      uint16_t        (*quant4_bias[4])[16];   /* [4][52][16] */
      uint16_t        (*quant8_bias[2])[64];   /* [2][52][64] */
  
+    /* mv/ref cost arrays.  Indexed by lambda instead of
+     * qp because, due to rounding, some quantizers share
+     * lambdas.  This saves memory. */
+    uint16_t *cost_mv[92];
+    uint16_t *cost_mv_fpel[92][4];
+
      const uint8_t   *chroma_qp_table; /* includes both the nonlinear luma->chroma mapping and chroma_qp_offset */
  
      ALIGNED_16( uint32_t nr_residual_sum[2][64] );
diff --git a/common/osdep.h b/common/osdep.h

index cb61b14b7f353f8f9ac7a5247e72bd1845e63207..71780a24f1abc7ceb00dcc6eba340da48a705bc7 100644 (file)
--- a/common/osdep.h
+++ b/common/osdep.h
@@ -140,6 +140,7 @@ static inline int x264_pthread_create( x264_pthread_t *t, void *a, void *(*f)(vo
  #define x264_pthread_attr_t          pthread_attr_t
  #define x264_pthread_attr_init       pthread_attr_init
  #define x264_pthread_attr_destroy    pthread_attr_destroy
+#define X264_PTHREAD_MUTEX_INITIALIZER PTHREAD_MUTEX_INITIALIZER
  #else
  #define x264_pthread_mutex_t         int
  #define x264_pthread_mutex_init(m,f) 0
@@ -154,6 +155,7 @@ static inline int x264_pthread_create( x264_pthread_t *t, void *a, void *(*f)(vo
  #define x264_pthread_attr_t          int
  #define x264_pthread_attr_init(a)    0
  #define x264_pthread_attr_destroy(a)
+#define X264_PTHREAD_MUTEX_INITIALIZER 0
  #endif
  
  #define WORD_SIZE sizeof(void*)
diff --git a/encoder/analyse.c b/encoder/analyse.c

index 68d0e9485b1ee207d27d3d6e2114d7ee537258fa..70f8d0f5cc256f0b848cd08de8996b2034c1fa5c 100644 (file)
--- a/encoder/analyse.c
+++ b/encoder/analyse.c
@@ -77,7 +77,7 @@ typedef struct
      int i_lambda;
      int i_lambda2;
      int i_qp;
-    int16_t *p_cost_mv;
+    uint16_t *p_cost_mv;
      uint16_t *p_cost_ref0;
      uint16_t *p_cost_ref1;
      int i_mbrd;
@@ -237,46 +237,36 @@ static const int i_sub_mb_p_cost_table[4] = {
  
  static void x264_analyse_update_cache( x264_t *h, x264_mb_analysis_t *a );
  
-/* Indexed by lambda instead of qp because, due to rounding,
- * some quantizers share lambdas.  This saves memory. */
-uint16_t *x264_cost_mv_fpel[92][4];
-uint16_t x264_cost_ref[92][3][33];
+static uint16_t x264_cost_ref[92][3][33];
+static x264_pthread_mutex_t cost_ref_mutex = X264_PTHREAD_MUTEX_INITIALIZER;
  
-/* initialize an array of lambda*nbits for all possible mvs */
-static int x264_mb_analyse_load_costs( x264_t *h, x264_mb_analysis_t *a )
+int x264_analyse_init_costs( x264_t *h, int qp )
  {
-    static int16_t *p_cost_mv[92];
      int i, j;
-
-    if( !p_cost_mv[a->i_lambda] )
-    {
-        x264_emms();
-        /* could be faster, but isn't called many times */
-        /* factor of 4 from qpel, 2 from sign, and 2 because mv can be opposite from mvp */
-        CHECKED_MALLOC( p_cost_mv[a->i_lambda], (4*4*2048 + 1) * sizeof(int16_t) );
-        p_cost_mv[a->i_lambda] += 2*4*2048;
-        for( i = 0; i <= 2*4*2048; i++ )
-        {
-            p_cost_mv[a->i_lambda][-i] =
-            p_cost_mv[a->i_lambda][i]  = a->i_lambda * (log2f(i+1)*2 + 0.718f + !!i) + .5f;
-        }
-        for( i = 0; i < 3; i++ )
-            for( j = 0; j < 33; j++ )
-                x264_cost_ref[a->i_lambda][i][j] = i ? a->i_lambda * bs_size_te( i, j ) : 0;
-    }
-    a->p_cost_mv = p_cost_mv[a->i_lambda];
-    a->p_cost_ref0 = x264_cost_ref[a->i_lambda][x264_clip3(h->sh.i_num_ref_idx_l0_active-1,0,2)];
-    a->p_cost_ref1 = x264_cost_ref[a->i_lambda][x264_clip3(h->sh.i_num_ref_idx_l1_active-1,0,2)];
-
-    /* FIXME is this useful for all me methods? */
-    if( h->param.analyse.i_me_method >= X264_ME_ESA && !x264_cost_mv_fpel[a->i_lambda][0] )
+    int lambda = x264_lambda_tab[qp];
+    if( h->cost_mv[lambda] )
+        return 0;
+    /* factor of 4 from qpel, 2 from sign, and 2 because mv can be opposite from mvp */
+    CHECKED_MALLOC( h->cost_mv[lambda], (4*4*2048 + 1) * sizeof(uint16_t) );
+    h->cost_mv[lambda] += 2*4*2048;
+    for( i = 0; i <= 2*4*2048; i++ )
+    {
+        h->cost_mv[lambda][-i] =
+        h->cost_mv[lambda][i]  = lambda * (log2f(i+1)*2 + 0.718f + !!i) + .5f;
+    }
+    x264_pthread_mutex_lock( &cost_ref_mutex );
+    for( i = 0; i < 3; i++ )
+        for( j = 0; j < 33; j++ )
+            x264_cost_ref[lambda][i][j] = i ? lambda * bs_size_te( i, j ) : 0;
+    x264_pthread_mutex_unlock( &cost_ref_mutex );
+    if( h->param.analyse.i_me_method >= X264_ME_ESA && !h->cost_mv_fpel[lambda][0] )
      {
          for( j=0; j<4; j++ )
          {
-            CHECKED_MALLOC( x264_cost_mv_fpel[a->i_lambda][j], (4*2048 + 1) * sizeof(int16_t) );
-            x264_cost_mv_fpel[a->i_lambda][j] += 2*2048;
+            CHECKED_MALLOC( h->cost_mv_fpel[lambda][j], (4*2048 + 1) * sizeof(uint16_t) );
+            h->cost_mv_fpel[lambda][j] += 2*2048;
              for( i = -2*2048; i < 2*2048; i++ )
-                x264_cost_mv_fpel[a->i_lambda][j][i] = p_cost_mv[a->i_lambda][i*4+j];
+                h->cost_mv_fpel[lambda][j][i] = h->cost_mv[lambda][i*4+j];
          }
      }
      return 0;
@@ -284,6 +274,27 @@ fail:
      return -1;
  }
  
+void x264_analyse_free_costs( x264_t *h )
+{
+    int i, j;
+    for( i = 0; i < 92; i++ )
+    {
+        if( h->cost_mv[i] )
+            x264_free( h->cost_mv[i] - 2*4*2048 );
+        if( h->cost_mv_fpel[i][0] )
+            for( j = 0; j < 4; j++ )
+                x264_free( h->cost_mv_fpel[i][j] - 2*2048 );
+    }
+}
+
+/* initialize an array of lambda*nbits for all possible mvs */
+static void x264_mb_analyse_load_costs( x264_t *h, x264_mb_analysis_t *a )
+{
+    a->p_cost_mv = h->cost_mv[a->i_lambda];
+    a->p_cost_ref0 = x264_cost_ref[a->i_lambda][x264_clip3(h->sh.i_num_ref_idx_l0_active-1,0,2)];
+    a->p_cost_ref1 = x264_cost_ref[a->i_lambda][x264_clip3(h->sh.i_num_ref_idx_l1_active-1,0,2)];
+}
+
  static void x264_mb_analyse_init( x264_t *h, x264_mb_analysis_t *a, int i_qp )
  {
      int i = h->param.analyse.i_subpel_refine - (h->sh.i_type == SLICE_TYPE_B);
@@ -2317,7 +2328,7 @@ static inline void x264_mb_analyse_qp_rd( x264_t *h, x264_mb_analysis_t *a )
  /*****************************************************************************
   * x264_macroblock_analyse:
   *****************************************************************************/
-int x264_macroblock_analyse( x264_t *h )
+void x264_macroblock_analyse( x264_t *h )
  {
      x264_mb_analysis_t analysis;
      int i_cost = COST_MAX;
@@ -2392,13 +2403,12 @@ int x264_macroblock_analyse( x264_t *h )
              int i_thresh16x8;
              int i_satd_inter, i_satd_intra;
  
-            if( x264_mb_analyse_load_costs( h, &analysis ) )
-                return -1;
+            x264_mb_analyse_load_costs( h, &analysis );
  
              x264_mb_analyse_inter_p16x16( h, &analysis );
  
              if( h->mb.i_type == P_SKIP )
-                return 0;
+                return;
  
              if( flags & X264_ANALYSE_PSUB16x16 )
              {
@@ -2686,8 +2696,7 @@ int x264_macroblock_analyse( x264_t *h )
              int i_satd_inter;
              h->mb.b_skip_mc = 0;
  
-            if( x264_mb_analyse_load_costs( h, &analysis ) )
-                return -1;
+            x264_mb_analyse_load_costs( h, &analysis );
  
              /* select best inter mode */
              /* direct must be first */
@@ -2713,7 +2722,7 @@ int x264_macroblock_analyse( x264_t *h )
                  {
                      h->mb.i_type = B_SKIP;
                      x264_analyse_update_cache( h, &analysis );
-                    return 0;
+                    return;
                  }
              }
  
@@ -2945,7 +2954,6 @@ int x264_macroblock_analyse( x264_t *h )
          x264_psy_trellis_init( h, 0 );
      if( h->mb.b_trellis == 1 || h->mb.b_noise_reduction )
          h->mb.i_skip_intra = 0;
-    return 0;
  }
  
  /*-------------------- Update MB from the analysis ----------------------*/
diff --git a/encoder/analyse.h b/encoder/analyse.h

index 05aae40d00546e03a5a6ba15335a507e8173c2ac..5342d04bd3015c72abb811b30fdcdeb35867c78d 100644 (file)
--- a/encoder/analyse.h
+++ b/encoder/analyse.h
@@ -24,9 +24,10 @@
  #ifndef X264_ANALYSE_H
  #define X264_ANALYSE_H
  
-int  x264_macroblock_analyse( x264_t *h );
+int x264_analyse_init_costs( x264_t *h, int qp );
+void x264_analyse_free_costs( x264_t *h );
+void x264_macroblock_analyse( x264_t *h );
  void x264_slicetype_decide( x264_t *h );
-int  x264_lowres_context_alloc( x264_t *h );
  
  void x264_slicetype_analyse( x264_t *h, int keyframe );
  
diff --git a/encoder/encoder.c b/encoder/encoder.c

index c6b3398049cfd284a7b0e1368347f3f897bd97bd..8e614a2bbb584e184826b05028491ad0e5d9c374 100644 (file)
--- a/encoder/encoder.c
+++ b/encoder/encoder.c
@@ -748,7 +748,7 @@ x264_t *x264_encoder_open( x264_param_t *param )
  {
      x264_t *h;
      char buf[1000], *p;
-    int i, i_slicetype_length;
+    int i, qp, i_slicetype_length;
  
      CHECKED_MALLOCZERO( h, sizeof(x264_t) );
  
@@ -869,6 +869,12 @@ x264_t *x264_encoder_open( x264_param_t *param )
          p += sprintf( p, " none!" );
      x264_log( h, X264_LOG_INFO, "%s\n", buf );
  
+    for( qp = h->param.rc.i_qp_min; qp <= h->param.rc.i_qp_max; qp++ )
+        if( x264_analyse_init_costs( h, qp ) )
+            goto fail;
+    if( x264_analyse_init_costs( h, X264_LOOKAHEAD_QP ) )
+        goto fail;
+
      h->out.i_nal = 0;
      h->out.i_bitstream = X264_MAX( 1000000, h->param.i_width * h->param.i_height * 4
          * ( h->param.rc.i_rc_method == X264_RC_ABR ? pow( 0.95, h->param.rc.i_qp_min )
@@ -900,9 +906,6 @@ x264_t *x264_encoder_open( x264_param_t *param )
      if( x264_ratecontrol_new( h ) < 0 )
          goto fail;
  
-    if( x264_lowres_context_alloc( h ) )
-        goto fail;
-
      if( h->param.psz_dump_yuv )
      {
          /* create or truncate the reconstructed video file */
@@ -1332,12 +1335,7 @@ static int x264_slice_write( x264_t *h )
          /* load cache */
          x264_macroblock_cache_load( h, i_mb_x, i_mb_y );
  
-        /* analyse parameters
-         * Slice I: choose I_4x4 or I_16x16 mode
-         * Slice P: choose between using P mode or intra (4x4 or 16x16)
-         * */
-        if( x264_macroblock_analyse( h ) )
-            return -1;
+        x264_macroblock_analyse( h );
  
          /* encode this macroblock -> be careful it can change the mb type to P_SKIP if needed */
          x264_macroblock_encode( h );
@@ -2230,6 +2228,8 @@ void    x264_encoder_close  ( x264_t *h )
  
      x264_cqm_delete( h );
  
+    x264_analyse_free_costs( h );
+
      if( h->param.i_threads > 1)
          h = h->thread[ h->i_thread_phase % h->param.i_threads ];
  
diff --git a/encoder/me.c b/encoder/me.c

index eb0fd5a02a032597e50e02488c8d2ff98350968a..d7c716539ac557de9aeb9355021addb70b75a1d3 100644 (file)
--- a/encoder/me.c
+++ b/encoder/me.c
@@ -195,8 +195,8 @@ void x264_me_search_ref( x264_t *h, x264_me_t *m, int16_t (*mvc)[2], int i_mvc,
  
  #define CHECK_MVRANGE(mx,my) ( mx >= mv_x_min && mx <= mv_x_max && my >= mv_y_min && my <= mv_y_max )
  
-    const int16_t *p_cost_mvx = m->p_cost_mv - m->mvp[0];
-    const int16_t *p_cost_mvy = m->p_cost_mv - m->mvp[1];
+    const uint16_t *p_cost_mvx = m->p_cost_mv - m->mvp[0];
+    const uint16_t *p_cost_mvy = m->p_cost_mv - m->mvp[1];
  
      bmx = x264_clip3( m->mvp[0], mv_x_min*4, mv_x_max*4 );
      bmy = x264_clip3( m->mvp[1], mv_y_min*4, mv_y_max*4 );
@@ -452,8 +452,8 @@ me_hex2:
  
              /* hexagon grid */
              omx = bmx; omy = bmy;
-            const int16_t *p_cost_omvx = p_cost_mvx + omx*4;
-            const int16_t *p_cost_omvy = p_cost_mvy + omy*4;
+            const uint16_t *p_cost_omvx = p_cost_mvx + omx*4;
+            const uint16_t *p_cost_omvy = p_cost_mvy + omy*4;
              i = 1;
              do
              {
@@ -569,7 +569,7 @@ me_hex2:
              int delta = x264_pixel_size[sad_size].w;
              int16_t *xs = h->scratch_buffer;
              int xn;
-            uint16_t *cost_fpel_mvx = x264_cost_mv_fpel[x264_lambda_tab[h->mb.i_qp]][-m->mvp[0]&3] + (-m->mvp[0]>>2);
+            uint16_t *cost_fpel_mvx = h->cost_mv_fpel[x264_lambda_tab[h->mb.i_qp]][-m->mvp[0]&3] + (-m->mvp[0]>>2);
  
              h->pixf.sad_x4[sad_size]( zero, p_fenc, p_fenc+delta,
                  p_fenc+delta*FENC_STRIDE, p_fenc+delta+delta*FENC_STRIDE,
@@ -768,8 +768,8 @@ static void refine_subpel( x264_t *h, x264_me_t *m, int hpel_iters, int qpel_ite
  {
      const int bw = x264_pixel_size[m->i_pixel].w;
      const int bh = x264_pixel_size[m->i_pixel].h;
-    const int16_t *p_cost_mvx = m->p_cost_mv - m->mvp[0];
-    const int16_t *p_cost_mvy = m->p_cost_mv - m->mvp[1];
+    const uint16_t *p_cost_mvx = m->p_cost_mv - m->mvp[0];
+    const uint16_t *p_cost_mvy = m->p_cost_mv - m->mvp[1];
      const int i_pixel = m->i_pixel;
      const int b_chroma_me = h->mb.b_chroma_me && i_pixel <= PIXEL_8x8;
  
@@ -942,10 +942,10 @@ static void ALWAYS_INLINE x264_me_refine_bidir( x264_t *h, x264_me_t *m0, x264_m
      const int i_pixel = m0->i_pixel;
      const int bw = x264_pixel_size[i_pixel].w;
      const int bh = x264_pixel_size[i_pixel].h;
-    const int16_t *p_cost_m0x = m0->p_cost_mv - m0->mvp[0];
-    const int16_t *p_cost_m0y = m0->p_cost_mv - m0->mvp[1];
-    const int16_t *p_cost_m1x = m1->p_cost_mv - m1->mvp[0];
-    const int16_t *p_cost_m1y = m1->p_cost_mv - m1->mvp[1];
+    const uint16_t *p_cost_m0x = m0->p_cost_mv - m0->mvp[0];
+    const uint16_t *p_cost_m0y = m0->p_cost_mv - m0->mvp[1];
+    const uint16_t *p_cost_m1x = m1->p_cost_mv - m1->mvp[0];
+    const uint16_t *p_cost_m1y = m1->p_cost_mv - m1->mvp[1];
      ALIGNED_ARRAY_16( uint8_t, pixy_buf,[2],[9][16*16] );
      ALIGNED_8( uint8_t pixu_buf[2][9][8*8] );
      ALIGNED_8( uint8_t pixv_buf[2][9][8*8] );
@@ -1073,7 +1073,7 @@ void x264_me_refine_qpel_rd( x264_t *h, x264_me_t *m, int i_lambda2, int i4, int
      static const int pixel_mv_offs[] = { 0, 4, 4*8, 0, 2, 2*8, 0 };
      int16_t *cache_mv = h->mb.cache.mv[i_list][x264_scan8[i4]];
      int16_t *cache_mv2 = cache_mv + pixel_mv_offs[m->i_pixel];
-    const int16_t *p_cost_mvx, *p_cost_mvy;
+    const uint16_t *p_cost_mvx, *p_cost_mvy;
      const int bw = x264_pixel_size[m->i_pixel].w>>2;
      const int bh = x264_pixel_size[m->i_pixel].h>>2;
      const int i_pixel = m->i_pixel;
diff --git a/encoder/me.h b/encoder/me.h

index 8bdee2e55f28a4d8fb9f522c323c654447e9a3a8..0122b8b70b8b50cc9f31b58a571d9ad92ea75345 100644 (file)
--- a/encoder/me.h
+++ b/encoder/me.h
@@ -31,7 +31,7 @@ typedef struct
  {
      /* input */
      int      i_pixel;   /* PIXEL_WxH */
-    int16_t *p_cost_mv; /* lambda * nbits for each possible mv */
+    uint16_t *p_cost_mv; /* lambda * nbits for each possible mv */
      int      i_ref_cost;
      int      i_ref;
  
diff --git a/encoder/ratecontrol.c b/encoder/ratecontrol.c

index cb7fd3b86dc3c0d1d507eaf15255e7db2553265f..b11c7c4357f3ae98ce2f871d26a98c73d4058f84 100644 (file)
--- a/encoder/ratecontrol.c
+++ b/encoder/ratecontrol.c
@@ -953,6 +953,8 @@ void x264_ratecontrol_start( x264_t *h, int i_force_qp )
          }
      }
  
+    q = x264_clip3f( q, h->param.rc.i_qp_min, h->param.rc.i_qp_max );
+
      rc->qpa_rc =
      rc->qpa_aq = 0;
      h->fdec->f_qp_avg_rc =
diff --git a/encoder/slicetype.c b/encoder/slicetype.c

index 56b56e1bc78005c9e64b6c614a3f736ddb71ea15..985dfd655235e8fa0abd6adc937e8a9eda057cd8 100644 (file)
--- a/encoder/slicetype.c
+++ b/encoder/slicetype.c
@@ -29,22 +29,14 @@
  #include "me.h"
  
  
-static int x264_lowres_context_init( x264_t *h, x264_mb_analysis_t *a )
+static void x264_lowres_context_init( x264_t *h, x264_mb_analysis_t *a )
  {
-    a->i_qp = 12; // arbitrary, but low because SATD scores are 1/4 normal
+    a->i_qp = X264_LOOKAHEAD_QP;
      a->i_lambda = x264_lambda_tab[ a->i_qp ];
-    if( x264_mb_analyse_load_costs( h, a ) )
-        return -1;
+    x264_mb_analyse_load_costs( h, a );
      h->mb.i_me_method = X264_MIN( X264_ME_HEX, h->param.analyse.i_me_method ); // maybe dia?
      h->mb.i_subpel_refine = 4; // 3 should be enough, but not tweaking for speed now
      h->mb.b_chroma_me = 0;
-    return 0;
-}
-
-int x264_lowres_context_alloc( x264_t *h )
-{
-    x264_mb_analysis_t a;
-    return x264_lowres_context_init( h, &a );
  }
  
  static int x264_slicetype_mb_cost( x264_t *h, x264_mb_analysis_t *a,
diff --git a/x264.c b/x264.c

index 0fdde7d8722d486d021418279a7916466f9b77e8..3d43eb5542a0a377877cd90f3877b4d1281fbcf0 100644 (file)
--- a/x264.c
+++ b/x264.c
@@ -262,9 +262,10 @@ static void Help( x264_param_t *defaults, int longhelp )
          "                                  where <option> is either\n"
          "                                      q=<integer> (force QP)\n"
          "                                  or  b=<float> (bitrate multiplier)\n" );
-    H1( "      --qpfile <string>       Force frametypes and QPs for some or all frames\n"
+    H2( "      --qpfile <string>       Force frametypes and QPs for some or all frames\n"
          "                              Format of each line: framenumber frametype QP\n"
-        "                              QP of -1 lets x264 choose. Frametypes: I,i,P,B,b.\n" );
+        "                              QP of -1 lets x264 choose. Frametypes: I,i,P,B,b.\n"
+        "                              QPs are restricted by qpmin/qpmax.\n" );
      H1( "\n" );
      H1( "Analysis:\n" );
      H1( "\n" );
author	Fiona Glaser <fiona@x264.com>
	Sun, 13 Sep 2009 08:02:37 +0000 (01:02 -0700)
committer	Fiona Glaser <fiona@x264.com>
	Mon, 14 Sep 2009 19:27:38 +0000 (12:27 -0700)
common/common.h		patch \| blob \| history
common/osdep.h		patch \| blob \| history
encoder/analyse.c		patch \| blob \| history
encoder/analyse.h		patch \| blob \| history
encoder/encoder.c		patch \| blob \| history
encoder/me.c		patch \| blob \| history
encoder/me.h		patch \| blob \| history
encoder/ratecontrol.c		patch \| blob \| history
encoder/slicetype.c		patch \| blob \| history
x264.c		patch \| blob \| history