Threaded lookahead

author Fiona Glaser <fiona@x264.com>

Tue, 8 May 2012 22:42:56 +0000 (15:42 -0700)

committer Fiona Glaser <fiona@x264.com>

Fri, 18 May 2012 23:15:14 +0000 (16:15 -0700)
author Fiona Glaser <fiona@x264.com>
Tue, 8 May 2012 22:42:56 +0000 (15:42 -0700)
committer Fiona Glaser <fiona@x264.com>
Fri, 18 May 2012 23:15:14 +0000 (16:15 -0700)
diff --git a/common/common.c b/common/common.c

index d03201d8f1ea2fcf3f8aa48b0757ecf382decd1a..3f40e66f11c205259ed9cc9226960bcb404043fb 100644 (file)
--- a/common/common.c
+++ b/common/common.c
@@ -50,6 +50,7 @@ void x264_param_default( x264_param_t *param )
      /* CPU autodetect */
      param->cpu = x264_cpu_detect();
      param->i_threads = X264_THREADS_AUTO;
+    param->i_lookahead_threads = X264_THREADS_AUTO;
      param->b_deterministic = 1;
      param->i_sync_lookahead = X264_SYNC_LOOKAHEAD_AUTO;
  
@@ -632,6 +633,13 @@ int x264_param_parse( x264_param_t *p, const char *name, const char *value )
          else
              p->i_threads = atoi(value);
      }
+    OPT("lookahead-threads")
+    {
+        if( !strcmp(value, "auto") )
+            p->i_lookahead_threads = X264_THREADS_AUTO;
+        else
+            p->i_lookahead_threads = atoi(value);
+    }
      OPT("sliced-threads")
          p->b_sliced_threads = atobool(value);
      OPT("sync-lookahead")
@@ -1285,6 +1293,7 @@ char *x264_param2string( x264_param_t *p, int b_res )
      s += sprintf( s, " fast_pskip=%d", p->analyse.b_fast_pskip );
      s += sprintf( s, " chroma_qp_offset=%d", p->analyse.i_chroma_qp_offset );
      s += sprintf( s, " threads=%d", p->i_threads );
+    s += sprintf( s, " lookahead_threads=%d", p->i_lookahead_threads );
      s += sprintf( s, " sliced_threads=%d", p->b_sliced_threads );
      if( p->i_slice_count )
          s += sprintf( s, " slices=%d", p->i_slice_count );
diff --git a/common/common.h b/common/common.h

index 5e3421291613becbd7e53fc5aa4fc830db5f3419..04ac11dae5274427c01153ceeb8933815ae13fdd 100644 (file)
--- a/common/common.h
+++ b/common/common.h
@@ -56,6 +56,7 @@ do {\
  #define X264_BFRAME_MAX 16
  #define X264_REF_MAX 16
  #define X264_THREAD_MAX 128
+#define X264_LOOKAHEAD_THREAD_MAX 16
  #define X264_PCM_COST (FRAME_SIZE(256*BIT_DEPTH)+16)
  #define X264_LOOKAHEAD_MAX 250
  #define QP_BD_OFFSET (6*(BIT_DEPTH-8))
@@ -469,6 +470,7 @@ struct x264_t
      x264_param_t    param;
  
      x264_t          *thread[X264_THREAD_MAX+1];
+    x264_t          *lookahead_thread[X264_LOOKAHEAD_THREAD_MAX];
      int             b_thread_active;
      int             i_thread_phase; /* which thread to use for the next frame */
      int             i_thread_idx;   /* which thread this is */
@@ -476,6 +478,7 @@ struct x264_t
      int             i_threadslice_end; /* row after the end of this thread slice */
      int             i_threadslice_pass; /* which pass of encoding we are on */
      x264_threadpool_t *threadpool;
+    x264_threadpool_t *lookaheadpool;
      x264_pthread_mutex_t mutex;
      x264_pthread_cond_t cv;
  
@@ -915,6 +918,7 @@ struct x264_t
  
      /* Buffers that are allocated per-thread even in sliced threads. */
      void *scratch_buffer; /* for any temporary storage that doesn't want repeated malloc */
+    void *scratch_buffer2; /* if the first one's already in use */
      pixel *intra_border_backup[5][3]; /* bottom pixels of the previous mb row, used for intra prediction after the framebuffer has been deblocked */
      /* Deblock strength values are stored for each 4x4 partition. In MBAFF
       * there are four extra values that need to be stored, located in [4][i]. */
diff --git a/common/macroblock.c b/common/macroblock.c

index 8216799c305d78ae1452938275fbdd25303473d7..abce8f68862c90856b2c0c9fab866cac3fc76cda 100644 (file)
--- a/common/macroblock.c
+++ b/common/macroblock.c
@@ -401,6 +401,9 @@ int x264_macroblock_thread_allocate( x264_t *h, int b_lookahead )
      else
          h->scratch_buffer = NULL;
  
+    int buf_lookahead_threads = (h->mb.i_mb_height + (4 + 32) * h->param.i_lookahead_threads) * sizeof(int) * 2;
+    CHECKED_MALLOC( h->scratch_buffer2, buf_lookahead_threads );
+
      return 0;
  fail:
      return -1;
@@ -418,6 +421,7 @@ void x264_macroblock_thread_free( x264_t *h, int b_lookahead )
                  x264_free( h->intra_border_backup[i][j] - 16 );
      }
      x264_free( h->scratch_buffer );
+    x264_free( h->scratch_buffer2 );
  }
  
  void x264_macroblock_slice_init( x264_t *h )
diff --git a/common/threadpool.c b/common/threadpool.c

index f7a95fcce34ebd57d6342499c6f6a84827b1e4b7..a11bf9d259659f02e5589b39ab705b5d9883f327 100644 (file)
--- a/common/threadpool.c
+++ b/common/threadpool.c
@@ -66,7 +66,7 @@ static void x264_threadpool_thread( x264_threadpool_t *pool )
          x264_pthread_mutex_unlock( &pool->run.mutex );
          if( !job )
              continue;
-        job->ret = job->func( job->arg ); /* execute the function */
+        job->ret = (void*)x264_stack_align( job->func, job->arg ); /* execute the function */
          x264_sync_frame_list_push( &pool->done, (void*)job );
      }
  }
@@ -83,7 +83,7 @@ int x264_threadpool_init( x264_threadpool_t **p_pool, int threads,
  
      pool->init_func = init_func;
      pool->init_arg  = init_arg;
-    pool->threads   = X264_MIN( threads, X264_THREAD_MAX );
+    pool->threads   = threads;
  
      CHECKED_MALLOC( pool->thread_handle, pool->threads * sizeof(x264_pthread_t) );
  
diff --git a/encoder/encoder.c b/encoder/encoder.c

index 2ed1e75fb222cceddf4a645525dd2ed45dbb5c0a..f6246f91c5f0efdf286d6b1594d3337bb89072e6 100644 (file)
--- a/encoder/encoder.c
+++ b/encoder/encoder.c
@@ -395,6 +395,15 @@ static void x264_encoder_thread_init( x264_t *h )
          x264_cpu_mask_misalign_sse();
  #endif
  }
+
+static void x264_lookahead_thread_init( x264_t *h )
+{
+#if HAVE_MMX
+    /* Misalign mask has to be set separately for each thread. */
+    if( h->param.cpu&X264_CPU_SSE_MISALIGN )
+        x264_cpu_mask_misalign_sse();
+#endif
+}
  #endif
  
  /****************************************************************************
@@ -494,6 +503,9 @@ static int x264_validate_parameters( x264_t *h, int b_open )
  
      if( h->param.i_threads == X264_THREADS_AUTO )
          h->param.i_threads = x264_cpu_num_processors() * (h->param.b_sliced_threads?2:3)/2;
+    if( h->param.i_lookahead_threads == X264_THREADS_AUTO )
+        h->param.i_lookahead_threads = h->param.i_threads / (h->param.b_sliced_threads?1:6);
+    int max_sliced_threads = X264_MAX( 1, (h->param.i_height+15)/16 / 4 );
      if( h->param.i_threads > 1 )
      {
  #if !HAVE_THREAD
@@ -503,14 +515,15 @@ static int x264_validate_parameters( x264_t *h, int b_open )
          /* Avoid absurdly small thread slices as they can reduce performance
           * and VBV compliance.  Capped at an arbitrary 4 rows per thread. */
          if( h->param.b_sliced_threads )
-        {
-            int max_threads = (h->param.i_height+15)/16 / 4;
-            h->param.i_threads = X264_MIN( h->param.i_threads, max_threads );
-        }
+            h->param.i_threads = X264_MIN( h->param.i_threads, max_sliced_threads );
      }
      h->param.i_threads = x264_clip3( h->param.i_threads, 1, X264_THREAD_MAX );
+    h->param.i_lookahead_threads = x264_clip3( h->param.i_lookahead_threads, 1, X264_MIN( max_sliced_threads, X264_LOOKAHEAD_THREAD_MAX ) );
      if( h->param.i_threads == 1 )
+    {
          h->param.b_sliced_threads = 0;
+        h->param.i_lookahead_threads = 1;
+    }
      h->i_thread_frames = h->param.b_sliced_threads ? 1 : h->param.i_threads;
      if( h->i_thread_frames > 1 )
          h->param.nalu_process = NULL;
@@ -1271,10 +1284,19 @@ x264_t *x264_encoder_open( x264_param_t *param )
      if( h->param.i_threads > 1 &&
          x264_threadpool_init( &h->threadpool, h->param.i_threads, (void*)x264_encoder_thread_init, h ) )
          goto fail;
+    if( h->param.i_lookahead_threads > 1 &&
+        x264_threadpool_init( &h->lookaheadpool, h->param.i_lookahead_threads, (void*)x264_lookahead_thread_init, h ) )
+        goto fail;
  
      h->thread[0] = h;
      for( int i = 1; i < h->param.i_threads + !!h->param.i_sync_lookahead; i++ )
          CHECKED_MALLOC( h->thread[i], sizeof(x264_t) );
+    if( h->param.i_lookahead_threads > 1 )
+        for( int i = 0; i < h->param.i_lookahead_threads; i++ )
+        {
+            CHECKED_MALLOC( h->lookahead_thread[i], sizeof(x264_t) );
+            *h->lookahead_thread[i] = *h;
+        }
  
      for( int i = 0; i < h->param.i_threads; i++ )
      {
@@ -3457,6 +3479,8 @@ void    x264_encoder_close  ( x264_t *h )
          x264_threadpool_wait_all( h );
      if( h->param.i_threads > 1 )
          x264_threadpool_delete( h->threadpool );
+    if( h->param.i_lookahead_threads > 1 )
+        x264_threadpool_delete( h->lookaheadpool );
      if( h->i_thread_frames > 1 )
      {
          for( int i = 0; i < h->i_thread_frames; i++ )
@@ -3766,6 +3790,10 @@ void    x264_encoder_close  ( x264_t *h )
                  if( h->thread[i]->fref[0][j] && h->thread[i]->fref[0][j]->b_duplicate )
                      x264_frame_delete( h->thread[i]->fref[0][j] );
  
+    if( h->param.i_lookahead_threads > 1 )
+        for( int i = 0; i < h->param.i_lookahead_threads; i++ )
+            x264_free( h->lookahead_thread[i] );
+
      for( int i = h->param.i_threads - 1; i >= 0; i-- )
      {
          x264_frame_t **frame;
diff --git a/encoder/slicetype.c b/encoder/slicetype.c

index f1c207f3949cbeee179e98662911a5549619fa04..4968f4f5bc2a41ea72010a50460a340f990e1781 100644 (file)
--- a/encoder/slicetype.c
+++ b/encoder/slicetype.c
@@ -424,9 +424,21 @@ static void x264_weights_analyse( x264_t *h, x264_frame_t *fenc, x264_frame_t *r
      }
  }
  
+/* Output buffers are separated by 128 bytes to avoid false sharing of cachelines
+ * in multithreaded lookahead. */
+#define PAD_SIZE 32
+/* cost_est, cost_est_aq, intra_mbs, num rows */
+#define NUM_INTS 4
+#define COST_EST 0
+#define COST_EST_AQ 1
+#define INTRA_MBS 2
+#define NUM_ROWS 3
+#define ROW_SATD (NUM_INTS + (h->mb.i_mb_y - h->i_threadslice_start))
+
  static void x264_slicetype_mb_cost( x264_t *h, x264_mb_analysis_t *a,
                                      x264_frame_t **frames, int p0, int p1, int b,
-                                    int dist_scale_factor, int do_search[2], const x264_weight_t *w )
+                                    int dist_scale_factor, int do_search[2], const x264_weight_t *w,
+                                    int *output_inter, int *output_intra )
  {
      x264_frame_t *fref0 = frames[p0];
      x264_frame_t *fref1 = frames[p1];
@@ -571,7 +583,7 @@ static void x264_slicetype_mb_cost( x264_t *h, x264_mb_analysis_t *a,
  #define MVC(mv) { CP32( mvc[i_mvc], mv ); i_mvc++; }
              if( i_mb_x < h->mb.i_mb_width - 1 )
                  MVC( fenc_mv[1] );
-            if( i_mb_y < h->mb.i_mb_height - 1 )
+            if( i_mb_y < h->i_threadslice_end - 1 )
              {
                  MVC( fenc_mv[i_mb_stride] );
                  if( i_mb_x > 0 )
@@ -653,11 +665,11 @@ lowres_intra_mb:
          int i_icost_aq = i_icost;
          if( h->param.rc.i_aq_mode )
              i_icost_aq = (i_icost_aq * fenc->i_inv_qscale_factor[i_mb_xy] + 128) >> 8;
-        fenc->i_row_satds[0][0][h->mb.i_mb_y] += i_icost_aq;
+        output_intra[ROW_SATD] += i_icost_aq;
          if( b_frame_score_mb )
          {
-            fenc->i_cost_est[0][0] += i_icost;
-            fenc->i_cost_est_aq[0][0] += i_icost_aq;
+            output_intra[COST_EST] += i_icost;
+            output_intra[COST_EST_AQ] += i_icost_aq;
          }
      }
      i_bcost += lowres_penalty;
@@ -674,7 +686,7 @@ lowres_intra_mb:
              list_used = 0;
          }
          if( b_frame_score_mb )
-            fenc->i_intra_mbs[b-p0] += b_intra;
+            output_inter[INTRA_MBS] += b_intra;
      }
  
      /* In an I-frame, we've already added the results above in the intra section. */
@@ -683,12 +695,12 @@ lowres_intra_mb:
          int i_bcost_aq = i_bcost;
          if( h->param.rc.i_aq_mode )
              i_bcost_aq = (i_bcost_aq * fenc->i_inv_qscale_factor[i_mb_xy] + 128) >> 8;
-        fenc->i_row_satds[b-p0][p1-b][h->mb.i_mb_y] += i_bcost_aq;
+        output_inter[ROW_SATD] += i_bcost_aq;
          if( b_frame_score_mb )
          {
              /* Don't use AQ-weighted costs for slicetype decision, only for ratecontrol. */
-            fenc->i_cost_est[b-p0][p1-b] += i_bcost;
-            fenc->i_cost_est_aq[b-p0][p1-b] += i_bcost_aq;
+            output_inter[COST_EST] += i_bcost;
+            output_inter[COST_EST_AQ] += i_bcost_aq;
          }
      }
  
@@ -701,6 +713,43 @@ lowres_intra_mb:
     (h->mb.i_mb_width - 2) * (h->mb.i_mb_height - 2) :\
      h->mb.i_mb_width * h->mb.i_mb_height)
  
+typedef struct
+{
+    x264_t *h;
+    x264_mb_analysis_t *a;
+    x264_frame_t **frames;
+    int p0;
+    int p1;
+    int b;
+    int dist_scale_factor;
+    int *do_search;
+    const x264_weight_t *w;
+    int *output_inter;
+    int *output_intra;
+} x264_slicetype_slice_t;
+
+static void x264_slicetype_slice_cost( x264_slicetype_slice_t *s )
+{
+    x264_t *h = s->h;
+
+    /* Lowres lookahead goes backwards because the MVs are used as predictors in the main encode.
+     * This considerably improves MV prediction overall. */
+
+    /* The edge mbs seem to reduce the predictive quality of the
+     * whole frame's score, but are needed for a spatial distribution. */
+    int do_edges = h->param.rc.b_mb_tree || h->param.rc.i_vbv_buffer_size || h->mb.i_mb_width <= 2 || h->mb.i_mb_height <= 2;
+
+    int start_y = X264_MIN( h->i_threadslice_end - 1, h->mb.i_mb_height - 2 + do_edges );
+    int end_y = X264_MAX( h->i_threadslice_start, 1 - do_edges );
+    int start_x = h->mb.i_mb_width - 2 + do_edges;
+    int end_x = 1 - do_edges;
+
+    for( h->mb.i_mb_y = start_y; h->mb.i_mb_y >= end_y; h->mb.i_mb_y-- )
+        for( h->mb.i_mb_x = start_x; h->mb.i_mb_x >= end_x; h->mb.i_mb_x-- )
+            x264_slicetype_mb_cost( h, s->a, s->frames, s->p0, s->p1, s->b, s->dist_scale_factor,
+                                    s->do_search, s->w, s->output_inter, s->output_intra );
+}
+
  static int x264_slicetype_frame_cost( x264_t *h, x264_mb_analysis_t *a,
                                        x264_frame_t **frames, int p0, int p1, int b,
                                        int b_intra_penalty )
@@ -708,77 +757,131 @@ static int x264_slicetype_frame_cost( x264_t *h, x264_mb_analysis_t *a,
      int i_score = 0;
      int do_search[2];
      const x264_weight_t *w = x264_weight_none;
+    x264_frame_t *fenc = frames[b];
+
      /* Check whether we already evaluated this frame
       * If we have tried this frame as P, then we have also tried
       * the preceding frames as B. (is this still true?) */
      /* Also check that we already calculated the row SATDs for the current frame. */
-    if( frames[b]->i_cost_est[b-p0][p1-b] >= 0 && (!h->param.rc.i_vbv_buffer_size || frames[b]->i_row_satds[b-p0][p1-b][0] != -1) )
-        i_score = frames[b]->i_cost_est[b-p0][p1-b];
+    if( fenc->i_cost_est[b-p0][p1-b] >= 0 && (!h->param.rc.i_vbv_buffer_size || fenc->i_row_satds[b-p0][p1-b][0] != -1) )
+        i_score = fenc->i_cost_est[b-p0][p1-b];
      else
      {
          int dist_scale_factor = 128;
-        int *row_satd = frames[b]->i_row_satds[b-p0][p1-b];
-        int *row_satd_intra = frames[b]->i_row_satds[0][0];
  
          /* For each list, check to see whether we have lowres motion-searched this reference frame before. */
-        do_search[0] = b != p0 && frames[b]->lowres_mvs[0][b-p0-1][0][0] == 0x7FFF;
-        do_search[1] = b != p1 && frames[b]->lowres_mvs[1][p1-b-1][0][0] == 0x7FFF;
+        do_search[0] = b != p0 && fenc->lowres_mvs[0][b-p0-1][0][0] == 0x7FFF;
+        do_search[1] = b != p1 && fenc->lowres_mvs[1][p1-b-1][0][0] == 0x7FFF;
          if( do_search[0] )
          {
              if( h->param.analyse.i_weighted_pred && b == p1 )
              {
                  x264_emms();
-                x264_weights_analyse( h, frames[b], frames[p0], 1 );
-                w = frames[b]->weight[0];
+                x264_weights_analyse( h, fenc, frames[p0], 1 );
+                w = fenc->weight[0];
              }
-            frames[b]->lowres_mvs[0][b-p0-1][0][0] = 0;
+            fenc->lowres_mvs[0][b-p0-1][0][0] = 0;
          }
-        if( do_search[1] ) frames[b]->lowres_mvs[1][p1-b-1][0][0] = 0;
+        if( do_search[1] ) fenc->lowres_mvs[1][p1-b-1][0][0] = 0;
  
-        if( b == p1 )
-            frames[b]->i_intra_mbs[b-p0] = 0;
-        if( !frames[b]->b_intra_calculated )
-        {
-            frames[b]->i_cost_est[0][0] = 0;
-            frames[b]->i_cost_est_aq[0][0] = 0;
-        }
          if( p1 != p0 )
              dist_scale_factor = ( ((b-p0) << 8) + ((p1-p0) >> 1) ) / (p1-p0);
  
-        frames[b]->i_cost_est[b-p0][p1-b] = 0;
-        frames[b]->i_cost_est_aq[b-p0][p1-b] = 0;
-
-        /* Lowres lookahead goes backwards because the MVs are used as predictors in the main encode.
-         * This considerably improves MV prediction overall. */
+        int output_buf_size = h->mb.i_mb_height + (NUM_INTS + PAD_SIZE) * h->param.i_lookahead_threads;
+        int *output_inter[X264_LOOKAHEAD_THREAD_MAX+1];
+        int *output_intra[X264_LOOKAHEAD_THREAD_MAX+1];
+        output_inter[0] = h->scratch_buffer2;
+        output_intra[0] = output_inter[0] + output_buf_size;
  
-        /* The edge mbs seem to reduce the predictive quality of the
-         * whole frame's score, but are needed for a spatial distribution. */
-        if( h->param.rc.b_mb_tree || h->param.rc.i_vbv_buffer_size ||
-            h->mb.i_mb_width <= 2 || h->mb.i_mb_height <= 2 )
+        if( h->param.i_lookahead_threads > 1 )
          {
-            for( h->mb.i_mb_y = h->mb.i_mb_height - 1; h->mb.i_mb_y >= 0; h->mb.i_mb_y-- )
+            x264_slicetype_slice_t s[X264_LOOKAHEAD_THREAD_MAX];
+
+            for( int i = 0; i < h->param.i_lookahead_threads; i++ )
              {
-                row_satd[h->mb.i_mb_y] = 0;
-                if( !frames[b]->b_intra_calculated )
-                    row_satd_intra[h->mb.i_mb_y] = 0;
-                for( h->mb.i_mb_x = h->mb.i_mb_width - 1; h->mb.i_mb_x >= 0; h->mb.i_mb_x-- )
-                    x264_slicetype_mb_cost( h, a, frames, p0, p1, b, dist_scale_factor, do_search, w );
+                x264_t *t = h->lookahead_thread[i];
+
+                /* FIXME move this somewhere else */
+                t->mb.i_me_method = h->mb.i_me_method;
+                t->mb.i_subpel_refine = h->mb.i_subpel_refine;
+                t->mb.b_chroma_me = h->mb.b_chroma_me;
+
+                s[i] = (x264_slicetype_slice_t){ t, a, frames, p0, p1, b, dist_scale_factor, do_search, w,
+                                                 output_inter[i], output_intra[i] };
+
+                t->i_threadslice_start = ((h->mb.i_mb_height *  i    + h->param.i_lookahead_threads/2) / h->param.i_lookahead_threads);
+                t->i_threadslice_end   = ((h->mb.i_mb_height * (i+1) + h->param.i_lookahead_threads/2) / h->param.i_lookahead_threads);
+
+                int thread_height = t->i_threadslice_end - t->i_threadslice_start;
+                int thread_output_size = thread_height + NUM_INTS;
+                memset( output_inter[i], 0, thread_output_size * sizeof(int) );
+                memset( output_intra[i], 0, thread_output_size * sizeof(int) );
+                output_inter[i][NUM_ROWS] = output_intra[i][NUM_ROWS] = thread_height;
+
+                output_inter[i+1] = output_inter[i] + thread_output_size + PAD_SIZE;
+                output_intra[i+1] = output_intra[i] + thread_output_size + PAD_SIZE;
+
+                x264_threadpool_run( h->lookaheadpool, (void*)x264_slicetype_slice_cost, &s[i] );
              }
+            for( int i = 0; i < h->param.i_lookahead_threads; i++ )
+                x264_threadpool_wait( h->lookaheadpool, &s[i] );
          }
          else
          {
-            for( h->mb.i_mb_y = h->mb.i_mb_height - 2; h->mb.i_mb_y >= 1; h->mb.i_mb_y-- )
-                for( h->mb.i_mb_x = h->mb.i_mb_width - 2; h->mb.i_mb_x >= 1; h->mb.i_mb_x-- )
-                    x264_slicetype_mb_cost( h, a, frames, p0, p1, b, dist_scale_factor, do_search, w );
+            h->i_threadslice_start = 0;
+            h->i_threadslice_end = h->mb.i_mb_height;
+            memset( output_inter[0], 0, (output_buf_size - PAD_SIZE) * sizeof(int) );
+            memset( output_intra[0], 0, (output_buf_size - PAD_SIZE) * sizeof(int) );
+            output_inter[0][NUM_ROWS] = output_intra[0][NUM_ROWS] = h->mb.i_mb_height;
+            x264_slicetype_slice_t s = (x264_slicetype_slice_t){ h, a, frames, p0, p1, b, dist_scale_factor, do_search, w,
+                                                                 output_inter[0], output_intra[0] };
+            x264_slicetype_slice_cost( &s );
+        }
+
+        /* Sum up accumulators */
+        if( b == p1 )
+            fenc->i_intra_mbs[b-p0] = 0;
+        if( !fenc->b_intra_calculated )
+        {
+            fenc->i_cost_est[0][0] = 0;
+            fenc->i_cost_est_aq[0][0] = 0;
+        }
+        fenc->i_cost_est[b-p0][p1-b] = 0;
+        fenc->i_cost_est_aq[b-p0][p1-b] = 0;
+
+        int *row_satd_inter = fenc->i_row_satds[b-p0][p1-b];
+        int *row_satd_intra = fenc->i_row_satds[0][0];
+        for( int i = 0; i < h->param.i_lookahead_threads; i++ )
+        {
+            if( b == p1 )
+                fenc->i_intra_mbs[b-p0] += output_inter[i][INTRA_MBS];
+            if( !fenc->b_intra_calculated )
+            {
+                fenc->i_cost_est[0][0] += output_intra[i][COST_EST];
+                fenc->i_cost_est_aq[0][0] += output_intra[i][COST_EST_AQ];
+            }
+
+            fenc->i_cost_est[b-p0][p1-b] += output_inter[i][COST_EST];
+            fenc->i_cost_est_aq[b-p0][p1-b] += output_inter[i][COST_EST_AQ];
+
+            if( h->param.rc.i_vbv_buffer_size )
+            {
+                int row_count = output_inter[i][NUM_ROWS];
+                memcpy( row_satd_inter, output_inter[i] + NUM_INTS, row_count * sizeof(int) );
+                if( !fenc->b_intra_calculated )
+                    memcpy( row_satd_intra, output_intra[i] + NUM_INTS, row_count * sizeof(int) );
+                row_satd_inter += row_count;
+                row_satd_intra += row_count;
+            }
          }
  
-        i_score = frames[b]->i_cost_est[b-p0][p1-b];
+        i_score = fenc->i_cost_est[b-p0][p1-b];
          if( b != p1 )
              i_score = (uint64_t)i_score * 100 / (120 + h->param.i_bframe_bias);
          else
-            frames[b]->b_intra_calculated = 1;
+            fenc->b_intra_calculated = 1;
  
-        frames[b]->i_cost_est[b-p0][p1-b] = i_score;
+        fenc->i_cost_est[b-p0][p1-b] = i_score;
          x264_emms();
      }
  
@@ -786,7 +889,7 @@ static int x264_slicetype_frame_cost( x264_t *h, x264_mb_analysis_t *a,
      {
          // arbitrary penalty for I-blocks after B-frames
          int nmb = NUM_MBS;
-        i_score += (uint64_t)i_score * frames[b]->i_intra_mbs[b-p0] / (nmb * 8);
+        i_score += (uint64_t)i_score * fenc->i_intra_mbs[b-p0] / (nmb * 8);
      }
      return i_score;
  }
diff --git a/x264.c b/x264.c

index 886ba370784cebb51c19403ca62cba7101fa5c56..b2b796673c05baaf9e43bec0f78ea90f565f9eee 100644 (file)
--- a/x264.c
+++ b/x264.c
@@ -797,6 +797,7 @@ static void help( x264_param_t *defaults, int longhelp )
      H1( "      --psnr                  Enable PSNR computation\n" );
      H1( "      --ssim                  Enable SSIM computation\n" );
      H1( "      --threads <integer>     Force a specific number of threads\n" );
+    H2( "      --lookahead-threads <integer> Force a specific number of lookahead threads\n" );
      H2( "      --sliced-threads        Low-latency but lower-efficiency threading\n" );
      H2( "      --thread-input          Run Avisynth in its own thread\n" );
      H2( "      --sync-lookahead <integer> Number of buffer frames for threaded lookahead\n" );
@@ -965,6 +966,7 @@ static struct option long_options[] =
      { "zones",       required_argument, NULL, 0 },
      { "qpfile",      required_argument, NULL, OPT_QPFILE },
      { "threads",     required_argument, NULL, 0 },
+    { "lookahead-threads", required_argument, NULL, 0 },
      { "sliced-threads",    no_argument, NULL, 0 },
      { "no-sliced-threads", no_argument, NULL, 0 },
      { "slice-max-size",    required_argument, NULL, 0 },
diff --git a/x264.h b/x264.h

index 3dcb386d898f5fbfd0f97b3f727cd6f7f9ce7986..f150151efa7e593015bb169045f2d8903828667e 100644 (file)
--- a/x264.h
+++ b/x264.h
@@ -41,7 +41,7 @@
  
  #include "x264_config.h"
  
-#define X264_BUILD 124
+#define X264_BUILD 125
  
  /* Application developers planning to link against a shared library version of
   * libx264 from a Microsoft Visual Studio or similar development environment
@@ -254,7 +254,8 @@ typedef struct x264_param_t
  {
      /* CPU flags */
      unsigned int cpu;
-    int         i_threads;       /* encode multiple frames in parallel */
+    int         i_threads;           /* encode multiple frames in parallel */
+    int         i_lookahead_threads; /* multiple threads for lookahead analysis */
      int         b_sliced_threads;  /* Whether to use slice-based threading. */
      int         b_deterministic; /* whether to allow non-deterministic optimizations when threaded */
      int         b_cpu_independent; /* force canonical behavior rather than cpu-dependent optimal algorithms */
author	Fiona Glaser <fiona@x264.com>
	Tue, 8 May 2012 22:42:56 +0000 (15:42 -0700)
committer	Fiona Glaser <fiona@x264.com>
	Fri, 18 May 2012 23:15:14 +0000 (16:15 -0700)
common/common.c		patch \| blob \| history
common/common.h		patch \| blob \| history
common/macroblock.c		patch \| blob \| history
common/threadpool.c		patch \| blob \| history
encoder/encoder.c		patch \| blob \| history
encoder/slicetype.c		patch \| blob \| history
x264.c		patch \| blob \| history
x264.h		patch \| blob \| history